rav1e-0.7.1/.cargo_vcs_info.json0000644000000001360000000000100120570ustar { "git": { "sha1": "a8d05d0c43826a465b60dbadd0ab7f1327d75371" }, "path_in_vcs": "" }rav1e-0.7.1/Cargo.lock0000644000001504770000000000100100500ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "addr2line" version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" dependencies = [ "gimli", ] [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "aho-corasick" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" dependencies = [ "memchr", ] [[package]] name = "anes" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anstyle-parse" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" dependencies = [ "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" dependencies = [ "anstyle", "windows-sys 0.52.0", ] [[package]] name = "anyhow" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" [[package]] name = "aom-sys" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "623675d7dbe1d65db81582231b0971384277a4dd2006763021ba2436e86f812d" dependencies = [ "bindgen", "system-deps", ] [[package]] name = "arbitrary" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" [[package]] name = "arg_enum_proc_macro" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "arrayvec" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" dependencies = [ "serde", ] [[package]] name = "assert_cmd" version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88903cb14723e4d4003335bb7f8a14f27691649105346a0f0957466c096adfe6" dependencies = [ "anstyle", "bstr", "doc-comment", "predicates", "predicates-core", "predicates-tree", "wait-timeout", ] [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "av-metrics" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "996ce95bbdb0203e5b91d4a0c9b81c0d67d11c80f884482a0c1ea19e732e3530" dependencies = [ "crossbeam", "itertools 0.10.5", "lab", "num-traits", "rayon", "thiserror", "v_frame", ] [[package]] name = "av1-grain" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6678909d8c5d46a42abcf571271e15fdbc0a225e3646cf23762cd415046c78bf" dependencies = [ "anyhow", "arrayvec", "log", "nom", "num-rational", "serde", "v_frame", ] [[package]] name = "backtrace" version = "0.3.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" dependencies = [ "addr2line", "cc", "cfg-if", "libc", "miniz_oxide", "object", "rustc-demangle", ] [[package]] name = "bindgen" version = "0.69.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2" dependencies = [ "bitflags 2.4.1", "cexpr", "clang-sys", "lazy_static", "lazycell", "log", "peeking_take_while", "prettyplease", "proc-macro2", "quote", "regex", "rustc-hash", "shlex", "syn", "which", ] [[package]] name = "bitflags" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "bitstream-io" version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06c9989a51171e2e81038ab168b6ae22886fe9ded214430dbb4f41c28cf176da" [[package]] name = "bstr" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", "regex-automata", "serde", ] [[package]] name = "built" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38d17f4d6e4dc36d1a02fbedc2753a096848e7c1b0772f7654eab8e2c927dd53" dependencies = [ "git2", ] [[package]] name = "bumpalo" version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "bytemuck" version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ "jobserver", "libc", ] [[package]] name = "cexpr" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ "nom", ] [[package]] name = "cfg-expr" version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6100bc57b6209840798d95cb2775684849d332f7bd788db2a8c8caf7ef82a41a" dependencies = [ "smallvec", "target-lexicon", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "ciborium" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" dependencies = [ "ciborium-io", "ciborium-ll", "serde", ] [[package]] name = "ciborium-io" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" [[package]] name = "ciborium-ll" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" dependencies = [ "ciborium-io", "half", ] [[package]] name = "clang-sys" version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67523a3b4be3ce1989d607a828d036249522dd9c1c8de7f4dd2dae43a37369d1" dependencies = [ "glob", "libc", "libloading", ] [[package]] name = "clap" version = "4.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33e92c5c1a78c62968ec57dbc2440366a2d6e5a23faf829970ff1585dc6b18e2" dependencies = [ "clap_builder", "clap_derive", ] [[package]] name = "clap_builder" version = "4.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4323769dc8a61e2c39ad7dc26f6f2800524691a44d74fe3d1071a5c24db6370" dependencies = [ "anstream", "anstyle", "clap_lex", "terminal_size", ] [[package]] name = "clap_complete" version = "4.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97aeaa95557bd02f23fbb662f981670c3d20c5a26e69f7354b28f57092437fcd" dependencies = [ "clap", ] [[package]] name = "clap_derive" version = "4.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" dependencies = [ "heck", "proc-macro2", "quote", "syn", ] [[package]] name = "clap_lex" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" [[package]] name = "color_quant" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorchoice" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] name = "console" version = "0.15.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" dependencies = [ "encode_unicode", "lazy_static", "libc", "unicode-width", "windows-sys 0.52.0", ] [[package]] name = "crc32fast" version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" dependencies = [ "cfg-if", ] [[package]] name = "criterion" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" dependencies = [ "anes", "cast", "ciborium", "clap", "criterion-plot", "is-terminal", "itertools 0.10.5", "num-traits", "once_cell", "oorandom", "plotters", "rayon", "regex", "serde", "serde_derive", "serde_json", "tinytemplate", "walkdir", ] [[package]] name = "criterion-plot" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" dependencies = [ "cast", "itertools 0.10.5", ] [[package]] name = "crossbeam" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-epoch", "crossbeam-queue", "crossbeam-utils", ] [[package]] name = "crossbeam-channel" version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "176dc175b78f56c0f321911d9c8eb2b77a78a4860b9c19db83835fea1a46649b" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-deque" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-queue" version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" [[package]] name = "diff" version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" [[package]] name = "difflib" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" [[package]] name = "doc-comment" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "either" version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "encode_unicode" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" [[package]] name = "env_logger" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" dependencies = [ "log", "regex", ] [[package]] name = "equivalent" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ "libc", "windows-sys 0.52.0", ] [[package]] name = "fdeflate" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209098dd6dfc4445aa6111f0e98653ac323eaa4dfd212c9ca3931bf9955c31bd" dependencies = [ "simd-adler32", ] [[package]] name = "fern" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9f0c14694cbd524c8720dd69b0e3179344f04ebb5f90f2e4a440c6ea3b2f1ee" dependencies = [ "log", ] [[package]] name = "flate2" version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "miniz_oxide", ] [[package]] name = "form_urlencoded" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ "percent-encoding", ] [[package]] name = "getrandom" version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "gimli" version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] name = "git2" version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fbf97ba92db08df386e10c8ede66a2a0369bd277090afd8710e19e38de9ec0cd" dependencies = [ "bitflags 2.4.1", "libc", "libgit2-sys", "log", "url", ] [[package]] name = "glob" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "half" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" [[package]] name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" [[package]] name = "heck" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] name = "home" version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" dependencies = [ "windows-sys 0.52.0", ] [[package]] name = "idna" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" dependencies = [ "unicode-bidi", "unicode-normalization", ] [[package]] name = "image" version = "0.24.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f3dfdbdd72063086ff443e297b61695500514b1e41095b6fb9a5ab48a70a711" dependencies = [ "bytemuck", "byteorder", "color_quant", "num-rational", "num-traits", "png", ] [[package]] name = "indexmap" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" dependencies = [ "equivalent", "hashbrown", ] [[package]] name = "interpolate_name" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "is-terminal" version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", "rustix", "windows-sys 0.52.0", ] [[package]] name = "itertools" version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" dependencies = [ "either", ] [[package]] name = "itertools" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" dependencies = [ "either", ] [[package]] name = "itertools" version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" dependencies = [ "either", ] [[package]] name = "itoa" version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "ivf" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552f657140ee72c552b728601179c10abea14cd7d815de2d75d75dea42485eca" dependencies = [ "bitstream-io", ] [[package]] name = "jobserver" version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" dependencies = [ "libc", ] [[package]] name = "js-sys" version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" dependencies = [ "wasm-bindgen", ] [[package]] name = "lab" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf36173d4167ed999940f804952e6b08197cae5ad5d572eb4db150ce8ad5d58f" [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "lazycell" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "libdav1d-sys" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12c9cc342dc258130a727ad15f48d01ebb181aafec30dd65338d8e51db930572" dependencies = [ "libc", ] [[package]] name = "libfuzzer-sys" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" dependencies = [ "arbitrary", "cc", "once_cell", ] [[package]] name = "libgit2-sys" version = "0.16.1+1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2a2bb3680b094add03bb3732ec520ece34da31a8cd2d633d1389d0f0fb60d0c" dependencies = [ "cc", "libc", "libz-sys", "pkg-config", ] [[package]] name = "libloading" version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c571b676ddfc9a8c12f1f3d3085a7b163966a8fd8098a90640953ce5f6170161" dependencies = [ "cfg-if", "windows-sys 0.48.0", ] [[package]] name = "libz-sys" version = "1.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "295c17e837573c8c821dbaeb3cceb3d745ad082f7572191409e69cbc1b3fd050" dependencies = [ "cc", "libc", "pkg-config", "vcpkg", ] [[package]] name = "linux-raw-sys" version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" [[package]] name = "log" version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "maybe-rayon" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea1f30cedd69f0a2954655f7188c6a834246d2bcf1e315e2ac40c4b24dc9519" dependencies = [ "cfg-if", "rayon", ] [[package]] name = "memchr" version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "minimal-lexical" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" dependencies = [ "adler", "simd-adler32", ] [[package]] name = "nasm-rs" version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe4d98d0065f4b1daf164b3eafb11974c94662e5e2396cf03f32d0bb5c17da51" dependencies = [ "rayon", ] [[package]] name = "new_debug_unreachable" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" [[package]] name = "nom" version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" dependencies = [ "memchr", "minimal-lexical", ] [[package]] name = "noop_proc_macro" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0676bb32a98c1a483ce53e500a81ad9c3d5b3f7c920c28c24e9cb0980d0b5bc8" [[package]] name = "nu-ansi-term" version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" dependencies = [ "overload", "winapi", ] [[package]] name = "num-bigint" version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" dependencies = [ "autocfg", "num-integer", "num-traits", ] [[package]] name = "num-derive" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfb77679af88f8b125209d354a202862602672222e7f2313fdd6dc349bad4712" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "num-integer" version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", ] [[package]] name = "num-rational" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ "autocfg", "num-bigint", "num-integer", "num-traits", ] [[package]] name = "num-traits" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] [[package]] name = "object" version = "0.32.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" dependencies = [ "memchr", ] [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "oorandom" version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "overload" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "paste" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" [[package]] name = "peeking_take_while" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "percent-encoding" version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pin-project-lite" version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" [[package]] name = "pkg-config" version = "0.3.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" [[package]] name = "plotters" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" dependencies = [ "num-traits", "plotters-backend", "plotters-svg", "wasm-bindgen", "web-sys", ] [[package]] name = "plotters-backend" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" [[package]] name = "plotters-svg" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" dependencies = [ "plotters-backend", ] [[package]] name = "png" version = "0.17.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd75bf2d8dd3702b9707cdbc56a5b9ef42cec752eb8b3bafc01234558442aa64" dependencies = [ "bitflags 1.3.2", "crc32fast", "fdeflate", "flate2", "miniz_oxide", ] [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "predicates" version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6dfc28575c2e3f19cb3c73b93af36460ae898d426eba6fc15b9bd2a5220758a0" dependencies = [ "anstyle", "difflib", "itertools 0.11.0", "predicates-core", ] [[package]] name = "predicates-core" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b794032607612e7abeb4db69adb4e33590fa6cf1149e95fd7cb00e634b92f174" [[package]] name = "predicates-tree" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "368ba315fb8c5052ab692e68a0eefec6ec57b23a36959c14496f0b0df2c0cecf" dependencies = [ "predicates-core", "termtree", ] [[package]] name = "pretty_assertions" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" dependencies = [ "diff", "yansi", ] [[package]] name = "prettyplease" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", "syn", ] [[package]] name = "proc-macro2" version = "1.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" dependencies = [ "unicode-ident", ] [[package]] name = "profiling" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d135ede8821cf6376eb7a64148901e1690b788c11ae94dc297ae917dbc91dc0e" dependencies = [ "profiling-procmacros", "tracing", ] [[package]] name = "profiling-procmacros" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b322d7d65c1ab449be3c890fcbd0db6e1092d0dd05d79dba2dd28032cebeb05" dependencies = [ "quote", "syn", ] [[package]] name = "quickcheck" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6" dependencies = [ "env_logger", "log", "rand", ] [[package]] name = "quote" version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", "rand_core", ] [[package]] name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ "getrandom", ] [[package]] name = "rav1e" version = "0.7.1" dependencies = [ "aom-sys", "arbitrary", "arg_enum_proc_macro", "arrayvec", "assert_cmd", "av-metrics", "av1-grain", "backtrace", "bitstream-io", "built", "byteorder", "cc", "cfg-if", "clap", "clap_complete", "console", "criterion", "crossbeam", "fern", "image", "interpolate_name", "itertools 0.12.0", "ivf", "libc", "libdav1d-sys", "libfuzzer-sys", "log", "maybe-rayon", "nasm-rs", "new_debug_unreachable", "nom", "noop_proc_macro", "num-derive", "num-traits", "once_cell", "paste", "pretty_assertions", "profiling", "quickcheck", "rand", "rand_chacha", "scan_fmt", "semver", "serde", "serde-big-array", "signal-hook", "simd_helpers", "system-deps", "thiserror", "toml", "tracing", "tracing-chrome", "tracing-subscriber", "v_frame", "wasm-bindgen", "y4m", ] [[package]] name = "rayon" version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "regex" version = "1.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" [[package]] name = "rustc-demangle" version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustc-hash" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] name = "rustix" version = "0.38.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" dependencies = [ "bitflags 2.4.1", "errno", "libc", "linux-raw-sys", "windows-sys 0.52.0", ] [[package]] name = "ryu" version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "scan_fmt" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b53b0a5db882a8e2fdaae0a43f7b39e7e9082389e978398bdf223a55b581248" [[package]] name = "semver" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" [[package]] name = "serde" version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63261df402c67811e9ac6def069e4786148c4563f4b50fd4bf30aa370d626b02" dependencies = [ "serde_derive", ] [[package]] name = "serde-big-array" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11fc7cc2c76d73e0f27ee52abbd64eec84d46f370c88371120433196934e4b7f" dependencies = [ "serde", ] [[package]] name = "serde_derive" version = "1.0.195" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46fe8f8603d81ba86327b23a2e9cdf49e1255fb94a4c5f297f6ee0547178ea2c" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.111" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "176e46fa42316f18edd598015a5166857fc835ec732f5215eac6b7bdbf0a84f4" dependencies = [ "itoa", "ryu", "serde", ] [[package]] name = "serde_spanned" version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1" dependencies = [ "serde", ] [[package]] name = "sharded-slab" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" dependencies = [ "lazy_static", ] [[package]] name = "shlex" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" [[package]] name = "signal-hook" version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" dependencies = [ "libc", "signal-hook-registry", ] [[package]] name = "signal-hook-registry" version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" dependencies = [ "libc", ] [[package]] name = "simd-adler32" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" [[package]] name = "simd_helpers" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95890f873bec569a0362c235787f3aca6e1e887302ba4840839bcc6459c42da6" dependencies = [ "quote", ] [[package]] name = "smallvec" version = "1.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" [[package]] name = "syn" version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "system-deps" version = "6.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a2d580ff6a20c55dfb86be5f9c238f67835d0e81cbdea8bf5680e0897320331" dependencies = [ "cfg-expr", "heck", "pkg-config", "toml", "version-compare", ] [[package]] name = "target-lexicon" version = "0.12.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69758bda2e78f098e4ccb393021a0963bb3442eac05f135c30f61b7370bbafae" [[package]] name = "terminal_size" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" dependencies = [ "rustix", "windows-sys 0.48.0", ] [[package]] name = "termtree" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "thiserror" version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" version = "1.0.56" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "thread_local" version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" dependencies = [ "cfg-if", "once_cell", ] [[package]] name = "tinytemplate" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", ] [[package]] name = "tinyvec" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" dependencies = [ "tinyvec_macros", ] [[package]] name = "tinyvec_macros" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "toml" version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1a195ec8c9da26928f773888e0742ca3ca1040c6cd859c919c9f59c1954ab35" dependencies = [ "serde", "serde_spanned", "toml_datetime", "toml_edit", ] [[package]] name = "toml_datetime" version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" dependencies = [ "serde", ] [[package]] name = "toml_edit" version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d34d383cd00a163b4a5b85053df514d45bc330f6de7737edfe0a93311d1eaa03" dependencies = [ "indexmap", "serde", "serde_spanned", "toml_datetime", "winnow", ] [[package]] name = "tracing" version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ "pin-project-lite", "tracing-attributes", "tracing-core", ] [[package]] name = "tracing-attributes" version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "tracing-chrome" version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "496b3cd5447f7ff527bbbf19b071ad542a000adf297d4127078b4dfdb931f41a" dependencies = [ "serde_json", "tracing-core", "tracing-subscriber", ] [[package]] name = "tracing-core" version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", "valuable", ] [[package]] name = "tracing-log" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" dependencies = [ "log", "once_cell", "tracing-core", ] [[package]] name = "tracing-subscriber" version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "nu-ansi-term", "sharded-slab", "smallvec", "thread_local", "tracing-core", "tracing-log", ] [[package]] name = "unicode-bidi" version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416" [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" dependencies = [ "tinyvec", ] [[package]] name = "unicode-width" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" [[package]] name = "url" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" dependencies = [ "form_urlencoded", "idna", "percent-encoding", ] [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "v_frame" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c372e4e6fad129795fb86fda6021b258948560b39883b80ed00510a7d19846b0" dependencies = [ "cfg-if", "noop_proc_macro", "num-derive", "num-traits", "profiling", "serde", ] [[package]] name = "valuable" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "version-compare" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "579a42fc0b8e0c63b76519a339be31bed574929511fa53c1a3acae26eb258f29" [[package]] name = "wait-timeout" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" dependencies = [ "libc", ] [[package]] name = "walkdir" version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" dependencies = [ "same-file", "winapi-util", ] [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" dependencies = [ "cfg-if", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" [[package]] name = "web-sys" version = "0.3.66" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] name = "which" version = "4.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" dependencies = [ "either", "home", "once_cell", "rustix", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ "windows-targets 0.48.5", ] [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets 0.52.0", ] [[package]] name = "windows-targets" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ "windows_aarch64_gnullvm 0.48.5", "windows_aarch64_msvc 0.48.5", "windows_i686_gnu 0.48.5", "windows_i686_msvc 0.48.5", "windows_x86_64_gnu 0.48.5", "windows_x86_64_gnullvm 0.48.5", "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows-targets" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" dependencies = [ "windows_aarch64_gnullvm 0.52.0", "windows_aarch64_msvc 0.52.0", "windows_i686_gnu 0.52.0", "windows_i686_msvc 0.52.0", "windows_x86_64_gnu 0.52.0", "windows_x86_64_gnullvm 0.52.0", "windows_x86_64_msvc 0.52.0", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" [[package]] name = "winnow" version = "0.5.33" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7520bbdec7211caa7c4e682eb1fbe07abe20cee6756b6e00f537c82c11816aa" dependencies = [ "memchr", ] [[package]] name = "y4m" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a5a4b21e1a62b67a2970e6831bc091d7b87e119e7f9791aef9702e3bef04448" [[package]] name = "yansi" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09041cd90cf85f7f8b2df60c646f853b7f535ce68f85244eb6731cf89fa498ec" rav1e-0.7.1/Cargo.toml0000644000000136260000000000100100650ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.70.0" name = "rav1e" version = "0.7.1" authors = ["Thomas Daede "] build = "build.rs" include = [ "/Cargo.toml", "/LICENSE", "/PATENTS", "/README.md", "/build.rs", "/cbindgen.toml", "/src/**", ] autobins = false autobenches = false default-run = "rav1e" description = "The fastest and safest AV1 encoder" readme = "README.md" license = "BSD-2-Clause" repository = "https://github.com/xiph/rav1e/" [package.metadata.docs.rs] no-default-features = true [profile.bench] incremental = true [profile.dev] opt-level = 1 [profile.release] lto = "thin" debug = 2 incremental = true [profile.release-no-lto] lto = "off" inherits = "release" [profile.release-strip] inherits = "release" strip = "symbols" [lib] bench = false [[bin]] name = "rav1e" bench = false required-features = ["binaries"] [[bin]] name = "rav1e-ch" bench = false required-features = [ "binaries", "channel-api", "unstable", ] [[bench]] name = "bench" path = "benches/bench.rs" harness = false required-features = ["bench"] [dependencies.aom-sys] version = "0.3.3" optional = true [dependencies.arg_enum_proc_macro] version = "0.3.4" [dependencies.arrayvec] version = "0.7" [dependencies.av-metrics] version = "0.9.1" optional = true default-features = false [dependencies.av1-grain] version = "0.2.2" [dependencies.backtrace] version = "0.3" optional = true [dependencies.bitstream-io] version = "2" [dependencies.byteorder] version = "1.5.0" optional = true [dependencies.cfg-if] version = "1.0" [dependencies.clap] version = "4.4.11" features = [ "color", "std", "wrap_help", "derive", ] optional = true default-features = false [dependencies.clap_complete] version = "4.4.5" optional = true [dependencies.console] version = "0.15" optional = true [dependencies.crossbeam] version = "0.8" optional = true [dependencies.dav1d-sys] version = "0.6.0" optional = true package = "libdav1d-sys" [dependencies.fern] version = "0.6" optional = true [dependencies.image] version = "0.24.7" features = ["png"] optional = true default-features = false [dependencies.itertools] version = "0.12" [dependencies.ivf] version = "0.1" optional = true [dependencies.libc] version = "0.2" [dependencies.log] version = "0.4" [dependencies.new_debug_unreachable] version = "1.0.4" [dependencies.nom] version = "7.1.3" optional = true [dependencies.noop_proc_macro] version = "0.3.0" [dependencies.num-derive] version = "0.4" [dependencies.num-traits] version = "0.2" [dependencies.once_cell] version = "1.19.0" [dependencies.paste] version = "1.0" [dependencies.profiling] version = "1" [dependencies.rayon] version = "0.1" default-features = false package = "maybe-rayon" [dependencies.scan_fmt] version = "0.2.6" optional = true default-features = false [dependencies.serde] version = "1.0" features = ["derive"] optional = true [dependencies.serde-big-array] version = "0.5.1" optional = true [dependencies.simd_helpers] version = "0.1" [dependencies.thiserror] version = "1.0" [dependencies.toml] version = "0.8" optional = true [dependencies.tracing] version = "0.1.40" optional = true [dependencies.tracing-chrome] version = "0.7.1" optional = true [dependencies.tracing-subscriber] version = "0.3.18" optional = true [dependencies.v_frame] version = "0.3.7" [dependencies.wasm-bindgen] version = "0.2.89" optional = true [dependencies.y4m] version = "0.8" optional = true [dev-dependencies.assert_cmd] version = "2.0" [dev-dependencies.criterion] version = "0.5" [dev-dependencies.interpolate_name] version = "0.2.4" [dev-dependencies.nom] version = "7.1.3" [dev-dependencies.pretty_assertions] version = "1.4.0" [dev-dependencies.quickcheck] version = "1.0" [dev-dependencies.rand] version = "0.8" [dev-dependencies.rand_chacha] version = "0.3" [dev-dependencies.semver] version = "1.0" [build-dependencies.built] version = "0.7.1" features = [] [build-dependencies.cc] version = "1.0" features = ["parallel"] optional = true [build-dependencies.nasm-rs] version = "0.2" features = ["parallel"] optional = true [features] asm = [ "nasm-rs", "cc", ] bench = [] binaries = [ "ivf", "y4m", "clap", "clap_complete", "scan_fmt", "fern", "console", "av-metrics", "nom", ] capi = ["scan_fmt"] channel-api = ["crossbeam"] check_asm = [] decode_test = ["aom-sys"] decode_test_dav1d = ["dav1d-sys"] default = [ "binaries", "asm", "threading", "signal_support", "git_version", ] desync_finder = ["backtrace"] dump_ivf = ["ivf"] dump_lookahead_data = [ "byteorder", "image", ] git_version = ["built/git2"] quick_test = [] scenechange = [] serialize = [ "serde", "toml", "v_frame/serialize", "serde-big-array", "av1-grain/serialize", ] signal_support = ["signal-hook"] threading = ["rayon/threads"] tracing = [ "profiling/profile-with-tracing", "tracing-subscriber", "tracing-chrome", "dep:tracing", ] unstable = [] wasm = ["wasm-bindgen"] [target."cfg(any(decode_test, decode_test_dav1d))".dependencies.system-deps] version = "6" [target."cfg(fuzzing)".dependencies.arbitrary] version = "1.3" [target."cfg(fuzzing)".dependencies.interpolate_name] version = "0.2.4" [target."cfg(fuzzing)".dependencies.libfuzzer-sys] version = "0.4.7" [target."cfg(fuzzing)".dependencies.rand] version = "0.8" [target."cfg(fuzzing)".dependencies.rand_chacha] version = "0.3" [target."cfg(unix)".dependencies.signal-hook] version = "0.3" optional = true rav1e-0.7.1/Cargo.toml.orig000064400000000000000000000115111046102023000135350ustar 00000000000000[package] name = "rav1e" version = "0.7.1" authors = ["Thomas Daede "] edition = "2021" rust-version = "1.70.0" build = "build.rs" include = [ "/Cargo.toml", "/LICENSE", "/PATENTS", "/README.md", "/build.rs", "/cbindgen.toml", "/src/**", ] license = "BSD-2-Clause" description = "The fastest and safest AV1 encoder" readme = "README.md" repository = "https://github.com/xiph/rav1e/" autobenches = false autobins = false default-run = "rav1e" [features] unstable = [] channel-api = ["crossbeam"] decode_test = ["aom-sys"] decode_test_dav1d = ["dav1d-sys"] binaries = [ "ivf", "y4m", "clap", "clap_complete", "scan_fmt", "fern", "console", "av-metrics", "nom", ] default = ["binaries", "asm", "threading", "signal_support", "git_version"] git_version = ["built/git2"] asm = ["nasm-rs", "cc"] threading = ["rayon/threads"] signal_support = ["signal-hook"] dump_ivf = ["ivf"] quick_test = [] desync_finder = ["backtrace"] bench = [] check_asm = [] capi = ["scan_fmt"] tracing = [ "profiling/profile-with-tracing", "tracing-subscriber", "tracing-chrome", "dep:tracing" ] scenechange = [] serialize = ["serde", "toml", "v_frame/serialize", "serde-big-array", "av1-grain/serialize"] wasm = ["wasm-bindgen"] # Enables debug dumping of lookahead computation results, specifically: # - i-qres.png: quarter-resolution luma planes, # - i-hres.png: half-resolution luma planes, # - i-mvs.bin: motion vectors, # - i-imps.bin: block importances, # - i-activity_scales.bin: spatial scales, # - i-distortion_scales.bin: temporal scales, # - i-spatiotemporal_scales.bin, # - i-thresholds.bin: segmentation thresholds. dump_lookahead_data = ["byteorder", "image"] [dependencies] arg_enum_proc_macro = "0.3.4" bitstream-io = "2" cfg-if = "1.0" clap = { version = "4.4.11", optional = true, default-features = false, features = [ "color", "std", "wrap_help", "derive", ] } clap_complete = { version = "4.4.5", optional = true } libc = "0.2" y4m = { version = "0.8", optional = true } backtrace = { version = "0.3", optional = true } num-traits = "0.2" num-derive = "0.4" paste = "1.0" noop_proc_macro = "0.3.0" serde = { version = "1.0", features = ["derive"], optional = true } dav1d-sys = { version = "0.6.0", package = "libdav1d-sys", optional = true } aom-sys = { version = "0.3.3", optional = true } scan_fmt = { version = "0.2.6", optional = true, default-features = false } ivf = { version = "0.1", path = "ivf/", optional = true } v_frame = "0.3.7" av-metrics = { version = "0.9.1", optional = true, default-features = false } rayon = { package = "maybe-rayon", version = "0.1", default-features = false } crossbeam = { version = "0.8", optional = true } toml = { version = "0.8", optional = true } arrayvec = "0.7" thiserror = "1.0" byteorder = { version = "1.5.0", optional = true } log = "0.4" console = { version = "0.15", optional = true } fern = { version = "0.6", optional = true } itertools = "0.12" simd_helpers = "0.1" wasm-bindgen = { version = "0.2.89", optional = true } nom = { version = "7.1.3", optional = true } new_debug_unreachable = "1.0.4" once_cell = "1.19.0" av1-grain = "0.2.2" serde-big-array = { version = "0.5.1", optional = true } profiling = { version = "1" } tracing-subscriber = { version = "0.3.18", optional = true } tracing-chrome = { version = "0.7.1", optional = true } tracing = { version = "0.1.40", optional = true } [dependencies.image] version = "0.24.7" optional = true default-features = false features = ["png"] [build-dependencies] cc = { version = "1.0", optional = true, features = ["parallel"] } built = { version = "0.7.1", features = [] } [build-dependencies.nasm-rs] version = "0.2" optional = true features = ["parallel"] [target.'cfg(unix)'.dependencies] signal-hook = { version = "0.3", optional = true } [dev-dependencies] assert_cmd = "2.0" criterion = "0.5" pretty_assertions = "1.4.0" interpolate_name = "0.2.4" nom = "7.1.3" quickcheck = "1.0" rand = "0.8" rand_chacha = "0.3" semver = "1.0" [target.'cfg(fuzzing)'.dependencies] arbitrary = "1.3" interpolate_name = "0.2.4" libfuzzer-sys = "0.4.7" rand = "0.8" rand_chacha = "0.3" [target.'cfg(any(decode_test, decode_test_dav1d))'.dependencies] system-deps = "6" [[bin]] name = "rav1e" required-features = ["binaries"] bench = false [[bin]] name = "rav1e-ch" required-features = ["binaries", "channel-api", "unstable"] bench = false [lib] bench = false [[bench]] name = "bench" path = "benches/bench.rs" required-features = ["bench"] harness = false [profile.dev] opt-level = 1 [profile.release] debug = true incremental = true lto = "thin" # windows-gnu should use it until rust-lang/rust#98302 is not fixed [profile.release-no-lto] inherits = "release" lto = "off" [profile.release-strip] inherits = "release" strip = "symbols" [profile.bench] incremental = true [workspace] members = [".", "ivf"] [package.metadata.docs.rs] no-default-features = true rav1e-0.7.1/LICENSE000064400000000000000000000024641046102023000116620ustar 00000000000000BSD 2-Clause License Copyright (c) 2017-2023, the rav1e contributors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. rav1e-0.7.1/PATENTS000064400000000000000000000131061046102023000117110ustar 00000000000000Alliance for Open Media Patent License 1.0 1. License Terms. 1.1. Patent License. Subject to the terms and conditions of this License, each Licensor, on behalf of itself and successors in interest and assigns, grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as expressly stated in this License) patent license to its Necessary Claims to make, use, sell, offer for sale, import or distribute any Implementation. 1.2. Conditions. 1.2.1. Availability. As a condition to the grant of rights to Licensee to make, sell, offer for sale, import or distribute an Implementation under Section 1.1, Licensee must make its Necessary Claims available under this License, and must reproduce this License with any Implementation as follows: a. For distribution in source code, by including this License in the root directory of the source code with its Implementation. b. For distribution in any other form (including binary, object form, and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist, GDSII, etc.)), by including this License in the documentation, legal notices, and/or other written materials provided with the Implementation. 1.2.2. Additional Conditions. This license is directly from Licensor to Licensee. Licensee acknowledges as a condition of benefiting from it that no rights from Licensor are received from suppliers, distributors, or otherwise in connection with this License. 1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents initiates patent litigation or files, maintains, or voluntarily participates in a lawsuit against another entity or any person asserting that any Implementation infringes Necessary Claims, any patent licenses granted under this License directly to the Licensee are immediately terminated as of the date of the initiation of action unless 1) that suit was in response to a corresponding suit regarding an Implementation first brought against an initiating entity, or 2) that suit was brought to enforce the terms of this License (including intervention in a third-party action by a Licensee). 1.4. Disclaimers. The Reference Implementation and Specification are provided "AS IS" and without warranty. The entire risk as to implementing or otherwise using the Reference Implementation or Specification is assumed by the implementer and user. Licensor expressly disclaims any warranties (express, implied, or otherwise), including implied warranties of merchantability, non-infringement, fitness for a particular purpose, or title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 2. Definitions. 2.1. Affiliate. Affiliate means an entity that directly or indirectly Controls, is Controlled by, or is under common Control of that party. 2.2. Control. Control means direct or indirect control of more than 50% of the voting power to elect directors of that corporation, or for any other entity, the power to direct management of such entity. 2.3. Decoder. "Decoder" means any decoder that conforms fully with all non-optional portions of the Specification. 2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can be decoded by a Decoder only to the extent it produces such a bitstream. 2.5. Final Deliverable. Final Deliverable means the final version of a deliverable approved by the Alliance for Open Media as a Final Deliverable. 2.6. Implementation. "Implementation" means any implementation, including the Reference Implementation, that is an Encoder and/or a Decoder. An Implementation also includes components of an Implementation only to the extent they are used as part of an Implementation. 2.7. License. License means this license. 2.8. Licensee. Licensee means any person or entity who exercises patent rights granted under this License. 2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers for sale, imports or distributes any Implementation, or (ii) a person or entity that has a licensing obligation to the Implementation as a result of its membership and/or participation in the Alliance for Open Media working group that developed the Specification. 2.10. Necessary Claims. "Necessary Claims" means all claims of patents or patent applications, (a) that currently or at any time in the future, are owned or controlled by the Licensor, and (b) (i) would be an Essential Claim as defined by the W3C Policy as of February 5, 2004 (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential) as if the Specification was a W3C Recommendation; or (ii) are infringed by the Reference Implementation. 2.11. Reference Implementation. Reference Implementation means an Encoder and/or Decoder released by the Alliance for Open Media as a Final Deliverable. 2.12. Specification. Specification means the specification designated by the Alliance for Open Media as a Final Deliverable for which this License was issued. rav1e-0.7.1/README.md000064400000000000000000000143401046102023000121300ustar 00000000000000# rav1e [![Actions Status][actions badge]][actions] [![CodeCov][codecov badge]][codecov] The fastest and safest AV1 encoder.
Table of Content - [Overview](#overview) - [Features](#features) - [Documentation](#documentation) - [Releases](#releases) - [Building](#building) - [Dependency: NASM](#dependency-nasm) - [Release binary](#release-binary) - [Unstable features](#unstable-features) - [Target-specific builds](#target-specific-builds) - [Building the C-API](#building-the-c-api) - [Usage](#usage) - [Compressing video](#compressing-video) - [Decompressing video](#decompressing-video) - [Configuring](#configuring) - [Features](#features-1) - [Contributing](#contributing) - [Getting in Touch](#getting-in-touch)
## Overview rav1e is an AV1 video encoder. It is designed to eventually cover all use cases, though in its current form it is most suitable for cases where libaom (the reference encoder) is too slow. ## Features * Intra, inter, and switch frames * 64x64 superblocks * 4x4 to 64x64 RDO-selected square and rectangular blocks * DC, H, V, Paeth, smooth, and all directional prediction modes * DCT, (FLIP-)ADST and identity transforms (up to 64x64, 16x16 and 32x32 respectively) * 8-, 10- and 12-bit depth color * 4:2:0, 4:2:2 and 4:4:4 chroma sampling * 11 speed settings (0-10, exhaustive to near real-time) * Constant quantizer and target bitrate (single- and multi-pass) encoding modes * Still picture mode ## Documentation Find the documentation in [`doc/`](doc/README.md) ## Releases For the foreseeable future, a weekly pre-release of rav1e will be [published](https://github.com/xiph/rav1e/releases) every Tuesday. ## Building ### Toolchain: Rust rav1e currently requires Rust 1.70.0 or later to build. ### Dependency: NASM Some `x86_64`-specific optimizations require [NASM](https://nasm.us/) `2.14.02` or newer and are enabled by default. `strip` will be used if available to remove the local symbols from the asm objects. The CI is testing against `nasm 2.15.05`, so bugs for other versions might happen. If you find one please open an issue!
Install nasm **ubuntu 20.04** (`nasm 2.14.02`) ```sh sudo apt install nasm ``` **ubuntu 18.04** (`nasm 2.14.02`) ```sh sudo apt install nasm-mozilla # link nasm into $PATH sudo ln /usr/lib/nasm-mozilla/bin/nasm /usr/local/bin/ ``` **fedora 31, 32** (`nasm 2.14.02`) ```sh sudo dnf install nasm ``` **windows** (`nasm 2.15.05`)
Have a [NASM binary](https://www.nasm.us/pub/nasm/releasebuilds/) in your system PATH. ```sh $NASM_VERSION="2.15.05" # or newer $LINK="https://www.nasm.us/pub/nasm/releasebuilds/$NASM_VERSION/win64" curl --ssl-no-revoke -LO "$LINK/nasm-$NASM_VERSION-win64.zip" 7z e -y "nasm-$NASM_VERSION-win64.zip" -o "C:\nasm" # set path for the current sessions set PATH="%PATH%;C:\nasm" ``` **macOS** (`nasm 2.15.05`) ```sh brew install nasm ```
### Release binary To build release binary in `target/release/rav1e` run: ```sh cargo build --release ``` ### Unstable features Experimental API and Features can be enabled by using the `unstable` feature. ```sh cargo build --features ,unstable ``` #### Current unstable features - Channel API: ```sh cargo build --features channel-api,unstable ``` Those Features and API are bound to change and evolve, do not rely on them staying the same over releases. ### Target-specific builds The rust compiler can produce a binary that is about 11%-13% faster if it can use `avx2`, `bmi1`, `bmi2`, `fma`, `lzcnt` and `popcnt` in the general code, you may allow it by issuing: ```sh RUSTFLAGS="-C target-cpu=native" cargo build --release # or RUSTFLAGS="-C target-cpu=x86-64-v3" cargo build --release ``` The resulting binary will not work on cpus that do not sport the same set of extensions enabled. > **NOTE** : You may use `rustc --print target-cpus` to check if the cpu is supported, if not `-C target-cpu=native` would be a no-op. ### Building the C-API **rav1e** provides a C-compatible set of library, header and pkg-config file. To build and install it you can use [cargo-c](https://crates.io/crates/cargo-c): ```sh cargo install cargo-c cargo cinstall --release ``` Please refer to the cargo-c [installation](https://github.com/lu-zero/cargo-c#installation) instructions. ## Usage ### Compressing video Input videos must be in [y4m format](https://wiki.multimedia.cx/index.php/YUV4MPEG2). The monochrome color format is not supported. ```sh cargo run --release --bin rav1e -- input.y4m -o output.ivf ``` _(Find a y4m-file for testing at [`tests/small_input.y4m`](tests/small_input.y4m) or at http://ultravideo.cs.tut.fi/#testsequences)_ ### Decompressing video Encoder output should be compatible with any AV1 decoder compliant with the v1.0.0 specification. You can decode using [dav1d](https://code.videolan.org/videolan/dav1d), which is now packaged [![in over 40 repositories](https://repology.org/badge/tiny-repos/dav1d.svg)](https://repology.org/project/dav1d/versions). ```sh dav1d -i output.ivf -o output.y4m ``` ### Configuring rav1e has several optional features that can be enabled by passing `--features` to cargo. Passing `--all-features` is discouraged. #### Features Find a full list in feature-table in [`Cargo.toml`](Cargo.toml) * `asm` - enabled by default. When enabled, assembly is built for the platforms supporting it. * `x86_64`: Requires [`nasm`](#dependency-nasm). * `aarch64` * Requires `gas` * Alternative: Use `clang` assembler by setting `CC=clang` **NOTE**: `SSE2` is always enabled on `x86_64`, `neon` is always enabled for aarch64, you may set the environment variable `RAV1E_CPU_TARGET` to `rust` to disable all the assembly-optimized routines at the runtime. ## Contributing Please read our guide to [contributing to rav1e](CONTRIBUTING.md). ## Getting in Touch Come chat with us on the IRC channel #daala on [Libera.Chat](https://libera.chat/)! You can also use a [web client](https://web.libera.chat/?channel=#daala) to join with a web browser. [actions]: https://github.com/xiph/rav1e/actions [codecov]: https://codecov.io/gh/xiph/rav1e [actions badge]: https://github.com/xiph/rav1e/workflows/rav1e/badge.svg [codecov badge]: https://codecov.io/gh/xiph/rav1e/branch/master/graph/badge.svg rav1e-0.7.1/build.rs000064400000000000000000000201421046102023000123130ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(clippy::print_literal)] #![allow(clippy::unused_io_amount)] #[allow(unused_imports)] use std::env; use std::fs; use std::path::{Path, PathBuf}; #[allow(dead_code)] fn rerun_dir>(dir: P) { for entry in fs::read_dir(dir).unwrap() { let entry = entry.unwrap(); let path = entry.path(); println!("cargo:rerun-if-changed={}", path.to_string_lossy()); if path.is_dir() { rerun_dir(path); } } } #[allow(dead_code)] fn hash_changed( files: &[&str], out_dir: &str, config: &Path, ) -> Option<([u8; 8], PathBuf)> { use std::collections::hash_map::DefaultHasher; use std::hash::Hasher; let mut hasher = DefaultHasher::new(); let paths = files .iter() .map(Path::new) .chain(std::iter::once(config)) .chain(std::iter::once(Path::new("build.rs"))); for path in paths { if let Ok(buf) = std::fs::read(path) { hasher.write(&buf); } else { panic!("Cannot open {}", path.display()); } } if let Some(cmd) = strip_command() { hasher.write(cmd.as_bytes()); } let hash = hasher.finish().to_be_bytes(); let hash_path = Path::new(&out_dir).join("asm.hash"); if let Ok(old_hash) = std::fs::read(&hash_path) { if old_hash == hash { return None; } } Some((hash, hash_path)) } #[cfg(feature = "asm")] fn build_nasm_files() { use std::fs::File; use std::io::Write; let out_dir = env::var("OUT_DIR").unwrap(); let dest_path = Path::new(&out_dir).join("config.asm"); let mut config_file = File::create(&dest_path).unwrap(); config_file.write(b" %define private_prefix rav1e\n").unwrap(); config_file.write(b" %define ARCH_X86_32 0\n").unwrap(); config_file.write(b" %define ARCH_X86_64 1\n").unwrap(); config_file.write(b" %define PIC 1\n").unwrap(); config_file.write(b" %define STACK_ALIGNMENT 16\n").unwrap(); config_file.write(b" %define HAVE_AVX512ICL 1\n").unwrap(); if env::var("CARGO_CFG_TARGET_VENDOR").unwrap() == "apple" { config_file.write(b" %define PREFIX 1\n").unwrap(); } let asm_files = &[ "src/x86/cdef_avx2.asm", "src/x86/cdef_avx512.asm", "src/x86/cdef_dist.asm", "src/x86/cdef_rav1e.asm", "src/x86/cdef_sse.asm", "src/x86/cdef16_avx2.asm", "src/x86/cdef16_avx512.asm", "src/x86/cdef16_sse.asm", "src/x86/ipred_avx2.asm", "src/x86/ipred_avx512.asm", "src/x86/ipred_sse.asm", "src/x86/ipred16_avx2.asm", "src/x86/ipred16_avx512.asm", "src/x86/ipred16_sse.asm", "src/x86/itx_avx2.asm", "src/x86/itx_avx512.asm", "src/x86/itx_sse.asm", "src/x86/itx16_avx2.asm", "src/x86/itx16_avx512.asm", "src/x86/itx16_sse.asm", "src/x86/looprestoration_avx2.asm", "src/x86/looprestoration_avx512.asm", "src/x86/looprestoration_sse.asm", "src/x86/looprestoration16_avx2.asm", "src/x86/looprestoration16_avx512.asm", "src/x86/looprestoration16_sse.asm", "src/x86/mc_avx2.asm", "src/x86/mc_avx512.asm", "src/x86/mc_sse.asm", "src/x86/mc16_avx2.asm", "src/x86/mc16_avx512.asm", "src/x86/mc16_sse.asm", "src/x86/me.asm", "src/x86/sad_avx.asm", "src/x86/sad_plane.asm", "src/x86/sad_sse2.asm", "src/x86/satd.asm", "src/x86/satd16_avx2.asm", "src/x86/sse.asm", "src/x86/tables.asm", ]; if let Some((hash, hash_path)) = hash_changed(asm_files, &out_dir, &dest_path) { let mut config_include_arg = String::from("-I"); config_include_arg.push_str(&out_dir); config_include_arg.push('/'); let mut nasm = nasm_rs::Build::new(); nasm.min_version(2, 14, 0); for file in asm_files { nasm.file(file); } nasm.flag(&config_include_arg); nasm.flag("-Isrc/"); let obj = nasm.compile_objects().unwrap_or_else(|e| { println!("cargo:warning={e}"); panic!("NASM build failed. Make sure you have nasm installed or disable the \"asm\" feature.\n\ You can get NASM from https://nasm.us or your system's package manager.\n\nerror: {e}"); }); // cc is better at finding the correct archiver let mut cc = cc::Build::new(); for o in obj { cc.object(o); } cc.compile("rav1easm"); // Strip local symbols from the asm library since they // confuse the debugger. if let Some(strip) = strip_command() { let _ = std::process::Command::new(strip) .arg("-x") .arg(Path::new(&out_dir).join("librav1easm.a")) .status(); } std::fs::write(hash_path, &hash[..]).unwrap(); } else { println!("cargo:rustc-link-search={out_dir}"); } println!("cargo:rustc-link-lib=static=rav1easm"); rerun_dir("src/x86"); rerun_dir("src/ext/x86"); } fn strip_command() -> Option { let target = env::var("TARGET").expect("TARGET"); // follows Cargo's naming convention for the linker setting let normalized_target = target.replace('-', "_").to_uppercase(); let explicit_strip = env::var(format!("CARGO_TARGET_{normalized_target}_STRIP")) .ok() .or_else(|| env::var("STRIP").ok()); if explicit_strip.is_some() { return explicit_strip; } // strip command is target-specific, e.g. macOS's strip breaks MUSL's archives let host = env::var("HOST").expect("HOST"); if host != target { return None; } Some("strip".into()) } #[cfg(feature = "asm")] fn build_asm_files() { use std::fs::File; use std::io::Write; let out_dir = env::var("OUT_DIR").unwrap(); let dest_path = Path::new(&out_dir).join("config.h"); let mut config_file = File::create(&dest_path).unwrap(); if env::var("CARGO_CFG_TARGET_VENDOR").unwrap() == "apple" { config_file.write(b" #define PREFIX 1\n").unwrap(); } config_file.write(b" #define PRIVATE_PREFIX rav1e_\n").unwrap(); config_file.write(b" #define ARCH_AARCH64 1\n").unwrap(); config_file.write(b" #define ARCH_ARM 0\n").unwrap(); config_file.write(b" #define CONFIG_LOG 1 \n").unwrap(); config_file.write(b" #define HAVE_ASM 1\n").unwrap(); config_file.sync_all().unwrap(); let asm_files = &[ "src/arm/64/cdef.S", "src/arm/64/cdef16.S", "src/arm/64/cdef_dist.S", "src/arm/64/mc.S", "src/arm/64/mc16.S", "src/arm/64/itx.S", "src/arm/64/itx16.S", "src/arm/64/ipred.S", "src/arm/64/ipred16.S", "src/arm/64/sad.S", "src/arm/64/satd.S", "src/arm/64/sse.S", "src/arm/tables.S", ]; if let Some((hash, hash_path)) = hash_changed(asm_files, &out_dir, &dest_path) { cc::Build::new() .files(asm_files) .include(".") .include(&out_dir) .compile("rav1e-aarch64"); std::fs::write(hash_path, &hash[..]).unwrap(); } else { println!("cargo:rustc-link-search={out_dir}"); println!("cargo:rustc-link-lib=static=rav1e-aarch64"); } rerun_dir("src/arm"); } #[allow(unused_variables)] fn main() { built::write_built_file().expect("Failed to acquire build-time information"); let arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap(); let os = env::var("CARGO_CFG_TARGET_OS").unwrap(); // let env = env::var("CARGO_CFG_TARGET_ENV").unwrap(); #[cfg(feature = "asm")] { if arch == "x86_64" { println!("cargo:rustc-cfg={}", "nasm_x86_64"); build_nasm_files() } if arch == "aarch64" { println!("cargo:rustc-cfg={}", "asm_neon"); build_asm_files() } } if os == "windows" && cfg!(feature = "decode_test") { panic!("Unsupported feature on this platform!"); } println!("cargo:rustc-env=PROFILE={}", env::var("PROFILE").unwrap()); if let Ok(value) = env::var("CARGO_CFG_TARGET_FEATURE") { println!("cargo:rustc-env=CARGO_CFG_TARGET_FEATURE={value}"); } println!( "cargo:rustc-env=CARGO_ENCODED_RUSTFLAGS={}", env::var("CARGO_ENCODED_RUSTFLAGS").unwrap() ); } rav1e-0.7.1/cbindgen.toml000064400000000000000000000006621046102023000133210ustar 00000000000000header = "// SPDX-License-Identifier: MIT" sys_includes = ["stddef.h", "stdint.h", "stdlib.h"] no_includes = true include_guard = "RAV1E_H" tab_width = 4 style = "Type" language = "C" cpp_compat = true [parse] parse_deps = true include = ['rav1e', 'v_frame'] [export] prefix = "Ra" item_types = ["enums", "structs", "unions", "typedefs", "opaque", "functions"] [enum] rename_variants = "ScreamingSnakeCase" prefix_with_name = true rav1e-0.7.1/src/activity.rs000064400000000000000000000214461046102023000136470ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::frame::*; use crate::rdo::DistortionScale; use crate::tiling::*; use crate::util::*; use itertools::izip; #[derive(Debug, Default, Clone)] pub struct ActivityMask { variances: Box<[u32]>, } impl ActivityMask { #[profiling::function] pub fn from_plane(luma_plane: &Plane) -> ActivityMask { let PlaneConfig { width, height, .. } = luma_plane.cfg; // Width and height are padded to 8×8 block size. let w_in_imp_b = width.align_power_of_two_and_shift(3); let h_in_imp_b = height.align_power_of_two_and_shift(3); let aligned_luma = Rect { x: 0_isize, y: 0_isize, width: w_in_imp_b << 3, height: h_in_imp_b << 3, }; let luma = PlaneRegion::new(luma_plane, aligned_luma); let mut variances = Vec::with_capacity(w_in_imp_b * h_in_imp_b); for y in 0..h_in_imp_b { for x in 0..w_in_imp_b { let block_rect = Area::Rect { x: (x << 3) as isize, y: (y << 3) as isize, width: 8, height: 8, }; let block = luma.subregion(block_rect); let variance = variance_8x8(&block); variances.push(variance); } } ActivityMask { variances: variances.into_boxed_slice() } } #[profiling::function] pub fn fill_scales( &self, bit_depth: usize, activity_scales: &mut Box<[DistortionScale]>, ) { for (dst, &src) in activity_scales.iter_mut().zip(self.variances.iter()) { *dst = ssim_boost(src, src, bit_depth); } } } // Adapted from the source variance calculation in `cdef_dist_wxh_8x8`. #[inline(never)] fn variance_8x8(src: &PlaneRegion<'_, T>) -> u32 { debug_assert!(src.plane_cfg.xdec == 0); debug_assert!(src.plane_cfg.ydec == 0); // Sum into columns to improve auto-vectorization let mut sum_s_cols: [u16; 8] = [0; 8]; let mut sum_s2_cols: [u32; 8] = [0; 8]; // Check upfront that 8 rows are available. let _row = &src[7]; for j in 0..8 { let row = &src[j][0..8]; for (sum_s, sum_s2, s) in izip!(&mut sum_s_cols, &mut sum_s2_cols, row) { // Don't convert directly to u32 to allow better vectorization let s: u16 = u16::cast_from(*s); *sum_s += s; // Convert to u32 to avoid overflows when multiplying let s: u32 = s as u32; *sum_s2 += s * s; } } // Sum together the sum of columns let sum_s = sum_s_cols.iter().copied().map(u64::from).sum::(); let sum_s2 = sum_s2_cols.iter().copied().map(u64::from).sum::(); // Use sums to calculate variance u32::try_from(sum_s2 - ((sum_s * sum_s + 32) >> 6)).unwrap_or(u32::MAX) } /// `rsqrt` result stored in fixed point w/ scaling such that: /// `rsqrt = output.rsqrt_norm / (1 << output.shift)` struct RsqrtOutput { norm: u16, shift: u8, } /// Fixed point `rsqrt` for `ssim_boost` fn ssim_boost_rsqrt(x: u64) -> RsqrtOutput { const INSHIFT: u8 = 16; const OUTSHIFT: u8 = 14; let k = ((ILog::ilog(x) - 1) >> 1) as i16; /*t is x in the range [0.25, 1) in QINSHIFT, or x*2^(-s). Shift by log2(x) - log2(0.25*(1 << INSHIFT)) to ensure 0.25 lower bound.*/ let s: i16 = 2 * k - (INSHIFT as i16 - 2); let t: u16 = if s > 0 { x >> s } else { x << -s } as u16; /*We want to express od_rsqrt() in terms of od_rsqrt_norm(), which is defined as (2^OUTSHIFT)/sqrt(t*(2^-INSHIFT)) with t=x*(2^-s). This simplifies to 2^(OUTSHIFT+(INSHIFT/2)+(s/2))/sqrt(x), so the caller needs to shift right by OUTSHIFT + INSHIFT/2 + s/2.*/ let rsqrt_shift: u8 = (OUTSHIFT as i16 + ((s + INSHIFT as i16) >> 1)) as u8; #[inline(always)] const fn mult16_16_q15(a: i32, b: i32) -> i32 { (a * b) >> 15 } /* Reciprocal sqrt approximation where the input is in the range [0.25,1) in Q16 and the output is in the range (1.0, 2.0] in Q14). */ /* Range of n is [-16384,32767] ([-0.5,1) in Q15). */ let n: i32 = t as i32 - 32768; debug_assert!(n >= -16384); /* Get a rough guess for the root. The optimal minimax quadratic approximation (using relative error) is r = 1.437799046117536+n*(-0.823394375837328+n*0.4096419668459485). Coefficients here, and the final result r, are Q14. */ let rsqrt: i32 = 23557 + mult16_16_q15(n, -13490 + mult16_16_q15(n, 6711)); debug_assert!((16384..32768).contains(&rsqrt)); RsqrtOutput { norm: rsqrt as u16, shift: rsqrt_shift } } #[inline(always)] pub fn ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> DistortionScale { DistortionScale(apply_ssim_boost( DistortionScale::default().0, svar, dvar, bit_depth, )) } /// Apply ssim boost to a given input #[inline(always)] pub fn apply_ssim_boost( input: u32, svar: u32, dvar: u32, bit_depth: usize, ) -> u32 { let coeff_shift = bit_depth - 8; // Scale dvar and svar to lbd range to prevent overflows. let svar = (svar >> (2 * coeff_shift)) as u64; let dvar = (dvar >> (2 * coeff_shift)) as u64; // The constants are such that when source and destination variance are equal, // ssim_boost ~= (x/2)^(-1/3) where x = variance / scale and the scale is // (maximum variance / sample range) << (bit depth - 8). // C2 is the variance floor, equivalent to a flat block of mean valued samples // with a single maximum value sample. const C1: u64 = 3355; const C2: u64 = 16128; const C3: u64 = 12338; const RATIO_SHIFT: u8 = 14; const RATIO: u64 = (((C1 << (RATIO_SHIFT + 1)) / C3) + 1) >> 1; // C1 (svar + dvar + C2) // input * ---- * -------------------------- // C3 sqrt(C1^2 + svar * dvar) let rsqrt = ssim_boost_rsqrt((C1 * C1) + svar * dvar); ((input as u64 * (((RATIO * (svar + dvar + C2)) * rsqrt.norm as u64) >> RATIO_SHIFT)) >> rsqrt.shift) as u32 } #[cfg(test)] mod ssim_boost_tests { use super::*; use interpolate_name::interpolate_test; use rand::Rng; /// Test to make sure extreme values of `ssim_boost` don't overflow. #[test] fn overflow_test() { // Test variance for 8x8 region with a bit depth of 12 let max_pix_diff = (1 << 12) - 1; let max_pix_sse = max_pix_diff * max_pix_diff; let max_variance = max_pix_diff * 8 * 8 / 4; apply_ssim_boost(max_pix_sse * 8 * 8, max_variance, max_variance, 12); } /// Floating point reference version of `ssim_boost` fn reference_ssim_boost(svar: u32, dvar: u32, bit_depth: usize) -> f64 { let coeff_shift = bit_depth - 8; let var_scale = 1f64 / (1 << (2 * coeff_shift)) as f64; let svar = svar as f64 * var_scale; let dvar = dvar as f64 * var_scale; // These constants are from ssim boost and need to be updated if the // constants in ssim boost change. const C1: f64 = 3355f64; const C2: f64 = 16128f64; const C3: f64 = 12338f64; const RATIO: f64 = C1 / C3; RATIO * (svar + dvar + C2) / f64::sqrt(C1.mul_add(C1, svar * dvar)) } /// Test that `ssim_boost` has sufficient accuracy. #[test] fn accuracy_test() { let mut rng = rand::thread_rng(); let mut max_relative_error = 0f64; let bd = 12; // Test different log scale ranges for the variance. // Each scale is tested multiple times with randomized variances. for scale in 0..(bd + 3 * 2 - 2) { for _ in 0..40 { let svar = rng.gen_range(0..(1 << scale)); let dvar = rng.gen_range(0..(1 << scale)); let float = reference_ssim_boost(svar, dvar, 12); let fixed = apply_ssim_boost(1 << 23, svar, dvar, 12) as f64 / (1 << 23) as f64; // Compare the two versions max_relative_error = max_relative_error.max(f64::abs(1f64 - fixed / float)); } } assert!( max_relative_error < 0.05, "SSIM boost error too high. Measured max relative error: {}.", max_relative_error ); } #[interpolate_test(8, 8)] #[interpolate_test(10, 10)] #[interpolate_test(12, 12)] fn reciprocal_cube_root_test(bd: usize) { let mut max_relative_error = 0f64; let scale = ((1 << bd) - 1) << (6 - 2 + bd - 8); for svar in scale..(scale << 2) { let float = ((scale << 1) as f64 / svar as f64).cbrt(); let fixed = apply_ssim_boost(1 << 23, svar, svar, bd) as f64 / (1 << 23) as f64; // Compare the two versions max_relative_error = max_relative_error.max(f64::abs(1f64 - fixed / float)); } assert!( max_relative_error < 0.0273, "SSIM boost error too high. Measured max relative error: {}.", max_relative_error ); } } rav1e-0.7.1/src/api/channel/by_gop.rs000064400000000000000000000231721046102023000154510ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::api::channel::data::*; use crate::api::config::*; use crate::api::util::*; use crate::api::EncoderConfig; use crate::api::InterConfig; use crossbeam::channel::*; // use crate::encoder::*; use crate::config::CpuFeatureLevel; use crate::encoder::Sequence; use crate::frame::*; use crate::scenechange::SceneChangeDetector; use crate::util::Pixel; use std::collections::BTreeMap; use std::sync::Arc; struct SubGop { frames: Vec>>, end_gop: bool, } /* impl SubGop { fn build_fi(&self) -> Vec> { todo!() } } */ // TODO: Make the detector logic fitting the model struct SceneChange { frames: usize, pyramid_size: usize, processed: u64, last_keyframe: u64, detector: SceneChangeDetector, } impl SceneChange { fn new(pyramid_size: usize, enc: &EncoderConfig) -> Self { let seq = Arc::new(Sequence::new(enc)); let detector = SceneChangeDetector::new( enc.clone(), CpuFeatureLevel::default(), pyramid_size, seq, ); Self { frames: 0, pyramid_size, processed: 0, last_keyframe: 0, detector } } // Tell where to split the lookahead // fn split(&mut self, lookahead: &[Arc>]) -> Option<(usize, bool)> { self.processed += 1; let lookahead_ref: Vec<_> = lookahead[self.frames..].iter().collect(); let new_gop = self.detector.analyze_next_frame( &lookahead_ref, self.processed, self.last_keyframe, ); if new_gop { self.last_keyframe = self.processed; } if self.frames > self.pyramid_size { self.frames -= self.pyramid_size + 1; Some((self.pyramid_size + 2, new_gop)) } else if new_gop { let frames = self.frames + 1; self.frames = 0; Some((frames, true)) } else { self.frames += 1; None } } } struct WorkLoad { s_recv: Receiver>, send: Sender>, } struct WorkerPoolSend { recv_workers: Receiver>>>, send_reassemble: Sender<(usize, Receiver>)>, count: usize, } impl WorkerPoolSend { fn get_worker(&mut self) -> Option>> { self.recv_workers.recv().ok().map(|sender| { let (s_send, s_recv) = unbounded(); let (send, recv) = unbounded(); let _ = self.send_reassemble.send((self.count, recv)); let wl = WorkLoad { s_recv, send }; let _ = sender.send(Some(wl)); self.count += 1; s_send }) } } struct WorkerPoolRecv { recv_reassemble: Receiver<(usize, Receiver>)>, recv_workers: Receiver>>>, } // TODO: make it Drop ? impl WorkerPoolRecv { fn close(&self) { for worker in self.recv_workers.iter() { let _ = worker.send(None); } } } fn workerpool( s: &rayon::ScopeFifo, workers: usize, mut cfg: Config, ) -> (WorkerPoolSend, WorkerPoolRecv) { let (send_workers, recv_workers) = bounded(workers); let (send_reassemble, recv_reassemble) = unbounded(); // TODO: unpack send_frame in process cfg.enc.speed_settings.scene_detection_mode = SceneDetectionSpeed::None; for _ in 0..workers { let (send_workload, recv_workload) = unbounded::>>(); let send_workload2 = send_workload.clone(); let send_back = send_workers.clone(); let cfg = cfg.clone(); s.spawn_fifo(move |_| { for wl in recv_workload.iter() { match wl { Some(wl) => { let mut inner = cfg.new_inner().unwrap(); for s in wl.s_recv.iter() { for f in s.frames { while !inner.needs_more_fi_lookahead() { let r = inner.receive_packet(); match r { Ok(p) => { wl.send.send(p).unwrap(); } Err(EncoderStatus::Encoded) => {} _ => todo!("Error management {:?}", r), } } let _ = inner.send_frame(Some(f), None); } } inner.limit = Some(inner.frame_count); let _ = inner.send_frame(None, None); loop { match inner.receive_packet() { Ok(p) => wl.send.send(p).unwrap(), Err(EncoderStatus::LimitReached) => break, Err(EncoderStatus::Encoded) => {} _ => todo!("Error management"), } } let _ = send_back.send(send_workload2.clone()); } None => break, } } }); let _ = send_workers.send(send_workload); } ( WorkerPoolSend { recv_workers: recv_workers.clone(), send_reassemble, count: 0, }, WorkerPoolRecv { recv_reassemble, recv_workers }, ) } fn reassemble( pool: WorkerPoolRecv

, s: &rayon::ScopeFifo, send_packet: Sender>, ) { s.spawn_fifo(move |_| { let mut pending = BTreeMap::new(); let mut last_idx = 0; let mut packet_index = 0; for (idx, recv) in pool.recv_reassemble.iter() { pending.insert(idx, recv); while let Some(recv) = pending.remove(&last_idx) { for mut p in recv { // patch up the packet_index p.input_frameno = packet_index; let _ = send_packet.send(p); packet_index += 1; } last_idx += 1; } } while !pending.is_empty() { if let Some(recv) = pending.remove(&last_idx) { for mut p in recv { // patch up the packet_index p.input_frameno = packet_index; let _ = send_packet.send(p); packet_index += 1; } } last_idx += 1; } pool.close(); }); } impl Config { // Group the incoming frames in Gops, emit a SubGop at time. fn scenechange( &self, s: &rayon::ScopeFifo, r: Receiver>, ) -> Receiver> { let inter_cfg = InterConfig::new(&self.enc); let pyramid_size = inter_cfg.keyframe_lookahead_distance() as usize; let lookahead_distance = pyramid_size + 1 + 1; let (send, recv) = bounded(lookahead_distance * 2); let mut sc = SceneChange::new(pyramid_size, &self.enc); s.spawn_fifo(move |_| { let mut lookahead = Vec::new(); for f in r.iter() { let (frame, _params) = f; lookahead.push(frame.unwrap()); // we need at least lookahead_distance frames to reason if lookahead.len() < lookahead_distance { continue; } if let Some((split_pos, end_gop)) = sc.split(&lookahead) { let rem = lookahead.split_off(split_pos); let _ = send.send(SubGop { frames: lookahead, end_gop }); lookahead = rem; } } while lookahead.len() > lookahead_distance { if let Some((split_pos, end_gop)) = sc.split(&lookahead) { let rem = lookahead.split_off(split_pos); let _ = send.send(SubGop { frames: lookahead, end_gop }); lookahead = rem; } } if !lookahead.is_empty() { let _ = send.send(SubGop { frames: lookahead, end_gop: true }); } }); recv } /// Encode the subgops, dispatch each Gop to an available worker fn encode( &self, s: &rayon::ScopeFifo, workers: usize, r: Receiver>, send_packet: Sender>, ) { let (mut workers, recv) = workerpool(s, workers, self.clone()); s.spawn_fifo(move |_| { let mut sg_send = workers.get_worker().unwrap(); for sb in r.iter() { let end_gop = sb.end_gop; let _ = sg_send.send(sb); if end_gop { sg_send = workers.get_worker().unwrap(); } } }); reassemble(recv, s, send_packet) } /// Create a single pass by-gop encoder channel /// /// Drop the `FrameSender` endpoint to flush the encoder. /// /// # Errors /// /// - Returns `InvalidConfig` if configuration is invalid. pub fn new_by_gop_channel( &self, slots: usize, ) -> Result, InvalidConfig> { let rc = &self.rate_control; if rc.emit_pass_data || rc.summary.is_some() { return Err(InvalidConfig::RateControlConfigurationMismatch); } self.validate()?; // TODO: make it user-settable let input_len = self.enc.speed_settings.rdo_lookahead_frames as usize * 4; let frame_limit = i32::MAX as u64; let (send_frame, receive_frame) = bounded(input_len); let (send_packet, receive_packet) = unbounded(); let cfg = self.clone(); let pool = self.new_thread_pool(); // TODO: move the accounting threads outside the threadpool let run = move || { let _ = rayon::scope_fifo(|s| { let sg_recv = cfg.scenechange(s, receive_frame); cfg.encode(s, slots, sg_recv, send_packet); }); }; if let Some(pool) = pool { pool.spawn_fifo(run); } else { rayon::spawn_fifo(run); } let channel = ( FrameSender::new(frame_limit, send_frame, Arc::new(self.enc.clone())), PacketReceiver { receiver: receive_packet, config: Arc::new(self.enc.clone()), }, ); Ok(channel) } } rav1e-0.7.1/src/api/channel/data.rs000064400000000000000000000271031046102023000151010ustar 00000000000000// Copyright (c) 2018-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::api::color::*; use crate::api::config::EncoderConfig; use crate::api::context::RcData; use crate::api::util::*; use crate::encoder::*; use crate::frame::*; use crate::util::Pixel; use bitstream_io::*; use crossbeam::channel::{Receiver, Sender}; use thiserror::Error; use std::io; use std::sync::Arc; /// An error returned from the `send` methods. /// /// The message could not be sent because the channel is disconnected. /// /// The error contains the message so it can be recovered. #[derive(PartialEq, Eq, Clone, Copy, Error)] #[error("sending on a disconnected channel")] pub struct SendError(pub T); /// An error returned from the `try_send` methods. /// /// The error contains the message being sent so it can be recovered. #[derive(PartialEq, Eq, Clone, Copy, Error)] pub enum TrySendError { /// The message could not be sent because the channel is full. #[error("sending on a full channel")] Full(T), /// The message could not be sent because the channel is disconnected. #[error("sending on a disconnected channel")] Disconnected(T), } /// An error returned from the `recv` methods. /// /// A message could not be received because the channel is empty and disconnected. /// #[derive(PartialEq, Eq, Clone, Copy, Debug, Error)] #[error("receiving on an empty and disconnected channel")] pub struct RecvError; /// An error returned from the `try_recv` methods. /// #[derive(PartialEq, Eq, Clone, Copy, Debug, Error)] pub enum TryRecvError { /// A message could not be received because the channel is empty. #[error("receiving on an empty channel")] Empty, /// The message could not be received because the channel is empty and disconnected. #[error("receiving on an empty and disconnected channel")] Disconnected, } impl SendError { fn from(value: crossbeam::channel::SendError) -> Self { Self(value.0) } } impl TrySendError { fn from(value: crossbeam::channel::TrySendError) -> Self { use crossbeam::channel::TrySendError::*; match value { Full(v) => TrySendError::Full(v), Disconnected(v) => TrySendError::Disconnected(v), } } } impl RecvError { fn from(_: crossbeam::channel::RecvError) -> Self { RecvError } } impl TryRecvError { fn from(value: crossbeam::channel::TryRecvError) -> Self { use crossbeam::channel::TryRecvError::*; match value { Empty => TryRecvError::Empty, Disconnected => TryRecvError::Disconnected, } } } /// Endpoint to send previous-pass statistics data pub struct RcDataSender { pub(crate) sender: Sender, pub(crate) limit: u64, pub(crate) count: u64, } impl RcDataSender { pub(crate) fn new(limit: u64, sender: Sender) -> RcDataSender { Self { sender, limit, count: 0 } } /// # Errors /// /// - `TrySendError::Full` if the message could not be sent because the channel is full. /// - `TrySendError::Disconnected` if the message could not be sent /// because the channel is disconnected. pub fn try_send( &mut self, data: RcData, ) -> Result<(), TrySendError> { if self.limit <= self.count { Err(TrySendError::Full(data)) } else { let r = self.sender.try_send(data).map_err(TrySendError::from); if r.is_ok() { self.count += 1; } r } } /// # Errors /// /// - `SendError` if the message could not be sent because the channel is disconnected. pub fn send(&mut self, data: RcData) -> Result<(), SendError> { if self.limit <= self.count { Err(SendError(data)) } else { let r = self.sender.send(data).map_err(SendError::from); if r.is_ok() { self.count += 1; } r } } pub fn len(&self) -> usize { self.sender.len() } pub fn is_empty(&self) -> bool { self.sender.is_empty() } // TODO: proxy more methods } /// Endpoint to receive current-pass statistics data pub struct RcDataReceiver(pub(crate) Receiver); impl RcDataReceiver { /// Attempts to receive a message from the channel without blocking. /// /// This method will either receive a message from the channel immediately or return an error /// if the channel is empty. /// /// If called on a zero-capacity channel, this method will receive a message only if there /// happens to be a send operation on the other side of the channel at the same time. /// /// # Errors /// /// - `TryRecvError::Empty` if the channel is currently empty. /// - `TryRecvError::Disconnected` if the channel is empty and has been disconnected. pub fn try_recv(&self) -> Result { self.0.try_recv().map_err(TryRecvError::from) } /// Blocks the current thread until a message is received or the channel is empty and /// disconnected. /// /// If the channel is empty and not disconnected, this call will block until the receive /// operation can proceed. If the channel is empty and becomes disconnected, this call will /// wake up and return an error. /// /// If called on a zero-capacity channel, this method will wait for a send operation to appear /// on the other side of the channel. /// /// # Errors /// /// - `RecvError` if the channel is empty and has been disconnected. pub fn recv(&self) -> Result { self.0.recv().map_err(RecvError::from) } pub fn len(&self) -> usize { self.0.len() } pub fn is_empty(&self) -> bool { self.0.is_empty() } pub fn iter<'a>(&'a self) -> impl Iterator + 'a { self.0.iter() } pub const fn summary_size(&self) -> usize { crate::rate::TWOPASS_HEADER_SZ } } pub type PassDataChannel = (RcDataSender, RcDataReceiver); pub type FrameInput = (Option>>, Option); /// Endpoint to send frames pub struct FrameSender { sender: Sender>, config: Arc, limit: u64, count: u64, } // Proxy the crossbeam Sender // // TODO: enforce the limit impl FrameSender { pub(crate) fn new( limit: u64, sender: Sender>, config: Arc, ) -> FrameSender { Self { sender, config, limit, count: 0 } } /// # Errors /// /// - `TrySendError::Full` if the message could not be sent because the channel is full. /// - `TrySendError::Disconnected` if the message could not be sent /// because the channel is disconnected. pub fn try_send>( &mut self, frame: F, ) -> Result<(), TrySendError>> { if self.limit <= self.count { Err(TrySendError::Full(frame.into())) } else { let r = self.sender.try_send(frame.into()).map_err(TrySendError::from); if r.is_ok() { self.count += 1; } r } } /// # Errors /// /// - `SendError` if the message could not be sent because the channel is disconnected. pub fn send>( &mut self, frame: F, ) -> Result<(), SendError>> { if self.limit <= self.count { Err(SendError(frame.into())) } else { let r = self.sender.send(frame.into()).map_err(SendError::from); if r.is_ok() { self.count += 1; } r } } pub fn len(&self) -> usize { self.sender.len() } pub fn is_empty(&self) -> bool { self.sender.is_empty() } // TODO: proxy more methods } // Frame factory impl FrameSender { /// Helper to create a new frame with the current encoder configuration #[inline] pub fn new_frame(&self) -> Frame { Frame::new( self.config.width, self.config.height, self.config.chroma_sampling, ) } } /// Endpoint to receive packets pub struct PacketReceiver { pub(crate) receiver: Receiver>, pub(crate) config: Arc, } impl PacketReceiver { /// Attempts to receive a message from the channel without blocking. /// /// This method will either receive a message from the channel immediately or return an error /// if the channel is empty. /// /// If called on a zero-capacity channel, this method will receive a message only if there /// happens to be a send operation on the other side of the channel at the same time. /// /// # Errors /// /// - `TryRecvError::Empty` if the channel is currently empty. /// - `TryRecvError::Disconnected` if the channel is empty and has been disconnected. pub fn try_recv(&self) -> Result, TryRecvError> { self.receiver.try_recv().map_err(TryRecvError::from) } /// Blocks the current thread until a message is received or the channel is empty and /// disconnected. /// /// If the channel is empty and not disconnected, this call will block until the receive /// operation can proceed. If the channel is empty and becomes disconnected, this call will /// wake up and return an error. /// /// If called on a zero-capacity channel, this method will wait for a send operation to appear /// on the other side of the channel. /// /// # Errors /// /// - `RecvError` if the channel is empty and has been disconnected. pub fn recv(&self) -> Result, RecvError> { self.receiver.recv().map_err(RecvError::from) } pub fn len(&self) -> usize { self.receiver.len() } pub fn is_empty(&self) -> bool { self.receiver.is_empty() } pub fn iter<'a>(&'a self) -> impl Iterator> + 'a { self.receiver.iter() } } impl PacketReceiver { /// Produces a sequence header matching the current encoding context. /// /// Its format is compatible with the AV1 Matroska and ISOBMFF specification. /// Note that the returned header does not include any config OBUs which are /// required for some uses. See [the specification]. /// /// [the specification]: /// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-section /// /// # Panics /// /// Panics if the header cannot be written in memory. This is unrecoverable, /// and usually indicates the system is out of memory. #[inline] pub fn container_sequence_header(&self) -> Vec { fn sequence_header_inner(seq: &Sequence) -> io::Result> { let mut buf = Vec::new(); { let mut bw = BitWriter::endian(&mut buf, BigEndian); bw.write_bit(true)?; // marker bw.write(7, 1)?; // version bw.write(3, seq.profile)?; bw.write(5, 31)?; // level bw.write_bit(false)?; // tier bw.write_bit(seq.bit_depth > 8)?; // high_bitdepth bw.write_bit(seq.bit_depth == 12)?; // twelve_bit bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs400)?; // monochrome bw.write_bit(seq.chroma_sampling != ChromaSampling::Cs444)?; // chroma_subsampling_x bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs420)?; // chroma_subsampling_y bw.write(2, 0)?; // chroma_sample_position bw.write(3, 0)?; // reserved bw.write_bit(false)?; // initial_presentation_delay_present bw.write(4, 0)?; // reserved } Ok(buf) } let seq = Sequence::new(&self.config); sequence_header_inner(&seq).unwrap() } } /// A channel modeling an encoding process pub type VideoDataChannel = (FrameSender, PacketReceiver); rav1e-0.7.1/src/api/channel/mod.rs000064400000000000000000000242131046102023000147460ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(missing_docs)] use crate::api::config::*; use crate::api::context::RcData; use crate::api::internal::ContextInner; use crate::api::util::*; use crossbeam::channel::*; use crate::rate::RCState; use crate::util::Pixel; use rayon::ThreadPool; use std::sync::Arc; mod data; pub use data::{ FrameInput, FrameSender, PacketReceiver, PassDataChannel, RcDataReceiver, RcDataSender, RecvError, SendError, TryRecvError, TrySendError, VideoDataChannel, }; mod by_gop; pub use by_gop::*; impl Config { pub(crate) fn setup( &self, ) -> Result<(ContextInner, Option>), InvalidConfig> { self.validate()?; let inner = self.new_inner()?; let pool = self.new_thread_pool(); Ok((inner, pool)) } } impl Config { /// Create a single pass encoder channel /// /// Drop the `FrameSender` endpoint to flush the encoder. /// /// # Errors /// /// - Returns `InvalidConfig` if the configuration is invalid. pub fn new_channel( &self, ) -> Result, InvalidConfig> { let rc = &self.rate_control; if rc.emit_pass_data || rc.summary.is_some() { return Err(InvalidConfig::RateControlConfigurationMismatch); } let v = if self.slots > 1 { self.new_by_gop_channel(self.slots)? } else { self.new_channel_internal()?.0 }; Ok(v) } /// Create a first pass encoder channel /// /// The pass data information is emitted through this channel. /// /// Drop the `FrameSender` endpoint to flush the encoder. /// The last buffer in the `PassDataReceiver` is the summary of the whole /// encoding process. /// /// # Errors /// /// - Returns `InvalidConfig` if the configuration is invalid. /// /// # Panics /// /// - If the channel cannot be created. An error should be raised before this, /// so a panic indicates a development error. pub fn new_firstpass_channel( &self, ) -> Result<(VideoDataChannel, RcDataReceiver), InvalidConfig> { let rc = &self.rate_control; if !rc.emit_pass_data { return Err(InvalidConfig::RateControlConfigurationMismatch); } if self.slots > 1 { log::warn!( "Parallel gop encoding does not support multi pass rate control" ); } let (v, (_, r)) = self.new_channel_internal()?; Ok((v, r.unwrap())) } /// Create a second pass encoder channel /// /// The encoding process require both frames and pass data to progress. /// /// Drop the `FrameSender` endpoint to flush the encoder. /// /// # Errors /// /// - Returns `InvalidConfig` if the configuration is invalid. /// /// # Panics /// /// - If the channel cannot be created. An error should be raised before this, /// so a panic indicates a development error. pub fn new_secondpass_channel( &self, ) -> Result<(VideoDataChannel, RcDataSender), InvalidConfig> { let rc = &self.rate_control; if rc.emit_pass_data || rc.summary.is_none() { return Err(InvalidConfig::RateControlConfigurationMismatch); } if self.slots > 1 { log::warn!( "Parallel gop encoding does not support multi pass rate control" ); } let (v, (s, _)) = self.new_channel_internal()?; Ok((v, s.unwrap())) } /// Create a multipass encoder channel /// /// The `PacketReceiver` may block if not enough pass statistics data /// are sent through the `PassDataSender` endpoint /// /// Drop the `FrameSender` endpoint to flush the encoder. /// The last buffer in the `PassDataReceiver` is the summary of the whole /// encoding process. /// /// # Errors /// /// - Returns `InvalidConfig` if the configuration is invalid. /// /// # Panics /// /// - If the channel cannot be created. An error should be raised before this, /// so a panic indicates a development error. pub fn new_multipass_channel( &self, ) -> Result<(VideoDataChannel, PassDataChannel), InvalidConfig> { let rc = &self.rate_control; if rc.summary.is_none() || !rc.emit_pass_data { return Err(InvalidConfig::RateControlConfigurationMismatch); } if self.slots > 1 { log::warn!( "Parallel gop encoding does not support multi pass rate control" ); } let (v, (s, r)) = self.new_channel_internal()?; Ok((v, (s.unwrap(), r.unwrap()))) } } trait RcFirstPass { fn send_pass_data(&mut self, rc_state: &mut RCState); fn send_pass_summary(&mut self, rc_state: &mut RCState); } trait RcSecondPass { fn feed_pass_data( &mut self, inner: &mut ContextInner, ) -> Result<(), ()>; } impl RcFirstPass for Sender { fn send_pass_data(&mut self, rc_state: &mut RCState) { if let Some(data) = rc_state.emit_frame_data() { let data = data.to_vec().into_boxed_slice(); self.send(RcData::Frame(data)).unwrap(); } else { unreachable!( "The encoder received more frames than its internal limit allows" ); } } fn send_pass_summary(&mut self, rc_state: &mut RCState) { let data = rc_state.emit_summary(); let data = data.to_vec().into_boxed_slice(); self.send(RcData::Summary(data)).unwrap(); } } impl RcFirstPass for Option> { fn send_pass_data(&mut self, rc_state: &mut RCState) { if let Some(s) = self.as_mut() { s.send_pass_data(rc_state) } } fn send_pass_summary(&mut self, rc_state: &mut RCState) { if let Some(s) = self.as_mut() { s.send_pass_summary(rc_state) } } } impl RcSecondPass for Receiver { fn feed_pass_data( &mut self, inner: &mut ContextInner, ) -> Result<(), ()> { while inner.rc_state.twopass_in_frames_needed() > 0 && !inner.done_processing() { if let Ok(RcData::Frame(data)) = self.recv() { inner .rc_state .parse_frame_data_packet(data.as_ref()) .unwrap_or_else(|_| todo!("Error reporting")); } else { todo!("Error reporting"); } } Ok(()) } } impl RcSecondPass for Option> { fn feed_pass_data( &mut self, inner: &mut ContextInner, ) -> Result<(), ()> { match self.as_mut() { Some(s) => s.feed_pass_data(inner), None => Ok(()), } } } impl Config { #[allow(clippy::type_complexity)] fn new_channel_internal( &self, ) -> Result< (VideoDataChannel, (Option, Option)), InvalidConfig, > { // The inner context is already configured to use the summary at this point. let (mut inner, pool) = self.setup()?; // TODO: make it user-settable let input_len = self.enc.speed_settings.rdo_lookahead_frames as usize * 2; let (send_frame, receive_frame) = bounded(input_len); let (send_packet, receive_packet) = unbounded(); let rc = &self.rate_control; let (mut send_rc_pass1, rc_data_receiver) = if rc.emit_pass_data { let (send_rc_pass1, receive_rc_pass1) = unbounded(); (Some(send_rc_pass1), Some(RcDataReceiver(receive_rc_pass1))) } else { (None, None) }; let (rc_data_sender, mut receive_rc_pass2, frame_limit) = if rc .summary .is_some() { let (frame_limit, pass_limit) = rc.summary.as_ref().map(|s| (s.ntus as u64, s.total as u64)).unwrap(); inner.limit = Some(frame_limit); let (send_rc_pass2, receive_rc_pass2) = unbounded(); ( Some(RcDataSender::new(pass_limit, send_rc_pass2)), Some(receive_rc_pass2), frame_limit, ) } else { (None, None, i32::MAX as u64) }; let config = Arc::new(self.enc.clone()); let channel = ( FrameSender::new(frame_limit, send_frame, config.clone()), PacketReceiver { receiver: receive_packet, config }, ); let pass_channel = (rc_data_sender, rc_data_receiver); let run = move || { for f in receive_frame.iter() { // info!("frame in {}", inner.frame_count); while !inner.needs_more_fi_lookahead() { receive_rc_pass2.feed_pass_data(&mut inner).unwrap(); // needs_more_fi_lookahead() should guard for missing output_frameno // already. // // this call should return either Ok or Err(Encoded) let has_pass_data = match inner.receive_packet() { Ok(p) => { send_packet.send(p).unwrap(); true } Err(EncoderStatus::Encoded) => true, Err(EncoderStatus::NotReady) => todo!("Error reporting"), _ => unreachable!(), }; if has_pass_data { send_rc_pass1.send_pass_data(&mut inner.rc_state); } } let (frame, params) = f; let _ = inner.send_frame(frame, params); // TODO make sure it cannot fail. } inner.limit = Some(inner.frame_count); let _ = inner.send_frame(None, None); loop { receive_rc_pass2.feed_pass_data(&mut inner).unwrap(); let r = inner.receive_packet(); let has_pass_data = match r { Ok(p) => { // warn!("Sending out {}", p.input_frameno); send_packet.send(p).unwrap(); true } Err(EncoderStatus::LimitReached) => break, Err(EncoderStatus::Encoded) => true, Err(EncoderStatus::NotReady) => todo!("Error reporting"), _ => unreachable!(), }; if has_pass_data { send_rc_pass1.send_pass_data(&mut inner.rc_state); } } send_rc_pass1.send_pass_summary(&mut inner.rc_state); }; if let Some(pool) = pool { pool.spawn(run); } else { rayon::spawn(run); } Ok((channel, pass_channel)) } } rav1e-0.7.1/src/api/color.rs000064400000000000000000000140731046102023000137000ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::serialize::*; use crate::wasm_bindgen::*; use arg_enum_proc_macro::ArgEnum; use num_derive::FromPrimitive; /// Sample position for subsampled chroma #[wasm_bindgen] #[derive( Copy, Clone, Debug, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, Default, )] #[repr(C)] pub enum ChromaSamplePosition { /// The source video transfer function must be signaled /// outside the AV1 bitstream. #[default] Unknown, /// Horizontally co-located with (0, 0) luma sample, vertically positioned /// in the middle between two luma samples. Vertical, /// Co-located with (0, 0) luma sample. Colocated, } pub use v_frame::pixel::ChromaSampling; /// Supported Color Primaries /// /// As defined by “Color primaries” section of ISO/IEC 23091-4/ITU-T H.273 #[derive( ArgEnum, Debug, Clone, Copy, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, Default, )] #[repr(C)] pub enum ColorPrimaries { /// BT.709 BT709 = 1, /// Unspecified, must be signaled or inferred outside of the bitstream #[default] Unspecified, /// BT.470 System M (historical) BT470M = 4, /// BT.470 System B, G (historical) BT470BG, /// BT.601-7 525 (SMPTE 170 M) BT601, /// SMPTE 240M (historical) SMPTE240, /// Generic film GenericFilm, /// BT.2020, BT.2100 BT2020, /// SMPTE 248 (CIE 1921 XYZ) XYZ, /// SMPTE RP 431-2 SMPTE431, /// SMPTE EG 432-1 SMPTE432, /// EBU Tech. 3213-E EBU3213 = 22, } /// Supported Transfer Characteristics /// /// As defined by “Transfer characteristics” section of ISO/IEC 23091-4/ITU-TH.273. #[derive( ArgEnum, Debug, Clone, Copy, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, Default, )] #[repr(C)] pub enum TransferCharacteristics { /// BT.709 BT709 = 1, /// Unspecified, must be signaled or inferred outside of the bitstream #[default] Unspecified, /// BT.470 System M (historical) BT470M = 4, /// BT.470 System B, G (historical) BT470BG, /// BT.601-7 525 (SMPTE 170 M) BT601, /// SMPTE 240 M SMPTE240, /// Linear Linear, /// Logarithmic (100:1 range) Log100, /// Logarithmic ((100 * √10):1 range) Log100Sqrt10, /// IEC 61966-2-4 IEC61966, /// BT.1361 extended color gamut system (historical) BT1361, /// sRGB or sYCC SRGB, /// BT.2020 10-bit systems BT2020_10Bit, /// BT.2020 12-bit systems BT2020_12Bit, /// SMPTE ST 2084, ITU BT.2100 PQ SMPTE2084, /// SMPTE ST 428 SMPTE428, /// BT.2100 HLG (Hybrid Log Gamma), ARIB STD-B67 HLG, } /// Matrix coefficients /// /// As defined by the “Matrix coefficients” section of ISO/IEC 23091-4/ITU-TH.273. #[derive( ArgEnum, Debug, Clone, Copy, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, Default, )] #[repr(C)] pub enum MatrixCoefficients { /// Identity matrix Identity = 0, /// BT.709 BT709, /// Unspecified, must be signaled or inferred outside of the bitstream. #[default] Unspecified, /// US FCC 73.628 FCC = 4, /// BT.470 System B, G (historical) BT470BG, /// BT.601-7 525 (SMPTE 170 M) BT601, /// SMPTE 240 M SMPTE240, /// YCgCo YCgCo, /// BT.2020 non-constant luminance, BT.2100 YCbCr BT2020NCL, /// BT.2020 constant luminance BT2020CL, /// SMPTE ST 2085 YDzDx SMPTE2085, /// Chromaticity-derived non-constant luminance ChromatNCL, /// Chromaticity-derived constant luminance ChromatCL, /// BT.2020 ICtCp ICtCp, } /// Signal the content color description #[derive(Copy, Clone, Debug, Serialize, Deserialize)] pub struct ColorDescription { /// Color primaries. pub color_primaries: ColorPrimaries, /// Transfer charasteristics. pub transfer_characteristics: TransferCharacteristics, /// Matrix coefficients. pub matrix_coefficients: MatrixCoefficients, } impl ColorDescription { pub(crate) fn is_srgb_triple(self) -> bool { self.color_primaries == ColorPrimaries::BT709 && self.transfer_characteristics == TransferCharacteristics::SRGB && self.matrix_coefficients == MatrixCoefficients::Identity } } /// Allowed pixel value range /// /// C.f. `VideoFullRangeFlag` variable specified in ISO/IEC 23091-4/ITU-T H.273 #[wasm_bindgen] #[derive( ArgEnum, Debug, Clone, Copy, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, Default, )] #[repr(C)] pub enum PixelRange { /// Studio swing representation #[default] Limited, /// Full swing representation Full, } /// High dynamic range content light level /// /// As defined by CEA-861.3, Appendix A. #[derive(Copy, Clone, Debug, Serialize, Deserialize)] pub struct ContentLight { /// Maximum content light level pub max_content_light_level: u16, /// Maximum frame-average light level pub max_frame_average_light_level: u16, } /// Chromaticity coordinates as defined by CIE 1931, expressed as 0.16 /// fixed-point values. #[derive(Clone, Copy, Debug, Serialize, Deserialize)] #[repr(C)] pub struct ChromaticityPoint { /// The X coordinate. pub x: u16, /// The Y coordinate. pub y: u16, } /// High dynamic range mastering display color volume /// /// As defined by CIE 1931 #[derive(Copy, Clone, Debug, Serialize, Deserialize)] pub struct MasteringDisplay { /// Chromaticity coordinates in Red, Green, Blue order /// expressed as 0.16 fixed-point pub primaries: [ChromaticityPoint; 3], /// Chromaticity coordinates expressed as 0.16 fixed-point pub white_point: ChromaticityPoint, /// 24.8 fixed-point maximum luminance in candelas per square meter pub max_luminance: u32, /// 18.14 fixed-point minimum luminance in candelas per square meter pub min_luminance: u32, } rav1e-0.7.1/src/api/config/encoder.rs000064400000000000000000000262471046102023000154540ustar 00000000000000// Copyright (c) 2020-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use itertools::*; use crate::api::color::*; use crate::api::config::GrainTableSegment; use crate::api::{Rational, SpeedSettings}; use crate::encoder::Tune; use crate::serialize::{Deserialize, Serialize}; use std::fmt; // We add 1 to rdo_lookahead_frames in a bunch of places. pub(crate) const MAX_RDO_LOOKAHEAD_FRAMES: usize = usize::MAX - 1; // Due to the math in RCState::new() regarding the reservoir frame delay. pub(crate) const MAX_MAX_KEY_FRAME_INTERVAL: u64 = i32::MAX as u64 / 3; /// Encoder settings which impact the produced bitstream. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct EncoderConfig { // output size /// Width of the frames in pixels. pub width: usize, /// Height of the frames in pixels. pub height: usize, /// Sample aspect ratio (for anamorphic video). pub sample_aspect_ratio: Rational, /// Video time base. pub time_base: Rational, // data format and ancillary color information /// Bit depth. pub bit_depth: usize, /// Chroma subsampling. pub chroma_sampling: ChromaSampling, /// Chroma sample position. pub chroma_sample_position: ChromaSamplePosition, /// Pixel value range. pub pixel_range: PixelRange, /// Content color description (primaries, transfer characteristics, matrix). pub color_description: Option, /// HDR mastering display parameters. pub mastering_display: Option, /// HDR content light parameters. pub content_light: Option, /// AV1 level index to target (0-31). /// If None, allow the encoder to decide. /// Currently, rav1e is unable to guarantee that the output bitstream /// meets the rate limitations of the specified level. pub level_idx: Option, /// Enable signaling timing info in the bitstream. pub enable_timing_info: bool, /// Still picture mode flag. pub still_picture: bool, /// Flag to force all frames to be error resilient. pub error_resilient: bool, /// Interval between switch frames (0 to disable) pub switch_frame_interval: u64, // encoder configuration /// The *minimum* interval between two keyframes pub min_key_frame_interval: u64, /// The *maximum* interval between two keyframes pub max_key_frame_interval: u64, /// The number of temporal units over which to distribute the reservoir /// usage. pub reservoir_frame_delay: Option, /// Flag to enable low latency mode. /// /// In this mode the frame reordering is disabled. pub low_latency: bool, /// The base quantizer to use. pub quantizer: usize, /// The minimum allowed base quantizer to use in bitrate mode. pub min_quantizer: u8, /// The target bitrate for the bitrate mode. pub bitrate: i32, /// Metric to tune the quality for. pub tune: Tune, /// Parameters for grain synthesis. pub film_grain_params: Option>, /// Number of tiles horizontally. Must be a power of two. /// /// Overridden by [`tiles`], if present. /// /// [`tiles`]: #structfield.tiles pub tile_cols: usize, /// Number of tiles vertically. Must be a power of two. /// /// Overridden by [`tiles`], if present. /// /// [`tiles`]: #structfield.tiles pub tile_rows: usize, /// Total number of tiles desired. /// /// Encoder will try to optimally split to reach this number of tiles, /// rounded up. Overrides [`tile_cols`] and [`tile_rows`]. /// /// [`tile_cols`]: #structfield.tile_cols /// [`tile_rows`]: #structfield.tile_rows pub tiles: usize, /// Settings which affect the encoding speed vs. quality trade-off. pub speed_settings: SpeedSettings, } /// Default preset for `EncoderConfig`: it is a balance between quality and /// speed. See [`with_speed_preset()`]. /// /// [`with_speed_preset()`]: struct.EncoderConfig.html#method.with_speed_preset impl Default for EncoderConfig { fn default() -> Self { const DEFAULT_SPEED: u8 = 6; Self::with_speed_preset(DEFAULT_SPEED) } } impl EncoderConfig { /// This is a preset which provides default settings according to a speed /// value in the specific range 0–10. Each speed value corresponds to a /// different preset. See [`from_preset()`]. If the input value is greater /// than 10, it will result in the same settings as 10. /// /// [`from_preset()`]: struct.SpeedSettings.html#method.from_preset pub fn with_speed_preset(speed: u8) -> Self { EncoderConfig { width: 640, height: 480, sample_aspect_ratio: Rational { num: 1, den: 1 }, time_base: Rational { num: 1, den: 30 }, bit_depth: 8, chroma_sampling: ChromaSampling::Cs420, chroma_sample_position: ChromaSamplePosition::Unknown, pixel_range: Default::default(), color_description: None, mastering_display: None, content_light: None, level_idx: None, enable_timing_info: false, still_picture: false, error_resilient: false, switch_frame_interval: 0, min_key_frame_interval: 12, max_key_frame_interval: 240, min_quantizer: 0, reservoir_frame_delay: None, low_latency: false, quantizer: 100, bitrate: 0, tune: Tune::default(), film_grain_params: None, tile_cols: 0, tile_rows: 0, tiles: 0, speed_settings: SpeedSettings::from_preset(speed), } } /// Sets the minimum and maximum keyframe interval, handling special cases as needed. pub fn set_key_frame_interval( &mut self, min_interval: u64, max_interval: u64, ) { self.min_key_frame_interval = min_interval; // Map an input value of 0 to an infinite interval self.max_key_frame_interval = if max_interval == 0 { MAX_MAX_KEY_FRAME_INTERVAL } else { max_interval }; } /// Returns the video frame rate computed from [`time_base`]. /// /// [`time_base`]: #structfield.time_base pub fn frame_rate(&self) -> f64 { Rational::from_reciprocal(self.time_base).as_f64() } /// Computes the render width and height of the stream based /// on [`width`], [`height`], and [`sample_aspect_ratio`]. /// /// [`width`]: #structfield.width /// [`height`]: #structfield.height /// [`sample_aspect_ratio`]: #structfield.sample_aspect_ratio pub fn render_size(&self) -> (usize, usize) { let sar = self.sample_aspect_ratio.as_f64(); if sar > 1.0 { ((self.width as f64 * sar).round() as usize, self.height) } else { (self.width, (self.height as f64 / sar).round() as usize) } } /// Is temporal RDO enabled ? #[inline] pub const fn temporal_rdo(&self) -> bool { // Note: This function is called frequently, unlike most other functions here. // `compute_distortion_scale` computes a scaling factor for the distortion // of an 8x8 block (4x4 blocks simply use the scaling of the enclosing 8x8 // block). As long as distortion is always computed on <= 8x8 blocks, this // has the property that the scaled distortion of a 2Nx2N block is always // equal to the sum of the scaled distortions of the NxN sub-blocks it's // made of, this is a necessary property to be able to do RDO between // multiple partition sizes properly. Unfortunately, when tx domain // distortion is used, distortion is only known at the tx block level which // might be bigger than 8x8. So temporal RDO is always disabled in that case. !self.speed_settings.transform.tx_domain_distortion } /// Describes whether the output is targeted as HDR pub fn is_hdr(&self) -> bool { self .color_description .map(|colors| { colors.transfer_characteristics == TransferCharacteristics::SMPTE2084 }) .unwrap_or(false) } pub(crate) fn get_film_grain_at( &self, timestamp: u64, ) -> Option<&GrainTableSegment> { self.film_grain_params.as_ref().and_then(|entries| { entries.iter().find(|entry| { timestamp >= entry.start_time && timestamp < entry.end_time }) }) } pub(crate) fn get_film_grain_mut_at( &mut self, timestamp: u64, ) -> Option<&mut GrainTableSegment> { self.film_grain_params.as_mut().and_then(|entries| { entries.iter_mut().find(|entry| { timestamp >= entry.start_time && timestamp < entry.end_time }) }) } } impl fmt::Display for EncoderConfig { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { let pairs = [ ("keyint_min", self.min_key_frame_interval.to_string()), ("keyint_max", self.max_key_frame_interval.to_string()), ("quantizer", self.quantizer.to_string()), ("bitrate", self.bitrate.to_string()), ("min_quantizer", self.min_quantizer.to_string()), ("low_latency", self.low_latency.to_string()), ("tune", self.tune.to_string()), ( "rdo_lookahead_frames", self.speed_settings.rdo_lookahead_frames.to_string(), ), ( "multiref", (!self.low_latency || self.speed_settings.multiref).to_string(), ), ("fast_deblock", self.speed_settings.fast_deblock.to_string()), ( "scene_detection_mode", self.speed_settings.scene_detection_mode.to_string(), ), ("cdef", self.speed_settings.cdef.to_string()), ("lrf", self.speed_settings.lrf.to_string()), ("enable_timing_info", self.enable_timing_info.to_string()), ( "min_block_size", self.speed_settings.partition.partition_range.min.to_string(), ), ( "max_block_size", self.speed_settings.partition.partition_range.max.to_string(), ), ( "encode_bottomup", self.speed_settings.partition.encode_bottomup.to_string(), ), ( "non_square_partition_max_threshold", self .speed_settings .partition .non_square_partition_max_threshold .to_string(), ), ( "reduced_tx_set", self.speed_settings.transform.reduced_tx_set.to_string(), ), ( "tx_domain_distortion", self.speed_settings.transform.tx_domain_distortion.to_string(), ), ( "tx_domain_rate", self.speed_settings.transform.tx_domain_rate.to_string(), ), ( "rdo_tx_decision", self.speed_settings.transform.rdo_tx_decision.to_string(), ), ( "prediction_modes", self.speed_settings.prediction.prediction_modes.to_string(), ), ( "fine_directional_intra", self.speed_settings.prediction.fine_directional_intra.to_string(), ), ( "include_near_mvs", self.speed_settings.motion.include_near_mvs.to_string(), ), ( "use_satd_subpel", self.speed_settings.motion.use_satd_subpel.to_string(), ), ]; write!( f, "{}", pairs.iter().map(|pair| format!("{}={}", pair.0, pair.1)).join(" ") ) } } rav1e-0.7.1/src/api/config/mod.rs000064400000000000000000000333411046102023000146050ustar 00000000000000// Copyright (c) 2020-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use thiserror::Error; use rayon::{ThreadPool, ThreadPoolBuilder}; use std::sync::Arc; use crate::api::{ChromaSampling, Context, ContextInner, PixelRange}; use crate::util::Pixel; mod encoder; pub use encoder::*; pub use av1_grain::*; use crate::levels::*; mod rate; pub use rate::Error as RateControlError; pub use rate::{RateControlConfig, RateControlSummary}; mod speedsettings; pub use speedsettings::*; pub use crate::tiling::TilingInfo; /// Enumeration of possible invalid configuration errors. #[derive(Debug, Clone, Copy, Eq, PartialEq, Error)] #[non_exhaustive] pub enum InvalidConfig { /// The width is invalid. #[error("invalid width {0} (expected >= 16, <= 65535)")] InvalidWidth(usize), /// The height is invalid. #[error("invalid height {0} (expected >= 16, <= 65535)")] InvalidHeight(usize), /// Aspect ratio numerator is invalid. #[error("invalid aspect ratio numerator {0} (expected > 0)")] InvalidAspectRatioNum(usize), /// Aspect ratio denominator is invalid. #[error("invalid aspect ratio denominator {0} (expected > 0)")] InvalidAspectRatioDen(usize), /// The render width (width adjusted based on the aspect ratio) is invalid. #[error("invalid render width {0} (expected >= 1, <= 65535")] InvalidRenderWidth(usize), /// The render height (height adjusted based on the aspect ratio) is invalid. #[error("invalid render height {0} (expected >= 1, <= 65535")] InvalidRenderHeight(usize), /// RDO lookahead frame count is invalid. #[error( "invalid rdo lookahead frames {actual} (expected <= {max} and >= {min})" )] InvalidRdoLookaheadFrames { /// The actual value. actual: usize, /// The maximal supported value. max: usize, /// The minimal supported value. min: usize, }, /// Maximal keyframe interval is invalid. #[error("invalid max keyframe interval {actual} (expected <= {max})")] InvalidMaxKeyFrameInterval { /// The actual value. actual: u64, /// The maximal supported value. max: u64, }, /// Tile columns is invalid. #[error("invalid tile cols {0} (expected power of 2)")] InvalidTileCols(usize), /// Tile rows is invalid. #[error("invalid tile rows {0} (expected power of 2)")] InvalidTileRows(usize), /// Framerate numerator is invalid. #[error("invalid framerate numerator {actual} (expected > 0, <= {max})")] InvalidFrameRateNum { /// The actual value. actual: u64, /// The maximal supported value. max: u64, }, /// Framerate denominator is invalid. #[error("invalid framerate denominator {actual} (expected > 0, <= {max})")] InvalidFrameRateDen { /// The actual value. actual: u64, /// The maximal supported value. max: u64, }, /// Reservoir frame delay is invalid. #[error("invalid reservoir frame delay {0} (expected >= 12, <= 131072)")] InvalidReservoirFrameDelay(i32), /// Reservoir frame delay is invalid. #[error( "invalid switch frame interval {0} (must only be used with low latency mode)" )] InvalidSwitchFrameInterval(u64), /// An option unsupported in still picture mode was enabled along with it. #[error("invalid option {0} specified with still picture mode")] InvalidOptionWithStillPicture(&'static str), /// The rate control needs a target bitrate in order to produce results #[error("The rate control requires a target bitrate")] TargetBitrateNeeded, /// The configuration #[error("Mismatch in the rate control configuration")] RateControlConfigurationMismatch, /// The color configuration mismatches AV1 constraints. #[error("Mismatch in the color configuration")] ColorConfigurationMismatch, /// The specified level is undefined in the current version of AV1. #[error("Specified level is undefined")] LevelUndefined, /// The configuration exceeded the specified level constraints. #[error("Constraints exceeded for specified level")] LevelConstraintsExceeded, } /// Contains the encoder configuration. #[derive(Clone, Debug, Default)] pub struct Config { /// Settings which impact the produced bitstream. pub(crate) enc: EncoderConfig, /// Rate control configuration pub(crate) rate_control: RateControlConfig, /// The number of threads in the threadpool. pub(crate) threads: usize, /// Shared thread pool pub(crate) pool: Option>, #[cfg(feature = "unstable")] /// Number of parallel encoding slots pub(crate) slots: usize, } impl Config { /// Create a default configuration /// /// same as `Default::default()` pub fn new() -> Self { Config::default() } /// Set the encoder configuration /// /// `EncoderConfig` contains the settings impacting the /// codec features used in the produced bitstream. pub fn with_encoder_config(mut self, enc: EncoderConfig) -> Self { self.enc = enc; self } /// Set the number of workers in the threadpool /// /// The threadpool is shared across all the different parallel /// components in the encoder. /// /// If it is left unset, the encoder will use the default global /// threadpool provided by Rayon instead. pub const fn with_threads(mut self, threads: usize) -> Self { self.threads = threads; self } /// Set the rate control configuration /// /// The default configuration is single pass pub const fn with_rate_control( mut self, rate_control: RateControlConfig, ) -> Self { self.rate_control = rate_control; self } #[cfg(feature = "unstable")] /// Use the provided threadpool /// /// It takes priority over `with_threads()` pub fn with_thread_pool(mut self, pool: Arc) -> Self { self.pool = Some(pool); self } #[cfg(feature = "unstable")] /// Set the maximum number of GOPs to encode in parallel pub const fn with_parallel_gops(mut self, slots: usize) -> Self { self.slots = slots; self } } fn check_tile_log2(n: usize) -> bool { let tile_log2 = TilingInfo::tile_log2(1, n); if tile_log2.is_none() { return false; } let tile_log2 = tile_log2.unwrap(); ((1 << tile_log2) - n) == 0 || n == 0 } impl Config { pub(crate) fn new_inner( &self, ) -> Result, InvalidConfig> { assert!( 8 * std::mem::size_of::() >= self.enc.bit_depth, "The Pixel u{} does not match the Config bit_depth {}", 8 * std::mem::size_of::(), self.enc.bit_depth ); self.validate()?; let mut config = self.enc.clone(); config.set_key_frame_interval( config.min_key_frame_interval, config.max_key_frame_interval, ); // FIXME: inter unsupported with 4:2:2 and 4:4:4 chroma sampling let chroma_sampling = config.chroma_sampling; // FIXME: tx partition for intra not supported for chroma 422 if chroma_sampling == ChromaSampling::Cs422 { config.speed_settings.transform.rdo_tx_decision = false; } let mut inner = ContextInner::new(&config); if let Some(ref s) = self.rate_control.summary { inner.rc_state.init_second_pass(); inner.rc_state.setup_second_pass(s); } // First-pass parameters depend on whether second-pass is in effect. // So `init_first_pass` must follow `init_second_pass`. if self.rate_control.emit_pass_data { let maybe_pass1_log_base_q = (self.rate_control.summary.is_none()) .then(|| inner.rc_state.select_pass1_log_base_q(&inner, 0)); inner.rc_state.init_first_pass(maybe_pass1_log_base_q); } Ok(inner) } /// Create a new threadpool with this configuration if set, /// or return `None` if global threadpool should be used instead. pub(crate) fn new_thread_pool(&self) -> Option> { if let Some(ref p) = self.pool { Some(p.clone()) } else if self.threads != 0 { let pool = ThreadPoolBuilder::new().num_threads(self.threads).build().unwrap(); Some(Arc::new(pool)) } else { None } } /// Creates a [`Context`] with this configuration. /// /// # Errors /// /// Returns `InvalidConfig` if the config is invalid. /// /// # Examples /// /// ``` /// use rav1e::prelude::*; /// /// # fn main() -> Result<(), InvalidConfig> { /// let cfg = Config::default(); /// let ctx: Context = cfg.new_context()?; /// # Ok(()) /// # } /// ``` /// /// [`Context`]: struct.Context.html pub fn new_context(&self) -> Result, InvalidConfig> { let inner = self.new_inner()?; let config = (*inner.config).clone(); let pool = self.new_thread_pool(); Ok(Context { is_flushing: false, inner, pool, config }) } /// Validates the configuration. /// /// # Errors /// /// - Returns `InvalidConfig` if the tiling config is invalid. pub fn validate(&self) -> Result<(), InvalidConfig> { use InvalidConfig::*; let config = &self.enc; if (config.still_picture && config.width < 1) || (!config.still_picture && config.width < 16) || config.width > u16::MAX as usize { return Err(InvalidWidth(config.width)); } if (config.still_picture && config.height < 1) || (!config.still_picture && config.height < 16) || config.height > u16::MAX as usize { return Err(InvalidHeight(config.height)); } if config.sample_aspect_ratio.num == 0 { return Err(InvalidAspectRatioNum( config.sample_aspect_ratio.num as usize, )); } if config.sample_aspect_ratio.den == 0 { return Err(InvalidAspectRatioDen( config.sample_aspect_ratio.den as usize, )); } let (render_width, render_height) = config.render_size(); if render_width == 0 || render_width > u16::MAX as usize { return Err(InvalidRenderWidth(render_width)); } if render_height == 0 || render_height > u16::MAX as usize { return Err(InvalidRenderHeight(render_height)); } if config.speed_settings.rdo_lookahead_frames > MAX_RDO_LOOKAHEAD_FRAMES || config.speed_settings.rdo_lookahead_frames < 1 { return Err(InvalidRdoLookaheadFrames { actual: config.speed_settings.rdo_lookahead_frames, max: MAX_RDO_LOOKAHEAD_FRAMES, min: 1, }); } if config.max_key_frame_interval > MAX_MAX_KEY_FRAME_INTERVAL { return Err(InvalidMaxKeyFrameInterval { actual: config.max_key_frame_interval, max: MAX_MAX_KEY_FRAME_INTERVAL, }); } if !check_tile_log2(config.tile_cols) { return Err(InvalidTileCols(config.tile_cols)); } if !check_tile_log2(config.tile_rows) { return Err(InvalidTileRows(config.tile_rows)); } if config.time_base.num == 0 || config.time_base.num > u32::MAX as u64 { return Err(InvalidFrameRateNum { actual: config.time_base.num, max: u32::MAX as u64, }); } if config.time_base.den == 0 || config.time_base.den > u32::MAX as u64 { return Err(InvalidFrameRateDen { actual: config.time_base.den, max: u32::MAX as u64, }); } if let Some(delay) = config.reservoir_frame_delay { if !(12..=131_072).contains(&delay) { return Err(InvalidReservoirFrameDelay(delay)); } } if config.switch_frame_interval > 0 && !config.low_latency { return Err(InvalidSwitchFrameInterval(config.switch_frame_interval)); } if config.enable_timing_info && config.still_picture { return Err(InvalidOptionWithStillPicture("enable_timing_info")); } // if let Some(color_description) = config.color_description { if config.chroma_sampling != ChromaSampling::Cs400 && color_description.is_srgb_triple() { if config.pixel_range != PixelRange::Full { return Err(ColorConfigurationMismatch); } if config.chroma_sampling != ChromaSampling::Cs444 { return Err(ColorConfigurationMismatch); } } } if let Some(level_idx) = config.level_idx { if level_idx > 31 { return Err(LevelUndefined); } if level_idx < 31 { if !AV1_LEVEL_DEFINED[level_idx as usize] { return Err(LevelUndefined); } if config.width * config.height > AV1_LEVEL_MAX_PIC_SIZE[level_idx as usize] { return Err(LevelConstraintsExceeded); } if config.width > AV1_LEVEL_MAX_H_SIZE[level_idx as usize] { return Err(LevelConstraintsExceeded); } if config.height > AV1_LEVEL_MAX_V_SIZE[level_idx as usize] { return Err(LevelConstraintsExceeded); } if ((config.width * config.height) as u64 * config.time_base.num + config.time_base.den - 1) / config.time_base.den > AV1_LEVEL_MAX_DISPLAY_RATE[level_idx as usize] as u64 { return Err(LevelConstraintsExceeded); } } } // TODO: add more validation let rc = &self.rate_control; if (rc.emit_pass_data || rc.summary.is_some()) && config.bitrate == 0 { return Err(TargetBitrateNeeded); } Ok(()) } /// Provide the tiling information for the current Config /// /// Useful for reporting and debugging. /// /// # Errors /// /// - Returns `InvalidConfig` if the tiling config is invalid. pub fn tiling_info(&self) -> Result { self.validate()?; let seq = crate::encoder::Sequence::new(&self.enc); Ok(seq.tiling) } } rav1e-0.7.1/src/api/config/rate.rs000064400000000000000000000045421046102023000147620ustar 00000000000000// Copyright (c) 2020-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use thiserror::Error; use crate::rate::*; /// Rate control errors #[derive(Debug, Error)] pub enum Error { /// The summary provided is not compatible with the current encoder version #[error("Incompatible version {0}")] InvalidVersion(i64), /// The summary provided is possibly corrupted #[error("The summary content is invalid: {0}")] CorruptedSummary(String), } /// Rate control configuration #[derive(Clone, Debug, Default)] pub struct RateControlConfig { pub(crate) emit_pass_data: bool, pub(crate) summary: Option, } pub use crate::rate::RCSummary as RateControlSummary; impl RateControlSummary { /// Deserializes a byte slice into a `RateControlSummary` pub(crate) fn from_slice(bytes: &[u8]) -> Result { let mut de = RCDeserialize::default(); let _ = de.buffer_fill(bytes, 0, TWOPASS_HEADER_SZ); de.parse_summary().map_err(Error::CorruptedSummary) } } impl RateControlConfig { /// Create a rate control configuration from a serialized summary /// /// # Errors /// /// Returns an error if the serialized data is invalid. pub fn from_summary_slice(bytes: &[u8]) -> Result { Ok(Self { summary: Some(RateControlSummary::from_slice(bytes)?), ..Default::default() }) } /// Create a default rate control configuration /// /// By default the encoder is in single pass mode. pub fn new() -> Self { Default::default() } /// Set a rate control summary /// /// Enable the second pass encoding mode pub const fn with_summary(mut self, summary: RateControlSummary) -> Self { self.summary = Some(summary); self } /// Emit the current pass data /// /// The pass data will be used in a second pass encoding session pub const fn with_emit_data(mut self, emit: bool) -> Self { self.emit_pass_data = emit; self } } rav1e-0.7.1/src/api/config/speedsettings.rs000064400000000000000000000300161046102023000167030ustar 00000000000000// Copyright (c) 2020-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use num_derive::*; use crate::partition::BlockSize; use crate::serialize::{Deserialize, Serialize}; use std::fmt; // NOTE: Add Structures at the end. /// Contains the speed settings. #[derive(Clone, Copy, Debug, Serialize, Deserialize)] #[non_exhaustive] pub struct SpeedSettings { /// Enables inter-frames to have multiple reference frames. /// /// Enabled is slower. pub multiref: bool, /// Enables fast deblocking filter. pub fast_deblock: bool, /// The number of lookahead frames to be used for temporal RDO. /// /// Higher is slower. pub rdo_lookahead_frames: usize, /// Which scene detection mode to use. Standard is slower, but best. pub scene_detection_mode: SceneDetectionSpeed, /// Enables CDEF. pub cdef: bool, /// Enables LRF. pub lrf: bool, /// Enable searching loop restoration units when no transforms have been coded /// restoration unit. pub lru_on_skip: bool, /// The amount of search done for self guided restoration. pub sgr_complexity: SGRComplexityLevel, /// Search level for segmentation. /// /// Full search is at least twice as slow. pub segmentation: SegmentationLevel, // NOTE: put enums and basic type fields above /// Speed settings related to partition decision pub partition: PartitionSpeedSettings, /// Speed settings related to transform size and type decision pub transform: TransformSpeedSettings, /// Speed settings related to intra prediction mode selection pub prediction: PredictionSpeedSettings, /// Speed settings related to motion estimation and motion vector selection pub motion: MotionSpeedSettings, } impl Default for SpeedSettings { /// The default settings are equivalent to speed 0 fn default() -> Self { SpeedSettings { multiref: true, fast_deblock: false, rdo_lookahead_frames: 40, scene_detection_mode: SceneDetectionSpeed::Standard, cdef: true, lrf: true, lru_on_skip: true, sgr_complexity: SGRComplexityLevel::Full, segmentation: SegmentationLevel::Complex, partition: PartitionSpeedSettings { encode_bottomup: true, non_square_partition_max_threshold: BlockSize::BLOCK_64X64, partition_range: PartitionRange::new( BlockSize::BLOCK_4X4, BlockSize::BLOCK_64X64, ), }, transform: TransformSpeedSettings { reduced_tx_set: false, // TX domain distortion is always faster, with no significant quality change, // although it will be ignored when Tune == Psychovisual. tx_domain_distortion: true, tx_domain_rate: false, rdo_tx_decision: true, enable_inter_tx_split: false, }, prediction: PredictionSpeedSettings { prediction_modes: PredictionModesSetting::ComplexAll, fine_directional_intra: true, }, motion: MotionSpeedSettings { include_near_mvs: true, use_satd_subpel: true, me_allow_full_search: true, }, } } } impl SpeedSettings { /// Set the speed setting according to a numeric speed preset. pub fn from_preset(speed: u8) -> Self { // The default settings are equivalent to speed 0 let mut settings = SpeedSettings::default(); if speed >= 1 { settings.lru_on_skip = false; settings.segmentation = SegmentationLevel::Simple; } if speed >= 2 { settings.partition.non_square_partition_max_threshold = BlockSize::BLOCK_8X8; settings.prediction.prediction_modes = PredictionModesSetting::ComplexKeyframes; } if speed >= 3 { settings.rdo_lookahead_frames = 30; settings.partition.partition_range = PartitionRange::new(BlockSize::BLOCK_8X8, BlockSize::BLOCK_64X64); } if speed >= 4 { settings.partition.encode_bottomup = false; } if speed >= 5 { settings.sgr_complexity = SGRComplexityLevel::Reduced; settings.motion.include_near_mvs = false; } if speed >= 6 { settings.rdo_lookahead_frames = 20; settings.transform.rdo_tx_decision = false; settings.transform.reduced_tx_set = true; settings.motion.me_allow_full_search = false; } if speed >= 7 { settings.prediction.prediction_modes = PredictionModesSetting::Simple; // Multiref is enabled automatically if low_latency is false. // // If low_latency is true, enabling multiref allows using multiple // backwards references. low_latency false enables both forward and // backwards references. settings.multiref = false; settings.fast_deblock = true; } if speed >= 8 { settings.rdo_lookahead_frames = 10; settings.lrf = false; } if speed >= 9 { // 8x8 is fast enough to use until very high speed levels, // because 8x8 with reduced TX set is faster but with equivalent // or better quality compared to 16x16 (to which reduced TX set does not apply). settings.partition.partition_range = PartitionRange::new(BlockSize::BLOCK_16X16, BlockSize::BLOCK_32X32); // FIXME: With unknown reasons, inter_tx_split does not work if reduced_tx_set is false settings.transform.enable_inter_tx_split = true; } if speed >= 10 { settings.scene_detection_mode = SceneDetectionSpeed::Fast; settings.partition.partition_range = PartitionRange::new(BlockSize::BLOCK_32X32, BlockSize::BLOCK_32X32); settings.motion.use_satd_subpel = false; } settings } } #[derive(Clone, Copy, Debug, Serialize, Deserialize)] #[cfg_attr(test, derive(Default))] /// Speed settings related to transform size and type decision pub struct TransformSpeedSettings { /// Enables reduced transform set. /// /// Enabled is faster. pub reduced_tx_set: bool, /// Enables using transform-domain distortion instead of pixel-domain. /// /// Enabled is faster. pub tx_domain_distortion: bool, /// Enables using transform-domain rate estimation. /// /// Enabled is faster. pub tx_domain_rate: bool, /// Enables searching transform size and type with RDO. /// /// Enabled is slower. pub rdo_tx_decision: bool, /// Enable tx split for inter mode block. pub enable_inter_tx_split: bool, } #[derive(Clone, Copy, Debug, Serialize, Deserialize)] #[cfg_attr(test, derive(Default))] /// Speed settings related to partition decision pub struct PartitionSpeedSettings { /// Enables bottom-up encoding, rather than top-down. /// /// Enabled is slower. pub encode_bottomup: bool, /// Allow non-square partition type outside of frame borders /// on any blocks at or below this size. pub non_square_partition_max_threshold: BlockSize, /// Range of partition sizes that can be used. Larger ranges are slower. /// /// Must be based on square block sizes, so e.g. 8×4 isn't allowed here. pub partition_range: PartitionRange, } #[derive(Clone, Copy, Debug, Serialize, Deserialize)] #[cfg_attr(test, derive(Default))] /// Speed settings related to motion estimation and motion vector selection pub struct MotionSpeedSettings { /// Use SATD instead of SAD for subpixel search. /// /// Enabled is slower. pub use_satd_subpel: bool, /// Enables searching near motion vectors during RDO. /// /// Enabled is slower. pub include_near_mvs: bool, /// Enable full search in some parts of motion estimation. Allowing full /// search is slower. pub me_allow_full_search: bool, } #[derive(Clone, Copy, Debug, Serialize, Deserialize)] #[cfg_attr(test, derive(Default))] /// Speed settings related to intra prediction mode selection pub struct PredictionSpeedSettings { /// Prediction modes to search. /// /// Complex settings are slower. pub prediction_modes: PredictionModesSetting, /// Use fine directional intra prediction pub fine_directional_intra: bool, } /// Range of block sizes to use. #[derive(Clone, Copy, Debug, Serialize, Deserialize)] pub struct PartitionRange { pub(crate) min: BlockSize, pub(crate) max: BlockSize, } impl PartitionRange { /// Creates a new partition range with min and max partition sizes. /// /// # Panics /// /// - Panics if `max` is larger than `min`. /// - Panics if either `min` or `max` are not square. pub fn new(min: BlockSize, max: BlockSize) -> Self { assert!(max >= min); // Topdown search checks the min block size for PARTITION_SPLIT only, so // the min block size must be square. assert!(min.is_sqr()); // Rectangular max partition sizes have not been tested. assert!(max.is_sqr()); Self { min, max } } } #[cfg(test)] impl Default for PartitionRange { fn default() -> Self { PartitionRange::new(BlockSize::BLOCK_4X4, BlockSize::BLOCK_64X64) } } /// Prediction modes to search. #[derive( Clone, Copy, Debug, PartialOrd, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, )] pub enum SceneDetectionSpeed { /// Fastest scene detection using pixel-wise comparison Fast, /// Scene detection using motion vectors and cost estimates Standard, /// Completely disable scene detection and only place keyframes /// at fixed intervals. None, } impl fmt::Display for SceneDetectionSpeed { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!( f, "{}", match self { SceneDetectionSpeed::Fast => "Fast", SceneDetectionSpeed::Standard => "Standard", SceneDetectionSpeed::None => "None", } ) } } /// Prediction modes to search. #[derive( Clone, Copy, Debug, PartialOrd, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, )] pub enum PredictionModesSetting { /// Only simple prediction modes. Simple, /// Search all prediction modes on key frames and simple modes on other /// frames. ComplexKeyframes, /// Search all prediction modes on all frames. ComplexAll, } impl fmt::Display for PredictionModesSetting { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!( f, "{}", match self { PredictionModesSetting::Simple => "Simple", PredictionModesSetting::ComplexKeyframes => "Complex-KFs", PredictionModesSetting::ComplexAll => "Complex-All", } ) } } #[cfg(test)] impl Default for PredictionModesSetting { fn default() -> Self { PredictionModesSetting::Simple } } /// Search level for self guided restoration #[derive( Clone, Copy, Debug, PartialOrd, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, )] pub enum SGRComplexityLevel { /// Search all sgr parameters Full, /// Search a reduced set of sgr parameters Reduced, } impl fmt::Display for SGRComplexityLevel { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!( f, "{}", match self { SGRComplexityLevel::Full => "Full", SGRComplexityLevel::Reduced => "Reduced", } ) } } /// Search level for segmentation #[derive( Clone, Copy, Debug, PartialOrd, PartialEq, Eq, FromPrimitive, Serialize, Deserialize, )] pub enum SegmentationLevel { /// No segmentation is signalled. Disabled, /// Segmentation index is derived from source statistics. Simple, /// Segmentation index range is derived from source statistics. Complex, /// Search all segmentation indices. Full, } impl fmt::Display for SegmentationLevel { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!( f, "{}", match self { SegmentationLevel::Disabled => "Disabled", SegmentationLevel::Simple => "Simple", SegmentationLevel::Complex => "Complex", SegmentationLevel::Full => "Full", } ) } } rav1e-0.7.1/src/api/context.rs000064400000000000000000000364721046102023000142550ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![deny(missing_docs)] use crate::api::color::*; use crate::api::config::*; use crate::api::internal::*; use crate::api::util::*; use bitstream_io::*; use crate::encoder::*; use crate::frame::*; use crate::util::Pixel; use std::fmt; use std::io; use std::sync::Arc; /// The encoder context. /// /// Contains the encoding state. pub struct Context { pub(crate) inner: ContextInner, pub(crate) config: EncoderConfig, pub(crate) pool: Option>, pub(crate) is_flushing: bool, } impl Context { /// Allocates and returns a new frame. /// /// # Examples /// /// ``` /// use rav1e::prelude::*; /// /// # fn main() -> Result<(), InvalidConfig> { /// let cfg = Config::default(); /// let ctx: Context = cfg.new_context()?; /// let frame = ctx.new_frame(); /// # Ok(()) /// # } /// ``` #[inline] pub fn new_frame(&self) -> Frame { Frame::new( self.config.width, self.config.height, self.config.chroma_sampling, ) } /// Sends the frame for encoding. /// /// This method adds the frame into the frame queue and runs the first passes /// of the look-ahead computation. /// /// Passing `None` is equivalent to calling [`flush`]. /// /// The caller is responsible for padding the invisible portion of the frame, /// if multiple references to the frame are held. /// Calling [`Plane::pad()`] after filling each plane or equivalent is required. /// /// # Errors /// /// If this method is called with a frame after the encoder has been flushed /// or the encoder internal limit is hit (`std::i32::MAX` frames) the /// [`EncoderStatus::EnoughData`] error is returned. /// /// # Examples /// /// ``` /// use rav1e::prelude::*; /// /// # fn main() -> Result<(), Box> { /// # if false { /// let cfg = Config::default(); /// let mut ctx: Context = cfg.new_context().unwrap(); /// let f1 = ctx.new_frame(); /// let f2 = f1.clone(); /// let info = FrameParameters { /// frame_type_override: FrameTypeOverride::Key, /// opaque: None, /// ..Default::default() /// }; /// /// // Send the plain frame data /// ctx.send_frame(f1)?; /// // Send the data and the per-frame parameters /// // In this case the frame is forced to be a keyframe. /// ctx.send_frame((f2, info))?; /// // Flush the encoder, it is equivalent to a call to `flush()` /// ctx.send_frame(None)?; /// # } /// # Ok(()) /// # } /// ``` /// /// [`flush`]: #method.flush /// [`EncoderStatus::EnoughData`]: enum.EncoderStatus.html#variant.EnoughData #[inline] pub fn send_frame(&mut self, frame: F) -> Result<(), EncoderStatus> where F: IntoFrame, { let (frame, params) = frame.into(); if frame.is_none() { if self.is_flushing { return Ok(()); } self.inner.limit = Some(self.inner.frame_count); self.is_flushing = true; } else if self.is_flushing || (self.inner.config.still_picture && self.inner.frame_count > 0) { return Err(EncoderStatus::EnoughData); // The rate control can process at most std::i32::MAX frames } else if self.inner.frame_count == std::i32::MAX as u64 - 1 { self.inner.limit = Some(self.inner.frame_count); self.is_flushing = true; } let inner = &mut self.inner; let run = move || inner.send_frame(frame, params); match &self.pool { Some(pool) => pool.install(run), None => run(), } } /// Returns the first-pass data of a two-pass encode for the frame that was /// just encoded. /// /// This should be called BEFORE every call to [`receive_packet`] (including /// the very first one), even if no packet was produced by the last call to /// [`receive_packet`], if any (i.e., [`EncoderStatus::Encoded`] was /// returned). It needs to be called once more after /// [`EncoderStatus::LimitReached`] is returned, to retrieve the header that /// should be written to the front of the stats file (overwriting the /// placeholder header that was emitted at the start of encoding). /// /// It is still safe to call this function when [`receive_packet`] returns /// any other error. It will return `None` instead of returning a duplicate /// copy of the previous frame's data. /// /// [`receive_packet`]: #method.receive_packet /// [`EncoderStatus::Encoded`]: enum.EncoderStatus.html#variant.Encoded /// [`EncoderStatus::LimitReached`]: /// enum.EncoderStatus.html#variant.LimitReached #[inline] pub fn twopass_out(&mut self) -> Option<&[u8]> { self.inner.rc_state.twopass_out(self.inner.done_processing()) } /// Returns the number of bytes of the stats file needed before the next /// frame of the second pass in a two-pass encode can be encoded. /// /// This is a lower bound (more might be required), but if `0` is returned, /// then encoding can proceed. This is just a hint to the application, and /// does not need to be called for encoding the second pass to work, so long /// as the application continues to provide more data to [`twopass_in`] in a /// loop until [`twopass_in`] returns `0`. /// /// [`twopass_in`]: #method.twopass_in #[inline] pub fn twopass_bytes_needed(&mut self) -> usize { self.inner.rc_state.twopass_in(None).unwrap_or(0) } /// Provides the stats data produced in the first pass of a two-pass encode /// to the second pass. /// /// On success this returns the number of bytes of the data which were /// consumed. When encoding the second pass of a two-pass encode, this should /// be called repeatedly in a loop before every call to [`receive_packet`] /// (including the very first one) until no bytes are consumed, or until /// [`twopass_bytes_needed`] returns `0`. /// /// [`receive_packet`]: #method.receive_packet /// [`twopass_bytes_needed`]: #method.twopass_bytes_needed /// /// # Errors /// /// Returns `Err(EncoderStatus::Failure)` if the two-pass data is invalid. #[inline] pub fn twopass_in(&mut self, buf: &[u8]) -> Result { self.inner.rc_state.twopass_in(Some(buf)).or(Err(EncoderStatus::Failure)) } /// Encodes the next frame and returns the encoded data. /// /// This method is where the main encoding work is done. /// /// # Errors /// /// May return `Err(EncoderStatus)`, which should be handled by the caller. /// /// # Examples /// /// Encoding a single frame: /// /// ``` /// use rav1e::prelude::*; /// /// # fn main() -> Result<(), Box> { /// # if false { /// let cfg = Config::default(); /// let mut ctx: Context = cfg.new_context()?; /// let frame = ctx.new_frame(); /// /// ctx.send_frame(frame)?; /// ctx.flush(); /// /// loop { /// match ctx.receive_packet() { /// Ok(packet) => { /* Mux the packet. */ }, /// Err(EncoderStatus::Encoded) => (), /// Err(EncoderStatus::LimitReached) => break, /// Err(err) => Err(err)?, /// } /// } /// # } /// # Ok(()) /// # } /// ``` /// /// Encoding a sequence of frames: /// /// ``` /// use std::sync::Arc; /// use rav1e::prelude::*; /// /// fn encode_frames( /// ctx: &mut Context, /// mut frames: impl Iterator> /// ) -> Result<(), EncoderStatus> { /// // This is a slightly contrived example, intended to showcase the /// // various statuses that can be returned from receive_packet(). /// // Assume that, for example, there are a lot of frames in the /// // iterator, which are produced lazily, so you don't want to send /// // them all in at once as to not exhaust the memory. /// loop { /// match ctx.receive_packet() { /// Ok(packet) => { /* Mux the packet. */ }, /// Err(EncoderStatus::Encoded) => { /// // A frame was encoded without emitting a packet. This is /// // normal, just proceed as usual. /// }, /// Err(EncoderStatus::LimitReached) => { /// // All frames have been encoded. Time to break out of the /// // loop. /// break; /// }, /// Err(EncoderStatus::NeedMoreData) => { /// // The encoder has requested additional frames. Push the /// // next frame in, or flush the encoder if there are no /// // frames left (on None). /// ctx.send_frame(frames.next().map(Arc::new))?; /// }, /// Err(EncoderStatus::EnoughData) => { /// // Since we aren't trying to push frames after flushing, /// // this should never happen in this example. /// unreachable!(); /// }, /// Err(EncoderStatus::NotReady) => { /// // We're not doing two-pass encoding, so this can never /// // occur. /// unreachable!(); /// }, /// Err(EncoderStatus::Failure) => { /// return Err(EncoderStatus::Failure); /// }, /// } /// } /// /// Ok(()) /// } /// # fn main() -> Result<(), Box> { /// # if false { /// # let mut enc = EncoderConfig::default(); /// # // So it runs faster. /// # enc.width = 16; /// # enc.height = 16; /// # let cfg = Config::new().with_encoder_config(enc); /// # let mut ctx: Context = cfg.new_context()?; /// # /// # let frames = vec![ctx.new_frame(); 4].into_iter(); /// # encode_frames(&mut ctx, frames); /// # } /// # Ok(()) /// # } /// ``` #[inline] pub fn receive_packet(&mut self) -> Result, EncoderStatus> { let inner = &mut self.inner; let mut run = move || inner.receive_packet(); match &self.pool { Some(pool) => pool.install(run), None => run(), } } /// Flushes the encoder. /// /// Flushing signals the end of the video. After the encoder has been /// flushed, no additional frames are accepted. /// /// # Panics /// /// Panics if `send_frame` returns an `Err`. /// This should never happen when calling it with `None` /// and indicates a development error. #[inline] pub fn flush(&mut self) { self.send_frame(None).unwrap(); } /// Produces a sequence header matching the current encoding context. /// /// Its format is compatible with the AV1 Matroska and ISOBMFF specification. /// Note that the returned header does not include any config OBUs which are /// required for some uses. See [the specification]. /// /// [the specification]: /// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-section /// /// # Panics /// /// Panics if the header cannot be written in memory. This is unrecoverable, /// and usually indicates the system is out of memory. #[inline] pub fn container_sequence_header(&self) -> Vec { fn sequence_header_inner(seq: &Sequence) -> io::Result> { let mut buf = Vec::new(); { let mut bw = BitWriter::endian(&mut buf, BigEndian); bw.write_bit(true)?; // marker bw.write(7, 1)?; // version bw.write(3, seq.profile)?; bw.write(5, 31)?; // level bw.write_bit(false)?; // tier bw.write_bit(seq.bit_depth > 8)?; // high_bitdepth bw.write_bit(seq.bit_depth == 12)?; // twelve_bit bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs400)?; // monochrome bw.write_bit(seq.chroma_sampling != ChromaSampling::Cs444)?; // chroma_subsampling_x bw.write_bit(seq.chroma_sampling == ChromaSampling::Cs420)?; // chroma_subsampling_y bw.write(2, 0)?; // chroma_sample_position bw.write(3, 0)?; // reserved bw.write_bit(false)?; // initial_presentation_delay_present bw.write(4, 0)?; // reserved } Ok(buf) } let seq = Sequence::new(&self.config); sequence_header_inner(&seq).unwrap() } } /// Rate Control Data pub enum RcData { /// A Rate Control Summary Packet /// /// It is emitted once, after the encoder is flushed. /// /// It contains a summary of the rate control information for the /// encoding process that just terminated. Summary(Box<[u8]>), /// A Rate Control Frame-specific Packet /// /// It is emitted every time a frame is processed. /// /// The information contained is required to encode its matching /// frame in a second pass encoding. Frame(Box<[u8]>), } impl Context { /// Return the Rate Control Summary Packet size /// /// It is useful mainly to preserve space when saving /// both Rate Control Summary and Frame Packets in a single file. pub fn rc_summary_size(&self) -> usize { crate::rate::TWOPASS_HEADER_SZ } /// Return the first pass data /// /// Call it after `receive_packet`, it returns a packet or the encoder /// lifecycle statuses [`EncoderStatus::Encoded`] and /// [`EncoderStatus::LimitReached`]. /// /// [`EncoderStatus::Encoded`]: enum.EncoderStatus.html#variant.Encoded /// [`EncoderStatus::LimitReached`]: /// enum.EncoderStatus.html#variant.LimitReached /// /// It will return a `RcData::Summary` once the encoder is flushed. pub fn rc_receive_pass_data(&mut self) -> Option { if self.inner.done_processing() && self.inner.rc_state.pass1_data_retrieved { let data = self.inner.rc_state.emit_summary(); Some(RcData::Summary(data.to_vec().into_boxed_slice())) } else if self.inner.rc_state.pass1_data_retrieved { None } else if let Some(data) = self.inner.rc_state.emit_frame_data() { Some(RcData::Frame(data.to_vec().into_boxed_slice())) } else { unreachable!( "The encoder received more frames than its internal limit allows" ) } } /// Lower bound number of pass data packets required to progress the /// encoding process. /// /// It should be called iteratively until it returns 0. pub fn rc_second_pass_data_required(&self) -> usize { if self.inner.done_processing() { 0 } else { self.inner.rc_state.twopass_in_frames_needed() as usize } } /// Feed the first pass Rate Control data to the encoder, /// Frame-specific Packets only. /// /// Call it before `receive_packet()` /// /// # Errors /// /// Returns `EncoderStatus::Failure` if the data provided is incorrect pub fn rc_send_pass_data( &mut self, data: &[u8], ) -> Result<(), EncoderStatus> { self .inner .rc_state .parse_frame_data_packet(data) .map_err(|_| EncoderStatus::Failure) } } impl fmt::Debug for Context { fn fmt( &self, f: &mut fmt::Formatter<'_>, ) -> std::result::Result<(), fmt::Error> { write!( f, "{{ \ config: {:?}, \ is_flushing: {}, \ }}", self.config, self.is_flushing, ) } } rav1e-0.7.1/src/api/internal.rs000064400000000000000000001614401046102023000143770ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![deny(missing_docs)] use crate::activity::ActivityMask; use crate::api::lookahead::*; use crate::api::{ EncoderConfig, EncoderStatus, FrameType, Opaque, Packet, T35, }; use crate::color::ChromaSampling::Cs400; use crate::cpu_features::CpuFeatureLevel; use crate::dist::get_satd; use crate::encoder::*; use crate::frame::*; use crate::partition::*; use crate::rate::{ RCState, FRAME_NSUBTYPES, FRAME_SUBTYPE_I, FRAME_SUBTYPE_P, FRAME_SUBTYPE_SEF, }; use crate::scenechange::SceneChangeDetector; use crate::stats::EncoderStats; use crate::tiling::Area; use crate::util::Pixel; use arrayvec::ArrayVec; use std::cmp; use std::collections::{BTreeMap, BTreeSet}; use std::env; use std::fs; use std::path::PathBuf; use std::sync::Arc; /// The set of options that controls frame re-ordering and reference picture /// selection. /// The options stored here are invariant over the whole encode. #[derive(Debug, Clone, Copy)] pub struct InterConfig { /// Whether frame re-ordering is enabled. reorder: bool, /// Whether P-frames can use multiple references. pub(crate) multiref: bool, /// The depth of the re-ordering pyramid. /// The current code cannot support values larger than 2. pub(crate) pyramid_depth: u64, /// Number of input frames in group. pub(crate) group_input_len: u64, /// Number of output frames in group. /// This includes both hidden frames and "show existing frame" frames. group_output_len: u64, /// Interval between consecutive S-frames. /// Keyframes reset this interval. /// This MUST be a multiple of group_input_len. pub(crate) switch_frame_interval: u64, } impl InterConfig { pub(crate) fn new(enc_config: &EncoderConfig) -> InterConfig { let reorder = !enc_config.low_latency; // A group always starts with (group_output_len - group_input_len) hidden // frames, followed by group_input_len shown frames. // The shown frames iterate over the input frames in order, with frames // already encoded as hidden frames now displayed with Show Existing // Frame. // For example, for a pyramid depth of 2, the group is as follows: // |TU |TU |TU |TU // idx_in_group_output: 0 1 2 3 4 5 // input_frameno: 4 2 1 SEF 3 SEF // output_frameno: 1 2 3 4 5 6 // level: 0 1 2 1 2 0 // ^^^^^ ^^^^^^^^^^^^^ // hidden shown // TODO: This only works for pyramid_depth <= 2 --- after that we need // more hidden frames in the middle of the group. let pyramid_depth = if reorder { 2 } else { 0 }; let group_input_len = 1 << pyramid_depth; let group_output_len = group_input_len + pyramid_depth; let switch_frame_interval = enc_config.switch_frame_interval; assert!(switch_frame_interval % group_input_len == 0); InterConfig { reorder, multiref: reorder || enc_config.speed_settings.multiref, pyramid_depth, group_input_len, group_output_len, switch_frame_interval, } } /// Get the index of an output frame in its re-ordering group given the output /// frame number of the frame in the current keyframe gop. /// When re-ordering is disabled, this always returns 0. pub(crate) fn get_idx_in_group_output( &self, output_frameno_in_gop: u64, ) -> u64 { // The first frame in the GOP should be a keyframe and is not re-ordered, // so we should not be calling this function on it. debug_assert!(output_frameno_in_gop > 0); (output_frameno_in_gop - 1) % self.group_output_len } /// Get the order-hint of an output frame given the output frame number of the /// frame in the current keyframe gop and the index of that output frame /// in its re-ordering gorup. pub(crate) fn get_order_hint( &self, output_frameno_in_gop: u64, idx_in_group_output: u64, ) -> u32 { // The first frame in the GOP should be a keyframe, but currently this // function only handles inter frames. // We could return 0 for keyframes if keyframe support is needed. debug_assert!(output_frameno_in_gop > 0); // Which P-frame group in the current gop is this output frame in? // Subtract 1 because the first frame in the gop is always a keyframe. let group_idx = (output_frameno_in_gop - 1) / self.group_output_len; // Get the offset to the corresponding input frame. // TODO: This only works with pyramid_depth <= 2. let offset = if idx_in_group_output < self.pyramid_depth { self.group_input_len >> idx_in_group_output } else { idx_in_group_output - self.pyramid_depth + 1 }; // Construct the final order hint relative to the start of the group. (self.group_input_len * group_idx + offset) as u32 } /// Get the level of the current frame in the pyramid. pub(crate) const fn get_level(&self, idx_in_group_output: u64) -> u64 { if !self.reorder { 0 } else if idx_in_group_output < self.pyramid_depth { // Hidden frames are output first (to be shown in the future). idx_in_group_output } else { // Shown frames // TODO: This only works with pyramid_depth <= 2. pos_to_lvl( idx_in_group_output - self.pyramid_depth + 1, self.pyramid_depth, ) } } pub(crate) const fn get_slot_idx(&self, level: u64, order_hint: u32) -> u32 { // Frames with level == 0 are stored in slots 0..4, and frames with higher // values of level in slots 4..8 if level == 0 { (order_hint >> self.pyramid_depth) & 3 } else { // This only works with pyramid_depth <= 4. 3 + level as u32 } } pub(crate) const fn get_show_frame(&self, idx_in_group_output: u64) -> bool { idx_in_group_output >= self.pyramid_depth } pub(crate) const fn get_show_existing_frame( &self, idx_in_group_output: u64, ) -> bool { // The self.reorder test here is redundant, but short-circuits the rest, // avoiding a bunch of work when it's false. self.reorder && self.get_show_frame(idx_in_group_output) && (idx_in_group_output - self.pyramid_depth + 1).count_ones() == 1 && idx_in_group_output != self.pyramid_depth } pub(crate) fn get_input_frameno( &self, output_frameno_in_gop: u64, gop_input_frameno_start: u64, ) -> u64 { if output_frameno_in_gop == 0 { gop_input_frameno_start } else { let idx_in_group_output = self.get_idx_in_group_output(output_frameno_in_gop); let order_hint = self.get_order_hint(output_frameno_in_gop, idx_in_group_output); gop_input_frameno_start + order_hint as u64 } } const fn max_reordering_latency(&self) -> u64 { self.group_input_len } pub(crate) const fn keyframe_lookahead_distance(&self) -> u64 { self.max_reordering_latency() + 1 } pub(crate) const fn allowed_ref_frames(&self) -> &[RefType] { use crate::partition::RefType::*; if self.reorder { &ALL_INTER_REFS } else if self.multiref { &[LAST_FRAME, LAST2_FRAME, LAST3_FRAME, GOLDEN_FRAME] } else { &[LAST_FRAME] } } } // Thin wrapper for frame-related data // that gets cached and reused throughout the life of a frame. #[derive(Clone)] pub(crate) struct FrameData { pub(crate) fi: FrameInvariants, pub(crate) fs: FrameState, } impl FrameData { pub(crate) fn new(fi: FrameInvariants, frame: Arc>) -> Self { let fs = FrameState::new_with_frame(&fi, frame); FrameData { fi, fs } } } type FrameQueue = BTreeMap>>>; type FrameDataQueue = BTreeMap>>; // the fields pub(super) are accessed only by the tests pub(crate) struct ContextInner { pub(crate) frame_count: u64, pub(crate) limit: Option, pub(crate) output_frameno: u64, pub(super) inter_cfg: InterConfig, pub(super) frames_processed: u64, /// Maps *input_frameno* to frames pub(super) frame_q: FrameQueue, /// Maps *output_frameno* to frame data pub(super) frame_data: FrameDataQueue, /// A list of the input_frameno for keyframes in this encode. /// Needed so that we don't need to keep all of the frame_invariants in /// memory for the whole life of the encode. // TODO: Is this needed at all? keyframes: BTreeSet, // TODO: Is this needed at all? keyframes_forced: BTreeSet, /// A storage space for reordered frames. packet_data: Vec, /// Maps `output_frameno` to `gop_output_frameno_start`. gop_output_frameno_start: BTreeMap, /// Maps `output_frameno` to `gop_input_frameno_start`. pub(crate) gop_input_frameno_start: BTreeMap, keyframe_detector: SceneChangeDetector, pub(crate) config: Arc, seq: Arc, pub(crate) rc_state: RCState, maybe_prev_log_base_q: Option, /// The next `input_frameno` to be processed by lookahead. next_lookahead_frame: u64, /// The next `output_frameno` to be computed by lookahead. next_lookahead_output_frameno: u64, /// Optional opaque to be sent back to the user opaque_q: BTreeMap, /// Optional T35 metadata per frame t35_q: BTreeMap>, } impl ContextInner { pub fn new(enc: &EncoderConfig) -> Self { // initialize with temporal delimiter let packet_data = TEMPORAL_DELIMITER.to_vec(); let mut keyframes = BTreeSet::new(); keyframes.insert(0); let maybe_ac_qi_max = if enc.quantizer < 255 { Some(enc.quantizer as u8) } else { None }; let seq = Arc::new(Sequence::new(enc)); let inter_cfg = InterConfig::new(enc); let lookahead_distance = inter_cfg.keyframe_lookahead_distance() as usize; ContextInner { frame_count: 0, limit: None, inter_cfg, output_frameno: 0, frames_processed: 0, frame_q: BTreeMap::new(), frame_data: BTreeMap::new(), keyframes, keyframes_forced: BTreeSet::new(), packet_data, gop_output_frameno_start: BTreeMap::new(), gop_input_frameno_start: BTreeMap::new(), keyframe_detector: SceneChangeDetector::new( enc.clone(), CpuFeatureLevel::default(), lookahead_distance, seq.clone(), ), config: Arc::new(enc.clone()), seq, rc_state: RCState::new( enc.width as i32, enc.height as i32, enc.time_base.den as i64, enc.time_base.num as i64, enc.bitrate, maybe_ac_qi_max, enc.min_quantizer, enc.max_key_frame_interval as i32, enc.reservoir_frame_delay, ), maybe_prev_log_base_q: None, next_lookahead_frame: 1, next_lookahead_output_frameno: 0, opaque_q: BTreeMap::new(), t35_q: BTreeMap::new(), } } #[profiling::function] pub fn send_frame( &mut self, mut frame: Option>>, params: Option, ) -> Result<(), EncoderStatus> { if let Some(ref mut frame) = frame { use crate::api::color::ChromaSampling; let EncoderConfig { width, height, chroma_sampling, .. } = *self.config; let planes = if chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; // Try to add padding if let Some(ref mut frame) = Arc::get_mut(frame) { for plane in frame.planes[..planes].iter_mut() { plane.pad(width, height); } } // Enforce that padding is added for (p, plane) in frame.planes[..planes].iter().enumerate() { assert!( plane.probe_padding(width, height), "Plane {p} was not padded before passing Frame to send_frame()." ); } } let input_frameno = self.frame_count; let is_flushing = frame.is_none(); if !is_flushing { self.frame_count += 1; } self.frame_q.insert(input_frameno, frame); if let Some(params) = params { if params.frame_type_override == FrameTypeOverride::Key { self.keyframes_forced.insert(input_frameno); } if let Some(op) = params.opaque { self.opaque_q.insert(input_frameno, op); } self.t35_q.insert(input_frameno, params.t35_metadata); } if !self.needs_more_frame_q_lookahead(self.next_lookahead_frame) { let lookahead_frames = self .frame_q .range(self.next_lookahead_frame - 1..) .filter_map(|(&_input_frameno, frame)| frame.as_ref()) .collect::>>>(); if is_flushing { // This is the last time send_frame is called, process all the // remaining frames. for cur_lookahead_frames in std::iter::successors(Some(&lookahead_frames[..]), |s| s.get(1..)) { if cur_lookahead_frames.len() < 2 { // All frames have been processed break; } Self::compute_keyframe_placement( cur_lookahead_frames, &self.keyframes_forced, &mut self.keyframe_detector, &mut self.next_lookahead_frame, &mut self.keyframes, ); } } else { Self::compute_keyframe_placement( &lookahead_frames, &self.keyframes_forced, &mut self.keyframe_detector, &mut self.next_lookahead_frame, &mut self.keyframes, ); } } self.compute_frame_invariants(); Ok(()) } /// Indicates whether more frames need to be read into the frame queue /// in order for frame queue lookahead to be full. fn needs_more_frame_q_lookahead(&self, input_frameno: u64) -> bool { let lookahead_end = self.frame_q.keys().last().cloned().unwrap_or(0); let frames_needed = input_frameno + self.inter_cfg.keyframe_lookahead_distance() + 1; lookahead_end < frames_needed && self.needs_more_frames(lookahead_end) } /// Indicates whether more frames need to be processed into `FrameInvariants` /// in order for FI lookahead to be full. pub fn needs_more_fi_lookahead(&self) -> bool { let ready_frames = self.get_rdo_lookahead_frames().count(); ready_frames < self.config.speed_settings.rdo_lookahead_frames + 1 && self.needs_more_frames(self.next_lookahead_frame) } pub fn needs_more_frames(&self, frame_count: u64) -> bool { self.limit.map(|limit| frame_count < limit).unwrap_or(true) } fn get_rdo_lookahead_frames( &self, ) -> impl Iterator)> { self .frame_data .iter() .skip_while(move |(&output_frameno, _)| { output_frameno < self.output_frameno }) .filter_map(|(fno, data)| data.as_ref().map(|data| (fno, data))) .filter(|(_, data)| !data.fi.is_show_existing_frame()) .take(self.config.speed_settings.rdo_lookahead_frames + 1) } fn next_keyframe_input_frameno( &self, gop_input_frameno_start: u64, ignore_limit: bool, ) -> u64 { let next_detected = self .keyframes .iter() .find(|&&input_frameno| input_frameno > gop_input_frameno_start) .cloned(); let mut next_limit = gop_input_frameno_start + self.config.max_key_frame_interval; if !ignore_limit && self.limit.is_some() { next_limit = next_limit.min(self.limit.unwrap()); } if next_detected.is_none() { return next_limit; } cmp::min(next_detected.unwrap(), next_limit) } fn set_frame_properties( &mut self, output_frameno: u64, ) -> Result<(), EncoderStatus> { let fi = self.build_frame_properties(output_frameno)?; self.frame_data.insert( output_frameno, fi.map(|fi| { let frame = self .frame_q .get(&fi.input_frameno) .as_ref() .unwrap() .as_ref() .unwrap(); FrameData::new(fi, frame.clone()) }), ); Ok(()) } #[allow(unused)] pub fn build_dump_properties() -> PathBuf { let mut data_location = PathBuf::new(); if env::var_os("RAV1E_DATA_PATH").is_some() { data_location.push(&env::var_os("RAV1E_DATA_PATH").unwrap()); } else { data_location.push(&env::current_dir().unwrap()); data_location.push(".lookahead_data"); } fs::create_dir_all(&data_location).unwrap(); data_location } fn build_frame_properties( &mut self, output_frameno: u64, ) -> Result>, EncoderStatus> { let (prev_gop_output_frameno_start, prev_gop_input_frameno_start) = if output_frameno == 0 { (0, 0) } else { ( self.gop_output_frameno_start[&(output_frameno - 1)], self.gop_input_frameno_start[&(output_frameno - 1)], ) }; self .gop_output_frameno_start .insert(output_frameno, prev_gop_output_frameno_start); self .gop_input_frameno_start .insert(output_frameno, prev_gop_input_frameno_start); let output_frameno_in_gop = output_frameno - self.gop_output_frameno_start[&output_frameno]; let mut input_frameno = self.inter_cfg.get_input_frameno( output_frameno_in_gop, self.gop_input_frameno_start[&output_frameno], ); if self.needs_more_frame_q_lookahead(input_frameno) { return Err(EncoderStatus::NeedMoreData); } let t35_metadata = if let Some(t35) = self.t35_q.remove(&input_frameno) { t35 } else { Box::new([]) }; if output_frameno_in_gop > 0 { let next_keyframe_input_frameno = self.next_keyframe_input_frameno( self.gop_input_frameno_start[&output_frameno], false, ); let prev_input_frameno = self.get_previous_fi(output_frameno).input_frameno; if input_frameno >= next_keyframe_input_frameno { if !self.inter_cfg.reorder || ((output_frameno_in_gop - 1) % self.inter_cfg.group_output_len == 0 && prev_input_frameno == (next_keyframe_input_frameno - 1)) { input_frameno = next_keyframe_input_frameno; // If we'll return early, do it before modifying the state. match self.frame_q.get(&input_frameno) { Some(Some(_)) => {} _ => { return Err(EncoderStatus::NeedMoreData); } } *self.gop_output_frameno_start.get_mut(&output_frameno).unwrap() = output_frameno; *self.gop_input_frameno_start.get_mut(&output_frameno).unwrap() = next_keyframe_input_frameno; } else { let fi = FrameInvariants::new_inter_frame( self.get_previous_coded_fi(output_frameno), &self.inter_cfg, self.gop_input_frameno_start[&output_frameno], output_frameno_in_gop, next_keyframe_input_frameno, self.config.error_resilient, t35_metadata, ); assert!(fi.is_none()); return Ok(fi); } } } match self.frame_q.get(&input_frameno) { Some(Some(_)) => {} _ => { return Err(EncoderStatus::NeedMoreData); } } // Now that we know the input_frameno, look up the correct frame type let frame_type = if self.keyframes.contains(&input_frameno) { FrameType::KEY } else { FrameType::INTER }; if frame_type == FrameType::KEY { *self.gop_output_frameno_start.get_mut(&output_frameno).unwrap() = output_frameno; *self.gop_input_frameno_start.get_mut(&output_frameno).unwrap() = input_frameno; } let output_frameno_in_gop = output_frameno - self.gop_output_frameno_start[&output_frameno]; if output_frameno_in_gop == 0 { let fi = FrameInvariants::new_key_frame( self.config.clone(), self.seq.clone(), self.gop_input_frameno_start[&output_frameno], t35_metadata, ); Ok(Some(fi)) } else { let next_keyframe_input_frameno = self.next_keyframe_input_frameno( self.gop_input_frameno_start[&output_frameno], false, ); let fi = FrameInvariants::new_inter_frame( self.get_previous_coded_fi(output_frameno), &self.inter_cfg, self.gop_input_frameno_start[&output_frameno], output_frameno_in_gop, next_keyframe_input_frameno, self.config.error_resilient, t35_metadata, ); assert!(fi.is_some()); Ok(fi) } } fn get_previous_fi(&self, output_frameno: u64) -> &FrameInvariants { let res = self .frame_data .iter() .filter(|(fno, _)| **fno < output_frameno) .rfind(|(_, fd)| fd.is_some()) .unwrap(); &res.1.as_ref().unwrap().fi } fn get_previous_coded_fi(&self, output_frameno: u64) -> &FrameInvariants { let res = self .frame_data .iter() .filter(|(fno, _)| **fno < output_frameno) .rfind(|(_, fd)| { fd.as_ref().map(|fd| !fd.fi.is_show_existing_frame()).unwrap_or(false) }) .unwrap(); &res.1.as_ref().unwrap().fi } pub(crate) fn done_processing(&self) -> bool { self.limit.map(|limit| self.frames_processed == limit).unwrap_or(false) } /// Computes lookahead motion vectors and fills in `lookahead_mvs`, /// `rec_buffer` and `lookahead_rec_buffer` on the `FrameInvariants`. This /// function must be called after every new `FrameInvariants` is initially /// computed. #[profiling::function] fn compute_lookahead_motion_vectors(&mut self, output_frameno: u64) { let frame_data = self.frame_data.get(&output_frameno).unwrap(); // We're only interested in valid frames which are not show-existing-frame. // Those two don't modify the rec_buffer so there's no need to do anything // special about it either, it'll propagate on its own. if frame_data .as_ref() .map(|fd| fd.fi.is_show_existing_frame()) .unwrap_or(true) { return; } let qps = { let fti = frame_data.as_ref().unwrap().fi.get_frame_subtype(); self.rc_state.select_qi( self, output_frameno, fti, self.maybe_prev_log_base_q, 0, ) }; let frame_data = self.frame_data.get_mut(&output_frameno).unwrap().as_mut().unwrap(); let fs = &mut frame_data.fs; let fi = &mut frame_data.fi; let coded_data = fi.coded_frame_data.as_mut().unwrap(); #[cfg(feature = "dump_lookahead_data")] { let data_location = Self::build_dump_properties(); let plane = &fs.input_qres; let mut file_name = format!("{:010}-qres", fi.input_frameno); let buf: Vec<_> = plane.iter().map(|p| p.as_()).collect(); image::GrayImage::from_vec( plane.cfg.width as u32, plane.cfg.height as u32, buf, ) .unwrap() .save(data_location.join(file_name).with_extension("png")) .unwrap(); let plane = &fs.input_hres; file_name = format!("{:010}-hres", fi.input_frameno); let buf: Vec<_> = plane.iter().map(|p| p.as_()).collect(); image::GrayImage::from_vec( plane.cfg.width as u32, plane.cfg.height as u32, buf, ) .unwrap() .save(data_location.join(file_name).with_extension("png")) .unwrap(); } // Do not modify the next output frame's FrameInvariants. if self.output_frameno == output_frameno { // We do want to propagate the lookahead_rec_buffer though. let rfs = Arc::new(ReferenceFrame { order_hint: fi.order_hint, width: fi.width as u32, height: fi.height as u32, render_width: fi.render_width, render_height: fi.render_height, // Use the original frame contents. frame: fs.input.clone(), input_hres: fs.input_hres.clone(), input_qres: fs.input_qres.clone(), cdfs: fs.cdfs, frame_me_stats: fs.frame_me_stats.clone(), output_frameno, segmentation: fs.segmentation, }); for i in 0..REF_FRAMES { if (fi.refresh_frame_flags & (1 << i)) != 0 { coded_data.lookahead_rec_buffer.frames[i] = Some(Arc::clone(&rfs)); coded_data.lookahead_rec_buffer.deblock[i] = fs.deblock; } } return; } // Our lookahead_rec_buffer should be filled with correct original frame // data from the previous frames. Copy it into rec_buffer because that's // what the MV search uses. During the actual encoding rec_buffer is // overwritten with its correct values anyway. fi.rec_buffer = coded_data.lookahead_rec_buffer.clone(); // Estimate lambda with rate-control dry-run fi.set_quantizers(&qps); // TODO: as in the encoding code, key frames will have no references. // However, for block importance purposes we want key frames to act as // P-frames in this instance. // // Compute the motion vectors. compute_motion_vectors(fi, fs, &self.inter_cfg); let coded_data = fi.coded_frame_data.as_mut().unwrap(); #[cfg(feature = "dump_lookahead_data")] { use crate::partition::RefType::*; let data_location = Self::build_dump_properties(); let file_name = format!("{:010}-mvs", fi.input_frameno); let second_ref_frame = if !self.inter_cfg.multiref { LAST_FRAME // make second_ref_frame match first } else if fi.idx_in_group_output == 0 { LAST2_FRAME } else { ALTREF_FRAME }; // Use the default index, it corresponds to the last P-frame or to the // backwards lower reference (so the closest previous frame). let index = if second_ref_frame.to_index() != 0 { 0 } else { 1 }; let me_stats = &fs.frame_me_stats.read().expect("poisoned lock")[index]; use byteorder::{NativeEndian, WriteBytesExt}; // dynamic allocation: debugging only let mut buf = vec![]; buf.write_u64::(me_stats.rows as u64).unwrap(); buf.write_u64::(me_stats.cols as u64).unwrap(); for y in 0..me_stats.rows { for x in 0..me_stats.cols { let mv = me_stats[y][x].mv; buf.write_i16::(mv.row).unwrap(); buf.write_i16::(mv.col).unwrap(); } } ::std::fs::write( data_location.join(file_name).with_extension("bin"), buf, ) .unwrap(); } // Set lookahead_rec_buffer on this FrameInvariants for future // FrameInvariants to pick it up. let rfs = Arc::new(ReferenceFrame { order_hint: fi.order_hint, width: fi.width as u32, height: fi.height as u32, render_width: fi.render_width, render_height: fi.render_height, // Use the original frame contents. frame: fs.input.clone(), input_hres: fs.input_hres.clone(), input_qres: fs.input_qres.clone(), cdfs: fs.cdfs, frame_me_stats: fs.frame_me_stats.clone(), output_frameno, segmentation: fs.segmentation, }); for i in 0..REF_FRAMES { if (fi.refresh_frame_flags & (1 << i)) != 0 { coded_data.lookahead_rec_buffer.frames[i] = Some(Arc::clone(&rfs)); coded_data.lookahead_rec_buffer.deblock[i] = fs.deblock; } } } /// Computes lookahead intra cost approximations and fills in /// `lookahead_intra_costs` on the `FrameInvariants`. fn compute_lookahead_intra_costs(&mut self, output_frameno: u64) { let frame_data = self.frame_data.get(&output_frameno).unwrap(); let fd = &frame_data.as_ref(); // We're only interested in valid frames which are not show-existing-frame. if fd.map(|fd| fd.fi.is_show_existing_frame()).unwrap_or(true) { return; } let fi = &fd.unwrap().fi; self .frame_data .get_mut(&output_frameno) .unwrap() .as_mut() .unwrap() .fi .coded_frame_data .as_mut() .unwrap() .lookahead_intra_costs = self .keyframe_detector .intra_costs .remove(&fi.input_frameno) .unwrap_or_else(|| { let frame = self.frame_q[&fi.input_frameno].as_ref().unwrap(); let temp_plane = self .keyframe_detector .temp_plane .get_or_insert_with(|| frame.planes[0].clone()); // We use the cached values from scenechange if available, // otherwise we need to calculate them here. estimate_intra_costs( temp_plane, &**frame, fi.sequence.bit_depth, fi.cpu_feature_level, ) }); } #[profiling::function] pub fn compute_keyframe_placement( lookahead_frames: &[&Arc>], keyframes_forced: &BTreeSet, keyframe_detector: &mut SceneChangeDetector, next_lookahead_frame: &mut u64, keyframes: &mut BTreeSet, ) { if keyframes_forced.contains(next_lookahead_frame) || keyframe_detector.analyze_next_frame( lookahead_frames, *next_lookahead_frame, *keyframes.iter().last().unwrap(), ) { keyframes.insert(*next_lookahead_frame); } *next_lookahead_frame += 1; } #[profiling::function] pub fn compute_frame_invariants(&mut self) { while self.set_frame_properties(self.next_lookahead_output_frameno).is_ok() { self .compute_lookahead_motion_vectors(self.next_lookahead_output_frameno); if self.config.temporal_rdo() { self.compute_lookahead_intra_costs(self.next_lookahead_output_frameno); } self.next_lookahead_output_frameno += 1; } } #[profiling::function] fn update_block_importances( fi: &FrameInvariants, me_stats: &crate::me::FrameMEStats, frame: &Frame, reference_frame: &Frame, bit_depth: usize, bsize: BlockSize, len: usize, reference_frame_block_importances: &mut [f32], ) { let coded_data = fi.coded_frame_data.as_ref().unwrap(); let plane_org = &frame.planes[0]; let plane_ref = &reference_frame.planes[0]; let lookahead_intra_costs_lines = coded_data.lookahead_intra_costs.chunks_exact(coded_data.w_in_imp_b); let block_importances_lines = coded_data.block_importances.chunks_exact(coded_data.w_in_imp_b); lookahead_intra_costs_lines .zip(block_importances_lines) .zip(me_stats.rows_iter().step_by(2)) .enumerate() .flat_map( |(y, ((lookahead_intra_costs, block_importances), me_stats_line))| { lookahead_intra_costs .iter() .zip(block_importances.iter()) .zip(me_stats_line.iter().step_by(2)) .enumerate() .map(move |(x, ((&intra_cost, &future_importance), &me_stat))| { let mv = me_stat.mv; // Coordinates of the top-left corner of the reference block, in MV // units. let reference_x = x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64; let reference_y = y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64; let region_org = plane_org.region(Area::Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); let region_ref = plane_ref.region(Area::Rect { x: reference_x as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, y: reference_y as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); let inter_cost = get_satd( ®ion_org, ®ion_ref, bsize.width(), bsize.height(), bit_depth, fi.cpu_feature_level, ) as f32; let intra_cost = intra_cost as f32; // let intra_cost = lookahead_intra_costs[x] as f32; // let future_importance = block_importances[x]; let propagate_fraction = if intra_cost <= inter_cost { 0. } else { 1. - inter_cost / intra_cost }; let propagate_amount = (intra_cost + future_importance) * propagate_fraction / len as f32; (propagate_amount, reference_x, reference_y) }) }, ) .for_each(|(propagate_amount, reference_x, reference_y)| { let mut propagate = |block_x_in_mv_units, block_y_in_mv_units, fraction| { let x = block_x_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; let y = block_y_in_mv_units / IMP_BLOCK_SIZE_IN_MV_UNITS; // TODO: propagate partially if the block is partially off-frame // (possible on right and bottom edges)? if x >= 0 && y >= 0 && (x as usize) < coded_data.w_in_imp_b && (y as usize) < coded_data.h_in_imp_b { reference_frame_block_importances [y as usize * coded_data.w_in_imp_b + x as usize] += propagate_amount * fraction; } }; // Coordinates of the top-left corner of the block intersecting the // reference block from the top-left. let top_left_block_x = (reference_x - if reference_x < 0 { IMP_BLOCK_SIZE_IN_MV_UNITS - 1 } else { 0 }) / IMP_BLOCK_SIZE_IN_MV_UNITS * IMP_BLOCK_SIZE_IN_MV_UNITS; let top_left_block_y = (reference_y - if reference_y < 0 { IMP_BLOCK_SIZE_IN_MV_UNITS - 1 } else { 0 }) / IMP_BLOCK_SIZE_IN_MV_UNITS * IMP_BLOCK_SIZE_IN_MV_UNITS; debug_assert!(reference_x >= top_left_block_x); debug_assert!(reference_y >= top_left_block_y); let top_right_block_x = top_left_block_x + IMP_BLOCK_SIZE_IN_MV_UNITS; let top_right_block_y = top_left_block_y; let bottom_left_block_x = top_left_block_x; let bottom_left_block_y = top_left_block_y + IMP_BLOCK_SIZE_IN_MV_UNITS; let bottom_right_block_x = top_right_block_x; let bottom_right_block_y = bottom_left_block_y; let top_left_block_fraction = ((top_right_block_x - reference_x) * (bottom_left_block_y - reference_y)) as f32 / IMP_BLOCK_AREA_IN_MV_UNITS as f32; propagate(top_left_block_x, top_left_block_y, top_left_block_fraction); let top_right_block_fraction = ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) * (bottom_left_block_y - reference_y)) as f32 / IMP_BLOCK_AREA_IN_MV_UNITS as f32; propagate( top_right_block_x, top_right_block_y, top_right_block_fraction, ); let bottom_left_block_fraction = ((top_right_block_x - reference_x) * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - bottom_left_block_y)) as f32 / IMP_BLOCK_AREA_IN_MV_UNITS as f32; propagate( bottom_left_block_x, bottom_left_block_y, bottom_left_block_fraction, ); let bottom_right_block_fraction = ((reference_x + IMP_BLOCK_SIZE_IN_MV_UNITS - top_right_block_x) * (reference_y + IMP_BLOCK_SIZE_IN_MV_UNITS - bottom_left_block_y)) as f32 / IMP_BLOCK_AREA_IN_MV_UNITS as f32; propagate( bottom_right_block_x, bottom_right_block_y, bottom_right_block_fraction, ); }); } /// Computes the block importances for the current output frame. #[profiling::function] fn compute_block_importances(&mut self) { // SEF don't need block importances. if self.frame_data[&self.output_frameno] .as_ref() .unwrap() .fi .is_show_existing_frame() { return; } // Get a list of output_framenos that we want to propagate through. let output_framenos = self .get_rdo_lookahead_frames() .map(|(&output_frameno, _)| output_frameno) .collect::>(); // The first one should be the current output frame. assert_eq!(output_framenos[0], self.output_frameno); // First, initialize them all with zeros. for output_frameno in output_framenos.iter() { let fi = &mut self .frame_data .get_mut(output_frameno) .unwrap() .as_mut() .unwrap() .fi; for x in fi.coded_frame_data.as_mut().unwrap().block_importances.iter_mut() { *x = 0.; } } // Now compute and propagate the block importances from the end. The // current output frame will get its block importances from the future // frames. let bsize = BlockSize::from_width_and_height( IMPORTANCE_BLOCK_SIZE, IMPORTANCE_BLOCK_SIZE, ); for &output_frameno in output_framenos.iter().skip(1).rev() { // TODO: see comment above about key frames not having references. if self .frame_data .get(&output_frameno) .unwrap() .as_ref() .unwrap() .fi .frame_type == FrameType::KEY { continue; } // Remove fi from the map temporarily and put it back in in the end of // the iteration. This is required because we need to mutably borrow // referenced fis from the map, and that wouldn't be possible if this was // an active borrow. // // Performance note: Contrary to intuition, // removing the data and re-inserting it at the end // is more performant because it avoids a very expensive clone. let output_frame_data = self.frame_data.remove(&output_frameno).unwrap().unwrap(); { let fi = &output_frame_data.fi; let fs = &output_frame_data.fs; let frame = self.frame_q[&fi.input_frameno].as_ref().unwrap(); // There can be at most 3 of these. let mut unique_indices = ArrayVec::<_, 3>::new(); for (mv_index, &rec_index) in fi.ref_frames.iter().enumerate() { if !unique_indices.iter().any(|&(_, r)| r == rec_index) { unique_indices.push((mv_index, rec_index)); } } let bit_depth = self.config.bit_depth; let frame_data = &mut self.frame_data; let len = unique_indices.len(); let lookahead_me_stats = fs.frame_me_stats.read().expect("poisoned lock"); // Compute and propagate the importance, split evenly between the // referenced frames. unique_indices.iter().for_each(|&(mv_index, rec_index)| { // Use rec_buffer here rather than lookahead_rec_buffer because // rec_buffer still contains the reference frames for the current frame // (it's only overwritten when the frame is encoded), while // lookahead_rec_buffer already contains reference frames for the next // frame (for the reference propagation to work correctly). let reference = fi.rec_buffer.frames[rec_index as usize].as_ref().unwrap(); let reference_frame = &reference.frame; let reference_output_frameno = reference.output_frameno; let me_stats = &lookahead_me_stats[mv_index]; // We should never use frame as its own reference. assert_ne!(reference_output_frameno, output_frameno); if let Some(reference_frame_block_importances) = frame_data.get_mut(&reference_output_frameno).map(|data| { &mut data .as_mut() .unwrap() .fi .coded_frame_data .as_mut() .unwrap() .block_importances }) { Self::update_block_importances( fi, me_stats, frame, reference_frame, bit_depth, bsize, len, reference_frame_block_importances, ); } }); } self.frame_data.insert(output_frameno, Some(output_frame_data)); } if !output_framenos.is_empty() { let fi = &mut self .frame_data .get_mut(&output_framenos[0]) .unwrap() .as_mut() .unwrap() .fi; let coded_data = fi.coded_frame_data.as_mut().unwrap(); let block_importances = coded_data.block_importances.iter(); let lookahead_intra_costs = coded_data.lookahead_intra_costs.iter(); let distortion_scales = coded_data.distortion_scales.iter_mut(); for ((&propagate_cost, &intra_cost), distortion_scale) in block_importances.zip(lookahead_intra_costs).zip(distortion_scales) { *distortion_scale = crate::rdo::distortion_scale_for( propagate_cost as f64, intra_cost as f64, ); } #[cfg(feature = "dump_lookahead_data")] { use byteorder::{NativeEndian, WriteBytesExt}; let coded_data = fi.coded_frame_data.as_ref().unwrap(); let mut buf = vec![]; let data_location = Self::build_dump_properties(); let file_name = format!("{:010}-imps", fi.input_frameno); buf.write_u64::(coded_data.h_in_imp_b as u64).unwrap(); buf.write_u64::(coded_data.w_in_imp_b as u64).unwrap(); buf.write_u64::(fi.get_frame_subtype() as u64).unwrap(); for y in 0..coded_data.h_in_imp_b { for x in 0..coded_data.w_in_imp_b { buf .write_f32::(f64::from( coded_data.distortion_scales[y * coded_data.w_in_imp_b + x], ) as f32) .unwrap(); } } ::std::fs::write( data_location.join(file_name).with_extension("bin"), buf, ) .unwrap(); } } } pub(crate) fn encode_packet( &mut self, cur_output_frameno: u64, ) -> Result, EncoderStatus> { if self .frame_data .get(&cur_output_frameno) .unwrap() .as_ref() .unwrap() .fi .is_show_existing_frame() { if !self.rc_state.ready() { return Err(EncoderStatus::NotReady); } self.encode_show_existing_packet(cur_output_frameno) } else if let Some(Some(_)) = self.frame_q.get( &self .frame_data .get(&cur_output_frameno) .unwrap() .as_ref() .unwrap() .fi .input_frameno, ) { if !self.rc_state.ready() { return Err(EncoderStatus::NotReady); } self.encode_normal_packet(cur_output_frameno) } else { Err(EncoderStatus::NeedMoreData) } } #[profiling::function] pub fn encode_show_existing_packet( &mut self, cur_output_frameno: u64, ) -> Result, EncoderStatus> { let frame_data = self.frame_data.get_mut(&cur_output_frameno).unwrap().as_mut().unwrap(); let sef_data = encode_show_existing_frame( &frame_data.fi, &mut frame_data.fs, &self.inter_cfg, ); let bits = (sef_data.len() * 8) as i64; self.packet_data.extend(sef_data); self.rc_state.update_state( bits, FRAME_SUBTYPE_SEF, frame_data.fi.show_frame, 0, false, false, ); let (rec, source) = if frame_data.fi.show_frame { (Some(frame_data.fs.rec.clone()), Some(frame_data.fs.input.clone())) } else { (None, None) }; self.output_frameno += 1; let input_frameno = frame_data.fi.input_frameno; let frame_type = frame_data.fi.frame_type; let qp = frame_data.fi.base_q_idx; let enc_stats = frame_data.fs.enc_stats.clone(); self.finalize_packet(rec, source, input_frameno, frame_type, qp, enc_stats) } #[profiling::function] pub fn encode_normal_packet( &mut self, cur_output_frameno: u64, ) -> Result, EncoderStatus> { let mut frame_data = self.frame_data.remove(&cur_output_frameno).unwrap().unwrap(); let mut log_isqrt_mean_scale = 0i64; if let Some(coded_data) = frame_data.fi.coded_frame_data.as_mut() { if self.config.tune == Tune::Psychovisual { let frame = self.frame_q[&frame_data.fi.input_frameno].as_ref().unwrap(); coded_data.activity_mask = ActivityMask::from_plane(&frame.planes[0]); coded_data.activity_mask.fill_scales( frame_data.fi.sequence.bit_depth, &mut coded_data.activity_scales, ); log_isqrt_mean_scale = coded_data.compute_spatiotemporal_scores(); } else { coded_data.activity_mask = ActivityMask::default(); log_isqrt_mean_scale = coded_data.compute_temporal_scores(); } #[cfg(feature = "dump_lookahead_data")] { use crate::encoder::Scales::*; let input_frameno = frame_data.fi.input_frameno; if self.config.tune == Tune::Psychovisual { coded_data.dump_scales( Self::build_dump_properties(), ActivityScales, input_frameno, ); coded_data.dump_scales( Self::build_dump_properties(), SpatiotemporalScales, input_frameno, ); } coded_data.dump_scales( Self::build_dump_properties(), DistortionScales, input_frameno, ); } } let fti = frame_data.fi.get_frame_subtype(); let qps = self.rc_state.select_qi( self, cur_output_frameno, fti, self.maybe_prev_log_base_q, log_isqrt_mean_scale, ); frame_data.fi.set_quantizers(&qps); if self.rc_state.needs_trial_encode(fti) { let mut trial_fs = frame_data.fs.clone(); let data = encode_frame(&frame_data.fi, &mut trial_fs, &self.inter_cfg); self.rc_state.update_state( (data.len() * 8) as i64, fti, frame_data.fi.show_frame, qps.log_target_q, true, false, ); let qps = self.rc_state.select_qi( self, cur_output_frameno, fti, self.maybe_prev_log_base_q, log_isqrt_mean_scale, ); frame_data.fi.set_quantizers(&qps); } let data = encode_frame(&frame_data.fi, &mut frame_data.fs, &self.inter_cfg); #[cfg(feature = "dump_lookahead_data")] { let input_frameno = frame_data.fi.input_frameno; let data_location = Self::build_dump_properties(); frame_data.fs.segmentation.dump_threshold(data_location, input_frameno); } let enc_stats = frame_data.fs.enc_stats.clone(); self.maybe_prev_log_base_q = Some(qps.log_base_q); // TODO: Add support for dropping frames. self.rc_state.update_state( (data.len() * 8) as i64, fti, frame_data.fi.show_frame, qps.log_target_q, false, false, ); self.packet_data.extend(data); let planes = if frame_data.fi.sequence.chroma_sampling == Cs400 { 1 } else { 3 }; Arc::get_mut(&mut frame_data.fs.rec).unwrap().pad( frame_data.fi.width, frame_data.fi.height, planes, ); let (rec, source) = if frame_data.fi.show_frame { (Some(frame_data.fs.rec.clone()), Some(frame_data.fs.input.clone())) } else { (None, None) }; update_rec_buffer(cur_output_frameno, &mut frame_data.fi, &frame_data.fs); // Copy persistent fields into subsequent FrameInvariants. let rec_buffer = frame_data.fi.rec_buffer.clone(); for subsequent_fi in self .frame_data .iter_mut() .skip_while(|(&output_frameno, _)| output_frameno <= cur_output_frameno) // Here we want the next valid non-show-existing-frame inter frame. // // Copying to show-existing-frame frames isn't actually required // for correct encoding, but it's needed for the reconstruction to // work correctly. .filter_map(|(_, frame_data)| frame_data.as_mut().map(|fd| &mut fd.fi)) .take_while(|fi| fi.frame_type != FrameType::KEY) { subsequent_fi.rec_buffer = rec_buffer.clone(); subsequent_fi.set_ref_frame_sign_bias(); // Stop after the first non-show-existing-frame. if !subsequent_fi.is_show_existing_frame() { break; } } self.frame_data.insert(cur_output_frameno, Some(frame_data)); let frame_data = self.frame_data.get(&cur_output_frameno).unwrap().as_ref().unwrap(); let fi = &frame_data.fi; self.output_frameno += 1; if fi.show_frame { let input_frameno = fi.input_frameno; let frame_type = fi.frame_type; let qp = fi.base_q_idx; self.finalize_packet( rec, source, input_frameno, frame_type, qp, enc_stats, ) } else { Err(EncoderStatus::Encoded) } } #[profiling::function] pub fn receive_packet(&mut self) -> Result, EncoderStatus> { if self.done_processing() { return Err(EncoderStatus::LimitReached); } if self.needs_more_fi_lookahead() { return Err(EncoderStatus::NeedMoreData); } // Find the next output_frameno corresponding to a non-skipped frame. self.output_frameno = self .frame_data .iter() .skip_while(|(&output_frameno, _)| output_frameno < self.output_frameno) .find(|(_, data)| data.is_some()) .map(|(&output_frameno, _)| output_frameno) .ok_or(EncoderStatus::NeedMoreData)?; // TODO: doesn't play well with the below check? let input_frameno = self.frame_data[&self.output_frameno].as_ref().unwrap().fi.input_frameno; if !self.needs_more_frames(input_frameno) { return Err(EncoderStatus::LimitReached); } if self.config.temporal_rdo() { // Compute the block importances for the current output frame. self.compute_block_importances(); } let cur_output_frameno = self.output_frameno; let mut ret = self.encode_packet(cur_output_frameno); if let Ok(ref mut pkt) = ret { self.garbage_collect(pkt.input_frameno); pkt.opaque = self.opaque_q.remove(&pkt.input_frameno); } ret } fn finalize_packet( &mut self, rec: Option>>, source: Option>>, input_frameno: u64, frame_type: FrameType, qp: u8, enc_stats: EncoderStats, ) -> Result, EncoderStatus> { let data = self.packet_data.clone(); self.packet_data.clear(); if write_temporal_delimiter(&mut self.packet_data).is_err() { return Err(EncoderStatus::Failure); } self.frames_processed += 1; Ok(Packet { data, rec, source, input_frameno, frame_type, qp, enc_stats, opaque: None, }) } #[profiling::function] fn garbage_collect(&mut self, cur_input_frameno: u64) { if cur_input_frameno == 0 { return; } let frame_q_start = self.frame_q.keys().next().cloned().unwrap_or(0); for i in frame_q_start..cur_input_frameno { self.frame_q.remove(&i); } if self.output_frameno < 2 { return; } let fi_start = self.frame_data.keys().next().cloned().unwrap_or(0); for i in fi_start..(self.output_frameno - 1) { self.frame_data.remove(&i); self.gop_output_frameno_start.remove(&i); self.gop_input_frameno_start.remove(&i); } } /// Counts the number of output frames of each subtype in the next /// `reservoir_frame_delay` temporal units (needed for rate control). /// Returns the number of output frames (excluding SEF frames) and output TUs /// until the last keyframe in the next `reservoir_frame_delay` temporal units, /// or the end of the interval, whichever comes first. /// The former is needed because it indicates the number of rate estimates we /// will make. /// The latter is needed because it indicates the number of times new bitrate /// is added to the buffer. pub(crate) fn guess_frame_subtypes( &self, nframes: &mut [i32; FRAME_NSUBTYPES + 1], reservoir_frame_delay: i32, ) -> (i32, i32) { for fti in 0..=FRAME_NSUBTYPES { nframes[fti] = 0; } // Two-pass calls this function before receive_packet(), and in particular // before the very first send_frame(), when the following maps are empty. // In this case, return 0 as the default value. let mut prev_keyframe_input_frameno = *self .gop_input_frameno_start .get(&self.output_frameno) .unwrap_or_else(|| { assert!(self.output_frameno == 0); &0 }); let mut prev_keyframe_output_frameno = *self .gop_output_frameno_start .get(&self.output_frameno) .unwrap_or_else(|| { assert!(self.output_frameno == 0); &0 }); let mut prev_keyframe_ntus = 0; // Does not include SEF frames. let mut prev_keyframe_nframes = 0; let mut acc: [i32; FRAME_NSUBTYPES + 1] = [0; FRAME_NSUBTYPES + 1]; // Updates the frame counts with the accumulated values when we hit a // keyframe. fn collect_counts( nframes: &mut [i32; FRAME_NSUBTYPES + 1], acc: &mut [i32; FRAME_NSUBTYPES + 1], ) { for fti in 0..=FRAME_NSUBTYPES { nframes[fti] += acc[fti]; acc[fti] = 0; } acc[FRAME_SUBTYPE_I] += 1; } let mut output_frameno = self.output_frameno; let mut ntus = 0; // Does not include SEF frames. let mut nframes_total = 0; while ntus < reservoir_frame_delay { let output_frameno_in_gop = output_frameno - prev_keyframe_output_frameno; let is_kf = if let Some(Some(frame_data)) = self.frame_data.get(&output_frameno) { if frame_data.fi.frame_type == FrameType::KEY { prev_keyframe_input_frameno = frame_data.fi.input_frameno; // We do not currently use forward keyframes, so they should always // end the current TU (thus we always increment ntus below). debug_assert!(frame_data.fi.show_frame); true } else { false } } else { // It is possible to be invoked for the first time from twopass_out() // before receive_packet() is called, in which case frame_invariants // will not be populated. // Force the first frame in each GOP to be a keyframe in that case. output_frameno_in_gop == 0 }; if is_kf { collect_counts(nframes, &mut acc); prev_keyframe_output_frameno = output_frameno; prev_keyframe_ntus = ntus; prev_keyframe_nframes = nframes_total; output_frameno += 1; ntus += 1; nframes_total += 1; continue; } let idx_in_group_output = self.inter_cfg.get_idx_in_group_output(output_frameno_in_gop); let input_frameno = prev_keyframe_input_frameno + self .inter_cfg .get_order_hint(output_frameno_in_gop, idx_in_group_output) as u64; // For rate control purposes, ignore any limit on frame count that has // been set. // We pretend that we will keep encoding frames forever to prevent the // control loop from driving us into the rails as we come up against a // hard stop (with no more chance to correct outstanding errors). let next_keyframe_input_frameno = self.next_keyframe_input_frameno(prev_keyframe_input_frameno, true); // If we are re-ordering, we may skip some output frames in the final // re-order group of the GOP. if input_frameno >= next_keyframe_input_frameno { // If we have encoded enough whole groups to reach the next keyframe, // then start the next keyframe gop. if 1 + (output_frameno - prev_keyframe_output_frameno) / self.inter_cfg.group_output_len * self.inter_cfg.group_input_len >= next_keyframe_input_frameno - prev_keyframe_input_frameno { collect_counts(nframes, &mut acc); prev_keyframe_input_frameno = input_frameno; prev_keyframe_output_frameno = output_frameno; prev_keyframe_ntus = ntus; prev_keyframe_nframes = nframes_total; // We do not currently use forward keyframes, so they should always // end the current TU. output_frameno += 1; ntus += 1; } output_frameno += 1; continue; } if self.inter_cfg.get_show_existing_frame(idx_in_group_output) { acc[FRAME_SUBTYPE_SEF] += 1; } else { // TODO: Implement golden P-frames. let fti = FRAME_SUBTYPE_P + (self.inter_cfg.get_level(idx_in_group_output) as usize); acc[fti] += 1; nframes_total += 1; } if self.inter_cfg.get_show_frame(idx_in_group_output) { ntus += 1; } output_frameno += 1; } if prev_keyframe_output_frameno <= self.output_frameno { // If there were no keyframes at all, or only the first frame was a // keyframe, the accumulators never flushed and still contain counts for // the entire buffer. // In both cases, we return these counts. collect_counts(nframes, &mut acc); (nframes_total, ntus) } else { // Otherwise, we discard what remains in the accumulators as they contain // the counts from and past the last keyframe. (prev_keyframe_nframes, prev_keyframe_ntus) } } } rav1e-0.7.1/src/api/lookahead.rs000064400000000000000000000207731046102023000145150ustar 00000000000000use crate::api::internal::InterConfig; use crate::config::EncoderConfig; use crate::context::{BlockOffset, FrameBlocks, TileBlockOffset}; use crate::cpu_features::CpuFeatureLevel; use crate::dist::get_satd; use crate::encoder::{ FrameInvariants, FrameState, Sequence, IMPORTANCE_BLOCK_SIZE, }; use crate::frame::{AsRegion, PlaneOffset}; use crate::me::{estimate_tile_motion, RefMEStats}; use crate::partition::{get_intra_edges, BlockSize}; use crate::predict::{IntraParam, PredictionMode}; use crate::tiling::{Area, PlaneRegion, TileRect}; use crate::transform::TxSize; use crate::util::Aligned; use crate::Pixel; use rayon::iter::*; use std::sync::Arc; use v_frame::frame::Frame; use v_frame::pixel::CastFromPrimitive; use v_frame::plane::Plane; pub(crate) const IMP_BLOCK_MV_UNITS_PER_PIXEL: i64 = 8; pub(crate) const IMP_BLOCK_SIZE_IN_MV_UNITS: i64 = IMPORTANCE_BLOCK_SIZE as i64 * IMP_BLOCK_MV_UNITS_PER_PIXEL; pub(crate) const IMP_BLOCK_AREA_IN_MV_UNITS: i64 = IMP_BLOCK_SIZE_IN_MV_UNITS * IMP_BLOCK_SIZE_IN_MV_UNITS; #[profiling::function] pub(crate) fn estimate_intra_costs( temp_plane: &mut Plane, frame: &Frame, bit_depth: usize, cpu_feature_level: CpuFeatureLevel, ) -> Box<[u32]> { let plane = &frame.planes[0]; let plane_after_prediction = temp_plane; let bsize = BlockSize::from_width_and_height( IMPORTANCE_BLOCK_SIZE, IMPORTANCE_BLOCK_SIZE, ); let tx_size = bsize.tx_size(); let h_in_imp_b = plane.cfg.height / IMPORTANCE_BLOCK_SIZE; let w_in_imp_b = plane.cfg.width / IMPORTANCE_BLOCK_SIZE; let mut intra_costs = Vec::with_capacity(h_in_imp_b * w_in_imp_b); for y in 0..h_in_imp_b { for x in 0..w_in_imp_b { let plane_org = plane.region(Area::Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); // TODO: other intra prediction modes. let mut edge_buf = Aligned::uninit_array(); let edge_buf = get_intra_edges( &mut edge_buf, &plane.as_region(), TileBlockOffset(BlockOffset { x, y }), 0, 0, bsize, PlaneOffset { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, }, TxSize::TX_8X8, bit_depth, Some(PredictionMode::DC_PRED), false, IntraParam::None, ); let mut plane_after_prediction_region = plane_after_prediction .region_mut(Area::Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); PredictionMode::DC_PRED.predict_intra( TileRect { x: x * IMPORTANCE_BLOCK_SIZE, y: y * IMPORTANCE_BLOCK_SIZE, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }, &mut plane_after_prediction_region, tx_size, bit_depth, &[], // Not used by DC_PRED IntraParam::None, None, // Not used by DC_PRED &edge_buf, cpu_feature_level, ); let plane_after_prediction_region = plane_after_prediction.region(Area::Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); let intra_cost = get_satd( &plane_org, &plane_after_prediction_region, bsize.width(), bsize.height(), bit_depth, cpu_feature_level, ); intra_costs.push(intra_cost); } } intra_costs.into_boxed_slice() } #[profiling::function] pub(crate) fn estimate_importance_block_difference( frame: Arc>, ref_frame: Arc>, ) -> f64 { let plane_org = &frame.planes[0]; let plane_ref = &ref_frame.planes[0]; let h_in_imp_b = plane_org.cfg.height / IMPORTANCE_BLOCK_SIZE; let w_in_imp_b = plane_org.cfg.width / IMPORTANCE_BLOCK_SIZE; let mut imp_block_costs = 0; (0..h_in_imp_b).for_each(|y| { (0..w_in_imp_b).for_each(|x| { // Coordinates of the top-left corner of the reference block, in MV // units. let region_org = plane_org.region(Area::Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); let region_ref = plane_ref.region(Area::Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); let sum_8x8_block = |region: &PlaneRegion| { region .rows_iter() .map(|row| { // 16-bit precision is sufficient for an 8px row, as IMPORTANCE_BLOCK_SIZE * (2^12 - 1) < 2^16 - 1, // so overflow is not possible row.iter().map(|pixel| u16::cast_from(*pixel)).sum::() as i64 }) .sum::() }; let histogram_org_sum = sum_8x8_block(®ion_org); let histogram_ref_sum = sum_8x8_block(®ion_ref); let count = (IMPORTANCE_BLOCK_SIZE * IMPORTANCE_BLOCK_SIZE) as i64; let mean = (((histogram_org_sum + count / 2) / count) - ((histogram_ref_sum + count / 2) / count)) .abs(); imp_block_costs += mean as u64; }); }); imp_block_costs as f64 / (w_in_imp_b * h_in_imp_b) as f64 } #[profiling::function] pub(crate) fn estimate_inter_costs( frame: Arc>, ref_frame: Arc>, bit_depth: usize, mut config: EncoderConfig, sequence: Arc, buffer: RefMEStats, ) -> f64 { config.low_latency = true; config.speed_settings.multiref = false; let inter_cfg = InterConfig::new(&config); let last_fi = FrameInvariants::new_key_frame( Arc::new(config), sequence, 0, Box::new([]), ); let mut fi = FrameInvariants::new_inter_frame( &last_fi, &inter_cfg, 0, 1, 2, false, Box::new([]), ) .unwrap(); // Compute the motion vectors. let mut fs = FrameState::new_with_frame_and_me_stats_and_rec( &fi, Arc::clone(&frame), buffer, // We do not use this field, so we can avoid the expensive allocation Arc::new(Frame { planes: [ Plane::new(0, 0, 0, 0, 0, 0), Plane::new(0, 0, 0, 0, 0, 0), Plane::new(0, 0, 0, 0, 0, 0), ], }), ); compute_motion_vectors(&mut fi, &mut fs, &inter_cfg); // Estimate inter costs let plane_org = &frame.planes[0]; let plane_ref = &ref_frame.planes[0]; let h_in_imp_b = plane_org.cfg.height / IMPORTANCE_BLOCK_SIZE; let w_in_imp_b = plane_org.cfg.width / IMPORTANCE_BLOCK_SIZE; let stats = &fs.frame_me_stats.read().expect("poisoned lock")[0]; let bsize = BlockSize::from_width_and_height( IMPORTANCE_BLOCK_SIZE, IMPORTANCE_BLOCK_SIZE, ); let mut inter_costs = 0; (0..h_in_imp_b).for_each(|y| { (0..w_in_imp_b).for_each(|x| { let mv = stats[y * 2][x * 2].mv; // Coordinates of the top-left corner of the reference block, in MV // units. let reference_x = x as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.col as i64; let reference_y = y as i64 * IMP_BLOCK_SIZE_IN_MV_UNITS + mv.row as i64; let region_org = plane_org.region(Area::Rect { x: (x * IMPORTANCE_BLOCK_SIZE) as isize, y: (y * IMPORTANCE_BLOCK_SIZE) as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); let region_ref = plane_ref.region(Area::Rect { x: reference_x as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, y: reference_y as isize / IMP_BLOCK_MV_UNITS_PER_PIXEL as isize, width: IMPORTANCE_BLOCK_SIZE, height: IMPORTANCE_BLOCK_SIZE, }); inter_costs += get_satd( ®ion_org, ®ion_ref, bsize.width(), bsize.height(), bit_depth, fi.cpu_feature_level, ) as u64; }); }); inter_costs as f64 / (w_in_imp_b * h_in_imp_b) as f64 } #[profiling::function] pub(crate) fn compute_motion_vectors( fi: &mut FrameInvariants, fs: &mut FrameState, inter_cfg: &InterConfig, ) { let mut blocks = FrameBlocks::new(fi.w_in_b, fi.h_in_b); fi.sequence .tiling .tile_iter_mut(fs, &mut blocks) .collect::>() .into_par_iter() .for_each(|mut ctx| { let ts = &mut ctx.ts; estimate_tile_motion(fi, ts, inter_cfg); }); } rav1e-0.7.1/src/api/mod.rs000064400000000000000000000021151046102023000133330ustar 00000000000000// Copyright (c) 2018-2020, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![deny(missing_docs)] /// Channel-based encoder #[cfg(all(feature = "channel-api", feature = "unstable"))] pub mod channel; /// Color model information pub mod color; /// Encoder Configuration pub mod config; /// Encoder Context pub mod context; /// Internal implementation pub(crate) mod internal; /// Lookahead-specific methods pub(crate) mod lookahead; mod util; #[cfg(test)] mod test; #[cfg(all(feature = "channel-api", feature = "unstable"))] pub use channel::*; pub use color::*; pub use config::*; pub use context::*; pub(crate) use internal::*; pub use util::*; rav1e-0.7.1/src/api/test.rs000064400000000000000000001474511046102023000135500ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::encoder::FrameInvariants; use crate::prelude::*; use std::sync::Arc; use interpolate_name::interpolate_test; fn setup_config( w: usize, h: usize, speed: u8, quantizer: usize, bit_depth: usize, chroma_sampling: ChromaSampling, min_keyint: u64, max_keyint: u64, bitrate: i32, low_latency: bool, switch_frame_interval: u64, no_scene_detection: bool, rdo_lookahead_frames: usize, min_quantizer: Option, ) -> Config { let mut enc = EncoderConfig::with_speed_preset(speed); enc.quantizer = quantizer; enc.min_key_frame_interval = min_keyint; enc.max_key_frame_interval = max_keyint; enc.low_latency = low_latency; enc.switch_frame_interval = switch_frame_interval; enc.width = w; enc.height = h; enc.bit_depth = bit_depth; enc.chroma_sampling = chroma_sampling; enc.bitrate = bitrate; if no_scene_detection { enc.speed_settings.scene_detection_mode = SceneDetectionSpeed::None; } enc.speed_settings.rdo_lookahead_frames = rdo_lookahead_frames; if let Some(min_quantizer) = min_quantizer { enc.min_quantizer = min_quantizer; } Config::new().with_encoder_config(enc).with_threads(1) } fn setup_encoder( w: usize, h: usize, speed: u8, quantizer: usize, bit_depth: usize, chroma_sampling: ChromaSampling, min_keyint: u64, max_keyint: u64, bitrate: i32, low_latency: bool, switch_frame_interval: u64, no_scene_detection: bool, rdo_lookahead_frames: usize, min_quantizer: Option, ) -> Context { let cfg = setup_config( w, h, speed, quantizer, bit_depth, chroma_sampling, min_keyint, max_keyint, bitrate, low_latency, switch_frame_interval, no_scene_detection, rdo_lookahead_frames, min_quantizer, ); cfg.new_context().unwrap() } /* fn fill_frame(ra: &mut ChaChaRng, frame: &mut Frame) { for plane in frame.planes.iter_mut() { let stride = plane.cfg.stride; for row in plane.data.chunks_mut(stride) { for pixel in row { let v: u8 = ra.gen(); *pixel = T::cast_from(v); } } } } */ fn fill_frame_const(frame: &mut Frame, value: T) { for plane in frame.planes.iter_mut() { let stride = plane.cfg.stride; for row in plane.data.chunks_mut(stride) { for pixel in row { *pixel = value; } } } } #[cfg(feature = "channel-api")] mod channel { use super::*; #[interpolate_test(low_latency_no_scene_change, true, true)] #[interpolate_test(reorder_no_scene_change, false, true)] #[interpolate_test(low_latency_scene_change_detection, true, false)] #[interpolate_test(reorder_scene_change_detection, false, false)] fn flush(low_lantency: bool, no_scene_detection: bool) { let cfg = setup_config( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 150, 200, 0, low_lantency, 0, no_scene_detection, 10, None, ); let limit = 41; let (mut sf, rp) = cfg.new_channel::().unwrap(); for _ in 0..limit { let input = sf.new_frame(); let _ = sf.send(input); } drop(sf); let mut count = 0; for _ in 0..limit { let _ = rp .recv() .map(|_| { eprintln!("Packet Received {}/{}", count, limit); count += 1; }) .unwrap(); } assert_eq!(limit, count); } } #[interpolate_test(low_latency_no_scene_change, true, true)] #[interpolate_test(reorder_no_scene_change, false, true)] #[interpolate_test(low_latency_scene_change_detection, true, false)] #[interpolate_test(reorder_scene_change_detection, false, false)] fn flush(low_lantency: bool, no_scene_detection: bool) { let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 150, 200, 0, low_lantency, 0, no_scene_detection, 10, None, ); let limit = 41; for _ in 0..limit { let input = ctx.new_frame(); let _ = ctx.send_frame(input); } ctx.flush(); let mut count = 0; 'out: for _ in 0..limit { loop { match ctx.receive_packet() { Ok(_) => { eprintln!("Packet Received {}/{}", count, limit); count += 1; } Err(EncoderStatus::EnoughData) => { eprintln!("{:?}", EncoderStatus::EnoughData); break 'out; } Err(e) => { eprintln!("{:?}", e); break; } } } } assert_eq!(limit, count); } #[interpolate_test(low_latency_no_scene_change, true, true)] #[interpolate_test(reorder_no_scene_change, false, true)] #[interpolate_test(low_latency_scene_change_detection, true, false)] #[interpolate_test(reorder_scene_change_detection, false, false)] fn flush_unlimited(low_lantency: bool, no_scene_detection: bool) { let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 150, 200, 0, low_lantency, 0, no_scene_detection, 10, None, ); let limit = 41; for _ in 0..limit { let input = ctx.new_frame(); let _ = ctx.send_frame(input); } ctx.flush(); let mut count = 0; 'out: for _ in 0..limit { loop { match ctx.receive_packet() { Ok(_) => { eprintln!("Packet Received {}/{}", count, limit); count += 1; } Err(EncoderStatus::EnoughData) => { eprintln!("{:?}", EncoderStatus::EnoughData); break 'out; } Err(e) => { eprintln!("{:?}", e); break; } } } } assert_eq!(limit, count); } fn send_frames( ctx: &mut Context, limit: u64, scene_change_at: u64, ) { for i in 0..limit { if i < scene_change_at { send_test_frame(ctx, T::min_value()); } else { send_test_frame(ctx, T::max_value()); } } } fn send_test_frame(ctx: &mut Context, content_value: T) { let mut input = ctx.new_frame(); fill_frame_const(&mut input, content_value); let _ = ctx.send_frame(Arc::new(input)); } fn get_frame_invariants( ctx: Context, ) -> impl Iterator>> { ctx.inner.frame_data.into_values().map(|v| v.map(|v| v.fi)) } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] fn output_frameno_low_latency_minus(missing: u64) { // Test output_frameno configurations when there are less frames // than the perfect subgop size, in no-reorder mode. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 5, 5, 0, true, 0, true, 10, None, ); let limit = 10 - missing; send_frames(&mut ctx, limit, 0); ctx.flush(); let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], match missing { 0 => { &[ Some(0), // I-frame Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame Some(6), Some(7), Some(8), Some(9), ][..] } 1 => { &[ Some(0), // I-frame Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame Some(6), Some(7), Some(8), ][..] } _ => unreachable!(), } ); } #[test] fn switch_frame_interval() { // Test output_frameno configurations when there are less frames // than the perfect subgop size, in no-reorder mode. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 5, 5, 0, true, 2, true, 10, None, ); let limit = 10; send_frames(&mut ctx, limit, 0); ctx.flush(); let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| (fi.input_frameno, fi.frame_type))) .collect::>(); assert_eq!( &data[..], &[ Some((0, FrameType::KEY)), Some((1, FrameType::INTER)), Some((2, FrameType::SWITCH)), Some((3, FrameType::INTER)), Some((4, FrameType::SWITCH)), Some((5, FrameType::KEY)), Some((6, FrameType::INTER)), Some((7, FrameType::SWITCH)), Some((8, FrameType::INTER)), Some((9, FrameType::SWITCH)), ][..] ); } #[test] fn minimum_frame_delay() { let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 5, 5, 0, true, 0, true, 1, None, ); let limit = 4; // 4 frames in for 1 frame out (delay of 3 frames) send_frames(&mut ctx, limit, 0); let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| (fi.input_frameno, fi.frame_type))) .collect::>(); assert_eq!(&data[..], &[Some((0, FrameType::KEY))][..]); } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] fn pyramid_level_low_latency_minus(missing: u64) { // Test pyramid_level configurations when there are less frames // than the perfect subgop size, in no-reorder mode. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 5, 5, 0, true, 0, true, 10, None, ); let limit = 10 - missing; send_frames(&mut ctx, limit, 0); ctx.flush(); // data[output_frameno] = pyramid_level assert!(get_frame_invariants(ctx) .map(|fi| fi.unwrap().pyramid_level) .all(|pyramid_level| pyramid_level == 0)); } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] #[interpolate_test(4, 4)] fn output_frameno_reorder_minus(missing: u64) { // Test output_frameno configurations when there are less frames // than the perfect subgop size. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 5, 5, 0, false, 0, true, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 10 - missing; send_frames(&mut ctx, limit, 0); ctx.flush(); // data[output_frameno] = (input_frameno, !invalid) let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], match missing { 0 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame Some(9), Some(7), Some(6), Some(7), Some(8), Some(9), ][..] } 1 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame None, Some(7), Some(6), Some(7), Some(8), None, ][..] } 2 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame None, Some(7), Some(6), Some(7), None, None, ][..] } 3 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame None, None, Some(6), None, None, None, ][..] } 4 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame ][..] } _ => unreachable!(), } ); } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] #[interpolate_test(4, 4)] fn pyramid_level_reorder_minus(missing: u64) { // Test pyramid_level configurations when there are less frames // than the perfect subgop size. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 5, 5, 0, false, 0, true, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 10 - missing; send_frames(&mut ctx, limit, 0); ctx.flush(); // data[output_frameno] = pyramid_level let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.pyramid_level)) .collect::>(); assert_eq!( &data[..], match missing { 0 => { &[ Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), ][..] } 1 => { &[ Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), // I-frame None, Some(1), Some(2), Some(1), Some(2), None, ][..] } 2 => { &[ Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), // I-frame None, Some(1), Some(2), Some(1), None, None, ][..] } 3 => { &[ Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), // I-frame None, None, Some(2), None, None, None, ][..] } 4 => { &[ Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), // I-frame ][..] } _ => unreachable!(), } ); } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] #[interpolate_test(4, 4)] fn output_frameno_reorder_scene_change_at(scene_change_at: u64) { // Test output_frameno configurations when there's a scene change at the // th frame. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 5, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 10; send_frames(&mut ctx, limit, scene_change_at); ctx.flush(); // data[output_frameno] = (input_frameno, !invalid) let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], match scene_change_at { 0 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), Some(9), Some(7), Some(6), Some(7), Some(8), Some(9), ][..] } 1 => { &[ Some(0), // I-frame Some(1), // I-frame Some(5), Some(3), Some(2), Some(3), Some(4), Some(5), Some(6), None, Some(8), Some(7), Some(8), Some(9), None, ][..] } 2 => { &[ Some(0), // I-frame None, None, Some(1), None, None, None, Some(2), // I-frame Some(6), Some(4), Some(3), Some(4), Some(5), Some(6), Some(7), None, Some(9), Some(8), Some(9), None, None, ][..] } 3 => { &[ Some(0), // I-frame None, Some(2), Some(1), Some(2), None, None, Some(3), // I-frame Some(7), Some(5), Some(4), Some(5), Some(6), Some(7), Some(8), None, None, Some(9), None, None, None, ][..] } 4 => { &[ Some(0), // I-frame None, Some(2), Some(1), Some(2), Some(3), None, Some(4), // I-frame Some(8), Some(6), Some(5), Some(6), Some(7), Some(8), Some(9), ][..] } _ => unreachable!(), } ); } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] #[interpolate_test(4, 4)] fn pyramid_level_reorder_scene_change_at(scene_change_at: u64) { // Test pyramid_level configurations when there's a scene change at the // th frame. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 5, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 10; send_frames(&mut ctx, limit, scene_change_at); ctx.flush(); // data[output_frameno] = pyramid_level let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.pyramid_level)) .collect::>(); assert_eq!( &data[..], match scene_change_at { 0 => { &[ Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), ][..] } 1 => { &[ Some(0), // I-frame Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), None, Some(1), Some(2), Some(1), Some(2), None, ][..] } 2 => { &[ Some(0), // I-frame None, None, Some(2), None, None, None, Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), None, Some(1), Some(2), Some(1), None, None, ][..] } 3 => { &[ Some(0), // I-frame None, Some(1), Some(2), Some(1), None, None, Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), None, None, Some(2), None, None, None, ][..] } 4 => { &[ Some(0), // I-frame None, Some(1), Some(2), Some(1), Some(2), None, Some(0), // I-frame Some(0), Some(1), Some(2), Some(1), Some(2), Some(0), Some(0), ][..] } _ => unreachable!(), } ); } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] #[interpolate_test(4, 4)] fn output_frameno_incremental_reorder_minus(missing: u64) { // Test output_frameno configurations when there are less frames // than the perfect subgop size, computing the lookahead data incrementally. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 5, 5, 0, false, 0, true, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 10 - missing; for _ in 0..limit { send_frames(&mut ctx, 1, 0); } ctx.flush(); // data[output_frameno] = (input_frameno, !invalid) let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], match missing { 0 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame Some(9), Some(7), Some(6), Some(7), Some(8), Some(9), ][..] } 1 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame None, Some(7), Some(6), Some(7), Some(8), None, ][..] } 2 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame None, Some(7), Some(6), Some(7), None, None, ][..] } 3 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame None, None, Some(6), None, None, None, ][..] } 4 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), // I-frame ][..] } _ => unreachable!(), } ); } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] #[interpolate_test(4, 4)] fn output_frameno_incremental_reorder_scene_change_at(scene_change_at: u64) { // Test output_frameno configurations when there's a scene change at the // th frame, computing the lookahead data incrementally. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 5, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 10; for i in 0..limit { send_frames(&mut ctx, 1, scene_change_at.saturating_sub(i)); } ctx.flush(); // data[output_frameno] = (input_frameno, !invalid) let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], match scene_change_at { 0 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), Some(9), Some(7), Some(6), Some(7), Some(8), Some(9), ][..] } 1 => { &[ Some(0), // I-frame Some(1), // I-frame Some(5), Some(3), Some(2), Some(3), Some(4), Some(5), Some(6), None, Some(8), Some(7), Some(8), Some(9), None, ][..] } 2 => { &[ Some(0), // I-frame None, None, Some(1), None, None, None, Some(2), // I-frame Some(6), Some(4), Some(3), Some(4), Some(5), Some(6), Some(7), None, Some(9), Some(8), Some(9), None, None, ][..] } 3 => { &[ Some(0), // I-frame None, Some(2), Some(1), Some(2), None, None, Some(3), // I-frame Some(7), Some(5), Some(4), Some(5), Some(6), Some(7), Some(8), None, None, Some(9), None, None, None, ][..] } 4 => { &[ Some(0), // I-frame None, Some(2), Some(1), Some(2), Some(3), None, Some(4), // I-frame Some(8), Some(6), Some(5), Some(6), Some(7), Some(8), Some(9), ][..] } _ => unreachable!(), } ); } fn send_frame_kf(ctx: &mut Context, keyframe: bool) { let input = ctx.new_frame(); let frame_type_override = if keyframe { FrameTypeOverride::Key } else { FrameTypeOverride::No }; let opaque = Some(Opaque::new(keyframe)); let fp = FrameParameters { frame_type_override, opaque, t35_metadata: Box::new([]), }; let _ = ctx.send_frame((input, fp)); } #[test] fn test_opaque_delivery() { let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 5, 0, false, 0, false, 10, None, ); let kf_at = 3; let limit = 5; for i in 0..limit { send_frame_kf(&mut ctx, kf_at == i); } ctx.flush(); while let Ok(pkt) = ctx.receive_packet() { let Packet { opaque, input_frameno, .. } = pkt; if let Some(opaque) = opaque { let kf = opaque.downcast::().unwrap(); assert_eq!(kf, Box::new(input_frameno == kf_at)); } } } fn send_frame_t35(ctx: &mut Context) { let input = ctx.new_frame(); let frame_type_override = FrameTypeOverride::No; let opaque = None; let t35_metadata = Box::new([T35 { country_code: 0xFF, country_code_extension_byte: 0x00, data: Box::new(*b"AYAYA"), }]); let fp = FrameParameters { frame_type_override, opaque, t35_metadata }; let _ = ctx.send_frame((input, fp)); } #[test] fn test_t35_parameter() { let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 5, 0, false, 0, false, 10, None, ); let limit = 2; for _ in 0..limit { send_frame_t35(&mut ctx); } ctx.flush(); while let Ok(_) = ctx.receive_packet() {} } #[interpolate_test(0, 0)] #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] #[interpolate_test(4, 4)] fn output_frameno_incremental_reorder_keyframe_at(kf_at: u64) { // Test output_frameno configurations when there's a forced keyframe at the // th frame, computing the lookahead data incrementally. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 5, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 5; for i in 0..limit { send_frame_kf(&mut ctx, kf_at == i); } ctx.flush(); // data[output_frameno] = (input_frameno, !invalid) let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], match kf_at { 0 => { &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), ][..] } 1 => { &[ Some(0), // I-frame Some(1), // I-frame None, Some(3), Some(2), Some(3), Some(4), None, ][..] } 2 => { &[ Some(0), // I-frame None, None, Some(1), None, None, None, Some(2), // I-frame None, Some(4), Some(3), Some(4), None, None, ][..] } 3 => { &[ Some(0), // I-frame None, Some(2), Some(1), Some(2), None, None, Some(3), // I-frame None, None, Some(4), None, None, None, ][..] } 4 => { &[ Some(0), // I-frame None, Some(2), Some(1), Some(2), Some(3), None, Some(4), // I-frame ][..] } _ => unreachable!(), } ); } #[interpolate_test(1, 1)] #[interpolate_test(2, 2)] #[interpolate_test(3, 3)] fn output_frameno_no_scene_change_at_short_flash(flash_at: u64) { // Test output_frameno configurations when there's a single-frame flash at the // th frame. let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 5, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); let limit = 5; for i in 0..limit { if i == flash_at { send_test_frame(&mut ctx, u8::MIN); } else { send_test_frame(&mut ctx, u8::MAX); } } ctx.flush(); // data[output_frameno] = (input_frameno, !invalid) let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), ] ); } #[test] fn output_frameno_no_scene_change_at_flash_smaller_than_max_len_flash() { // Test output_frameno configurations when there's a multi-frame flash // with length equal to the max flash length let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 10, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); assert_eq!(ctx.inner.inter_cfg.group_input_len, 4); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); ctx.flush(); let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), None, Some(6), Some(5), Some(6), Some(7), None, ] ); } #[test] fn output_frameno_scene_change_before_flash_longer_than_max_flash_len() { // Test output_frameno configurations when there's a multi-frame flash // with length greater than the max flash length let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 10, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); assert_eq!(ctx.inner.inter_cfg.group_input_len, 4); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MAX); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); ctx.flush(); let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], &[ Some(0), // I-frame None, None, Some(1), None, None, None, Some(2), // I-frame Some(6), Some(4), Some(3), Some(4), Some(5), Some(6), Some(10), Some(8), Some(7), Some(8), Some(9), Some(10), None, None, Some(11), None, None, None, ] ); } #[test] fn output_frameno_scene_change_after_multiple_flashes() { // Test output_frameno configurations when there are multiple consecutive flashes let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 10, 0, false, 0, false, 10, None, ); // TODO: when we support more pyramid depths, this test will need tweaks. assert_eq!(ctx.inner.inter_cfg.pyramid_depth, 2); assert_eq!(ctx.inner.inter_cfg.group_input_len, 4); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, u8::MIN); send_test_frame(&mut ctx, 40); send_test_frame(&mut ctx, 100); send_test_frame(&mut ctx, 160); send_test_frame(&mut ctx, 240); send_test_frame(&mut ctx, 240); send_test_frame(&mut ctx, 240); send_test_frame(&mut ctx, 240); send_test_frame(&mut ctx, 240); send_test_frame(&mut ctx, 240); ctx.flush(); let data = get_frame_invariants(ctx) .map(|fi| fi.map(|fi| fi.input_frameno)) .collect::>(); assert_eq!( &data[..], &[ Some(0), // I-frame Some(4), Some(2), Some(1), Some(2), Some(3), Some(4), Some(5), Some(9), Some(7), Some(6), Some(7), Some(8), Some(9), None, None, Some(10), None, None, None, ] ); } #[derive(Clone, Copy)] struct LookaheadTestExpectations { pre_receive_frame_q_lens: [usize; 60], pre_receive_fi_lens: [usize; 60], post_receive_frame_q_lens: [usize; 60], post_receive_fi_lens: [usize; 60], } #[test] fn lookahead_size_properly_bounded_8() { const LOOKAHEAD_SIZE: usize = 8; const EXPECTATIONS: LookaheadTestExpectations = LookaheadTestExpectations { pre_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, 21, 19, 20, 20, ], pre_receive_fi_lens: [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 7, 7, 7, 7, 13, 13, 13, 13, 19, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, 19, 14, 20, 19, ], post_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, 18, 19, 19, 20, ], post_receive_fi_lens: [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 7, 7, 7, 7, 13, 13, 13, 13, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, 14, 14, 19, 19, ], }; lookahead_size_properly_bounded(LOOKAHEAD_SIZE, false, &EXPECTATIONS); } #[test] fn lookahead_size_properly_bounded_10() { const LOOKAHEAD_SIZE: usize = 10; const EXPECTATIONS: LookaheadTestExpectations = LookaheadTestExpectations { pre_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, 21, 22, 23, 20, ], pre_receive_fi_lens: [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 7, 7, 7, 7, 13, 13, 13, 13, 19, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, 19, 19, 25, 19, ], post_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, 21, 22, 19, 20, ], post_receive_fi_lens: [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 7, 7, 7, 7, 13, 13, 13, 13, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, ], }; lookahead_size_properly_bounded(LOOKAHEAD_SIZE, false, &EXPECTATIONS); } #[test] fn lookahead_size_properly_bounded_16() { const LOOKAHEAD_SIZE: usize = 16; const EXPECTATIONS: LookaheadTestExpectations = LookaheadTestExpectations { pre_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 27, 28, 28, 29, 27, 28, 28, 29, 27, 28, 28, 29, 27, 28, 28, 29, 27, 28, 28, 29, 27, 28, 28, 29, 27, 28, 28, 29, 27, 28, 28, ], pre_receive_fi_lens: [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 7, 7, 7, 7, 13, 13, 13, 13, 19, 19, 19, 19, 25, 25, 25, 25, 31, 31, 31, 26, 32, 31, 31, 26, 32, 31, 31, 26, 32, 31, 31, 26, 32, 31, 31, 26, 32, 31, 31, 26, 32, 31, 31, 26, 32, 31, 31, 26, 32, 31, ], post_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 26, 27, 27, 28, 26, 27, 27, 28, 26, 27, 27, 28, 26, 27, 27, 28, 26, 27, 27, 28, 26, 27, 27, 28, 26, 27, 27, 28, 26, 27, 27, 28, ], post_receive_fi_lens: [ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 7, 7, 7, 7, 13, 13, 13, 13, 19, 19, 19, 19, 25, 25, 25, 25, 31, 31, 26, 26, 31, 31, 26, 26, 31, 31, 26, 26, 31, 31, 26, 26, 31, 31, 26, 26, 31, 31, 26, 26, 31, 31, 26, 26, 31, 31, 26, 26, 31, 31, ], }; lookahead_size_properly_bounded(LOOKAHEAD_SIZE, false, &EXPECTATIONS); } #[test] fn lookahead_size_properly_bounded_lowlatency_8() { const LOOKAHEAD_SIZE: usize = 8; const EXPECTATIONS: LookaheadTestExpectations = LookaheadTestExpectations { pre_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, ], pre_receive_fi_lens: [ 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, ], post_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, ], post_receive_fi_lens: [ 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, ], }; lookahead_size_properly_bounded(LOOKAHEAD_SIZE, true, &EXPECTATIONS); } #[test] fn lookahead_size_properly_bounded_lowlatency_1() { const LOOKAHEAD_SIZE: usize = 1; const EXPECTATIONS: LookaheadTestExpectations = LookaheadTestExpectations { pre_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ], pre_receive_fi_lens: [ 0, 0, 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ], post_receive_frame_q_lens: [ 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ], post_receive_fi_lens: [ 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ], }; lookahead_size_properly_bounded(LOOKAHEAD_SIZE, true, &EXPECTATIONS); } fn lookahead_size_properly_bounded( rdo_lookahead: usize, low_latency: bool, expectations: &LookaheadTestExpectations, ) { // Test that lookahead reads in the proper number of frames at once let mut ctx = setup_encoder::( 64, 80, 10, 100, 8, ChromaSampling::Cs420, 0, 100, 0, low_latency, 0, true, rdo_lookahead, None, ); const LIMIT: usize = 60; let mut pre_receive_frame_q_lens = [0; LIMIT]; let mut pre_receive_fi_lens = [0; LIMIT]; let mut post_receive_frame_q_lens = [0; LIMIT]; let mut post_receive_fi_lens = [0; LIMIT]; for i in 0..LIMIT { let input = ctx.new_frame(); let _ = ctx.send_frame(input); pre_receive_frame_q_lens[i] = ctx.inner.frame_q.len(); pre_receive_fi_lens[i] = ctx.inner.frame_data.len(); while ctx.receive_packet().is_ok() { // Receive packets until lookahead consumed, due to pyramids receiving frames in groups } post_receive_frame_q_lens[i] = ctx.inner.frame_q.len(); post_receive_fi_lens[i] = ctx.inner.frame_data.len(); } assert_eq!( &pre_receive_frame_q_lens[..], &expectations.pre_receive_frame_q_lens[..] ); assert_eq!(&pre_receive_fi_lens[..], &expectations.pre_receive_fi_lens[..]); assert_eq!( &post_receive_frame_q_lens[..], &expectations.post_receive_frame_q_lens[..] ); assert_eq!( &post_receive_fi_lens[..], &expectations.post_receive_fi_lens[..] ); ctx.flush(); let end = ctx.inner.frame_q.get(&(LIMIT as u64)); assert!(end.is_some()); assert!(end.unwrap().is_none()); while let Ok(_) | Err(EncoderStatus::Encoded) = ctx.receive_packet() { // Receive packets until all frames consumed } assert_eq!(ctx.inner.frames_processed, LIMIT as u64); } #[test] fn zero_frames() { let config = Config::default(); let mut ctx: Context = config.new_context().unwrap(); ctx.flush(); assert_eq!(ctx.receive_packet(), Err(EncoderStatus::LimitReached)); } #[test] fn tile_cols_overflow() { let enc = EncoderConfig { tile_cols: usize::MAX, ..Default::default() }; let config = Config::new().with_encoder_config(enc); let _: Result, _> = config.new_context(); } #[test] fn max_key_frame_interval_overflow() { let enc = EncoderConfig { max_key_frame_interval: i32::MAX as u64, reservoir_frame_delay: None, ..Default::default() }; let config = Config::new().with_encoder_config(enc); let _: Result, _> = config.new_context(); } #[test] fn target_bitrate_overflow() { let enc = EncoderConfig { bitrate: i32::MAX, time_base: Rational::new(i64::MAX as u64, 1), ..Default::default() }; let config = Config::new().with_encoder_config(enc); let _: Result, _> = config.new_context(); } #[test] fn time_base_den_divide_by_zero() { let enc = EncoderConfig { time_base: Rational::new(1, 0), ..Default::default() }; let config = Config::new().with_encoder_config(enc); let _: Result, _> = config.new_context(); } #[test] fn large_width_assert() { let enc = EncoderConfig { width: u32::MAX as usize, ..Default::default() }; let config = Config::new().with_encoder_config(enc); let _: Result, _> = config.new_context(); } #[test] fn reservoir_max_overflow() { let enc = EncoderConfig { reservoir_frame_delay: Some(i32::MAX), bitrate: i32::MAX, time_base: Rational::new(i32::MAX as u64 * 2, 1), ..Default::default() }; let config = Config::new().with_encoder_config(enc); let _: Result, _> = config.new_context(); } #[test] fn zero_width() { let enc = EncoderConfig { width: 0, ..Default::default() }; let config = Config::new().with_encoder_config(enc); let res: Result, _> = config.new_context(); assert!(res.is_err()); } #[test] fn rdo_lookahead_frames_overflow() { let enc = EncoderConfig { speed_settings: SpeedSettings { rdo_lookahead_frames: usize::MAX, ..Default::default() }, ..Default::default() }; let config = Config::new().with_encoder_config(enc); let res: Result, _> = config.new_context(); assert!(res.is_err()); } #[test] fn log_q_exp_overflow() { let enc = EncoderConfig { width: 16, height: 16, sample_aspect_ratio: Rational::new(1, 1), bit_depth: 8, chroma_sampling: ChromaSampling::Cs420, chroma_sample_position: ChromaSamplePosition::Unknown, pixel_range: PixelRange::Limited, color_description: None, mastering_display: None, content_light: None, level_idx: Some(31), enable_timing_info: false, still_picture: false, error_resilient: false, switch_frame_interval: 0, time_base: Rational { num: 1, den: 25 }, min_key_frame_interval: 12, max_key_frame_interval: 240, reservoir_frame_delay: None, low_latency: false, quantizer: 100, min_quantizer: 64, bitrate: 1, tune: Tune::Psychovisual, film_grain_params: None, tile_cols: 0, tile_rows: 0, tiles: 0, speed_settings: SpeedSettings { multiref: false, fast_deblock: true, rdo_lookahead_frames: 40, scene_detection_mode: SceneDetectionSpeed::None, cdef: true, lrf: true, partition: PartitionSpeedSettings { partition_range: PartitionRange::new( BlockSize::BLOCK_64X64, BlockSize::BLOCK_64X64, ), encode_bottomup: false, non_square_partition_max_threshold: BlockSize::BLOCK_4X4, }, transform: TransformSpeedSettings { reduced_tx_set: true, tx_domain_distortion: true, tx_domain_rate: false, rdo_tx_decision: false, ..Default::default() }, prediction: PredictionSpeedSettings { prediction_modes: PredictionModesSetting::Simple, ..Default::default() }, motion: MotionSpeedSettings { include_near_mvs: false, use_satd_subpel: false, ..Default::default() }, ..Default::default() }, }; let config = Config::new().with_encoder_config(enc).with_threads(1); let mut ctx: Context = config.new_context().unwrap(); for _ in 0..2 { ctx.send_frame(ctx.new_frame()).unwrap(); } ctx.flush(); ctx.receive_packet().unwrap(); let _ = ctx.receive_packet(); } #[test] fn guess_frame_subtypes_assert() { let enc = EncoderConfig { width: 16, height: 16, sample_aspect_ratio: Rational::new(1, 1), bit_depth: 8, chroma_sampling: ChromaSampling::Cs420, chroma_sample_position: ChromaSamplePosition::Unknown, pixel_range: PixelRange::Limited, color_description: None, mastering_display: None, content_light: None, level_idx: Some(31), enable_timing_info: false, still_picture: false, error_resilient: false, switch_frame_interval: 0, time_base: Rational { num: 1, den: 25 }, min_key_frame_interval: 0, max_key_frame_interval: 1, reservoir_frame_delay: None, low_latency: false, quantizer: 100, min_quantizer: 0, bitrate: 16384, tune: Tune::Psychovisual, film_grain_params: None, tile_cols: 0, tile_rows: 0, tiles: 0, speed_settings: SpeedSettings { multiref: false, fast_deblock: true, rdo_lookahead_frames: 40, scene_detection_mode: SceneDetectionSpeed::None, cdef: true, lrf: true, partition: PartitionSpeedSettings { partition_range: PartitionRange::new( BlockSize::BLOCK_64X64, BlockSize::BLOCK_64X64, ), encode_bottomup: false, non_square_partition_max_threshold: BlockSize::BLOCK_4X4, }, transform: TransformSpeedSettings { reduced_tx_set: true, tx_domain_distortion: true, tx_domain_rate: false, rdo_tx_decision: false, ..Default::default() }, prediction: PredictionSpeedSettings { prediction_modes: PredictionModesSetting::Simple, ..Default::default() }, motion: MotionSpeedSettings { include_near_mvs: false, use_satd_subpel: false, ..Default::default() }, ..Default::default() }, }; let config = Config::new().with_encoder_config(enc).with_threads(1); let mut ctx: Context = config.new_context().unwrap(); ctx.send_frame(ctx.new_frame()).unwrap(); ctx.flush(); ctx.receive_packet().unwrap(); } #[test] fn min_quantizer_bounds_correctly() { let mut ctx = setup_encoder::( 64, 80, 10, 255, 8, ChromaSampling::Cs420, 25, 25, 25000, true, 0, true, 1, Some(100), ); let limit = 25; send_frames(&mut ctx, limit, 0); ctx.flush(); for i in 0..limit { ctx.inner.encode_packet(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert_eq!(68, frame_data.fi.base_q_idx); } else { assert_eq!(96, frame_data.fi.base_q_idx); } } let mut ctx = setup_encoder::( 64, 80, 10, 255, 8, ChromaSampling::Cs420, 25, 25, 2000, true, 0, true, 1, Some(100), ); let limit = 25; send_frames(&mut ctx, limit, 0); ctx.flush(); for i in 0..limit { ctx.inner.encode_packet(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert!(frame_data.fi.base_q_idx > 68); } else { assert!(frame_data.fi.base_q_idx > 96); } } } #[test] fn max_quantizer_bounds_correctly() { let mut ctx = setup_encoder::( 64, 80, 10, 120, 8, ChromaSampling::Cs420, 25, 25, 2000, true, 0, true, 1, None, ); let limit = 25; send_frames(&mut ctx, limit, 0); ctx.flush(); for i in 0..limit { ctx.inner.encode_packet(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert_eq!(95, frame_data.fi.base_q_idx); } else { assert_eq!(115, frame_data.fi.base_q_idx); } } let mut ctx = setup_encoder::( 64, 80, 10, 120, 8, ChromaSampling::Cs420, 25, 25, 20000, true, 0, true, 1, None, ); let limit = 25; send_frames(&mut ctx, limit, 0); ctx.flush(); for i in 0..limit { ctx.inner.encode_packet(i).unwrap(); let frame_data = ctx.inner.frame_data.get(&i).unwrap().as_ref().unwrap(); if i == 0 { assert!(frame_data.fi.base_q_idx < 95); } else { assert!(frame_data.fi.base_q_idx < 115); } } } rav1e-0.7.1/src/api/util.rs000064400000000000000000000212471046102023000135400ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![deny(missing_docs)] use crate::frame::*; use crate::serialize::{Deserialize, Serialize}; use crate::stats::EncoderStats; use crate::util::Pixel; use std::any::Any; use std::fmt; use std::sync::Arc; use thiserror::*; /// Opaque type to be passed from Frame to Packet #[derive(Debug)] pub struct Opaque(Box); impl Opaque { /// Wrap a type in the opaque struct pub fn new(t: T) -> Self { Opaque(Box::new(t) as Box) } /// Attempt to downcast the opaque to a concrete type. /// /// # Errors /// /// Returns `Err(Self)` if the value could not be downcast to `T`. pub fn downcast(self) -> Result, Opaque> { if self.0.is::() { // SAFETY: We verified the type of `T` before this cast. unsafe { let raw: *mut (dyn Any + Send + Sync) = Box::into_raw(self.0); Ok(Box::from_raw(raw as *mut T)) } } else { Err(self) } } } // TODO: use the num crate? /// A rational number. #[derive(Clone, Copy, Debug)] #[repr(C)] pub struct Rational { /// Numerator. pub num: u64, /// Denominator. pub den: u64, } impl Rational { /// Creates a rational number from the given numerator and denominator. pub const fn new(num: u64, den: u64) -> Self { Rational { num, den } } /// Returns a rational number that is the reciprocal of the given one. pub const fn from_reciprocal(reciprocal: Self) -> Self { Rational { num: reciprocal.den, den: reciprocal.num } } /// Returns the rational number as a floating-point number. pub fn as_f64(self) -> f64 { self.num as f64 / self.den as f64 } } #[cfg(feature = "serialize")] impl serde::Serialize for Rational { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { (self.num, self.den).serialize(serializer) } } #[cfg(feature = "serialize")] impl<'a> serde::Deserialize<'a> for Rational { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'a>, { let (num, den) = serde::Deserialize::deserialize(deserializer)?; Ok(Rational::new(num, den)) } } /// Possible types of a frame. #[allow(dead_code, non_camel_case_types)] #[derive(Debug, Eq, PartialEq, Clone, Copy, Serialize, Deserialize)] #[repr(C)] pub enum FrameType { /// Key frame. KEY, /// Inter-frame. INTER, /// Intra-only frame. INTRA_ONLY, /// Switching frame. SWITCH, } impl FrameType { /// Returns whether frame can have inter blocks #[inline] pub fn has_inter(self) -> bool { self == FrameType::INTER || self == FrameType::SWITCH } /// Returns whether frame is only intra blocks #[inline] pub fn all_intra(self) -> bool { self == FrameType::KEY || self == FrameType::INTRA_ONLY } } impl fmt::Display for FrameType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::FrameType::*; match self { KEY => write!(f, "Key frame"), INTER => write!(f, "Inter frame"), INTRA_ONLY => write!(f, "Intra only frame"), SWITCH => write!(f, "Switching frame"), } } } /// A single T.35 metadata packet. #[derive(Clone, Debug, Default)] pub struct T35 { /// Country code. pub country_code: u8, /// Country code extension bytes (if country_code == 0xFF) pub country_code_extension_byte: u8, /// T.35 payload. pub data: Box<[u8]>, } /// Status that can be returned by [`Context`] functions. /// /// [`Context`]: struct.Context.html #[derive(Clone, Copy, Debug, Eq, PartialEq, Error)] pub enum EncoderStatus { /// The encoder needs more data to produce an output packet. /// /// May be emitted by [`Context::receive_packet()`] when frame reordering is /// enabled. /// /// [`Context::receive_packet()`]: struct.Context.html#method.receive_packet #[error("need more data")] NeedMoreData, /// There are enough frames in the queue. /// /// May be emitted by [`Context::send_frame()`] when trying to send a frame /// after the encoder has been flushed. /// /// [`Context::send_frame()`]: struct.Context.html#method.send_frame #[error("enough data")] EnoughData, /// The encoder has already produced the number of frames requested. /// /// May be emitted by [`Context::receive_packet()`] after a flush request had /// been processed or the frame limit had been reached. /// /// [`Context::receive_packet()`]: struct.Context.html#method.receive_packet #[error("limit reached")] LimitReached, /// A frame had been encoded but not emitted yet. #[error("encoded")] Encoded, /// Generic fatal error. #[error("failure")] Failure, /// A frame was encoded in the first pass of a 2-pass encode, but its stats /// data was not retrieved with [`Context::twopass_out()`], or not enough /// stats data was provided in the second pass of a 2-pass encode to encode /// the next frame. /// /// [`Context::twopass_out()`]: struct.Context.html#method.twopass_out #[error("not ready")] NotReady, } /// Represents a packet. /// /// A packet contains one shown frame together with zero or more additional /// frames. #[derive(Debug, Serialize, Deserialize)] pub struct Packet { /// The packet data. pub data: Vec, /// The reconstruction of the shown frame. #[cfg_attr(feature = "serialize", serde(skip))] pub rec: Option>>, /// The Reference Frame #[cfg_attr(feature = "serialize", serde(skip))] pub source: Option>>, /// The number of the input frame corresponding to the one shown frame in the /// TU stored in this packet. Since AV1 does not explicitly reorder frames, /// these will increase sequentially. // TODO: When we want to add VFR support, we will need a more explicit time // stamp here. pub input_frameno: u64, /// Type of the shown frame. pub frame_type: FrameType, /// QP selected for the frame. pub qp: u8, /// Block-level encoding stats for the frame pub enc_stats: EncoderStats, /// Optional user-provided opaque data #[cfg_attr(feature = "serialize", serde(skip))] pub opaque: Option, } impl PartialEq for Packet { fn eq(&self, other: &Self) -> bool { self.data == other.data && self.input_frameno == other.input_frameno && self.frame_type == other.frame_type && self.qp == other.qp } } impl fmt::Display for Packet { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "Frame {} - {} - {} bytes", self.input_frameno, self.frame_type, self.data.len() ) } } /// Types which can be converted into frames. /// /// This trait is used in [`Context::send_frame`] to allow for passing in /// frames with optional frame parameters and optionally frames wrapped in /// `Arc` (to allow for zero-copy, since the encoder uses frames in `Arc` /// internally). /// /// [`Context::send_frame`]: struct.Context.html#method.send_frame pub trait IntoFrame { /// Converts the type into a tuple of frame and parameters. fn into(self) -> (Option>>, Option); } impl IntoFrame for Option>> { fn into(self) -> (Option>>, Option) { (self, None) } } impl IntoFrame for Arc> { fn into(self) -> (Option>>, Option) { (Some(self), None) } } impl IntoFrame for (Arc>, FrameParameters) { fn into(self) -> (Option>>, Option) { (Some(self.0), Some(self.1)) } } impl IntoFrame for (Arc>, Option) { fn into(self) -> (Option>>, Option) { (Some(self.0), self.1) } } impl IntoFrame for Frame { fn into(self) -> (Option>>, Option) { (Some(Arc::new(self)), None) } } impl IntoFrame for (Frame, FrameParameters) { fn into(self) -> (Option>>, Option) { (Some(Arc::new(self.0)), Some(self.1)) } } impl IntoFrame for (Frame, Option) { fn into(self) -> (Option>>, Option) { (Some(Arc::new(self.0)), self.1) } } rav1e-0.7.1/src/arm/32/cdef.S000064400000000000000000000453041046102023000134740ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" // n1 = s0/d0 // w1 = d0/q0 // n2 = s4/d2 // w2 = d2/q1 .macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldrh r12, [\s1, #-2] vldr \n1, [\s1] vdup.16 d4, r12 ldrh r12, [\s1, #\w] vmov.16 d4[1], r12 ldrh r12, [\s2, #-2] vldr \n2, [\s2] vmov.16 d4[2], r12 ldrh r12, [\s2, #\w] vmovl.u8 q0, d0 vmov.16 d4[3], r12 vmovl.u8 q1, d2 vmovl.u8 q2, d4 vstr s8, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s9, [r0, #2*\w] add r0, r0, #2*\stride vstr s10, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s11, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldrh r12, [\s1, #-2] vldr \n1, [\s1] vdup.16 d4, r12 ldrh r12, [\s2, #-2] vldr \n2, [\s2] vmovl.u8 q0, d0 vmov.16 d4[1], r12 vmovl.u8 q1, d2 vmovl.u8 q2, d4 vstr s8, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s9, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vldr \n1, [\s1] ldrh r12, [\s1, #\w] vldr \n2, [\s2] vdup.16 d4, r12 ldrh r12, [\s2, #\w] vmovl.u8 q0, d0 vmov.16 d4[1], r12 vmovl.u8 q1, d2 vmovl.u8 q2, d4 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s8, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s9, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vldr \n1, [\s1] vldr \n2, [\s2] vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\w2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride .endif 3: .endm .macro load_n_incr dst, src, incr, w .if \w == 4 vld1.32 {\dst\()[0]}, [\src, :32], \incr .else vld1.8 {\dst\()}, [\src, :64], \incr .endif .endm // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); // n1 = s0/d0 // w1 = d0/q0 // n2 = s4/d2 // w2 = d2/q1 .macro padding_func w, stride, n1, w1, n2, w2, align function cdef_padding\w\()_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] cmp r7, #0xf // fully edged beq cdef_padding\w\()_edged_8bpc_neon vmov.i16 q3, #0x8000 tst r7, #4 // CDEF_HAVE_TOP bne 1f // !CDEF_HAVE_TOP sub r12, r0, #2*(2*\stride+2) vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif b 3f 1: // CDEF_HAVE_TOP add r8, r4, r2 sub r0, r0, #2*(2*\stride) pad_top_bottom r4, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 0 // Middle section 3: tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vld1.16 {d2[]}, [r3, :16]! ldrh r12, [r1, #\w] load_n_incr d0, r1, r2, \w subs r6, r6, #1 vmov.16 d2[1], r12 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s4, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s5, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {d2[]}, [r3, :16]! load_n_incr d0, r1, r2, \w subs r6, r6, #1 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s4, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b b 3f 2: tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldrh r12, [r1, #\w] load_n_incr d0, r1, r2, \w vdup.16 d2, r12 subs r6, r6, #1 vmovl.u8 q0, d0 vmovl.u8 q1, d2 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s4, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr d0, r1, r2, \w subs r6, r6, #1 vmovl.u8 q0, d0 vstr s12, [r0, #-4] vst1.16 {\w1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b 3: tst r7, #8 // CDEF_HAVE_BOTTOM bne 1f // !CDEF_HAVE_BOTTOM sub r12, r0, #4 vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif pop {r4-r8,pc} 1: // CDEF_HAVE_BOTTOM add r8, r5, r2 pad_top_bottom r5, r8, \w, \stride, \n1, \w1, \n2, \w2, \align, 1 endfunc .endm padding_func 8, 16, d0, q0, d2, q1, 128 padding_func 4, 8, s0, d0, s4, d2, 64 // void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_edged w, stride, reg, align function cdef_padding\w\()_edged_8bpc_neon sub r0, r0, #(2*\stride) ldrh r12, [r4, #-2] vldr \reg, [r4] add r8, r4, r2 strh r12, [r0, #-2] ldrh r12, [r4, #\w] vstr \reg, [r0] strh r12, [r0, #\w] ldrh r12, [r8, #-2] vldr \reg, [r8] strh r12, [r0, #\stride-2] ldrh r12, [r8, #\w] vstr \reg, [r0, #\stride] strh r12, [r0, #\stride+\w] add r0, r0, #2*\stride 0: ldrh r12, [r3], #2 vldr \reg, [r1] str r12, [r0, #-2] ldrh r12, [r1, #\w] add r1, r1, r2 subs r6, r6, #1 vstr \reg, [r0] str r12, [r0, #\w] add r0, r0, #\stride bgt 0b ldrh r12, [r5, #-2] vldr \reg, [r5] add r8, r5, r2 strh r12, [r0, #-2] ldrh r12, [r5, #\w] vstr \reg, [r0] strh r12, [r0, #\w] ldrh r12, [r8, #-2] vldr \reg, [r8] strh r12, [r0, #\stride-2] ldrh r12, [r8, #\w] vstr \reg, [r0, #\stride] strh r12, [r0, #\stride+\w] pop {r4-r8,pc} endfunc .endm padding_func_edged 8, 16, d0, 64 padding_func_edged 4, 8, s0, 32 tables filter 8, 8 filter 4, 8 find_dir 8 .macro load_px_8 d11, d12, d21, d22, w .if \w == 8 add r6, r2, r9 // x + off sub r9, r2, r9 // x - off vld1.8 {\d11}, [r6] // p0 add r6, r6, #16 // += stride vld1.8 {\d21}, [r9] // p1 add r9, r9, #16 // += stride vld1.8 {\d12}, [r6] // p0 vld1.8 {\d22}, [r9] // p1 .else add r6, r2, r9 // x + off sub r9, r2, r9 // x - off vld1.32 {\d11[0]}, [r6] // p0 add r6, r6, #8 // += stride vld1.32 {\d21[0]}, [r9] // p1 add r9, r9, #8 // += stride vld1.32 {\d11[1]}, [r6] // p0 add r6, r6, #8 // += stride vld1.32 {\d21[1]}, [r9] // p1 add r9, r9, #8 // += stride vld1.32 {\d12[0]}, [r6] // p0 add r6, r6, #8 // += stride vld1.32 {\d22[0]}, [r9] // p1 add r9, r9, #8 // += stride vld1.32 {\d12[1]}, [r6] // p0 vld1.32 {\d22[1]}, [r9] // p1 .endif .endm .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min .if \min vmin.u8 q3, q3, \s1 vmax.u8 q4, q4, \s1 vmin.u8 q3, q3, \s2 vmax.u8 q4, q4, \s2 .endif vabd.u8 q8, q0, \s1 // abs(diff) vabd.u8 q11, q0, \s2 // abs(diff) vshl.u8 q9, q8, \shift // abs(diff) >> shift vshl.u8 q12, q11, \shift // abs(diff) >> shift vqsub.u8 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) vqsub.u8 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) vcgt.u8 q10, q0, \s1 // px > p0 vcgt.u8 q13, q0, \s2 // px > p1 vmin.u8 q9, q9, q8 // imin(abs(diff), clip) vmin.u8 q12, q12, q11 // imin(abs(diff), clip) vneg.s8 q8, q9 // -imin() vneg.s8 q11, q12 // -imin() vbsl q10, q8, q9 // constrain() = imax(imin(diff, clip), -clip) vdup.8 d18, \tap // taps[k] vbsl q13, q11, q12 // constrain() = imax(imin(diff, clip), -clip) vmlal.s8 q1, d20, d18 // sum += taps[k] * constrain() vmlal.s8 q1, d26, d18 // sum += taps[k] * constrain() vmlal.s8 q2, d21, d18 // sum += taps[k] * constrain() vmlal.s8 q2, d27, d18 // sum += taps[k] * constrain() .endm // void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride, // const uint16_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h, size_t edges); .macro filter_func_8 w, pri, sec, min, suffix function cdef_filter\w\suffix\()_edged_neon .if \pri movrel_local r8, pri_taps and r9, r3, #1 add r8, r8, r9, lsl #1 .endif movrel_local r9, directions\w add r5, r9, r5, lsl #1 vmov.u8 d17, #7 vdup.8 d16, r6 // damping vmov.8 d8[0], r3 vmov.8 d8[1], r4 vclz.i8 d8, d8 // clz(threshold) vsub.i8 d8, d17, d8 // ulog2(threshold) vqsub.u8 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) vneg.s8 d8, d8 // -shift .if \sec vdup.8 q6, d8[1] .endif .if \pri vdup.8 q5, d8[0] .endif 1: .if \w == 8 add r12, r2, #16 vld1.8 {d0}, [r2, :64] // px vld1.8 {d1}, [r12, :64] // px .else add r12, r2, #8 vld1.32 {d0[0]}, [r2, :32] // px add r9, r2, #2*8 vld1.32 {d0[1]}, [r12, :32] // px add r12, r12, #2*8 vld1.32 {d1[0]}, [r9, :32] // px vld1.32 {d1[1]}, [r12, :32] // px .endif vmov.u8 q1, #0 // sum vmov.u8 q2, #0 // sum .if \min vmov.u16 q3, q0 // min vmov.u16 q4, q0 // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov lr, #2 // sec_taps[0] 2: .if \pri ldrsb r9, [r5] // off1 load_px_8 d28, d29, d30, d31, \w .endif .if \sec add r5, r5, #4 // +2*2 ldrsb r9, [r5] // off2 .endif .if \pri ldrb r12, [r8] // *pri_taps vdup.8 q7, r3 // threshold handle_pixel_8 q14, q15, q7, q5, r12, \min .endif .if \sec load_px_8 d28, d29, d30, d31, \w add r5, r5, #8 // +2*4 ldrsb r9, [r5] // off3 vdup.8 q7, r4 // threshold handle_pixel_8 q14, q15, q7, q6, lr, \min load_px_8 d28, d29, d30, d31, \w handle_pixel_8 q14, q15, q7, q6, lr, \min sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; .else add r5, r5, #1 // r5 += 1 .endif subs lr, lr, #1 // sec_tap-- (value) .if \pri add r8, r8, #1 // pri_taps++ (pointer) .endif bne 2b vshr.s16 q14, q1, #15 // -(sum < 0) vshr.s16 q15, q2, #15 // -(sum < 0) vadd.i16 q1, q1, q14 // sum - (sum < 0) vadd.i16 q2, q2, q15 // sum - (sum < 0) vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 vrshr.s16 q2, q2, #4 // (8 + sum - (sum < 0)) >> 4 vaddw.u8 q1, q1, d0 // px + (8 + sum ...) >> 4 vaddw.u8 q2, q2, d1 // px + (8 + sum ...) >> 4 vqmovun.s16 d0, q1 vqmovun.s16 d1, q2 .if \min vmin.u8 q0, q0, q4 vmax.u8 q0, q0, q3 // iclip(px + .., min, max) .endif .if \w == 8 vst1.8 {d0}, [r0, :64], r1 add r2, r2, #2*16 // tmp += 2*tmp_stride subs r7, r7, #2 // h -= 2 vst1.8 {d1}, [r0, :64], r1 .else vst1.32 {d0[0]}, [r0, :32], r1 add r2, r2, #4*8 // tmp += 4*tmp_stride vst1.32 {d0[1]}, [r0, :32], r1 subs r7, r7, #4 // h -= 4 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 .endif // Reset pri_taps and directions back to the original point sub r5, r5, #2 .if \pri sub r8, r8, #2 .endif bgt 1b vpop {q4-q7} pop {r4-r9,pc} endfunc .endm .macro filter_8 w filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec .endm filter_8 8 filter_8 4 rav1e-0.7.1/src/arm/32/cdef16.S000064400000000000000000000175071046102023000136470ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" // r1 = d0/q0 // r2 = d2/q1 .macro pad_top_bot_16 s1, s2, w, stride, r1, r2, align, ret tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vldr s8, [\s1, #-4] vld1.16 {\r1}, [\s1, :\align] vldr s9, [\s1, #2*\w] vldr s10, [\s2, #-4] vld1.16 {\r2}, [\s2, :\align] vldr s11, [\s2, #2*\w] vstr s8, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s9, [r0, #2*\w] add r0, r0, #2*\stride vstr s10, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s11, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vldr s8, [\s1, #-4] vld1.16 {\r1}, [\s1, :\align] vldr s9, [\s2, #-4] vld1.16 {\r2}, [\s2, :\align] vstr s8, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s9, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT vld1.16 {\r1}, [\s1, :\align] vldr s8, [\s1, #2*\w] vld1.16 {\r2}, [\s2, :\align] vldr s9, [\s2, #2*\w] vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s8, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s9, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {\r1}, [\s1, :\align] vld1.16 {\r2}, [\s2, :\align] vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride vstr s12, [r0, #-4] vst1.16 {\r2}, [r0, :\align] vstr s12, [r0, #2*\w] .if \ret pop {r4-r8,pc} .else add r0, r0, #2*\stride .endif 3: .endm // void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); // r1 = d0/q0 // r2 = d2/q1 .macro padding_func_16 w, stride, r1, r2, align function cdef_padding\w\()_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] vmov.i16 q3, #0x8000 tst r7, #4 // CDEF_HAVE_TOP bne 1f // !CDEF_HAVE_TOP sub r12, r0, #2*(2*\stride+2) vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif b 3f 1: // CDEF_HAVE_TOP add r8, r4, r2 sub r0, r0, #2*(2*\stride) pad_top_bot_16 r4, r8, \w, \stride, \r1, \r2, \align, 0 // Middle section 3: tst r7, #1 // CDEF_HAVE_LEFT beq 2f // CDEF_HAVE_LEFT tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vld1.32 {d2[]}, [r3, :32]! vldr s5, [r1, #2*\w] vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s4, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s5, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.32 {d2[]}, [r3, :32]! vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s4, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b b 3f 2: tst r7, #2 // CDEF_HAVE_RIGHT beq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: vldr s4, [r1, #2*\w] vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s4, [r0, #2*\w] add r0, r0, #2*\stride bgt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT vld1.16 {\r1}, [r1, :\align], r2 subs r6, r6, #1 vstr s12, [r0, #-4] vst1.16 {\r1}, [r0, :\align] vstr s12, [r0, #2*\w] add r0, r0, #2*\stride bgt 1b 3: tst r7, #8 // CDEF_HAVE_BOTTOM bne 1f // !CDEF_HAVE_BOTTOM sub r12, r0, #4 vmov.i16 q2, #0x8000 vst1.16 {q2,q3}, [r12]! .if \w == 8 vst1.16 {q2,q3}, [r12]! .endif pop {r4-r8,pc} 1: // CDEF_HAVE_BOTTOM add r8, r5, r2 pad_top_bot_16 r5, r8, \w, \stride, \r1, \r2, \align, 1 endfunc .endm padding_func_16 8, 16, q0, q1, 128 padding_func_16 4, 8, d0, d2, 64 tables filter 8, 16 filter 4, 16 find_dir 16 rav1e-0.7.1/src/arm/32/cdef_tmpl.S000064400000000000000000000455671046102023000145430ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro dir_table w, stride const directions\w .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 .byte 1 * \stride + 0, 2 * \stride + 0 .byte 1 * \stride + 0, 2 * \stride - 1 // Repeated, to avoid & 7 .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 endconst .endm .macro tables dir_table 8, 16 dir_table 4, 8 const pri_taps .byte 4, 2, 3, 3 endconst .endm .macro load_px d11, d12, d21, d22, w .if \w == 8 add r6, r2, r9, lsl #1 // x + off sub r9, r2, r9, lsl #1 // x - off vld1.16 {\d11,\d12}, [r6] // p0 vld1.16 {\d21,\d22}, [r9] // p1 .else add r6, r2, r9, lsl #1 // x + off sub r9, r2, r9, lsl #1 // x - off vld1.16 {\d11}, [r6] // p0 add r6, r6, #2*8 // += stride vld1.16 {\d21}, [r9] // p1 add r9, r9, #2*8 // += stride vld1.16 {\d12}, [r6] // p0 vld1.16 {\d22}, [r9] // p1 .endif .endm .macro handle_pixel s1, s2, thresh_vec, shift, tap, min .if \min vmin.u16 q2, q2, \s1 vmax.s16 q3, q3, \s1 vmin.u16 q2, q2, \s2 vmax.s16 q3, q3, \s2 .endif vabd.u16 q8, q0, \s1 // abs(diff) vabd.u16 q11, q0, \s2 // abs(diff) vshl.u16 q9, q8, \shift // abs(diff) >> shift vshl.u16 q12, q11, \shift // abs(diff) >> shift vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift)) vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift)) vsub.i16 q10, \s1, q0 // diff = p0 - px vsub.i16 q13, \s2, q0 // diff = p1 - px vneg.s16 q8, q9 // -clip vneg.s16 q11, q12 // -clip vmin.s16 q10, q10, q9 // imin(diff, clip) vmin.s16 q13, q13, q12 // imin(diff, clip) vdup.16 q9, \tap // taps[k] vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip) vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip) vmla.i16 q1, q10, q9 // sum += taps[k] * constrain() vmla.i16 q1, q13, q9 // sum += taps[k] * constrain() .endm // void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, // const uint16_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h, size_t edges); .macro filter_func w, bpc, pri, sec, min, suffix function cdef_filter\w\suffix\()_\bpc\()bpc_neon .if \bpc == 8 cmp r8, #0xf beq cdef_filter\w\suffix\()_edged_neon .endif .if \pri .if \bpc == 16 clz r9, r9 sub r9, r9, #24 // -bitdepth_min_8 neg r9, r9 // bitdepth_min_8 .endif movrel_local r8, pri_taps .if \bpc == 16 lsr r9, r3, r9 // pri_strength >> bitdepth_min_8 and r9, r9, #1 // (pri_strength >> bitdepth_min_8) & 1 .else and r9, r3, #1 .endif add r8, r8, r9, lsl #1 .endif movrel_local r9, directions\w add r5, r9, r5, lsl #1 vmov.u16 d17, #15 vdup.16 d16, r6 // damping .if \pri vdup.16 q5, r3 // threshold .endif .if \sec vdup.16 q7, r4 // threshold .endif vmov.16 d8[0], r3 vmov.16 d8[1], r4 vclz.i16 d8, d8 // clz(threshold) vsub.i16 d8, d17, d8 // ulog2(threshold) vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold)) vneg.s16 d8, d8 // -shift .if \sec vdup.16 q6, d8[1] .endif .if \pri vdup.16 q4, d8[0] .endif 1: .if \w == 8 vld1.16 {q0}, [r2, :128] // px .else add r12, r2, #2*8 vld1.16 {d0}, [r2, :64] // px vld1.16 {d1}, [r12, :64] // px .endif vmov.u16 q1, #0 // sum .if \min vmov.u16 q2, q0 // min vmov.u16 q3, q0 // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov lr, #2 // sec_taps[0] 2: .if \pri ldrsb r9, [r5] // off1 load_px d28, d29, d30, d31, \w .endif .if \sec add r5, r5, #4 // +2*2 ldrsb r9, [r5] // off2 .endif .if \pri ldrb r12, [r8] // *pri_taps handle_pixel q14, q15, q5, q4, r12, \min .endif .if \sec load_px d28, d29, d30, d31, \w add r5, r5, #8 // +2*4 ldrsb r9, [r5] // off3 handle_pixel q14, q15, q7, q6, lr, \min load_px d28, d29, d30, d31, \w handle_pixel q14, q15, q7, q6, lr, \min sub r5, r5, #11 // r5 -= 2*(2+4); r5 += 1; .else add r5, r5, #1 // r5 += 1 .endif subs lr, lr, #1 // sec_tap-- (value) .if \pri add r8, r8, #1 // pri_taps++ (pointer) .endif bne 2b vshr.s16 q14, q1, #15 // -(sum < 0) vadd.i16 q1, q1, q14 // sum - (sum < 0) vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4 vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4 .if \min vmin.s16 q0, q0, q3 vmax.s16 q0, q0, q2 // iclip(px + .., min, max) .endif .if \bpc == 8 vmovn.u16 d0, q0 .endif .if \w == 8 add r2, r2, #2*16 // tmp += tmp_stride subs r7, r7, #1 // h-- .if \bpc == 8 vst1.8 {d0}, [r0, :64], r1 .else vst1.16 {q0}, [r0, :128], r1 .endif .else .if \bpc == 8 vst1.32 {d0[0]}, [r0, :32], r1 .else vst1.16 {d0}, [r0, :64], r1 .endif add r2, r2, #2*16 // tmp += 2*tmp_stride subs r7, r7, #2 // h -= 2 .if \bpc == 8 vst1.32 {d0[1]}, [r0, :32], r1 .else vst1.16 {d1}, [r0, :64], r1 .endif .endif // Reset pri_taps and directions back to the original point sub r5, r5, #2 .if \pri sub r8, r8, #2 .endif bgt 1b vpop {q4-q7} pop {r4-r9,pc} endfunc .endm .macro filter w, bpc filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec function cdef_filter\w\()_\bpc\()bpc_neon, export=1 push {r4-r9,lr} vpush {q4-q7} ldrd r4, r5, [sp, #92] ldrd r6, r7, [sp, #100] .if \bpc == 16 ldrd r8, r9, [sp, #108] .else ldr r8, [sp, #108] .endif cmp r3, #0 // pri_strength bne 1f b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec 1: cmp r4, #0 // sec_strength bne 1f b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri 1: b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec endfunc .endm const div_table, align=4 .short 840, 420, 280, 210, 168, 140, 120, 105 endconst const alt_fact, align=4 .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 endconst .macro cost_alt dest, s1, s2, s3, s4, s5, s6 vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n] vmull.s16 q2, \s2, \s2 vmull.s16 q3, \s3, \s3 vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n] vmull.s16 q12, \s5, \s5 vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact vmla.i32 q1, q2, q14 vmla.i32 q1, q3, q15 vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact vmla.i32 q5, q12, q14 vmla.i32 q5, q6, q15 vadd.i32 d2, d2, d3 vadd.i32 d3, d10, d11 vpadd.i32 \dest, d2, d3 // *cost_ptr .endm .macro find_best s1, s2, s3 .ifnb \s2 vmov.32 lr, \s2 .endif cmp r12, r1 // cost[n] > best_cost itt gt movgt r0, r3 // best_dir = n movgt r1, r12 // best_cost = cost[n] .ifnb \s2 add r3, r3, #1 // n++ cmp lr, r1 // cost[n] > best_cost vmov.32 r12, \s3 itt gt movgt r0, r3 // best_dir = n movgt r1, lr // best_cost = cost[n] add r3, r3, #1 // n++ .endif .endm // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, // unsigned *const var) .macro find_dir bpc function cdef_find_dir_\bpc\()bpc_neon, export=1 push {lr} vpush {q4-q7} .if \bpc == 16 clz r3, r3 // clz(bitdepth_max) sub lr, r3, #24 // -bitdepth_min_8 .endif sub sp, sp, #32 // cost mov r3, #8 vmov.u16 q1, #0 // q0-q1 sum_diag[0] vmov.u16 q3, #0 // q2-q3 sum_diag[1] vmov.u16 q5, #0 // q4-q5 sum_hv[0-1] vmov.u16 q8, #0 // q6,d16 sum_alt[0] // q7,d17 sum_alt[1] vmov.u16 q9, #0 // q9,d22 sum_alt[2] vmov.u16 q11, #0 vmov.u16 q10, #0 // q10,d23 sum_alt[3] .irpc i, 01234567 .if \bpc == 8 vld1.8 {d30}, [r0, :64], r1 vmov.u8 d31, #128 vsubl.u8 q15, d30, d31 // img[x] - 128 .else vld1.16 {q15}, [r0, :128], r1 vdup.16 q14, lr // -bitdepth_min_8 vshl.u16 q15, q15, q14 vmov.u16 q14, #128 vsub.i16 q15, q15, q14 // img[x] - 128 .endif vmov.u16 q14, #0 .if \i == 0 vmov q0, q15 // sum_diag[0] .else vext.8 q12, q14, q15, #(16-2*\i) vext.8 q13, q15, q14, #(16-2*\i) vadd.i16 q0, q0, q12 // sum_diag[0] vadd.i16 q1, q1, q13 // sum_diag[0] .endif vrev64.16 q13, q15 vswp d26, d27 // [-x] .if \i == 0 vmov q2, q13 // sum_diag[1] .else vext.8 q12, q14, q13, #(16-2*\i) vext.8 q13, q13, q14, #(16-2*\i) vadd.i16 q2, q2, q12 // sum_diag[1] vadd.i16 q3, q3, q13 // sum_diag[1] .endif vpadd.u16 d26, d30, d31 // [(x >> 1)] vmov.u16 d27, #0 vpadd.u16 d24, d26, d28 vpadd.u16 d24, d24, d28 // [y] vmov.u16 r12, d24[0] vadd.i16 q5, q5, q15 // sum_hv[1] .if \i < 4 vmov.16 d8[\i], r12 // sum_hv[0] .else vmov.16 d9[\i-4], r12 // sum_hv[0] .endif .if \i == 0 vmov.u16 q6, q13 // sum_alt[0] .else vext.8 q12, q14, q13, #(16-2*\i) vext.8 q14, q13, q14, #(16-2*\i) vadd.i16 q6, q6, q12 // sum_alt[0] vadd.i16 d16, d16, d28 // sum_alt[0] .endif vrev64.16 d26, d26 // [-(x >> 1)] vmov.u16 q14, #0 .if \i == 0 vmov q7, q13 // sum_alt[1] .else vext.8 q12, q14, q13, #(16-2*\i) vext.8 q13, q13, q14, #(16-2*\i) vadd.i16 q7, q7, q12 // sum_alt[1] vadd.i16 d17, d17, d26 // sum_alt[1] .endif .if \i < 6 vext.8 q12, q14, q15, #(16-2*(3-(\i/2))) vext.8 q13, q15, q14, #(16-2*(3-(\i/2))) vadd.i16 q9, q9, q12 // sum_alt[2] vadd.i16 d22, d22, d26 // sum_alt[2] .else vadd.i16 q9, q9, q15 // sum_alt[2] .endif .if \i == 0 vmov q10, q15 // sum_alt[3] .elseif \i == 1 vadd.i16 q10, q10, q15 // sum_alt[3] .else vext.8 q12, q14, q15, #(16-2*(\i/2)) vext.8 q13, q15, q14, #(16-2*(\i/2)) vadd.i16 q10, q10, q12 // sum_alt[3] vadd.i16 d23, d23, d26 // sum_alt[3] .endif .endr vmov.u32 q15, #105 vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0] vmlal.s16 q12, d9, d9 vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1] vmlal.s16 q13, d11, d11 vadd.s32 d8, d24, d25 vadd.s32 d9, d26, d27 vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17) vmul.i32 d8, d8, d30 // cost[2,6] *= 105 vrev64.16 q1, q1 vrev64.16 q3, q3 vext.8 q1, q1, q1, #10 // sum_diag[0][14-n] vext.8 q3, q3, q3, #10 // sum_diag[1][14-n] vstr s16, [sp, #2*4] // cost[2] vstr s17, [sp, #6*4] // cost[6] movrel_local r12, div_table vld1.16 {q14}, [r12, :128] vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0] vmull.s16 q12, d1, d1 vmlal.s16 q5, d2, d2 vmlal.s16 q12, d3, d3 vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1] vmull.s16 q1, d5, d5 vmlal.s16 q0, d6, d6 vmlal.s16 q1, d7, d7 vmovl.u16 q13, d28 // div_table vmovl.u16 q14, d29 vmul.i32 q5, q5, q13 // cost[0] vmla.i32 q5, q12, q14 vmul.i32 q0, q0, q13 // cost[4] vmla.i32 q0, q1, q14 vadd.i32 d10, d10, d11 vadd.i32 d0, d0, d1 vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1 movrel_local r12, alt_fact vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105 vstr s0, [sp, #0*4] // cost[0] vstr s1, [sp, #4*4] // cost[4] vmovl.u16 q13, d29 // div_table[2*m+1] + 105 vmovl.u16 q14, d30 vmovl.u16 q15, d31 cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3] cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7] vstr s28, [sp, #1*4] // cost[1] vstr s29, [sp, #3*4] // cost[3] mov r0, #0 // best_dir vmov.32 r1, d0[0] // best_cost mov r3, #1 // n vstr s30, [sp, #5*4] // cost[5] vstr s31, [sp, #7*4] // cost[7] vmov.32 r12, d14[0] find_best d14[0], d8[0], d14[1] find_best d14[1], d0[1], d15[0] find_best d15[0], d8[1], d15[1] find_best d15[1] eor r3, r0, #4 // best_dir ^4 ldr r12, [sp, r3, lsl #2] sub r1, r1, r12 // best_cost - cost[best_dir ^ 4] lsr r1, r1, #10 str r1, [r2] // *var add sp, sp, #32 vpop {q4-q7} pop {pc} endfunc .endm rav1e-0.7.1/src/arm/32/ipred.S000064400000000000000000003376071046102023000137100ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * Copyright © 2019, B Krishnan Iyer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_128_8bpc_neon, export=1 push {r4, lr} ldr r4, [sp, #8] clz r3, r3 adr r2, L(ipred_dc_128_tbl) sub r3, r3, #25 ldr r3, [r2, r3, lsl #2] vmov.i8 q0, #128 add r2, r2, r3 add r12, r0, r1 lsl r1, r1, #1 bx r2 .align 2 L(ipred_dc_128_tbl): .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB 4: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4, pc} 8: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4, pc} 16: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vmov.i8 q1, #128 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vmov.i8 q1, #128 sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_8bpc_neon, export=1 push {r4, lr} ldr lr, [sp, #8] clz r3, r3 adr r4, L(ipred_v_tbl) sub r3, r3, #25 ldr r3, [r4, r3, lsl #2] add r2, r2, #1 add r4, r4, r3 add r12, r0, r1 lsl r1, r1, #1 bx r4 .align 2 L(ipred_v_tbl): .word 640f - L(ipred_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_v_tbl) + CONFIG_THUMB 40: vld1.32 {d0[]}, [r2] 4: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs lr, lr, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4, pc} 80: vld1.8 {d0}, [r2] 8: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs lr, lr, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4, pc} 160: vld1.8 {q0}, [r2] 16: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs lr, lr, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vld1.8 {q0, q1}, [r2] 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs lr, lr, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vld1.8 {q0, q1}, [r2]! sub r1, r1, #32 vld1.8 {q2, q3}, [r2] 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 subs lr, lr, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d4, d5, d6, d7}, [r0, :128], r1 vst1.8 {d4, d5, d6, d7}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_h_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] sub r2, r2, #4 mov lr, #-4 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_h_tbl): .word 640f - L(ipred_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_h_tbl) + CONFIG_THUMB .word 8f - L(ipred_h_tbl) + CONFIG_THUMB .word 4f - L(ipred_h_tbl) + CONFIG_THUMB 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr vst1.32 {d3[0]}, [r0, :32], r1 vst1.32 {d2[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4-r5, pc} 8: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr vst1.8 {d3}, [r0, :64], r1 vst1.8 {d2}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d1}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4-r5, pc} 160: add r2, r2, #3 mov lr, #-1 16: vld1.8 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.8 {d2[], d3[]}, [r2], lr vst1.8 {q0}, [r0, :128], r1 vld1.8 {d4[], d5[]}, [r2], lr vst1.8 {q1}, [r12, :128], r1 vld1.8 {d6[], d7[]}, [r2], lr vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: add r2, r2, #3 mov lr, #-1 sub r1, r1, #16 32: vld1.8 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.8 {d2[], d3[]}, [r2], lr vst1.8 {q0}, [r0, :128]! vld1.8 {d4[], d5[]}, [r2], lr vst1.8 {q1}, [r12, :128]! vld1.8 {d6[], d7[]}, [r2], lr vst1.8 {q0}, [r0, :128], r1 vst1.8 {q1}, [r12, :128], r1 vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: add r2, r2, #3 mov lr, #-1 sub r1, r1, #48 64: vld1.8 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.8 {d2[], d3[]}, [r2], lr vst1.8 {q0}, [r0, :128]! vld1.8 {d4[], d5[]}, [r2], lr vst1.8 {q1}, [r12, :128]! vld1.8 {d6[], d7[]}, [r2], lr vst1.8 {q0}, [r0, :128]! vst1.8 {q1}, [r12, :128]! vst1.8 {q0}, [r0, :128]! vst1.8 {q1}, [r12, :128]! vst1.8 {q0}, [r0, :128], r1 vst1.8 {q1}, [r12, :128], r1 vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128]! vst1.8 {q3}, [r12, :128]! vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_dc_top_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] add r2, r2, #1 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_top_tbl): .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB 40: vld1.32 {d0[]}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #2 vdup.8 d0, d0[0] 4: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 4b pop {r4-r5, pc} 80: vld1.8 {d0}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #3 vdup.8 d0, d0[0] 8: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 8b pop {r4-r5, pc} 160: vld1.8 {d0, d1}, [r2] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] 16: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: vld1.8 {d0, d1, d2, d3}, [r2] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d4, q0, #5 vdup.8 q0, d4[0] vdup.8 q1, d4[0] 32: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: vld1.8 {d0, d1, d2, d3}, [r2]! vaddl.u8 q0, d0, d1 vld1.8 {d4, d5, d6, d7}, [r2] vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q0, q1 vadd.u16 q1, q2, q3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d18, q0, #6 vdup.8 q0, d18[0] vdup.8 q1, d18[0] sub r1, r1, #32 64: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] sub r2, r2, r4 clz r3, r3 clz lr, r4 sub lr, lr, #25 adr r5, L(ipred_dc_left_tbl) sub r3, r3, #20 ldr r3, [r5, r3, lsl #2] ldr lr, [r5, lr, lsl #2] add r3, r5, r3 add r5, r5, lr add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_left_tbl): .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB L(ipred_dc_left_h4): vld1.32 {d0[]}, [r2, :32] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #2 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w4): vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt L(ipred_dc_left_w4) pop {r4-r5, pc} L(ipred_dc_left_h8): vld1.8 {d0}, [r2, :64] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #3 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w8): vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt L(ipred_dc_left_w8) pop {r4-r5, pc} L(ipred_dc_left_h16): vld1.8 {d0, d1}, [r2, :128] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w16): vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt L(ipred_dc_left_w16) pop {r4-r5, pc} L(ipred_dc_left_h32): vld1.8 {d0, d1, d2, d3}, [r2, :128] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #5 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w32): vmov.8 q1, q0 1: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} L(ipred_dc_left_h64): vld1.8 {d0, d1, d2, d3}, [r2, :128]! vld1.8 {d4, d5, d6, d7}, [r2, :128] vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q0, q1 vadd.u16 q1, q2, q3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshrn.u16 d0, q0, #6 vdup.8 q0, d0[0] bx r3 L(ipred_dc_left_w64): vmov.8 q1, q0 sub r1, r1, #32 1: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} endfunc // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_8bpc_neon, export=1 push {r4-r6, lr} ldr r4, [sp, #16] sub r2, r2, r4 add lr, r3, r4 // width + height clz r3, r3 clz r12, r4 vdup.16 q15, lr // width + height adr r5, L(ipred_dc_tbl) rbit lr, lr // rbit(width + height) sub r3, r3, #20 // 25 leading bits, minus table offset 5 sub r12, r12, #25 clz lr, lr // ctz(width + height) ldr r3, [r5, r3, lsl #2] ldr r12, [r5, r12, lsl #2] neg lr, lr // -ctz(width + height) add r3, r5, r3 add r5, r5, r12 vshr.u16 q15, q15, #1 // (width + height) >> 1 vdup.16 q14, lr // -ctz(width + height) add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_tbl): .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB L(ipred_dc_h4): vld1.32 {d0[]}, [r2, :32]! vpaddl.u8 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w4): vld1.32 {d1[]}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d1, d1 vpadd.u16 d1, d1 cmp r4, #4 vadd.s16 d0, d0, d1 vshl.u16 d0, d0, d28 beq 1f // h = 8/16 movw lr, #(0x3334/2) movw r5, #(0x5556/2) cmp r4, #16 it ne movne lr, r5 vdup.16 d30, lr vqdmulh.s16 d0, d0, d30 1: vdup.8 d0, d0[0] 2: vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 subs r4, r4, #4 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[0]}, [r12, :32], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h8): vld1.8 {d0}, [r2, :64]! vpaddl.u8 d0, d0 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w8): vld1.8 {d2}, [r2] vadd.s16 d0, d0, d30 vpaddl.u8 d2, d2 vpadd.u16 d2, d2 vpadd.u16 d2, d2 cmp r4, #8 vadd.s16 d0, d0, d2 vshl.u16 d0, d0, d28 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 vdup.16 d24, lr vqdmulh.s16 d0, d0, d24 1: vdup.8 d0, d0[0] 2: vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d0}, [r12, :64], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h16): vld1.8 {d0, d1}, [r2, :128]! vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w16): vld1.8 {d2, d3}, [r2] vadd.s16 d0, d0, d30 vaddl.u8 q1, d2, d3 vadd.u16 d2, d2, d3 vpadd.u16 d2, d2 vpadd.u16 d2, d2 cmp r4, #16 vadd.s16 d0, d0, d2 vshl.u16 d0, d0, d28 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 vdup.16 d24, lr vqdmulh.s16 d0, d0, d24 1: vdup.8 q0, d0[0] 2: vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1}, [r0, :128], r1 vst1.8 {d0, d1}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h32): vld1.8 {d0, d1, d2, d3}, [r2, :128]! vaddl.u8 q0, d0, d1 vaddl.u8 q1, d2, d3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w32): vld1.8 {d2, d3, d4, d5}, [r2] vadd.s16 d0, d0, d30 vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vadd.u16 q1, q1, q2 vadd.u16 d2, d2, d3 vpadd.u16 d2, d2 vpadd.u16 d2, d2 cmp r4, #32 vadd.s16 d0, d0, d2 vshl.u16 d4, d0, d28 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #(0x3334/2) movw r5, #(0x5556/2) it ne movne lr, r5 vdup.16 d24, lr vqdmulh.s16 d4, d4, d24 1: vdup.8 q0, d4[0] vdup.8 q1, d4[0] 2: vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h64): vld1.8 {d0, d1, d2, d3}, [r2, :128]! vaddl.u8 q0, d0, d1 vld1.8 {d4, d5, d6, d7}, [r2, :128]! vaddl.u8 q1, d2, d3 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q0, q1 vadd.u16 q1, q2, q3 vadd.u16 q0, q0, q1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 add r2, r2, #1 vpadd.u16 d0, d0 bx r3 L(ipred_dc_w64): vld1.8 {d2, d3, d4, d5}, [r2]! vadd.s16 d0, d0, d30 vaddl.u8 q2, d4, d5 vaddl.u8 q1, d2, d3 vadd.u16 d4, d4, d5 vadd.u16 d2, d2, d3 vld1.8 {d16, d17, d18, d19}, [r2] vpadd.u16 d4, d4 vpadd.u16 d2, d2 vpadd.u16 d4, d4 vpadd.u16 d2, d2 vaddl.u8 q8, d16, d17 vaddl.u8 q9, d18, d19 vadd.u16 d16, d16, d17 vadd.u16 d18, d18, d19 vpadd.u16 d16, d16 vpadd.u16 d18, d18 vpadd.u16 d16, d16 vpadd.u16 d18, d18 vadd.u16 d2, d2, d4 vadd.u16 d3, d16, d18 cmp r4, #64 vadd.s16 d0, d0, d2 vadd.s16 d0, d0, d3 vshl.u16 d18, d0, d28 beq 1f // h = 16/32 movw lr, #(0x5556/2) movt lr, #(0x3334/2) and r5, r4, #31 lsr lr, lr, r5 vdup.16 d30, lr vqdmulh.s16 d18, d18, d30 1: sub r1, r1, #32 vdup.8 q0, d18[0] vdup.8 q1, d18[0] 2: vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.8 {d0, d1, d2, d3}, [r0, :128]! vst1.8 {d0, d1, d2, d3}, [r12, :128]! vst1.8 {d0, d1, d2, d3}, [r0, :128], r1 vst1.8 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} endfunc // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_8bpc_neon, export=1 push {r4-r8, lr} ldr r4, [sp, #24] clz lr, r3 adr r5, L(ipred_paeth_tbl) sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.8 {d4[], d5[]}, [r2] add r8, r2, #1 sub r2, r2, #4 add r5, r5, lr mov r7, #-4 add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_paeth_tbl): .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB 40: vld1.32 {d6[], d7[]}, [r8] vsubl.u8 q8, d6, d4 // top - topleft 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 vzip.32 d0, d1 vzip.32 d2, d3 vaddw.u8 q9, q8, d0 vaddw.u8 q10, q8, d2 vqmovun.s16 d18, q9 // base vqmovun.s16 d19, q10 vmov d1, d2 vabd.u8 q10, q3, q9 // tdiff vabd.u8 q11, q2, q9 // tldiff vabd.u8 q9, q0, q9 // ldiff vmin.u8 q12, q10, q11 // min(tdiff, tldiff) vcge.u8 q10, q11, q10 // tldiff >= tdiff vcge.u8 q9, q12, q9 // min(tdiff, tldiff) >= ldiff vbsl q10, q3, q2 // tdiff <= tldiff ? top : topleft vbit q10, q0, q9 // ldiff <= min ? left : ... vst1.32 {d21[1]}, [r0, :32], r1 vst1.32 {d21[0]}, [r6, :32], r1 subs r4, r4, #4 vst1.32 {d20[1]}, [r0, :32], r1 vst1.32 {d20[0]}, [r6, :32], r1 bgt 4b pop {r4-r8, pc} 80: vld1.8 {d6}, [r8] vsubl.u8 q8, d6, d4 // top - topleft vmov d7, d6 8: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 vaddw.u8 q9, q8, d0 vaddw.u8 q10, q8, d1 vaddw.u8 q11, q8, d2 vaddw.u8 q12, q8, d3 vqmovun.s16 d18, q9 // base vqmovun.s16 d19, q10 vqmovun.s16 d20, q11 vqmovun.s16 d21, q12 vabd.u8 q11, q3, q9 // tdiff vabd.u8 q12, q3, q10 vabd.u8 q13, q2, q9 // tldiff vabd.u8 q14, q2, q10 vabd.u8 q10, q1, q10 // ldiff vabd.u8 q9, q0, q9 vmin.u8 q15, q12, q14 // min(tdiff, tldiff) vcge.u8 q12, q14, q12 // tldiff >= tdiff vmin.u8 q14, q11, q13 // min(tdiff, tldiff) vcge.u8 q11, q13, q11 // tldiff >= tdiff vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff vcge.u8 q9, q14, q9 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 vst1.8 {d25}, [r0, :64], r1 vst1.8 {d24}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d23}, [r0, :64], r1 vst1.8 {d22}, [r6, :64], r1 bgt 8b pop {r4-r8, pc} 160: 320: 640: vld1.8 {d6}, [r8]! mov r12, r3 // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3 1: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 2: vsubl.u8 q8, d6, d4 // top - topleft vmov d7, d6 vaddw.u8 q9, q8, d0 vaddw.u8 q10, q8, d1 vaddw.u8 q11, q8, d2 vaddw.u8 q12, q8, d3 vqmovun.s16 d18, q9 // base vqmovun.s16 d19, q10 vqmovun.s16 d20, q11 vqmovun.s16 d21, q12 vabd.u8 q11, q3, q9 // tdiff vabd.u8 q12, q3, q10 vabd.u8 q13, q2, q9 // tldiff vabd.u8 q14, q2, q10 vabd.u8 q10, q1, q10 // ldiff vabd.u8 q9, q0, q9 vmin.u8 q15, q12, q14 // min(tdiff, tldiff) vcge.u8 q12, q14, q12 // tldiff >= tdiff vmin.u8 q14, q11, q13 // min(tdiff, tldiff) vcge.u8 q11, q13, q11 // tldiff >= tdiff vcge.u8 q10, q15, q10 // min(tdiff, tldiff) >= ldiff vcge.u8 q9, q14, q9 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 subs r3, r3, #8 vst1.8 {d25}, [r0, :64]! vst1.8 {d24}, [r6, :64]! vst1.8 {d23}, [r5, :64]! vst1.8 {d22}, [lr, :64]! ble 8f vld1.8 {d6}, [r8]! b 2b 8: subs r4, r4, #4 ble 9f // End of horizontal loop, move pointers to next four rows sub r8, r8, r12 add r0, r0, r1 add r6, r6, r1 vld1.8 {d6}, [r8]! add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: pop {r4-r8, pc} endfunc // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_8bpc_neon, export=1 push {r4-r10, lr} ldr r4, [sp, #32] movrel r10, X(sm_weights) add r12, r10, r4 add r10, r10, r3 clz r9, r3 adr r5, L(ipred_smooth_tbl) sub lr, r2, r4 sub r9, r9, #25 ldr r9, [r5, r9, lsl #2] vld1.8 {d4[]}, [lr] // bottom add r8, r2, #1 add r5, r5, r9 add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_tbl): .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB 40: vld1.32 {d16[]}, [r8] // top vld1.32 {d18[]}, [r10, :32] // weights_hor sub r2, r2, #4 mov r7, #-4 vdup.8 q3, d16[3] // right vsubl.u8 q8, d16, d4 // top-bottom vmovl.u8 q9, d18 // weights_hor 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver vshll.i8 q12, d6, #8 // right*256 vshll.i8 q13, d6, #8 vzip.32 d1, d0 // left, flipped vzip.32 d3, d2 vzip.32 d20, d21 // weights_ver vzip.32 d22, d23 vshll.i8 q14, d4, #8 // bottom*256 vshll.i8 q15, d4, #8 vsubl.u8 q0, d1, d6 // left-right vsubl.u8 q1, d3, d6 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmla.i16 q12, q1, q9 // right*256 + (left-right)*weights_hor vmla.i16 q13, q0, q9 // (left flipped) vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q15, q8, q11 vhadd.u16 q12, q12, q14 vhadd.u16 q13, q13, q15 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 vst1.32 {d24[0]}, [r0, :32], r1 vst1.32 {d24[1]}, [r6, :32], r1 subs r4, r4, #4 vst1.32 {d25[0]}, [r0, :32], r1 vst1.32 {d25[1]}, [r6, :32], r1 bgt 4b pop {r4-r10, pc} 80: vld1.8 {d16}, [r8] // top vld1.8 {d18}, [r10, :64] // weights_hor sub r2, r2, #2 mov r7, #-2 vdup.8 q3, d16[7] // right vsubl.u8 q8, d16, d4 // top-bottom vmovl.u8 q9, d18 // weights_hor 8: vld2.8 {d0[], d1[]}, [r2, :16], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vshll.i8 q12, d6, #8 // right*256 vshll.i8 q13, d6, #8 vshll.i8 q14, d4, #8 // bottom*256 vshll.i8 q15, d4, #8 vsubl.u8 q1, d0, d6 // left-right (left flipped) vsubl.u8 q0, d1, d6 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor vmla.i16 q13, q1, q9 vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q15, q8, q11 vhadd.u16 q12, q12, q14 vhadd.u16 q13, q13, q15 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 subs r4, r4, #2 vst1.8 {d24}, [r0, :64], r1 vst1.8 {d25}, [r6, :64], r1 bgt 8b pop {r4-r10, pc} 160: 320: 640: add lr, r2, r3 sub r2, r2, #2 mov r7, #-2 vld1.8 {d6[], d7[]}, [lr] // right sub r1, r1, r3 mov r9, r3 1: vld2.8 {d0[], d1[]}, [r2, :16], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vsubl.u8 q1, d0, d6 // left-right (left flipped) vsubl.u8 q0, d1, d6 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 2: vld1.8 {d16}, [r8]! // top vld1.8 {d18}, [r10, :64]! // weights_hor vshll.i8 q12, d6, #8 // right*256 vshll.i8 q13, d6, #8 vmovl.u8 q9, d18 // weights_hor vshll.i8 q14, d4, #8 // bottom*256 vshll.i8 q15, d4, #8 vsubl.u8 q8, d16, d4 // top-bottom vmla.i16 q12, q0, q9 // right*256 + (left-right)*weights_hor vmla.i16 q13, q1, q9 vmla.i16 q14, q8, q10 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q15, q8, q11 vhadd.u16 q12, q12, q14 vhadd.u16 q13, q13, q15 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 subs r3, r3, #8 vst1.8 {d24}, [r0, :64]! vst1.8 {d25}, [r6, :64]! bgt 2b subs r4, r4, #2 ble 9f sub r8, r8, r9 sub r10, r10, r9 add r0, r0, r1 add r6, r6, r1 mov r3, r9 b 1b 9: pop {r4-r10, pc} endfunc // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_8bpc_neon, export=1 push {r4-r7, lr} ldr r4, [sp, #20] movrel r7, X(sm_weights) add r7, r7, r4 clz lr, r3 adr r5, L(ipred_smooth_v_tbl) sub r12, r2, r4 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.8 {d4[]}, [r12] // bottom add r2, r2, #1 add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_v_tbl): .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB 40: vld1.32 {d6[]}, [r2] // top vsubl.u8 q3, d6, d4 // top-bottom 4: vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver vshll.i8 q10, d4, #8 // bottom*256 vshll.i8 q11, d4, #8 vzip.32 d16, d17 // weights_ver vzip.32 d18, d19 vmovl.u8 q8, d16 // weights_ver vmovl.u8 q9, d18 subs r4, r4, #4 vmla.i16 q10, q3, q8 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q11, q3, q9 vrshrn.i16 d20, q10, #8 vrshrn.i16 d21, q11, #8 vst1.32 {d20[0]}, [r0, :32], r1 vst1.32 {d20[1]}, [r6, :32], r1 vst1.32 {d21[0]}, [r0, :32], r1 vst1.32 {d21[1]}, [r6, :32], r1 bgt 4b pop {r4-r7, pc} 80: vld1.8 {d6}, [r2] // top vsubl.u8 q3, d6, d4 // top-bottom 8: vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver vshll.i8 q12, d4, #8 // bottom*256 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vmovl.u8 q8, d16 // weights_ver vmovl.u8 q9, d18 vmovl.u8 q10, d20 vmovl.u8 q11, d22 vmla.i16 q12, q3, q8 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q13, q3, q9 vmla.i16 q14, q3, q10 vmla.i16 q15, q3, q11 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 vrshrn.i16 d26, q14, #8 vrshrn.i16 d27, q15, #8 vst1.8 {d24}, [r0, :64], r1 vst1.8 {d25}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d26}, [r0, :64], r1 vst1.8 {d27}, [r6, :64], r1 bgt 8b pop {r4-r7, pc} 160: 320: 640: vpush {q4-q7} // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3 mov r12, r3 1: vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver vmovl.u8 q4, d8 // weights_ver vmovl.u8 q5, d10 vmovl.u8 q6, d12 vmovl.u8 q7, d14 2: vld1.8 {q3}, [r2]! // top vshll.i8 q8, d4, #8 // bottom*256 vshll.i8 q9, d4, #8 vshll.i8 q10, d4, #8 vshll.i8 q11, d4, #8 vsubl.u8 q0, d6, d4 // top-bottom vsubl.u8 q1, d7, d4 vshll.i8 q12, d4, #8 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vmla.i16 q8, q0, q4 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q9, q1, q4 vmla.i16 q10, q0, q5 vmla.i16 q11, q1, q5 vmla.i16 q12, q0, q6 // bottom*256 + (top-bottom)*weights_ver vmla.i16 q13, q1, q6 vmla.i16 q14, q0, q7 vmla.i16 q15, q1, q7 vrshrn.i16 d16, q8, #8 vrshrn.i16 d17, q9, #8 vrshrn.i16 d18, q10, #8 vrshrn.i16 d19, q11, #8 vrshrn.i16 d20, q12, #8 vrshrn.i16 d21, q13, #8 vrshrn.i16 d22, q14, #8 vrshrn.i16 d23, q15, #8 subs r3, r3, #16 vst1.8 {q8}, [r0, :128]! vst1.8 {q9}, [r6, :128]! vst1.8 {q10}, [r5, :128]! vst1.8 {q11}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r2, r2, r12 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r7, pc} endfunc // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_8bpc_neon, export=1 push {r4-r8, lr} ldr r4, [sp, #24] movrel r8, X(sm_weights) add r8, r8, r3 clz lr, r3 adr r5, L(ipred_smooth_h_tbl) add r12, r2, r3 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.8 {d4[]}, [r12] // right add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_h_tbl): .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB 40: vld1.32 {d6[]}, [r8, :32] // weights_hor sub r2, r2, #4 mov r7, #-4 vmovl.u8 q3, d6 // weights_hor 4: vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], r7 // left vshll.i8 q8, d4, #8 // right*256 vshll.i8 q9, d4, #8 vzip.32 d3, d2 // left, flipped vzip.32 d1, d0 vsubl.u8 q1, d3, d4 // left-right vsubl.u8 q0, d1, d4 subs r4, r4, #4 vmla.i16 q8, q1, q3 // right*256 + (left-right)*weights_hor vmla.i16 q9, q0, q3 vrshrn.i16 d16, q8, #8 vrshrn.i16 d17, q9, #8 vst1.32 {d16[0]}, [r0, :32], r1 vst1.32 {d16[1]}, [r6, :32], r1 vst1.32 {d17[0]}, [r0, :32], r1 vst1.32 {d17[1]}, [r6, :32], r1 bgt 4b pop {r4-r8, pc} 80: vld1.8 {d6}, [r8, :64] // weights_hor sub r2, r2, #4 mov r7, #-4 vmovl.u8 q3, d6 // weights_hor 8: vld4.8 {d16[], d18[], d20[], d22[]}, [r2, :32], r7 // left vshll.i8 q12, d4, #8 // right*256 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vsubl.u8 q11, d22, d4 // left-right vsubl.u8 q10, d20, d4 vsubl.u8 q9, d18, d4 vsubl.u8 q8, d16, d4 vmla.i16 q12, q11, q3 // right*256 + (left-right)*weights_hor vmla.i16 q13, q10, q3 // (left flipped) vmla.i16 q14, q9, q3 vmla.i16 q15, q8, q3 vrshrn.i16 d24, q12, #8 vrshrn.i16 d25, q13, #8 vrshrn.i16 d26, q14, #8 vrshrn.i16 d27, q15, #8 vst1.8 {d24}, [r0, :64], r1 vst1.8 {d25}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d26}, [r0, :64], r1 vst1.8 {d27}, [r6, :64], r1 bgt 8b pop {r4-r8, pc} 160: 320: 640: vpush {q4-q7} sub r2, r2, #4 mov r7, #-4 // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3 mov r12, r3 1: vld4.8 {d8[], d10[], d12[], d14[]}, [r2, :32], r7 // left vsubl.u8 q4, d8, d4 // left-right vsubl.u8 q5, d10, d4 vsubl.u8 q6, d12, d4 vsubl.u8 q7, d14, d4 2: vld1.8 {q1}, [r8, :128]! // weights_hor vshll.i8 q8, d4, #8 // right*256 vshll.i8 q9, d4, #8 vshll.i8 q10, d4, #8 vshll.i8 q11, d4, #8 vmovl.u8 q0, d2 // weights_hor vmovl.u8 q1, d3 vshll.i8 q12, d4, #8 vshll.i8 q13, d4, #8 vshll.i8 q14, d4, #8 vshll.i8 q15, d4, #8 vmla.i16 q8, q7, q0 // right*256 + (left-right)*weights_hor vmla.i16 q9, q7, q1 // (left flipped) vmla.i16 q10, q6, q0 vmla.i16 q11, q6, q1 vmla.i16 q12, q5, q0 vmla.i16 q13, q5, q1 vmla.i16 q14, q4, q0 vmla.i16 q15, q4, q1 vrshrn.i16 d16, q8, #8 vrshrn.i16 d17, q9, #8 vrshrn.i16 d18, q10, #8 vrshrn.i16 d19, q11, #8 vrshrn.i16 d20, q12, #8 vrshrn.i16 d21, q13, #8 vrshrn.i16 d22, q14, #8 vrshrn.i16 d23, q15, #8 subs r3, r3, #16 vst1.8 {q8}, [r0, :128]! vst1.8 {q9}, [r6, :128]! vst1.8 {q10}, [r5, :128]! vst1.8 {q11}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r8, r8, r12 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r8, pc} endfunc // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height); function ipred_filter_8bpc_neon, export=1 push {r4-r8, lr} movw r12, #511 ldrd r4, r5, [sp, #24] and r5, r5, r12 // 511 movrel r6, X(filter_intra_taps) lsl r5, r5, #6 add r6, r6, r5 vld1.8 {d20, d21, d22, d23}, [r6, :128]! clz lr, r3 adr r5, L(ipred_filter_tbl) vld1.8 {d27, d28, d29}, [r6, :64] sub lr, lr, #26 ldr lr, [r5, lr, lsl #2] vmovl.s8 q8, d20 vmovl.s8 q9, d21 add r5, r5, lr vmovl.s8 q10, d22 vmovl.s8 q11, d23 add r6, r0, r1 lsl r1, r1, #1 vmovl.s8 q12, d27 vmovl.s8 q13, d28 vmovl.s8 q14, d29 add r8, r2, #1 sub r2, r2, #2 mov r7, #-2 bx r5 .align 2 L(ipred_filter_tbl): .word 320f - L(ipred_filter_tbl) + CONFIG_THUMB .word 160f - L(ipred_filter_tbl) + CONFIG_THUMB .word 80f - L(ipred_filter_tbl) + CONFIG_THUMB .word 40f - L(ipred_filter_tbl) + CONFIG_THUMB 40: vld1.32 {d0[]}, [r8] // top (0-3) vmovl.u8 q0, d0 // top (0-3) 4: vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmovl.u8 q1, d2 // left (0-1) + topleft (2) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vqrshrun.s16 d4, q2, #4 subs r4, r4, #2 vst1.32 {d4[0]}, [r0, :32], r1 vmovl.u8 q0, d4 vst1.32 {d4[1]}, [r6, :32], r1 vmov d0, d1 // move top from [4-7] to [0-3] bgt 4b pop {r4-r8, pc} 80: vld1.8 {d0}, [r8] // top (0-7) vmovl.u8 q0, d0 // top (0-7) 8: vld1.32 {d2[]}, [r2], r7 // left (0-1) + topleft (2) vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmovl.u8 q1, d2 // left (0-1) + topleft (2) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) vqrshrun.s16 d4, q2, #4 vmovl.u8 q1, d4 // first block, in 16 bit vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) vmla.i16 q3, q13, d2[3] // p5(left[0]) * filter(5) vmla.i16 q3, q14, d3[3] // p6(left[1]) * filter(6) vqrshrun.s16 d5, q3, #4 vzip.32 d4, d5 subs r4, r4, #2 vst1.8 {d4}, [r0, :64], r1 vmovl.u8 q0, d5 vst1.8 {d5}, [r6, :64], r1 bgt 8b pop {r4-r8, pc} 160: 320: vpush {q4-q5} sub r1, r1, r3 mov lr, r3 1: vld1.32 {d0[]}, [r2], r7 // left (0-1) + topleft (2) vmovl.u8 q0, d0 // left (0-1) + topleft (2) 2: vld1.8 {q2}, [r8]! // top(0-15) vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) vmovl.u8 q1, d4 // top(0-7) vmovl.u8 q2, d5 // top(8-15) vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) vqrshrun.s16 d6, q3, #4 vmovl.u8 q0, d6 // first block, in 16 bit vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) vmla.i16 q4, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q4, q14, d1[3] // p6(left[1]) * filter(6) vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) vqrshrun.s16 d7, q4, #4 vmovl.u8 q0, d7 // second block, in 16 bit vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) vmul.i16 q15, q9, d5[0] // p1(top[0]) * filter(1) vmla.i16 q15, q10, d5[1] // p2(top[1]) * filter(2) vmla.i16 q15, q11, d5[2] // p3(top[2]) * filter(3) vqrshrun.s16 d8, q5, #4 vmovl.u8 q0, d8 // third block, in 16 bit vmov.u8 r12, d5[6] vmla.i16 q15, q12, d5[3] // p4(top[3]) * filter(4) vmla.i16 q15, q8, d4[3] // p0(topleft) * filter(0) vmla.i16 q15, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q15, q14, d1[3] // p6(left[1]) * filter(6) vmov.8 d0[4], r12 subs r3, r3, #16 vqrshrun.s16 d9, q15, #4 vst4.32 {d6[0], d7[0], d8[0], d9[0]}, [r0, :128]! vst4.32 {d6[1], d7[1], d8[1], d9[1]}, [r6, :128]! ble 8f vmov.u8 r12, d9[7] vmov.8 d0[0], r12 vmov.u8 r12, d9[3] vmov.8 d0[2], r12 b 2b 8: subs r4, r4, #2 ble 9f sub r8, r6, lr add r0, r0, r1 add r6, r6, r1 mov r3, lr b 1b 9: vpop {q4-q5} pop {r4-r8, pc} endfunc // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint16_t *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 push {r4-r5, lr} ldrd r4, r5, [sp, #12] vld1.16 {q0}, [r2, :128] clz lr, r4 adr r12, L(pal_pred_tbl) sub lr, lr, #25 ldr lr, [r12, lr, lsl #2] vmovn.i16 d0, q0 add r12, r12, lr add r2, r0, r1 bx r12 .align 2 L(pal_pred_tbl): .word 640f - L(pal_pred_tbl) + CONFIG_THUMB .word 320f - L(pal_pred_tbl) + CONFIG_THUMB .word 160f - L(pal_pred_tbl) + CONFIG_THUMB .word 80f - L(pal_pred_tbl) + CONFIG_THUMB .word 40f - L(pal_pred_tbl) + CONFIG_THUMB 40: lsl r1, r1, #1 4: vld1.8 {q1}, [r3, :128]! subs r5, r5, #4 vtbl.8 d2, {d0}, d2 vtbl.8 d3, {d0}, d3 vst1.32 {d2[0]}, [r0, :32], r1 vst1.32 {d2[1]}, [r2, :32], r1 vst1.32 {d3[0]}, [r0, :32], r1 vst1.32 {d3[1]}, [r2, :32], r1 bgt 4b pop {r4-r5, pc} 80: lsl r1, r1, #1 8: vld1.8 {q1, q2}, [r3, :128]! subs r5, r5, #4 vtbl.8 d2, {d0}, d2 vtbl.8 d3, {d0}, d3 vst1.8 {d2}, [r0, :64], r1 vtbl.8 d4, {d0}, d4 vst1.8 {d3}, [r2, :64], r1 vtbl.8 d5, {d0}, d5 vst1.8 {d4}, [r0, :64], r1 vst1.8 {d5}, [r2, :64], r1 bgt 8b pop {r4-r5, pc} 160: lsl r1, r1, #1 16: vld1.8 {q8, q9}, [r3, :128]! subs r5, r5, #4 vld1.8 {q10, q11}, [r3, :128]! vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 vtbl.8 d19, {d0}, d19 vtbl.8 d20, {d0}, d20 vtbl.8 d21, {d0}, d21 vst1.8 {q8}, [r0, :128], r1 vtbl.8 d22, {d0}, d22 vst1.8 {q9}, [r2, :128], r1 vtbl.8 d23, {d0}, d23 vst1.8 {q10}, [r0, :128], r1 vst1.8 {q11}, [r2, :128], r1 bgt 16b pop {r4-r5, pc} 320: lsl r1, r1, #1 32: vld1.8 {q8, q9}, [r3, :128]! subs r5, r5, #2 vld1.8 {q10, q11}, [r3, :128]! vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 vtbl.8 d19, {d0}, d19 vtbl.8 d20, {d0}, d20 vtbl.8 d21, {d0}, d21 vst1.8 {q8, q9}, [r0, :128], r1 vtbl.8 d22, {d0}, d22 vtbl.8 d23, {d0}, d23 vst1.8 {q10, q11}, [r2, :128], r1 bgt 32b pop {r4-r5, pc} 640: sub r1, r1, #32 64: vld1.8 {q8, q9}, [r3, :128]! subs r5, r5, #1 vld1.8 {q10, q11}, [r3, :128]! vtbl.8 d16, {d0}, d16 vtbl.8 d17, {d0}, d17 vtbl.8 d18, {d0}, d18 vtbl.8 d19, {d0}, d19 vtbl.8 d20, {d0}, d20 vtbl.8 d21, {d0}, d21 vst1.8 {q8, q9}, [r0, :128]! vtbl.8 d22, {d0}, d22 vtbl.8 d23, {d0}, d23 vst1.8 {q10, q11}, [r0, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz lr, r3 adr r12, L(ipred_cfl_128_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vmov.i16 q0, #128 // dc vdup.i16 q1, r6 // alpha add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 bx r12 .align 2 L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB L(ipred_cfl_splat_w4): vld1.16 {q2, q3}, [r5, :128]! vmul.i16 q2, q2, q1 // diff = ac * alpha vmul.i16 q3, q3, q1 vshr.s16 q8, q2, #15 // sign = diff >> 15 vshr.s16 q9, q3, #15 vadd.i16 q2, q2, q8 // diff + sign vadd.i16 q3, q3, q9 vrshr.s16 q2, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshr.s16 q3, q3, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vadd.i16 q3, q3, q0 vqmovun.s16 d4, q2 // iclip_pixel(dc + apply_sign()) vqmovun.s16 d5, q3 vst1.32 {d4[0]}, [r0, :32], r1 vst1.32 {d4[1]}, [r6, :32], r1 subs r4, r4, #4 vst1.32 {d5[0]}, [r0, :32], r1 vst1.32 {d5[1]}, [r6, :32], r1 bgt L(ipred_cfl_splat_w4) pop {r4-r8, pc} L(ipred_cfl_splat_w8): vld1.16 {q8, q9}, [r5, :128]! vld1.16 {q10, q11}, [r5, :128]! vmul.i16 q8, q8, q1 // diff = ac * alpha vmul.i16 q9, q9, q1 vmul.i16 q10, q10, q1 vmul.i16 q11, q11, q1 vshr.s16 q12, q8, #15 // sign = diff >> 15 vshr.s16 q13, q9, #15 vshr.s16 q14, q10, #15 vshr.s16 q15, q11, #15 vadd.i16 q8, q8, q12 // diff + sign vadd.i16 q9, q9, q13 vadd.i16 q10, q10, q14 vadd.i16 q11, q11, q15 vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshr.s16 q9, q9, #6 vrshr.s16 q10, q10, #6 vrshr.s16 q11, q11, #6 vadd.i16 q8, q8, q0 // dc + apply_sign() vadd.i16 q9, q9, q0 vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q0 vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) vqmovun.s16 d17, q9 vqmovun.s16 d18, q10 vqmovun.s16 d19, q11 vst1.8 {d16}, [r0, :64], r1 vst1.8 {d17}, [r6, :64], r1 subs r4, r4, #4 vst1.8 {d18}, [r0, :64], r1 vst1.8 {d19}, [r6, :64], r1 bgt L(ipred_cfl_splat_w8) pop {r4-r8, pc} L(ipred_cfl_splat_w16): add r12, r5, r3, lsl #1 sub r1, r1, r3 mov lr, r3 1: vld1.16 {q8, q9}, [r5, :128]! vmul.i16 q8, q8, q1 // diff = ac * alpha vld1.16 {q10, q11}, [r12, :128]! vmul.i16 q9, q9, q1 vmul.i16 q10, q10, q1 vmul.i16 q11, q11, q1 vshr.s16 q12, q8, #15 // sign = diff >> 15 vshr.s16 q13, q9, #15 vshr.s16 q14, q10, #15 vshr.s16 q15, q11, #15 vadd.i16 q8, q8, q12 // diff + sign vadd.i16 q9, q9, q13 vadd.i16 q10, q10, q14 vadd.i16 q11, q11, q15 vrshr.s16 q8, q8, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshr.s16 q9, q9, #6 vrshr.s16 q10, q10, #6 vrshr.s16 q11, q11, #6 vadd.i16 q8, q8, q0 // dc + apply_sign() vadd.i16 q9, q9, q0 vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q0 vqmovun.s16 d16, q8 // iclip_pixel(dc + apply_sign()) vqmovun.s16 d17, q9 vqmovun.s16 d18, q10 vqmovun.s16 d19, q11 subs r3, r3, #16 vst1.16 {q8}, [r0, :128]! vst1.16 {q9}, [r6, :128]! bgt 1b subs r4, r4, #2 add r5, r5, lr, lsl #1 add r12, r12, lr, lsl #1 add r0, r0, r1 add r6, r6, r1 mov r3, lr bgt 1b pop {r4-r8, pc} endfunc // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz lr, r3 adr r12, L(ipred_cfl_top_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vdup.16 q1, r6 // alpha add r2, r2, #1 add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 bx r12 .align 2 L(ipred_cfl_top_tbl): .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB 4: vld1.32 {d0[]}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) 8: vld1.8 {d0}, [r2] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) 16: vld1.8 {q0}, [r2] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) 32: vld1.8 {q2, q3}, [r2] vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q2, q3 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #5 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_left_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] sub r2, r2, r4 clz lr, r3 clz r8, r4 adr r12, L(ipred_cfl_splat_tbl) adr r7, L(ipred_cfl_left_tbl) sub lr, lr, #26 sub r8, r8, #26 ldr lr, [r12, lr, lsl #2] ldr r8, [r7, r8, lsl #2] vdup.16 q1, r6 // alpha add r12, r12, lr add r7, r7, r8 add r6, r0, r1 lsl r1, r1, #1 bx r7 .align 2 L(ipred_cfl_left_tbl): .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB L(ipred_cfl_left_h4): vld1.32 {d0[]}, [r2, :32] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h8): vld1.8 {d0}, [r2, :64] vpaddl.u8 d0, d0 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h16): vld1.8 {q0}, [r2, :128] vaddl.u8 q0, d0, d1 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h32): vld1.8 {q2, q3}, [r2, :128] vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.u16 q0, q2, q3 vadd.u16 d0, d0, d1 vpadd.u16 d0, d0 vpadd.u16 d0, d0 vrshr.u16 d0, d0, #5 vdup.16 q0, d0[0] bx r12 endfunc // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_8bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] sub r2, r2, r4 add r8, r3, r4 // width + height vdup.16 q1, r6 // alpha clz lr, r3 clz r6, r4 vdup.16 d16, r8 // width + height adr r7, L(ipred_cfl_tbl) rbit r8, r8 // rbit(width + height) sub lr, lr, #22 // 26 leading bits, minus table offset 4 sub r6, r6, #26 clz r8, r8 // ctz(width + height) ldr lr, [r7, lr, lsl #2] ldr r6, [r7, r6, lsl #2] neg r8, r8 // -ctz(width + height) add r12, r7, lr add r7, r7, r6 vshr.u16 d16, d16, #1 // (width + height) >> 1 vdup.16 d17, r8 // -ctz(width + height) add r6, r0, r1 lsl r1, r1, #1 bx r7 .align 2 L(ipred_cfl_tbl): .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB L(ipred_cfl_h4): vld1.32 {d0[]}, [r2, :32]! vpaddl.u8 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w4): vld1.32 {d1[]}, [r2] vadd.i16 d0, d0, d16 vpaddl.u8 d1, d1 vpadd.u16 d1, d1 cmp r4, #4 vadd.i16 d0, d0, d1 vshl.u16 d0, d0, d17 beq 1f // h = 8/16 movw lr, #(0x3334/2) movw r8, #(0x5556/2) cmp r4, #16 it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): vld1.8 {d0}, [r2, :64]! vpaddl.u8 d0, d0 vpadd.i16 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w8): vld1.8 {d1}, [r2] vadd.i16 d0, d0, d16 vpaddl.u8 d1, d1 vpadd.i16 d1, d1 vpadd.i16 d1, d1 cmp r4, #8 vadd.i16 d0, d0, d1 vshl.u16 d0, d0, d17 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #(0x3334/2) movw r8, #(0x5556/2) it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): vld1.8 {q0}, [r2, :128]! vaddl.u8 q0, d0, d1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w16): vld1.8 {q2}, [r2] vadd.i16 d0, d0, d16 vaddl.u8 q2, d4, d5 vadd.i16 d4, d4, d5 vpadd.i16 d4, d4 vpadd.i16 d4, d4 cmp r4, #16 vadd.i16 d0, d0, d4 vshl.u16 d0, d0, d17 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #(0x3334/2) movw r8, #(0x5556/2) it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): vld1.8 {q2, q3}, [r2, :128]! vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0 add r2, r2, #1 vpadd.i16 d0, d0 bx r12 L(ipred_cfl_w32): vld1.8 {q2, q3}, [r2] vadd.i16 d0, d0, d16 vaddl.u8 q2, d4, d5 vaddl.u8 q3, d6, d7 vadd.i16 q2, q2, q3 vadd.i16 d4, d4, d5 vpadd.i16 d4, d4 vpadd.i16 d4, d4 cmp r4, #32 vadd.i16 d0, d0, d4 vshl.u16 d0, d0, d17 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #(0x3334/2) movw r8, #(0x5556/2) it ne movne lr, r8 vdup.16 d18, lr vqdmulh.s16 d0, d0, d18 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_420_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_420_tbl): .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w4): 1: // Copy and subsample input vld1.8 {d0}, [r1, :64], r2 vld1.8 {d2}, [r12, :64], r2 vld1.8 {d1}, [r1, :64], r2 vld1.8 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vadd.i16 q0, q0, q1 vshl.i16 q0, q0, #1 subs r8, r8, #2 vst1.16 {q0}, [r0, :128]! vadd.i16 q8, q8, q0 bgt 1b cmp r4, #0 vmov d0, d1 vmov d2, d1 vmov d3, d1 L(ipred_cfl_ac_420_w4_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q8, q8, q1 bgt 2b 3: L(ipred_cfl_ac_420_w4_calc_subtract_dc): // Aggregate the sums vadd.i16 q0, q8, q9 vadd.i16 q1, q10, q11 vpaddl.u16 q0, q0 vpaddl.u16 q1, q1 vadd.i32 q0, q1 vadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d0 // sum sub r0, r0, r6, lsl #3 vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz vdup.16 q8, d16[0] L(ipred_cfl_ac_420_w4_subtract_dc): 6: // Subtract dc from ac vld1.16 {q0, q1}, [r0, :128] subs r6, r6, #4 vsub.i16 q0, q0, q8 vsub.i16 q1, q1, q8 vst1.16 {q0, q1}, [r0, :128]! bgt 6b pop {r4-r8, pc} L(ipred_cfl_ac_420_w8): cmp r3, #0 bne L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding vld1.8 {q0}, [r1, :128], r2 vld1.8 {q1}, [r12, :128], r2 vld1.8 {q2}, [r1, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q3}, [r12, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vshl.i16 q0, q0, #1 vshl.i16 q1, q2, #1 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov q0, q1 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.16 {d0}, [r1, :64], r2 vld1.16 {d2}, [r12, :64], r2 vld1.16 {d1}, [r1, :64], r2 vld1.16 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vadd.i16 q0, q0, q1 vshl.i16 q0, q0, #1 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov q0, q1 L(ipred_cfl_ac_420_w8_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 2b 3: // Double the height and reuse the w4 summing/subtracting lsl r6, r6, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): adr r7, L(ipred_cfl_ac_420_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_420_w16_tbl): .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w16_wpad0): 1: // Copy and subsample input, without padding vld1.8 {q0, q1}, [r1, :128], r2 vld1.8 {q2, q3}, [r12, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q12, q13}, [r1, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vld1.8 {q2, q3}, [r12, :128], r2 vpaddl.u8 q12, q12 vpaddl.u8 q13, q13 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q3 vshl.i16 q0, q0, #1 vshl.i16 q1, q1, #1 vshl.i16 q2, q12, #1 vshl.i16 q3, q13, #1 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): 1: // Copy and subsample input, padding 4 vldr d2, [r1, #16] vld1.8 {q0}, [r1, :128], r2 vldr d6, [r12, #16] vld1.8 {q2}, [r12, :128], r2 vpaddl.u8 d2, d2 vldr d26, [r1, #16] vpaddl.u8 q0, q0 vld1.8 {q12}, [r1, :128], r2 vpaddl.u8 d6, d6 vldr d30, [r12, #16] vpaddl.u8 q2, q2 vld1.8 {q14}, [r12, :128], r2 vpaddl.u8 d26, d26 vpaddl.u8 q12, q12 vpaddl.u8 d30, d30 vpaddl.u8 q14, q14 vadd.i16 d2, d2, d6 vadd.i16 q0, q0, q2 vadd.i16 d26, d26, d30 vadd.i16 q12, q12, q14 vshl.i16 d2, d2, #1 vshl.i16 q0, q0, #1 vshl.i16 d6, d26, #1 vshl.i16 q2, q12, #1 vdup.16 d3, d2[3] vdup.16 d7, d6[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.8 {q0}, [r1, :128], r2 vld1.8 {q1}, [r12, :128], r2 vld1.8 {q2}, [r1, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q3}, [r12, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vshl.i16 q0, q0, #1 vshl.i16 q2, q2, #1 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vld1.8 {d4}, [r1, :64], r2 vpaddl.u8 q0, q0 vld1.8 {d5}, [r12, :64], r2 vpaddl.u8 q2, q2 vadd.i16 d0, d0, d1 vadd.i16 d4, d4, d5 vshl.i16 d0, d0, #1 vshl.i16 d4, d4, #1 vdup.16 q1, d0[3] vdup.16 q3, d4[3] vdup.16 d1, d0[3] vdup.16 d5, d4[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 2b 3: // Quadruple the height and reuse the w4 summing/subtracting lsl r6, r6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_422_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_422_tbl): .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w4): 1: // Copy and subsample input vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vld1.8 {d2}, [r1, :64], r2 vld1.8 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): cmp r3, #0 bne L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding vld1.8 {q0}, [r1, :128], r2 vld1.8 {q1}, [r12, :128], r2 vld1.8 {q2}, [r1, :128], r2 vpaddl.u8 q0, q0 vld1.8 {q3}, [r12, :128], r2 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vld1.8 {d2}, [r1, :64], r2 vld1.8 {d3}, [r12, :64], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vdup.16 d7, d3[3] vmov d6, d3 vdup.16 d5, d2[3] vmov d4, d2 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): adr r7, L(ipred_cfl_ac_422_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_422_w16_tbl): .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w16_wpad0): 1: // Copy and subsample input, without padding vld1.8 {q0, q1}, [r1, :128], r2 vld1.8 {q2, q3}, [r12, :128], r2 vpaddl.u8 q0, q0 vpaddl.u8 q1, q1 vpaddl.u8 q2, q2 vpaddl.u8 q3, q3 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): 1: // Copy and subsample input, padding 4 vldr d2, [r1, #16] vld1.8 {q0}, [r1, :128], r2 vldr d6, [r12, #16] vld1.8 {q2}, [r12, :128], r2 vpaddl.u8 d2, d2 vpaddl.u8 q0, q0 vpaddl.u8 d6, d6 vpaddl.u8 q2, q2 vshl.i16 d2, d2, #2 vshl.i16 q0, q0, #2 vshl.i16 d6, d6, #2 vshl.i16 q2, q2, #2 vdup.16 d3, d2[3] vdup.16 d7, d6[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.8 {q0}, [r1, :128], r2 vld1.8 {q2}, [r12, :128], r2 vpaddl.u8 q0, q0 vpaddl.u8 q2, q2 vshl.i16 q0, q0, #2 vshl.i16 q2, q2, #2 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d1}, [r12, :64], r2 vpaddl.u8 q0, q0 vshl.i16 q0, q0, #2 vdup.16 q3, d1[3] vdup.16 q1, d0[3] vdup.16 d5, d1[3] vmov d4, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) endfunc // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_8bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_444_tbl) sub r8, r8, #26 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_444_tbl): .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w4): 1: // Copy and expand input vld1.32 {d0[]}, [r1, :32], r2 vld1.32 {d0[1]}, [r12, :32], r2 vld1.32 {d2[]}, [r1, :32], r2 vld1.32 {d2[1]}, [r12, :32], r2 vshll.u8 q0, d0, #3 vshll.u8 q1, d2, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): 1: // Copy and expand input vld1.16 {d0}, [r1, :64], r2 vld1.16 {d2}, [r12, :64], r2 vld1.16 {d4}, [r1, :64], r2 vshll.u8 q0, d0, #3 vld1.16 {d6}, [r12, :64], r2 vshll.u8 q1, d2, #3 vshll.u8 q2, d4, #3 vshll.u8 q3, d6, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): cmp r3, #0 bne L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding vld1.8 {q1}, [r1, :128], r2 vld1.8 {q3}, [r12, :128], r2 vshll.u8 q0, d2, #3 vshll.u8 q1, d3, #3 vshll.u8 q2, d6, #3 vshll.u8 q3, d7, #3 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d4}, [r12, :64], r2 vshll.u8 q0, d0, #3 vshll.u8 q2, d4, #3 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): adr r7, L(ipred_cfl_ac_444_w32_tbl) ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_444_w32_tbl): .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w32_wpad0): 1: // Copy and expand input, without padding vld1.8 {q2, q3}, [r1, :128], r2 vld1.8 {q13, q14}, [r12, :128], r2 vshll.u8 q0, d4, #3 vshll.u8 q1, d5, #3 vshll.u8 q2, d6, #3 vshll.u8 q3, d7, #3 vshll.u8 q12, d26, #3 vshll.u8 q13, d27, #3 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vshll.u8 q0, d28, #3 vshll.u8 q1, d29, #3 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): 1: // Copy and expand input, padding 8 vldr d4, [r1, #16] vld1.8 {q1}, [r1, :128], r2 vldr d28, [r12, #16] vld1.8 {q13}, [r12, :128], r2 vshll.u8 q2, d4, #3 vshll.u8 q0, d2, #3 vshll.u8 q1, d3, #3 vshll.u8 q12, d26, #3 vshll.u8 q13, d27, #3 vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vshll.u8 q0, d28, #3 vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vdup.16 q1, d1[3] vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): 1: // Copy and expand input, padding 16 vld1.8 {q1}, [r1, :128], r2 vld1.8 {q13}, [r12, :128], r2 vshll.u8 q0, d2, #3 vshll.u8 q1, d3, #3 vshll.u8 q12, d26, #3 vshll.u8 q13, d27, #3 vdup.16 q2, d3[3] vdup.16 q3, d3[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vdup.16 q0, d27[3] vdup.16 q1, d27[3] vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): 1: // Copy and expand input, padding 24 vld1.8 {d0}, [r1, :64], r2 vld1.8 {d24}, [r12, :64], r2 vshll.u8 q0, d0, #3 vshll.u8 q12, d24, #3 subs r8, r8, #2 vdup.16 q1, d1[3] vdup.16 q2, d1[3] vdup.16 q3, d1[3] vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q8, q8, q0 vadd.i16 q9, q9, q1 vdup.16 q13, d25[3] vdup.16 q0, d25[3] vdup.16 q1, d25[3] vst1.16 {q2, q3}, [r0, :128]! vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q3 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 1b cmp r4, #0 L(ipred_cfl_ac_444_w32_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #1 vst1.16 {q12, q13}, [r0, :128]! vadd.i16 q8, q8, q12 vadd.i16 q9, q9, q13 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q10, q10, q0 vadd.i16 q11, q11, q1 bgt 2b 3: // Multiply the height by eight and reuse the w4 subtracting lsl r6, r6, #3 // Aggregate the sums, with wider intermediates earlier than in // ipred_cfl_ac_420_w8_calc_subtract_dc. vpaddl.u16 q0, q8 vpaddl.u16 q1, q9 vpaddl.u16 q2, q10 vpaddl.u16 q3, q11 vadd.i32 q0, q0, q1 vadd.i32 q2, q2, q3 vadd.i32 q0, q0, q2 vadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d0 // sum sub r0, r0, r6, lsl #3 vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz vdup.16 q8, d16[0] b L(ipred_cfl_ac_420_w4_subtract_dc) endfunc rav1e-0.7.1/src/arm/32/ipred16.S000064400000000000000000003753501046102023000140540ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, B Krishnan Iyer * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height, // const int bitdepth_max); function ipred_dc_128_16bpc_neon, export=1 push {r4, lr} ldr r4, [sp, #8] ldr r12, [sp, #24] clz r3, r3 adr r2, L(ipred_dc_128_tbl) sub r3, r3, #25 vdup.16 q0, r12 ldr r3, [r2, r3, lsl #2] add r12, r0, r1 vrshr.u16 q0, q0, #1 add r2, r2, r3 lsl r1, r1, #1 bx r2 .align 2 L(ipred_dc_128_tbl): .word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 160f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB .word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB 4: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4, pc} 8: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 8b pop {r4, pc} 160: vmov q1, q0 16: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vmov q1, q0 sub r1, r1, #32 32: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vmov q1, q0 sub r1, r1, #96 64: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_16bpc_neon, export=1 push {r4, lr} ldr lr, [sp, #8] clz r3, r3 adr r4, L(ipred_v_tbl) sub r3, r3, #25 ldr r3, [r4, r3, lsl #2] add r2, r2, #2 add r4, r4, r3 add r12, r0, r1 lsl r1, r1, #1 bx r4 .align 2 L(ipred_v_tbl): .word 640f - L(ipred_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_v_tbl) + CONFIG_THUMB 40: vld1.16 {d0}, [r2] 4: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs lr, lr, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4, pc} 80: vld1.16 {q0}, [r2] 8: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs lr, lr, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 8b pop {r4, pc} 160: vld1.16 {q0, q1}, [r2] 16: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs lr, lr, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 16b pop {r4, pc} 320: vld1.16 {q0, q1}, [r2]! sub r1, r1, #32 vld1.16 {q2, q3}, [r2] 32: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 subs lr, lr, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d4, d5, d6, d7}, [r0, :128], r1 vst1.16 {d4, d5, d6, d7}, [r12, :128], r1 bgt 32b pop {r4, pc} 640: vld1.16 {q0, q1}, [r2]! sub r1, r1, #96 vld1.16 {q2, q3}, [r2]! vld1.16 {q8, q9}, [r2]! vld1.16 {q10, q11}, [r2]! 64: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d4, d5, d6, d7}, [r0, :128]! vst1.16 {d4, d5, d6, d7}, [r12, :128]! subs lr, lr, #2 vst1.16 {d16, d17, d18, d19}, [r0, :128]! vst1.16 {d16, d17, d18, d19}, [r12, :128]! vst1.16 {d20, d21, d22, d23}, [r0, :128], r1 vst1.16 {d20, d21, d22, d23}, [r12, :128], r1 bgt 64b pop {r4, pc} endfunc // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_h_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] sub r2, r2, #2 mov lr, #-2 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_h_tbl): .word 640f - L(ipred_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_h_tbl) + CONFIG_THUMB .word 8f - L(ipred_h_tbl) + CONFIG_THUMB .word 40f - L(ipred_h_tbl) + CONFIG_THUMB 40: sub r2, r2, #6 mov lr, #-8 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2], lr vst1.16 {d3}, [r0, :64], r1 vst1.16 {d2}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d1}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4-r5, pc} 8: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.16 {d2[], d3[]}, [r2], lr vst1.16 {q0}, [r0, :128], r1 vld1.16 {d4[], d5[]}, [r2], lr vst1.16 {q1}, [r12, :128], r1 vld1.16 {d6[], d7[]}, [r2], lr vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 8b pop {r4-r5, pc} 160: sub r1, r1, #16 16: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.16 {d2[], d3[]}, [r2], lr vst1.16 {q0}, [r0, :128]! vld1.16 {d4[], d5[]}, [r2], lr vst1.16 {q1}, [r12, :128]! vld1.16 {d6[], d7[]}, [r2], lr vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r12, :128], r1 vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: sub r1, r1, #48 32: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #4 vld1.16 {d2[], d3[]}, [r2], lr vst1.16 {q0}, [r0, :128]! vld1.16 {d4[], d5[]}, [r2], lr vst1.16 {q1}, [r12, :128]! vld1.16 {d6[], d7[]}, [r2], lr vst1.16 {q0}, [r0, :128]! vst1.16 {q1}, [r12, :128]! vst1.16 {q0}, [r0, :128]! vst1.16 {q1}, [r12, :128]! vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r12, :128], r1 vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: sub r1, r1, #96 64: vld1.16 {d0[], d1[]}, [r2], lr subs r4, r4, #2 vld1.16 {d4[], d5[]}, [r2], lr vmov q1, q0 vmov q3, q2 vst1.16 {q0, q1}, [r0, :128]! vst1.16 {q2, q3}, [r12, :128]! vst1.16 {q0, q1}, [r0, :128]! vst1.16 {q2, q3}, [r12, :128]! vst1.16 {q0, q1}, [r0, :128]! vst1.16 {q2, q3}, [r12, :128]! vst1.16 {q0, q1}, [r0, :128], r1 vst1.16 {q2, q3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 adr r5, L(ipred_dc_top_tbl) sub r3, r3, #25 ldr r3, [r5, r3, lsl #2] add r2, r2, #2 add r5, r5, r3 add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_top_tbl): .word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB .word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB 40: vld1.16 {d0}, [r2] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 d0, d0[0] 4: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 4b pop {r4-r5, pc} 80: vld1.16 {d0, d1}, [r2] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] 8: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 8b pop {r4-r5, pc} 160: vld1.16 {d0, d1, d2, d3}, [r2] vadd.i16 q0, q0, q1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d4, d0, #4 vdup.16 q0, d4[0] vdup.16 q1, d4[0] 16: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 16b pop {r4-r5, pc} 320: vld1.16 {d0, d1, d2, d3}, [r2]! vld1.16 {d4, d5, d6, d7}, [r2] vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vadd.i16 q0, q0, q2 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d18, q0, #5 vdup.16 q0, d18[0] vdup.16 q1, d18[0] sub r1, r1, #32 32: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 32b pop {r4-r5, pc} 640: vld1.16 {d0, d1, d2, d3}, [r2]! vld1.16 {d4, d5, d6, d7}, [r2]! vadd.i16 q0, q0, q1 vld1.16 {d16, d17, d18, d19}, [r2]! vadd.i16 q2, q2, q3 vld1.16 {d20, d21, d22, d23}, [r2] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q0, q2 vadd.i16 q8, q8, q10 vadd.i16 q0, q0, q8 vadd.i16 d0, d0, d1 vpaddl.u16 d0, d0 vpadd.i32 d0, d0, d0 vrshrn.i32 d18, q0, #6 vdup.16 q0, d18[0] vdup.16 q1, d18[0] sub r1, r1, #96 64: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] sub r2, r2, r4, lsl #1 clz r3, r3 clz lr, r4 sub lr, lr, #25 adr r5, L(ipred_dc_left_tbl) sub r3, r3, #20 ldr r3, [r5, r3, lsl #2] ldr lr, [r5, lr, lsl #2] add r3, r5, r3 add r5, r5, lr add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_left_tbl): .word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB .word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB L(ipred_dc_left_h4): vld1.16 {d0}, [r2, :64] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w4): vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt L(ipred_dc_left_w4) pop {r4-r5, pc} L(ipred_dc_left_h8): vld1.16 {d0, d1}, [r2, :128] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w8): vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt L(ipred_dc_left_w8) pop {r4-r5, pc} L(ipred_dc_left_h16): vld1.16 {d0, d1, d2, d3}, [r2, :128] vadd.i16 q0, q0, q1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w16): vmov q1, q0 1: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} L(ipred_dc_left_h32): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128] vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vadd.i16 q0, q0, q2 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d0, q0, #5 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w32): sub r1, r1, #32 vmov q1, q0 1: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} L(ipred_dc_left_h64): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128]! vadd.i16 q0, q0, q1 vld1.16 {d16, d17, d18, d19}, [r2, :128]! vadd.i16 q2, q2, q3 vld1.16 {d20, d21, d22, d23}, [r2, :128] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q0, q2 vadd.i16 q8, q8, q10 vadd.i16 q0, q0, q8 vadd.i16 d0, d0, d1 vpaddl.u16 d0, d0 vpadd.i32 d0, d0, d0 vrshrn.i32 d0, q0, #6 vdup.16 q0, d0[0] bx r3 L(ipred_dc_left_w64): sub r1, r1, #96 vmov q1, q0 1: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 1b pop {r4-r5, pc} endfunc // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_16bpc_neon, export=1 push {r4-r6, lr} ldr r4, [sp, #16] sub r2, r2, r4, lsl #1 add lr, r3, r4 // width + height clz r3, r3 clz r12, r4 vdup.32 q15, lr // width + height adr r5, L(ipred_dc_tbl) rbit lr, lr // rbit(width + height) sub r3, r3, #20 // 25 leading bits, minus table offset 5 sub r12, r12, #25 clz lr, lr // ctz(width + height) ldr r3, [r5, r3, lsl #2] ldr r12, [r5, r12, lsl #2] neg lr, lr // -ctz(width + height) add r3, r5, r3 add r5, r5, r12 vshr.u32 q15, q15, #1 // (width + height) >> 1 vdup.32 q14, lr // -ctz(width + height) add r12, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_dc_tbl): .word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB .word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB L(ipred_dc_h4): vld1.16 {d0}, [r2, :64]! vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w4): vld1.16 {d2}, [r2] vadd.i32 d0, d0, d30 vpadd.i16 d2, d2, d2 vpaddl.u16 d2, d2 cmp r4, #4 vadd.i32 d0, d0, d2 vshl.u32 d0, d0, d28 beq 1f // h = 8/16 cmp r4, #16 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d0, d0, d24 vshr.u32 d0, d0, #17 1: vdup.16 d0, d0[0] 2: vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 subs r4, r4, #4 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d0}, [r12, :64], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h8): vld1.16 {d0, d1}, [r2, :128]! vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w8): vld1.16 {d2, d3}, [r2] vadd.i32 d0, d0, d30 vadd.i16 d2, d2, d3 vpadd.i16 d2, d2, d2 vpaddl.u16 d2, d2 cmp r4, #8 vadd.i32 d0, d0, d2 vshl.u32 d0, d0, d28 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d0, d0, d24 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] 2: vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1}, [r0, :128], r1 vst1.16 {d0, d1}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h16): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vadd.i16 q0, q0, q1 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w16): vld1.16 {d2, d3, d4, d5}, [r2] vadd.i32 d0, d0, d30 vadd.i16 q1, q1, q2 vadd.i16 d2, d2, d3 vpadd.i16 d2, d2, d1 vpaddl.u16 d2, d2 cmp r4, #16 vadd.i32 d0, d0, d2 vshl.u32 d4, d0, d28 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d4, d4, d24 vshr.u32 d4, d4, #17 1: vdup.16 q0, d4[0] vdup.16 q1, d4[0] 2: vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h32): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128]! vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vadd.i16 q0, q0, q2 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r3 L(ipred_dc_w32): vld1.16 {d2, d3, d4, d5}, [r2]! vadd.i32 d0, d0, d30 vld1.16 {d16, d17, d18, d19}, [r2] vadd.i16 q1, q1, q2 vadd.i16 q8, q8, q9 vadd.i16 q1, q1, q8 vadd.i16 d2, d2, d3 vpadd.i16 d2, d2, d2 vpaddl.u16 d2, d2 cmp r4, #32 vadd.i32 d0, d0, d2 vshl.u32 d4, d0, d28 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d4, d4, d24 vshr.u32 d4, d4, #17 1: sub r1, r1, #32 vdup.16 q0, d4[0] vdup.16 q1, d4[0] 2: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 subs r4, r4, #4 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} L(ipred_dc_h64): vld1.16 {d0, d1, d2, d3}, [r2, :128]! vld1.16 {d4, d5, d6, d7}, [r2, :128]! vadd.i16 q0, q0, q1 vld1.16 {d16, d17, d18, d19}, [r2, :128]! vadd.i16 q2, q2, q3 vld1.16 {d20, d21, d22, d23}, [r2, :128]! vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q0, q2 vadd.i16 q8, q8, q10 vadd.i16 q0, q0, q8 vadd.i16 d0, d0, d1 vpaddl.u16 d0, d0 add r2, r2, #2 vpadd.i32 d0, d0, d0 bx r3 L(ipred_dc_w64): vld1.16 {d2, d3, d4, d5}, [r2]! vadd.i32 d0, d0, d30 vld1.16 {d16, d17, d18, d19}, [r2]! vadd.i16 q1, q1, q2 vld1.16 {d20, d21, d22, d23}, [r2]! vadd.i16 q8, q8, q9 vld1.16 {d24, d25, d26, d27}, [r2]! vadd.i16 q10, q10, q11 vadd.i16 q12, q12, q13 vadd.i16 q1, q1, q8 vadd.i16 q10, q10, q12 vadd.i16 q1, q1, q10 vadd.i16 d2, d2, d3 vpaddl.u16 d2, d2 vpadd.i32 d2, d2, d2 cmp r4, #64 vadd.i32 d0, d0, d2 vshl.u32 d4, d0, d28 beq 1f // h = 16/32 cmp r4, #16 movw lr, #0x6667 movw r5, #0xAAAB it ne movne lr, r5 vdup.32 d24, lr vmul.i32 d4, d4, d24 vshr.u32 d4, d4, #17 1: sub r1, r1, #96 vdup.16 q0, d4[0] vdup.16 q1, d4[0] 2: vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! subs r4, r4, #2 vst1.16 {d0, d1, d2, d3}, [r0, :128]! vst1.16 {d0, d1, d2, d3}, [r12, :128]! vst1.16 {d0, d1, d2, d3}, [r0, :128], r1 vst1.16 {d0, d1, d2, d3}, [r12, :128], r1 bgt 2b pop {r4-r6, pc} endfunc // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_16bpc_neon, export=1 push {r4-r6, lr} vpush {q4} ldr r4, [sp, #32] clz lr, r3 adr r12, L(ipred_paeth_tbl) sub lr, lr, #25 ldr lr, [r12, lr, lsl #2] vld1.16 {d4[], d5[]}, [r2] add r6, r2, #2 sub r2, r2, #4 add r12, r12, lr mov r5, #-4 add lr, r0, r1 lsl r1, r1, #1 bx r12 .align 2 L(ipred_paeth_tbl): .word 640f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 320f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 160f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 80f - L(ipred_paeth_tbl) + CONFIG_THUMB .word 40f - L(ipred_paeth_tbl) + CONFIG_THUMB 40: sub r2, r2, #4 mov r5, #-8 vld1.16 {d6}, [r6] vsub.i16 d16, d6, d4 // top - topleft vmov d7, d6 vmov d17, d16 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r5 vadd.i16 q9, q8, q0 // base vadd.i16 q10, q8, q1 vabd.s16 q11, q3, q9 // tdiff vabd.s16 q12, q3, q10 vabd.s16 q13, q2, q9 // tldiff vabd.s16 q14, q2, q10 vabd.s16 q9, q0, q9 // ldiff vabd.s16 q10, q1, q10 vmin.u16 q15, q11, q13 // min(tdiff, tldiff) vmin.u16 q4, q12, q14 vcge.u16 q11, q13, q11 // tldiff >= tdiff vcge.u16 q12, q14, q12 vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff vcge.u16 q10, q4, q10 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 vst1.16 {d25}, [r0, :64], r1 vst1.16 {d24}, [lr, :64], r1 subs r4, r4, #4 vst1.16 {d23}, [r0, :64], r1 vst1.16 {d22}, [lr, :64], r1 bgt 4b vpop {q4} pop {r4-r6, pc} 80: 160: 320: 640: vld1.16 {q3}, [r6]! mov r12, r3 sub r1, r1, r3, lsl #1 1: vld2.16 {d0[], d2[]}, [r2, :32], r5 vmov d1, d0 vmov d3, d2 2: vsub.i16 q8, q3, q2 // top - topleft vadd.i16 q9, q8, q0 // base vadd.i16 q10, q8, q1 vabd.s16 q11, q3, q9 // tdiff vabd.s16 q12, q3, q10 vabd.s16 q13, q2, q9 // tldiff vabd.s16 q14, q2, q10 vabd.s16 q9, q0, q9 // ldiff vabd.s16 q10, q1, q10 vmin.u16 q15, q11, q13 // min(tdiff, tldiff) vmin.u16 q4, q12, q14 vcge.u16 q11, q13, q11 // tldiff >= tdiff vcge.u16 q12, q14, q12 vcge.u16 q9, q15, q9 // min(tdiff, tldiff) >= ldiff vcge.u16 q10, q4, q10 vbsl q12, q3, q2 // tdiff <= tldiff ? top : topleft vbsl q11, q3, q2 vbit q12, q1, q10 // ldiff <= min ? left : ... vbit q11, q0, q9 subs r3, r3, #8 vst1.16 {q12}, [r0, :128]! vst1.16 {q11}, [lr, :128]! ble 8f vld1.16 {q3}, [r6]! b 2b 8: subs r4, r4, #2 ble 9f // End of horizontal loop, move pointers to next two rows sub r6, r6, r12, lsl #1 add r0, r0, r1 add lr, lr, r1 vld1.16 {q3}, [r6]! mov r3, r12 b 1b 9: vpop {q4} pop {r4-r6, pc} endfunc // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_16bpc_neon, export=1 push {r4-r10, lr} ldr r4, [sp, #32] movrel r10, X(sm_weights) add r12, r10, r4 add r10, r10, r3 clz r9, r3 adr r5, L(ipred_smooth_tbl) sub lr, r2, r4, lsl #1 sub r9, r9, #25 ldr r9, [r5, r9, lsl #2] vld1.16 {d4[], d5[]}, [lr] // bottom add r8, r2, #2 add r5, r5, r9 add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_tbl): .word 640f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_tbl) + CONFIG_THUMB 40: vld1.16 {d16}, [r8] // top vld1.32 {d18[]}, [r10, :32] // weights_hor sub r2, r2, #8 mov r7, #-8 vdup.16 q3, d16[3] // right vsub.i16 q8, q8, q2 // top-bottom vmovl.u8 q9, d18 // weights_hor vadd.i16 d19, d4, d6 // bottom+right 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left vld4.8 {d20[], d21[], d22[], d23[]}, [r12, :32]! // weights_ver vshll.u16 q12, d19, #8 // (bottom+right)*256 vshll.u16 q13, d19, #8 vshll.u16 q14, d19, #8 vshll.u16 q15, d19, #8 vzip.32 d20, d21 // weights_ver vzip.32 d22, d23 vsub.i16 q1, q1, q3 // left-right vsub.i16 q0, q0, q3 vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmlal.s16 q12, d3, d18 // += (left-right)*weights_hor vmlal.s16 q13, d2, d18 // (left flipped) vmlal.s16 q14, d1, d18 vmlal.s16 q15, d0, d18 vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver vmlal.s16 q13, d16, d21 vmlal.s16 q14, d16, d22 vmlal.s16 q15, d16, d23 vrshrn.i32 d24, q12, #9 vrshrn.i32 d25, q13, #9 vrshrn.i32 d26, q14, #9 vrshrn.i32 d27, q15, #9 vst1.16 {d24}, [r0, :64], r1 vst1.16 {d25}, [r6, :64], r1 subs r4, r4, #4 vst1.16 {d26}, [r0, :64], r1 vst1.16 {d27}, [r6, :64], r1 bgt 4b pop {r4-r10, pc} 80: vld1.16 {q8}, [r8] // top vld1.8 {d18}, [r10, :64] // weights_hor sub r2, r2, #4 mov r7, #-4 vdup.16 q3, d17[3] // right vsub.i16 q8, q8, q2 // top-bottom vmovl.u8 q9, d18 // weights_hor vadd.i16 d3, d4, d6 // bottom+right 8: vld2.16 {d0[], d1[]}, [r2, :32], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vshll.u16 q12, d3, #8 // (bottom+right)*256 vshll.u16 q13, d3, #8 vshll.u16 q14, d3, #8 vshll.u16 q15, d3, #8 vsub.i16 q0, q0, q3 // left-right vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor vmlal.s16 q13, d1, d19 // (left flipped) vmlal.s16 q14, d0, d18 vmlal.s16 q15, d0, d19 vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver vmlal.s16 q13, d17, d20 vmlal.s16 q14, d16, d22 vmlal.s16 q15, d17, d22 vrshrn.i32 d24, q12, #9 vrshrn.i32 d25, q13, #9 vrshrn.i32 d26, q14, #9 vrshrn.i32 d27, q15, #9 subs r4, r4, #2 vst1.16 {q12}, [r0, :128], r1 vst1.16 {q13}, [r6, :128], r1 bgt 8b pop {r4-r10, pc} 160: 320: 640: add lr, r2, r3, lsl #1 sub r2, r2, #4 mov r7, #-4 vld1.16 {d6[], d7[]}, [lr] // right sub r1, r1, r3, lsl #1 mov r9, r3 vadd.i16 d3, d4, d6 // bottom+right 1: vld2.16 {d0[], d1[]}, [r2, :32], r7 // left vld2.8 {d20[], d22[]}, [r12, :16]! // weights_ver vsub.i16 q0, q0, q3 // left-right vmovl.u8 q10, d20 // weights_ver vmovl.u8 q11, d22 2: vld1.8 {d18}, [r10, :64]! // weights_hor vld1.16 {q8}, [r8]! // top vshll.u16 q12, d3, #8 // (bottom+right)*256 vshll.u16 q13, d3, #8 vmovl.u8 q9, d18 // weights_hor vshll.u16 q14, d3, #8 vshll.u16 q15, d3, #8 vsub.i16 q8, q8, q2 // top-bottom vmlal.s16 q12, d1, d18 // += (left-right)*weights_hor vmlal.s16 q13, d1, d19 // (left flipped) vmlal.s16 q14, d0, d18 vmlal.s16 q15, d0, d19 vmlal.s16 q12, d16, d20 // += (top-bottom)*weights_ver vmlal.s16 q13, d17, d20 vmlal.s16 q14, d16, d22 vmlal.s16 q15, d17, d22 vrshrn.i32 d24, q12, #9 vrshrn.i32 d25, q13, #9 vrshrn.i32 d26, q14, #9 vrshrn.i32 d27, q15, #9 subs r3, r3, #8 vst1.16 {q12}, [r0, :128]! vst1.16 {q13}, [r6, :128]! bgt 2b subs r4, r4, #2 ble 9f sub r8, r8, r9, lsl #1 sub r10, r10, r9 add r0, r0, r1 add r6, r6, r1 mov r3, r9 b 1b 9: pop {r4-r10, pc} endfunc // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_16bpc_neon, export=1 push {r4-r7, lr} ldr r4, [sp, #20] movrel r7, X(sm_weights) add r7, r7, r4 clz lr, r3 adr r5, L(ipred_smooth_v_tbl) sub r12, r2, r4, lsl #1 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.16 {d4[], d5[]}, [r12] // bottom add r2, r2, #2 add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_v_tbl): .word 640f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_v_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_v_tbl) + CONFIG_THUMB 40: vld1.16 {d6}, [r2] // top vsub.i16 d6, d6, d4 // top-bottom vmov d7, d6 4: vld4.8 {d16[], d17[], d18[], d19[]}, [r7, :32]! // weights_ver vzip.32 d16, d17 // weights_ver vzip.32 d18, d19 vshll.u8 q8, d16, #7 // weights_ver << 7 vshll.u8 q9, d18, #7 vqrdmulh.s16 q10, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 vqrdmulh.s16 q11, q3, q9 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vst1.16 {d20}, [r0, :64], r1 vst1.16 {d21}, [r6, :64], r1 subs r4, r4, #4 vst1.16 {d22}, [r0, :64], r1 vst1.16 {d23}, [r6, :64], r1 bgt 4b pop {r4-r7, pc} 80: vld1.16 {q3}, [r2] // top vsub.i16 q3, q3, q2 // top-bottom 8: vld4.8 {d16[], d18[], d20[], d22[]}, [r7, :32]! // weights_ver vshll.u8 q8, d16, #7 // weights_ver << 7 vshll.u8 q9, d18, #7 vshll.u8 q10, d20, #7 vshll.u8 q11, d22, #7 vqrdmulh.s16 q8, q3, q8 // ((top-bottom)*weights_ver + 128) >> 8 vqrdmulh.s16 q9, q3, q9 vqrdmulh.s16 q10, q3, q10 vqrdmulh.s16 q11, q3, q11 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vst1.16 {q8}, [r0, :128], r1 vst1.16 {q9}, [r6, :128], r1 subs r4, r4, #4 vst1.16 {q10}, [r0, :128], r1 vst1.16 {q11}, [r6, :128], r1 bgt 8b pop {r4-r7, pc} 160: 320: 640: vpush {q4-q7} // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3, lsl #1 mov r12, r3 1: vld4.8 {d8[], d10[], d12[], d14[]}, [r7, :32]! // weights_ver vshll.u8 q4, d8, #7 // weights_ver << 7 vshll.u8 q5, d10, #7 vshll.u8 q6, d12, #7 vshll.u8 q7, d14, #7 2: vld1.16 {q0, q1}, [r2]! // top vsub.i16 q0, q0, q2 // top-bottom vsub.i16 q1, q1, q2 vqrdmulh.s16 q8, q0, q4 // ((top-bottom)*weights_ver + 128) >> 8 vqrdmulh.s16 q9, q1, q4 vqrdmulh.s16 q10, q0, q5 vqrdmulh.s16 q11, q1, q5 vqrdmulh.s16 q12, q0, q6 vqrdmulh.s16 q13, q1, q6 vqrdmulh.s16 q14, q0, q7 vqrdmulh.s16 q15, q1, q7 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q2 vadd.i16 q14, q14, q2 vadd.i16 q15, q15, q2 subs r3, r3, #16 vst1.16 {q8, q9}, [r0, :128]! vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r5, :128]! vst1.16 {q14, q15}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r2, r2, r12, lsl #1 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r7, pc} endfunc // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_16bpc_neon, export=1 push {r4-r8, lr} ldr r4, [sp, #24] movrel r8, X(sm_weights) add r8, r8, r3 clz lr, r3 adr r5, L(ipred_smooth_h_tbl) add r12, r2, r3, lsl #1 sub lr, lr, #25 ldr lr, [r5, lr, lsl #2] vld1.16 {d4[], d5[]}, [r12] // right add r5, r5, lr add r6, r0, r1 lsl r1, r1, #1 bx r5 .align 2 L(ipred_smooth_h_tbl): .word 640f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 320f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 160f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 80f - L(ipred_smooth_h_tbl) + CONFIG_THUMB .word 40f - L(ipred_smooth_h_tbl) + CONFIG_THUMB 40: vld1.32 {d6[]}, [r8, :32] // weights_hor sub r2, r2, #8 mov r7, #-8 vshll.u8 q3, d6, #7 // weights_hor << 7 4: vld4.16 {d0[], d1[], d2[], d3[]}, [r2, :64], r7 // left vsub.i16 q0, q0, q2 // left-right vsub.i16 q1, q1, q2 subs r4, r4, #4 vqrdmulh.s16 q8, q1, q3 // ((left-right)*weights_hor + 128) >> 8 vqrdmulh.s16 q9, q0, q3 // (left flipped) vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vst1.16 {d17}, [r0, :64], r1 vst1.16 {d16}, [r6, :64], r1 vst1.16 {d19}, [r0, :64], r1 vst1.16 {d18}, [r6, :64], r1 bgt 4b pop {r4-r8, pc} 80: vld1.8 {d6}, [r8, :64] // weights_hor sub r2, r2, #8 mov r7, #-8 vshll.u8 q3, d6, #7 // weights_hor << 7 8: vld1.16 {d23}, [r2, :64], r7 // left subs r4, r4, #4 vsub.i16 d23, d23, d4 // left-right vdup.16 q8, d23[3] // flip left vdup.16 q9, d23[2] vdup.16 q10, d23[1] vdup.16 q11, d23[0] vqrdmulh.s16 q8, q8, q3 // ((left-right)*weights_hor + 128) >> 8 vqrdmulh.s16 q9, q9, q3 vqrdmulh.s16 q10, q10, q3 vqrdmulh.s16 q11, q11, q3 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vst1.16 {q8}, [r0, :128], r1 vst1.16 {q9}, [r6, :128], r1 vst1.16 {q10}, [r0, :128], r1 vst1.16 {q11}, [r6, :128], r1 bgt 8b pop {r4-r8, pc} 160: 320: 640: vpush {q4-q7} sub r2, r2, #8 mov r7, #-8 // Set up pointers for four rows in parallel; r0, r6, r5, lr add r5, r0, r1 add lr, r6, r1 lsl r1, r1, #1 sub r1, r1, r3, lsl #1 mov r12, r3 1: vld1.16 {d15}, [r2, :64], r7 // left vsub.i16 d15, d15, d4 // left-right vdup.16 q4, d15[3] // flip left vdup.16 q5, d15[2] vdup.16 q6, d15[1] vdup.16 q7, d15[0] 2: vld1.8 {q1}, [r8, :128]! // weights_hor subs r3, r3, #16 vshll.u8 q0, d2, #7 // weights_hor << 7 vshll.u8 q1, d3, #7 vqrdmulh.s16 q8, q0, q4 // ((left-right)*weights_hor + 128) >> 8 vqrdmulh.s16 q9, q1, q4 vqrdmulh.s16 q10, q0, q5 vqrdmulh.s16 q11, q1, q5 vqrdmulh.s16 q12, q0, q6 vqrdmulh.s16 q13, q1, q6 vqrdmulh.s16 q14, q0, q7 vqrdmulh.s16 q15, q1, q7 vadd.i16 q8, q8, q2 vadd.i16 q9, q9, q2 vadd.i16 q10, q10, q2 vadd.i16 q11, q11, q2 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q2 vadd.i16 q14, q14, q2 vadd.i16 q15, q15, q2 vst1.16 {q8, q9}, [r0, :128]! vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r5, :128]! vst1.16 {q14, q15}, [lr, :128]! bgt 2b subs r4, r4, #4 ble 9f sub r8, r8, r12 add r0, r0, r1 add r6, r6, r1 add r5, r5, r1 add lr, lr, r1 mov r3, r12 b 1b 9: vpop {q4-q7} pop {r4-r8, pc} endfunc // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height, // const int bitdepth_max); .macro filter_fn bpc function ipred_filter_\bpc\()bpc_neon, export=1 movw r12, #511 ldrd r4, r5, [sp, #88] and r5, r5, r12 // 511 movrel r6, X(filter_intra_taps) lsl r5, r5, #6 add r6, r6, r5 vld1.8 {d20, d21, d22, d23}, [r6, :128]! clz lr, r3 adr r5, L(ipred_filter\bpc\()_tbl) vld1.8 {d27, d28, d29}, [r6, :64] sub lr, lr, #26 ldr lr, [r5, lr, lsl #2] vmovl.s8 q8, d20 vmovl.s8 q9, d21 add r5, r5, lr vmovl.s8 q10, d22 vmovl.s8 q11, d23 add r6, r0, r1 lsl r1, r1, #1 vmovl.s8 q12, d27 vmovl.s8 q13, d28 vmovl.s8 q14, d29 mov r7, #-4 vdup.16 q15, r8 add r8, r2, #2 sub r2, r2, #4 .if \bpc == 10 vmov.i16 q7, #0 .endif bx r5 .align 2 L(ipred_filter\bpc\()_tbl): .word 320f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB .word 160f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB .word 80f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB .word 40f - L(ipred_filter\bpc\()_tbl) + CONFIG_THUMB 40: vld1.16 {d0}, [r8] // top (0-3) 4: vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) .if \bpc == 10 vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vrshr.s16 q2, q2, #4 vmax.s16 q2, q2, q7 .else vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) vqrshrun.s32 d4, q2, #4 vqrshrun.s32 d5, q3, #4 .endif vmin.s16 q2, q2, q15 subs r4, r4, #2 vst1.16 {d4}, [r0, :64], r1 vst1.16 {d5}, [r6, :64], r1 vmov d0, d5 // move top from [4-7] to [0-3] bgt 4b vpop {q4-q7} pop {r4-r8, pc} 80: vld1.16 {q0}, [r8] // top (0-7) 8: vld1.16 {d2}, [r2], r7 // left (0-1) + topleft (2) .if \bpc == 10 vmul.i16 q2, q9, d0[0] // p1(top[0]) * filter(1) vmla.i16 q2, q10, d0[1] // p2(top[1]) * filter(2) vmla.i16 q2, q11, d0[2] // p3(top[2]) * filter(3) vmla.i16 q2, q12, d0[3] // p4(top[3]) * filter(4) vmla.i16 q2, q8, d2[2] // p0(topleft) * filter(0) vmla.i16 q2, q13, d2[1] // p5(left[0]) * filter(5) vmla.i16 q2, q14, d2[0] // p6(left[1]) * filter(6) vmul.i16 q3, q9, d1[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d1[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d1[2] // p3(top[2]) * filter(3) vrshr.s16 q2, q2, #4 vmax.s16 q2, q2, q7 vmin.s16 q2, q2, q15 vmla.i16 q3, q12, d1[3] // p4(top[3]) * filter(4) vmla.i16 q3, q8, d0[3] // p0(topleft) * filter(0) vmla.i16 q3, q13, d4[3] // p5(left[0]) * filter(5) vmla.i16 q3, q14, d5[3] // p6(left[1]) * filter(6) vrshr.s16 q3, q3, #4 vmax.s16 q3, q3, q7 .else vmull.s16 q2, d18, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q2, d20, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q2, d22, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q2, d24, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q2, d16, d2[2] // p0(topleft) * filter(0) vmlal.s16 q2, d26, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q2, d28, d2[0] // p6(left[1]) * filter(6) vmull.s16 q3, d19, d0[0] // p1(top[0]) * filter(1) vmlal.s16 q3, d21, d0[1] // p2(top[1]) * filter(2) vmlal.s16 q3, d23, d0[2] // p3(top[2]) * filter(3) vmlal.s16 q3, d25, d0[3] // p4(top[3]) * filter(4) vmlal.s16 q3, d17, d2[2] // p0(topleft) * filter(0) vmlal.s16 q3, d27, d2[1] // p5(left[0]) * filter(5) vmlal.s16 q3, d29, d2[0] // p6(left[1]) * filter(6) vqrshrun.s32 d4, q2, #4 vmull.s16 q4, d18, d1[0] // p1(top[0]) * filter(1) vmlal.s16 q4, d20, d1[1] // p2(top[1]) * filter(2) vmlal.s16 q4, d22, d1[2] // p3(top[2]) * filter(3) vqrshrun.s32 d5, q3, #4 vmin.s16 q2, q2, q15 vmlal.s16 q4, d24, d1[3] // p4(top[3]) * filter(4) vmlal.s16 q4, d16, d0[3] // p0(topleft) * filter(0) vmlal.s16 q4, d26, d4[3] // p5(left[0]) * filter(5) vmlal.s16 q4, d28, d5[3] // p6(left[1]) * filter(6) vmull.s16 q5, d19, d1[0] // p1(top[0]) * filter(1) vmlal.s16 q5, d21, d1[1] // p2(top[1]) * filter(2) vmlal.s16 q5, d23, d1[2] // p3(top[2]) * filter(3) vmlal.s16 q5, d25, d1[3] // p4(top[3]) * filter(4) vmlal.s16 q5, d17, d0[3] // p0(topleft) * filter(0) vmlal.s16 q5, d27, d4[3] // p5(left[0]) * filter(5) vmlal.s16 q5, d29, d5[3] // p6(left[1]) * filter(6) vqrshrun.s32 d6, q4, #4 vqrshrun.s32 d7, q5, #4 .endif vmin.s16 q3, q3, q15 vswp d5, d6 subs r4, r4, #2 vst1.16 {q2}, [r0, :128], r1 vmov q0, q3 vst1.16 {q3}, [r6, :128], r1 bgt 8b vpop {q4-q7} pop {r4-r8, pc} 160: 320: sub r1, r1, r3, lsl #1 mov lr, r3 1: vld1.16 {d0}, [r2], r7 // left (0-1) + topleft (2) 2: vld1.16 {q1, q2}, [r8]! // top(0-15) .if \bpc == 10 vmul.i16 q3, q8, d0[2] // p0(topleft) * filter(0) vmla.i16 q3, q13, d0[1] // p5(left[0]) * filter(5) vmla.i16 q3, q14, d0[0] // p6(left[1]) * filter(6) vmla.i16 q3, q9, d2[0] // p1(top[0]) * filter(1) vmla.i16 q3, q10, d2[1] // p2(top[1]) * filter(2) vmla.i16 q3, q11, d2[2] // p3(top[2]) * filter(3) vmla.i16 q3, q12, d2[3] // p4(top[3]) * filter(4) vmul.i16 q4, q9, d3[0] // p1(top[0]) * filter(1) vmla.i16 q4, q10, d3[1] // p2(top[1]) * filter(2) vmla.i16 q4, q11, d3[2] // p3(top[2]) * filter(3) vrshr.s16 q3, q3, #4 vmax.s16 q3, q3, q7 vmin.s16 q3, q3, q15 vmla.i16 q4, q12, d3[3] // p4(top[3]) * filter(4) vmla.i16 q4, q8, d2[3] // p0(topleft) * filter(0) vmla.i16 q4, q13, d6[3] // p5(left[0]) * filter(5) vmla.i16 q4, q14, d7[3] // p6(left[1]) * filter(6) vmul.i16 q5, q9, d4[0] // p1(top[0]) * filter(1) vmla.i16 q5, q10, d4[1] // p2(top[1]) * filter(2) vmla.i16 q5, q11, d4[2] // p3(top[2]) * filter(3) vrshr.s16 q4, q4, #4 vmax.s16 q4, q4, q7 vmin.s16 q4, q4, q15 vmov q0, q4 vmla.i16 q5, q12, d4[3] // p4(top[3]) * filter(4) vmla.i16 q5, q8, d3[3] // p0(topleft) * filter(0) vmla.i16 q5, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q5, q14, d1[3] // p6(left[1]) * filter(6) vmul.i16 q6, q9, d5[0] // p1(top[0]) * filter(1) vmla.i16 q6, q10, d5[1] // p2(top[1]) * filter(2) vmla.i16 q6, q11, d5[2] // p3(top[2]) * filter(3) vrshr.s16 q5, q5, #4 vmax.s16 q5, q5, q7 vmin.s16 q5, q5, q15 vmov q0, q5 vmov.u16 r12, d5[3] vmla.i16 q6, q12, d5[3] // p4(top[3]) * filter(4) vmla.i16 q6, q8, d4[3] // p0(topleft) * filter(0) vmla.i16 q6, q13, d0[3] // p5(left[0]) * filter(5) vmla.i16 q6, q14, d1[3] // p6(left[1]) * filter(6) vmov.16 d0[2], r12 subs r3, r3, #16 vrshr.s16 q6, q6, #4 .else vmull.s16 q3, d16, d0[2] // p0(topleft) * filter(0) vmlal.s16 q3, d26, d0[1] // p5(left[0]) * filter(5) vmlal.s16 q3, d28, d0[0] // p6(left[1]) * filter(6) vmlal.s16 q3, d18, d2[0] // p1(top[0]) * filter(1) vmlal.s16 q3, d20, d2[1] // p2(top[1]) * filter(2) vmlal.s16 q3, d22, d2[2] // p3(top[2]) * filter(3) vmlal.s16 q3, d24, d2[3] // p4(top[3]) * filter(4) vmull.s16 q4, d17, d0[2] // p0(topleft) * filter(0) vmlal.s16 q4, d27, d0[1] // p5(left[0]) * filter(5) vmlal.s16 q4, d29, d0[0] // p6(left[1]) * filter(6) vmlal.s16 q4, d19, d2[0] // p1(top[0]) * filter(1) vmlal.s16 q4, d21, d2[1] // p2(top[1]) * filter(2) vmlal.s16 q4, d23, d2[2] // p3(top[2]) * filter(3) vmlal.s16 q4, d25, d2[3] // p4(top[3]) * filter(4) vqrshrun.s32 d6, q3, #4 vmull.s16 q5, d18, d3[0] // p1(top[0]) * filter(1) vmlal.s16 q5, d20, d3[1] // p2(top[1]) * filter(2) vqrshrun.s32 d7, q4, #4 vmin.s16 q3, q3, q15 vmlal.s16 q5, d22, d3[2] // p3(top[2]) * filter(3) vmlal.s16 q5, d24, d3[3] // p4(top[3]) * filter(4) vmlal.s16 q5, d16, d2[3] // p0(topleft) * filter(0) vmlal.s16 q5, d26, d6[3] // p5(left[0]) * filter(5) vmlal.s16 q5, d28, d7[3] // p6(left[1]) * filter(6) vmull.s16 q6, d19, d3[0] // p1(top[0]) * filter(1) vmlal.s16 q6, d21, d3[1] // p2(top[1]) * filter(2) vmlal.s16 q6, d23, d3[2] // p3(top[2]) * filter(3) vmlal.s16 q6, d25, d3[3] // p4(top[3]) * filter(4) vmlal.s16 q6, d17, d2[3] // p0(topleft) * filter(0) vmlal.s16 q6, d27, d6[3] // p5(left[0]) * filter(5) vmlal.s16 q6, d29, d7[3] // p6(left[1]) * filter(6) vqrshrun.s32 d8, q5, #4 vmull.s16 q7, d18, d4[0] // p1(top[0]) * filter(1) vmlal.s16 q7, d20, d4[1] // p2(top[1]) * filter(2) vmlal.s16 q7, d22, d4[2] // p3(top[2]) * filter(3) vqrshrun.s32 d9, q6, #4 vmin.s16 q0, q4, q15 vmlal.s16 q7, d24, d4[3] // p4(top[3]) * filter(4) vmlal.s16 q7, d16, d3[3] // p0(topleft) * filter(0) vmlal.s16 q7, d26, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q7, d28, d1[3] // p6(left[1]) * filter(6) vmin.s16 q4, q4, q15 vmull.s16 q6, d19, d4[0] // p1(top[0]) * filter(1) vmlal.s16 q6, d21, d4[1] // p2(top[1]) * filter(2) vmlal.s16 q6, d23, d4[2] // p3(top[2]) * filter(3) vmlal.s16 q6, d25, d4[3] // p4(top[3]) * filter(4) vmlal.s16 q6, d17, d3[3] // p0(topleft) * filter(0) vmlal.s16 q6, d27, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q6, d29, d1[3] // p6(left[1]) * filter(6) vqrshrun.s32 d10, q7, #4 vmull.s16 q1, d18, d5[0] // p1(top[0]) * filter(1) vmlal.s16 q1, d20, d5[1] // p2(top[1]) * filter(2) vmlal.s16 q1, d22, d5[2] // p3(top[2]) * filter(3) vqrshrun.s32 d11, q6, #4 vmin.s16 q0, q5, q15 vmlal.s16 q1, d24, d5[3] // p4(top[3]) * filter(4) vmlal.s16 q1, d16, d4[3] // p0(topleft) * filter(0) vmlal.s16 q1, d26, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q1, d28, d1[3] // p6(left[1]) * filter(6) vmin.s16 q5, q5, q15 vmov.u16 r12, d5[3] vmull.s16 q7, d19, d5[0] // p1(top[0]) * filter(1) vmlal.s16 q7, d21, d5[1] // p2(top[1]) * filter(2) vmlal.s16 q7, d23, d5[2] // p3(top[2]) * filter(3) vmlal.s16 q7, d25, d5[3] // p4(top[3]) * filter(4) vmlal.s16 q7, d17, d4[3] // p0(topleft) * filter(0) vmlal.s16 q7, d27, d0[3] // p5(left[0]) * filter(5) vmlal.s16 q7, d29, d1[3] // p6(left[1]) * filter(6) vmov.16 d0[2], r12 vqrshrun.s32 d12, q1, #4 subs r3, r3, #16 vqrshrun.s32 d13, q7, #4 .endif vswp q4, q5 .if \bpc == 10 vmax.s16 q6, q6, q7 .endif vswp d7, d10 vmin.s16 q6, q6, q15 vswp d9, d12 vst1.16 {q3, q4}, [r0, :128]! vst1.16 {q5, q6}, [r6, :128]! ble 8f vmov.u16 r12, d13[3] vmov.16 d0[0], r12 vmov.u16 r12, d9[3] vmov.16 d0[1], r12 b 2b 8: subs r4, r4, #2 ble 9f sub r8, r6, lr, lsl #1 add r0, r0, r1 add r6, r6, r1 mov r3, lr b 1b 9: vpop {q4-q7} pop {r4-r8, pc} endfunc .endm filter_fn 10 filter_fn 12 function ipred_filter_16bpc_neon, export=1 push {r4-r8, lr} vpush {q4-q7} movw r12, 0x3ff ldr r8, [sp, #104] cmp r8, r12 ble ipred_filter_10bpc_neon b ipred_filter_12bpc_neon endfunc // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint16_t *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_16bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] ldr r5, [sp, #16] vld1.16 {q14}, [r2, :128] clz lr, r4 adr r12, L(pal_pred_tbl) sub lr, lr, #25 ldr lr, [r12, lr, lsl #2] vmov.i16 q15, #0x100 add r12, r12, lr add r2, r0, r1 bx r12 .align 2 L(pal_pred_tbl): .word 640f - L(pal_pred_tbl) + CONFIG_THUMB .word 320f - L(pal_pred_tbl) + CONFIG_THUMB .word 160f - L(pal_pred_tbl) + CONFIG_THUMB .word 80f - L(pal_pred_tbl) + CONFIG_THUMB .word 40f - L(pal_pred_tbl) + CONFIG_THUMB 40: lsl r1, r1, #1 4: vld1.8 {q1}, [r3, :128]! subs r5, r5, #4 // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... vadd.i8 q0, q1, q1 vadd.i8 q1, q1, q1 vzip.8 q0, q1 vadd.i16 q0, q0, q15 vadd.i16 q1, q1, q15 vtbl.8 d0, {q14}, d0 vtbl.8 d1, {q14}, d1 vst1.16 {d0}, [r0, :64], r1 vtbl.8 d2, {q14}, d2 vst1.16 {d1}, [r2, :64], r1 vtbl.8 d3, {q14}, d3 vst1.16 {d2}, [r0, :64], r1 vst1.16 {d3}, [r2, :64], r1 bgt 4b pop {r4-r5, pc} 80: lsl r1, r1, #1 8: vld1.8 {q1, q2}, [r3, :128]! subs r5, r5, #4 // Prefer doing the adds twice, instead of chaining a vmov after // the add. vadd.i8 q0, q1, q1 vadd.i8 q1, q1, q1 vadd.i8 q3, q2, q2 vadd.i8 q2, q2, q2 vzip.8 q0, q1 vzip.8 q2, q3 vadd.i16 q0, q0, q15 vadd.i16 q1, q1, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q2, q2, q15 vtbl.8 d1, {q14}, d1 vadd.i16 q3, q3, q15 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vst1.16 {q0}, [r0, :128], r1 vtbl.8 d6, {q14}, d6 vst1.16 {q1}, [r2, :128], r1 vtbl.8 d7, {q14}, d7 vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r2, :128], r1 bgt 8b pop {r4-r5, pc} 160: lsl r1, r1, #1 16: vld1.8 {q2, q3}, [r3, :128]! subs r5, r5, #4 vld1.8 {q10, q11}, [r3, :128]! vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 vadd.i8 q3, q3, q3 vadd.i8 q8, q10, q10 vadd.i8 q9, q10, q10 vadd.i8 q10, q11, q11 vzip.8 q0, q1 vadd.i8 q11, q11, q11 vzip.8 q2, q3 vzip.8 q8, q9 vadd.i16 q0, q0, q15 vzip.8 q10, q11 vadd.i16 q1, q1, q15 vadd.i16 q2, q2, q15 vadd.i16 q3, q3, q15 vadd.i16 q8, q8, q15 vadd.i16 q9, q9, q15 vadd.i16 q10, q10, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q11, q11, q15 vtbl.8 d1, {q14}, d1 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vtbl.8 d6, {q14}, d6 vtbl.8 d7, {q14}, d7 vtbl.8 d16, {q14}, d16 vtbl.8 d17, {q14}, d17 vtbl.8 d18, {q14}, d18 vst1.16 {q0, q1}, [r0, :128], r1 vtbl.8 d19, {q14}, d19 vtbl.8 d20, {q14}, d20 vst1.16 {q2, q3}, [r2, :128], r1 vtbl.8 d21, {q14}, d21 vtbl.8 d22, {q14}, d22 vst1.16 {q8, q9}, [r0, :128], r1 vtbl.8 d23, {q14}, d23 vst1.16 {q10, q11}, [r2, :128], r1 bgt 16b pop {r4-r5, pc} 320: lsl r1, r1, #1 sub r1, r1, #32 32: vld1.8 {q2, q3}, [r3, :128]! subs r5, r5, #2 vld1.8 {q10, q11}, [r3, :128]! vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 vadd.i8 q3, q3, q3 vadd.i8 q8, q10, q10 vadd.i8 q9, q10, q10 vadd.i8 q10, q11, q11 vzip.8 q0, q1 vadd.i8 q11, q11, q11 vzip.8 q2, q3 vzip.8 q8, q9 vadd.i16 q0, q0, q15 vzip.8 q10, q11 vadd.i16 q1, q1, q15 vadd.i16 q2, q2, q15 vadd.i16 q3, q3, q15 vadd.i16 q8, q8, q15 vadd.i16 q9, q9, q15 vadd.i16 q10, q10, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q11, q11, q15 vtbl.8 d1, {q14}, d1 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vtbl.8 d6, {q14}, d6 vtbl.8 d7, {q14}, d7 vtbl.8 d16, {q14}, d16 vtbl.8 d17, {q14}, d17 vtbl.8 d18, {q14}, d18 vst1.16 {q0, q1}, [r0, :128]! vtbl.8 d19, {q14}, d19 vtbl.8 d20, {q14}, d20 vst1.16 {q2, q3}, [r0, :128], r1 vtbl.8 d21, {q14}, d21 vtbl.8 d22, {q14}, d22 vst1.16 {q8, q9}, [r2, :128]! vtbl.8 d23, {q14}, d23 vst1.16 {q10, q11}, [r2, :128], r1 bgt 32b pop {r4-r5, pc} 640: sub r1, r1, #96 64: vld1.8 {q2, q3}, [r3, :128]! subs r5, r5, #1 vld1.8 {q10, q11}, [r3, :128]! vadd.i8 q0, q2, q2 vadd.i8 q1, q2, q2 vadd.i8 q2, q3, q3 vadd.i8 q3, q3, q3 vadd.i8 q8, q10, q10 vadd.i8 q9, q10, q10 vadd.i8 q10, q11, q11 vzip.8 q0, q1 vadd.i8 q11, q11, q11 vzip.8 q2, q3 vzip.8 q8, q9 vadd.i16 q0, q0, q15 vzip.8 q10, q11 vadd.i16 q1, q1, q15 vadd.i16 q2, q2, q15 vadd.i16 q3, q3, q15 vadd.i16 q8, q8, q15 vadd.i16 q9, q9, q15 vadd.i16 q10, q10, q15 vtbl.8 d0, {q14}, d0 vadd.i16 q11, q11, q15 vtbl.8 d1, {q14}, d1 vtbl.8 d2, {q14}, d2 vtbl.8 d3, {q14}, d3 vtbl.8 d4, {q14}, d4 vtbl.8 d5, {q14}, d5 vtbl.8 d6, {q14}, d6 vtbl.8 d7, {q14}, d7 vtbl.8 d16, {q14}, d16 vtbl.8 d17, {q14}, d17 vtbl.8 d18, {q14}, d18 vst1.16 {q0, q1}, [r0, :128]! vtbl.8 d19, {q14}, d19 vtbl.8 d20, {q14}, d20 vst1.16 {q2, q3}, [r0, :128]! vtbl.8 d21, {q14}, d21 vtbl.8 d22, {q14}, d22 vst1.16 {q8, q9}, [r0, :128]! vtbl.8 d23, {q14}, d23 vst1.16 {q10, q11}, [r0, :128], r1 bgt 64b pop {r4-r5, pc} endfunc // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_128_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] clz lr, r3 vdup.16 q15, r7 // bitdepth_max adr r12, L(ipred_cfl_128_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vrshr.u16 q0, q15, #1 vdup.16 q1, r6 // alpha add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r12 .align 2 L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w16) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w8) - L(ipred_cfl_128_tbl) + CONFIG_THUMB .word L(ipred_cfl_splat_w4) - L(ipred_cfl_128_tbl) + CONFIG_THUMB L(ipred_cfl_splat_w4): vld1.16 {q8, q9}, [r5, :128]! vmull.s16 q2, d16, d2 // diff = ac * alpha vmull.s16 q3, d17, d3 vmull.s16 q8, d18, d2 vmull.s16 q9, d19, d3 vshr.s32 q10, q2, #31 // sign = diff >> 15 vshr.s32 q11, q3, #31 vshr.s32 q12, q8, #31 vshr.s32 q13, q9, #31 vadd.i32 q2, q2, q10 // diff + sign vadd.i32 q3, q3, q11 vadd.i32 q8, q8, q12 vadd.i32 q9, q9, q13 vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q8, #6 vrshrn.i32 d7, q9, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vadd.i16 q3, q3, q0 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vst1.16 {d4}, [r0, :64], r1 vst1.16 {d5}, [r6, :64], r1 subs r4, r4, #4 vst1.16 {d6}, [r0, :64], r1 vst1.16 {d7}, [r6, :64], r1 bgt L(ipred_cfl_splat_w4) pop {r4-r8, pc} L(ipred_cfl_splat_w8): vld1.16 {q8, q9}, [r5, :128]! subs r4, r4, #2 vmull.s16 q2, d16, d2 // diff = ac * alpha vmull.s16 q3, d17, d3 vmull.s16 q8, d18, d2 vmull.s16 q9, d19, d3 vshr.s32 q10, q2, #31 // sign = diff >> 15 vshr.s32 q11, q3, #31 vshr.s32 q12, q8, #31 vshr.s32 q13, q9, #31 vadd.i32 q2, q2, q10 // diff + sign vadd.i32 q3, q3, q11 vadd.i32 q8, q8, q12 vadd.i32 q9, q9, q13 vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q8, #6 vrshrn.i32 d7, q9, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vadd.i16 q3, q3, q0 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r6, :128], r1 bgt L(ipred_cfl_splat_w8) pop {r4-r8, pc} L(ipred_cfl_splat_w16): vpush {q4-q7} add r12, r5, r3, lsl #1 sub r1, r1, r3, lsl #1 mov lr, r3 1: vld1.16 {q6, q7}, [r5, :128]! vmull.s16 q2, d12, d2 // diff = ac * alpha vld1.16 {q8, q9}, [r12, :128]! vmull.s16 q3, d13, d3 vmull.s16 q4, d14, d2 vmull.s16 q5, d15, d3 vmull.s16 q6, d16, d2 vmull.s16 q7, d17, d3 vmull.s16 q8, d18, d2 vmull.s16 q9, d19, d3 vshr.s32 q10, q2, #31 // sign = diff >> 15 vshr.s32 q11, q3, #31 vshr.s32 q12, q4, #31 vshr.s32 q13, q5, #31 vadd.i32 q2, q2, q10 // diff + sign vshr.s32 q10, q6, #31 vadd.i32 q3, q3, q11 vshr.s32 q11, q7, #31 vadd.i32 q4, q4, q12 vshr.s32 q12, q8, #31 vadd.i32 q5, q5, q13 vshr.s32 q13, q9, #31 vadd.i32 q6, q6, q10 vadd.i32 q7, q7, q11 vadd.i32 q8, q8, q12 vadd.i32 q9, q9, q13 vrshrn.i32 d4, q2, #6 // (diff + sign + 32) >> 6 = apply_sign() vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q4, #6 vrshrn.i32 d7, q5, #6 vadd.i16 q2, q2, q0 // dc + apply_sign() vrshrn.i32 d8, q6, #6 vrshrn.i32 d9, q7, #6 vadd.i16 q3, q3, q0 vrshrn.i32 d10, q8, #6 vrshrn.i32 d11, q9, #6 vadd.i16 q4, q4, q0 vadd.i16 q5, q5, q0 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmax.s16 q4, q4, q14 vmax.s16 q5, q5, q14 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vmin.s16 q4, q4, q15 vmin.s16 q5, q5, q15 subs r3, r3, #16 vst1.16 {q2, q3}, [r0, :128]! vst1.16 {q4, q5}, [r6, :128]! bgt 1b subs r4, r4, #2 add r5, r5, lr, lsl #1 add r12, r12, lr, lsl #1 add r0, r0, r1 add r6, r6, r1 mov r3, lr bgt 1b vpop {q4-q7} pop {r4-r8, pc} endfunc // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_top_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] clz lr, r3 vdup.16 q15, r7 // bitdepth_max adr r12, L(ipred_cfl_top_tbl) sub lr, lr, #26 ldr lr, [r12, lr, lsl #2] vdup.16 q1, r6 // alpha add r2, r2, #2 add r12, r12, lr add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r12 .align 2 L(ipred_cfl_top_tbl): .word 32f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 16f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 8f - L(ipred_cfl_top_tbl) + CONFIG_THUMB .word 4f - L(ipred_cfl_top_tbl) + CONFIG_THUMB 4: vld1.16 {d0}, [r2] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) 8: vld1.16 {q0}, [r2] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) 16: vld1.16 {q2, q3}, [r2] vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) 32: vld1.16 {q8, q9}, [r2]! vld1.16 {q10, q11}, [r2] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q8, q10 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d0, q0, #5 vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_left_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] sub r2, r2, r4, lsl #1 clz lr, r3 clz r8, r4 vdup.16 q15, r7 // bitdepth_max adr r12, L(ipred_cfl_splat_tbl) adr r7, L(ipred_cfl_left_tbl) sub lr, lr, #26 sub r8, r8, #26 ldr lr, [r12, lr, lsl #2] ldr r8, [r7, r8, lsl #2] vdup.16 q1, r6 // alpha add r12, r12, lr add r7, r7, r8 add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r7 .align 2 L(ipred_cfl_left_tbl): .word L(ipred_cfl_left_h32) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h16) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h8) - L(ipred_cfl_left_tbl) + CONFIG_THUMB .word L(ipred_cfl_left_h4) - L(ipred_cfl_left_tbl) + CONFIG_THUMB L(ipred_cfl_left_h4): vld1.16 {d0}, [r2, :64] vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #2 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h8): vld1.16 {q0}, [r2, :128] vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #3 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h16): vld1.16 {q2, q3}, [r2, :128] vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpadd.i16 d0, d0, d0 vrshr.u16 d0, d0, #4 vdup.16 q0, d0[0] bx r12 L(ipred_cfl_left_h32): vld1.16 {q8, q9}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128] vadd.i16 q8, q8, q9 vadd.i16 q10, q10, q11 vadd.i16 q0, q8, q10 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 vpaddl.u16 d0, d0 vrshrn.i32 d0, q0, #5 vdup.16 q0, d0[0] bx r12 endfunc // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_16bpc_neon, export=1 push {r4-r8, lr} ldrd r4, r5, [sp, #24] ldrd r6, r7, [sp, #32] sub r2, r2, r4, lsl #1 add r8, r3, r4 // width + height vdup.16 q1, r6 // alpha clz lr, r3 clz r6, r4 vdup.32 d16, r8 // width + height vdup.16 q15, r7 // bitdepth_max adr r7, L(ipred_cfl_tbl) rbit r8, r8 // rbit(width + height) sub lr, lr, #22 // 26 leading bits, minus table offset 4 sub r6, r6, #26 clz r8, r8 // ctz(width + height) ldr lr, [r7, lr, lsl #2] ldr r6, [r7, r6, lsl #2] neg r8, r8 // -ctz(width + height) add r12, r7, lr add r7, r7, r6 vshr.u32 d16, d16, #1 // (width + height) >> 1 vdup.32 d17, r8 // -ctz(width + height) add r6, r0, r1 lsl r1, r1, #1 vmov.i16 q14, #0 bx r7 .align 2 L(ipred_cfl_tbl): .word L(ipred_cfl_h32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_h4) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w32) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w16) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w8) - L(ipred_cfl_tbl) + CONFIG_THUMB .word L(ipred_cfl_w4) - L(ipred_cfl_tbl) + CONFIG_THUMB L(ipred_cfl_h4): vld1.16 {d0}, [r2, :64]! vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w4): vld1.16 {d1}, [r2] vadd.i32 d0, d0, d16 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #4 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 8/16 cmp r4, #16 movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): vld1.16 {q0}, [r2, :128]! vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w8): vld1.16 {q2}, [r2] vadd.i32 d0, d0, d16 vadd.i16 d1, d4, d5 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #8 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 4/16/32 cmp r4, #32 movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): vld1.16 {q2, q3}, [r2, :128]! vadd.i16 q0, q2, q3 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w16): vld1.16 {q2, q3}, [r2] vadd.i32 d0, d0, d16 vadd.i16 q2, q2, q3 vadd.i16 d1, d4, d5 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #16 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 4/8/32/64 tst r4, #(32+16+8) // 16 added to make a consecutive bitmask movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): vld1.16 {q2, q3}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128]! vadd.i16 q2, q2, q3 vadd.i16 q10, q10, q11 vadd.i16 q0, q2, q10 vadd.i16 d0, d0, d1 vpadd.i16 d0, d0, d0 add r2, r2, #2 vpaddl.u16 d0, d0 bx r12 L(ipred_cfl_w32): vld1.16 {q2, q3}, [r2]! vadd.i32 d0, d0, d16 vld1.16 {q10, q11}, [r2]! vadd.i16 q2, q2, q3 vadd.i16 q10, q10, q11 vadd.i16 q2, q2, q10 vadd.i16 d1, d4, d5 vpadd.i16 d1, d1, d1 vpaddl.u16 d1, d1 cmp r4, #32 vadd.i32 d0, d0, d1 vshl.u32 d0, d0, d17 beq 1f // h = 8/16/64 cmp r4, #8 movw lr, #0x6667 movw r8, #0xAAAB it ne movne lr, r8 vdup.32 d18, lr vmul.i32 d0, d0, d18 vshr.u32 d0, d0, #17 1: vdup.16 q0, d0[0] b L(ipred_cfl_splat_w16) endfunc // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_420_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i32 q8, #0 vmov.i32 q9, #0 vmov.i32 q10, #0 vmov.i32 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_420_tbl): .word L(ipred_cfl_ac_420_w16) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w8) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w4) - L(ipred_cfl_ac_420_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w4): 1: // Copy and subsample input vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vshl.i16 q0, q0, #1 subs r8, r8, #2 vst1.16 {q0}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 bgt 1b cmp r4, #0 vmov d0, d1 vmov d2, d1 vmov d3, d1 L(ipred_cfl_ac_420_w4_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 2b 3: L(ipred_cfl_ac_420_w4_calc_subtract_dc): // Aggregate the sums vadd.i32 q8, q8, q9 vadd.i32 q10, q10, q11 vadd.i32 q0, q8, q10 vadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d0 // sum sub r0, r0, r6, lsl #3 vrshl.u32 d16, d0, d31 // (sum + (1 << (log2sz - 1))) >>= log2sz vdup.16 q8, d16[0] 6: // Subtract dc from ac vld1.16 {q0, q1}, [r0, :128] subs r6, r6, #4 vsub.i16 q0, q0, q8 vsub.i16 q1, q1, q8 vst1.16 {q0, q1}, [r0, :128]! bgt 6b pop {r4-r8, pc} L(ipred_cfl_ac_420_w8): cmp r3, #0 bne L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vld1.16 {q12, q13}, [r1, :128], r2 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vadd.i16 q12, q12, q2 vadd.i16 q13, q13, q3 vpadd.i16 d2, d24, d25 vpadd.i16 d3, d26, d27 vshl.i16 q0, q0, #1 vshl.i16 q1, q1, #1 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov q0, q1 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vadd.i16 q0, q0, q1 vadd.i16 q2, q2, q3 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vshl.i16 q0, q0, #1 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov q0, q1 L(ipred_cfl_ac_420_w8_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 2b 3: // Double the height and reuse the w4 summing/subtracting lsl r6, r6, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): adr r7, L(ipred_cfl_ac_420_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_420_w16_tbl): .word L(ipred_cfl_ac_420_w16_wpad0) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad1) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad2) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_420_w16_wpad3) - L(ipred_cfl_ac_420_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_420_w16_wpad0): sub r2, r2, #32 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q12, q13}, [r12, :128]! vld1.16 {q2, q3}, [r1, :128], r2 vadd.i16 q0, q0, q12 vadd.i16 q1, q1, q13 vld1.16 {q12, q13}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vadd.i16 q2, q2, q12 vadd.i16 q3, q3, q13 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vshl.i16 q0, q0, #1 vshl.i16 q1, q1, #1 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): sub r2, r2, #32 1: // Copy and subsample input, padding 4 vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q12, q13}, [r12, :128]! vld1.16 {q2}, [r1, :128], r2 vadd.i16 q0, q0, q12 vadd.i16 q1, q1, q13 vld1.16 {q12}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vadd.i16 q2, q2, q12 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vshl.i16 q0, q0, #1 vshl.i16 d2, d2, #1 subs r8, r8, #1 vdup.16 d3, d2[3] vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q12, q13}, [r12, :128], r2 vadd.i16 q0, q0, q12 vadd.i16 q1, q1, q13 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vshl.i16 q0, q0, #1 subs r8, r8, #1 vdup.16 q1, d1[3] vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q12}, [r12, :128], r2 vadd.i16 q0, q0, q12 vpadd.i16 d0, d0, d1 vshl.i16 d0, d0, #1 subs r8, r8, #1 vdup.16 q1, d0[3] vdup.16 d1, d0[3] vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 2b 3: // Quadruple the height and reuse the w4 summing/subtracting lsl r6, r6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_422_tbl) sub r8, r8, #27 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_422_tbl): .word L(ipred_cfl_ac_422_w16) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w8) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w4) - L(ipred_cfl_ac_422_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w4): 1: // Copy and subsample input vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): cmp r3, #0 bne L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vld1.16 {q12, q13}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d4, d5 vpadd.i16 d3, d6, d7 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d24, d24, d25 vpadd.i16 d25, d26, d27 vpadd.i16 d26, d4, d5 vpadd.i16 d27, d6, d7 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q12, #2 vshl.i16 q3, q13, #2 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q2}, [r12, :128], r2 vld1.16 {q12}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d24, d24, d25 vpadd.i16 d25, d4, d5 vshl.i16 q0, q0, #2 vshl.i16 q12, q12, #2 vdup.16 d7, d25[3] vmov d6, d25 vdup.16 d5, d24[3] vmov d4, d24 vdup.16 d3, d1[3] vmov d2, d1 vdup.16 d1, d0[3] subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): adr r7, L(ipred_cfl_ac_422_w16_tbl) ldr r3, [r7, r3, lsl #2] add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_422_w16_tbl): .word L(ipred_cfl_ac_422_w16_wpad0) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad1) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad2) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_422_w16_wpad3) - L(ipred_cfl_ac_422_w16_tbl) + CONFIG_THUMB L(ipred_cfl_ac_422_w16_wpad0): sub r2, r2, #32 1: // Copy and subsample input, without padding vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2, q3}, [r12, :128]! vld1.16 {q12, q13}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d24, d25 vpadd.i16 d3, d26, d27 vld1.16 {q12, q13}, [r12, :128], r2 vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vpadd.i16 d6, d24, d25 vpadd.i16 d7, d26, d27 vshl.i16 q0, q0, #2 vshl.i16 q1, q1, #2 vshl.i16 q2, q2, #2 vshl.i16 q3, q3, #2 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): sub r2, r2, #32 1: // Copy and subsample input, padding 4 vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2, q3}, [r12, :128]! vld1.16 {q12}, [r1, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d2, d24, d25 vld1.16 {q12}, [r12, :128], r2 vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vpadd.i16 d6, d24, d25 vshl.i16 q0, q0, #2 vshl.i16 d2, d2, #2 vshl.i16 q2, q2, #2 vshl.i16 d6, d6, #2 vdup.16 d3, d2[3] vdup.16 d7, d6[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): 1: // Copy and subsample input, padding 8 vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d2, d3 vpadd.i16 d4, d4, d5 vpadd.i16 d5, d6, d7 vshl.i16 q0, q0, #2 vshl.i16 q2, q2, #2 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): 1: // Copy and subsample input, padding 12 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q2}, [r12, :128], r2 vpadd.i16 d0, d0, d1 vpadd.i16 d1, d4, d5 vshl.i16 q0, q0, #2 vdup.16 q3, d1[3] vdup.16 q1, d0[3] vdup.16 d5, d1[3] vmov d4, d1 vdup.16 d1, d0[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) endfunc // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_16bpc_neon, export=1 push {r4-r8,lr} ldrd r4, r5, [sp, #24] ldr r6, [sp, #32] clz r8, r5 lsl r4, r4, #2 adr r7, L(ipred_cfl_ac_444_tbl) sub r8, r8, #26 ldr r8, [r7, r8, lsl #2] vmov.i16 q8, #0 vmov.i16 q9, #0 vmov.i16 q10, #0 vmov.i16 q11, #0 add r7, r7, r8 sub r8, r6, r4 // height - h_pad rbit lr, r5 // rbit(width) rbit r12, r6 // rbit(height) clz lr, lr // ctz(width) clz r12, r12 // ctz(height) add lr, lr, r12 // log2sz add r12, r1, r2 vdup.32 d31, lr lsl r2, r2, #1 vneg.s32 d31, d31 // -log2sz bx r7 .align 2 L(ipred_cfl_ac_444_tbl): .word L(ipred_cfl_ac_444_w32) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w16) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w8) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w4) - L(ipred_cfl_ac_444_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w4): 1: // Copy and expand input vld1.16 {d0}, [r1, :64], r2 vld1.16 {d1}, [r12, :64], r2 vld1.16 {d2}, [r1, :64], r2 vld1.16 {d3}, [r12, :64], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 bgt 1b cmp r4, #0 vmov d0, d3 vmov d1, d3 vmov d2, d3 b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): 1: // Copy and expand input vld1.16 {q0}, [r1, :128], r2 vld1.16 {q1}, [r12, :128], r2 vld1.16 {q2}, [r1, :128], r2 vld1.16 {q3}, [r12, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 vshl.i16 q3, q3, #3 subs r8, r8, #4 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q3 vmov q1, q3 b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): cmp r3, #0 bne L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding vld1.16 {q0, q1}, [r1, :128], r2 vld1.16 {q2, q3}, [r12, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 vshl.i16 q3, q3, #3 subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 vld1.16 {q0}, [r1, :128], r2 vld1.16 {q2}, [r12, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q2, q2, #3 vdup.16 q1, d1[3] vdup.16 q3, d5[3] subs r8, r8, #2 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 vmov q0, q2 vmov q1, q3 b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): adr r7, L(ipred_cfl_ac_444_w32_tbl) ldr r3, [r7, r3, lsl #1] // (w3>>1) << 2 asr r2, r2, #1 add r7, r7, r3 bx r7 .align 2 L(ipred_cfl_ac_444_w32_tbl): .word L(ipred_cfl_ac_444_w32_wpad0) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad2) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad4) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB .word L(ipred_cfl_ac_444_w32_wpad6) - L(ipred_cfl_ac_444_w32_tbl) + CONFIG_THUMB L(ipred_cfl_ac_444_w32_wpad0): sub r2, r2, #32 1: // Copy and expand input, without padding vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2, q3}, [r1, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 vshl.i16 q3, q3, #3 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): sub r2, r2, #32 1: // Copy and expand input, padding 8 vld1.16 {q0, q1}, [r1, :128]! vld1.16 {q2}, [r1, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 vshl.i16 q2, q2, #3 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vdup.16 q3, d5[3] vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): 1: // Copy and expand input, padding 16 vld1.16 {q0, q1}, [r1, :128], r2 vshl.i16 q0, q0, #3 vshl.i16 q1, q1, #3 subs r8, r8, #1 vst1.16 {q0, q1}, [r0, :128]! vdup.16 q2, d3[3] vdup.16 q3, d3[3] vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): 1: // Copy and expand input, padding 24 vld1.16 {q0}, [r1, :128], r2 vshl.i16 q0, q0, #3 subs r8, r8, #1 vdup.16 q1, d1[3] vst1.16 {q0, q1}, [r0, :128]! vdup.16 q2, d1[3] vdup.16 q3, d1[3] vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 1b cmp r4, #0 L(ipred_cfl_ac_444_w32_hpad): beq 3f // This assumes that all callers already did "cmp r4, #0" 2: // Vertical padding (h_pad > 0) subs r4, r4, #1 vst1.16 {q0, q1}, [r0, :128]! vaddw.u16 q8, q8, d0 vaddw.u16 q9, q9, d1 vaddw.u16 q10, q10, d2 vaddw.u16 q11, q11, d3 vst1.16 {q2, q3}, [r0, :128]! vaddw.u16 q8, q8, d4 vaddw.u16 q9, q9, d5 vaddw.u16 q10, q10, d6 vaddw.u16 q11, q11, d7 bgt 2b 3: // Multiply the height by eight and reuse the w4 subtracting lsl r6, r6, #3 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) endfunc rav1e-0.7.1/src/arm/32/itx.S000064400000000000000000003374361046102023000134110ustar 00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); // Most of the functions use the following register layout: // r0-r3 external parameters // r4 function pointer to first transform // r5 function pointer to second transform // r6 output parameter for helper function // r7 input parameter for helper function // r8 input stride for helper function // r9 scratch variable for helper functions // r10-r11 pointer to list of eob thresholds, eob threshold value, // scratch variables within helper functions (backed up) // The SIMD registers most often use the following layout: // d0-d3 multiplication coefficients // d4-d7 scratch registers // d8-d15 unused in some transforms, used for scratch registers in others // d16-v31 inputs/outputs of transforms // Potential further optimizations, that are left unimplemented for now: // - Trying to keep multiplication coefficients in registers across multiple // transform functions. (The register layout is designed to potentially // allow this.) // - Use a simplified version of the transforms themselves for cases where // we know a significant number of inputs are zero. E.g. if the eob value // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. const idct_coeffs, align=4 // idct4 .short 2896, 2896*8, 1567, 3784 // idct8 .short 799, 4017, 3406, 2276 // idct16 .short 401, 4076, 3166, 2598 .short 1931, 3612, 3920, 1189 // idct32 .short 201, 4091, 3035, 2751 .short 1751, 3703, 3857, 1380 .short 995, 3973, 3513, 2106 .short 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .short 101*8, 4095*8, 2967*8, -2824*8 .short 1660*8, 3745*8, 3822*8, -1474*8 .short 4076, 401, 4017, 799 .short 4036*8, -700*8, 2359*8, 3349*8 .short 3461*8, -2191*8, 897*8, 3996*8 .short -3166, -2598, -799, -4017 .short 501*8, 4065*8, 3229*8, -2520*8 .short 2019*8, 3564*8, 3948*8, -1092*8 .short 3612, 1931, 2276, 3406 .short 4085*8, -301*8, 2675*8, 3102*8 .short 3659*8, -1842*8, 1285*8, 3889*8 .short -3920, -1189, -3406, -2276 endconst const iadst4_coeffs, align=4 // .h[4-5] can be interpreted as .s[2] .short 1321, 3803, 2482, 3344, 3344, 0 endconst const iadst8_coeffs, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 // idct_coeffs .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst const iadst16_coeffs, align=4 .short 4091, 201, 3973, 995 .short 3703, 1751, 3290, 2440 .short 2751, 3035, 2106, 3513 .short 1380, 3857, 601, 4052 endconst .macro vmull_vmlal d0, s0, s1, c0, c1 vmull.s16 \d0, \s0, \c0 vmlal.s16 \d0, \s1, \c1 .endm .macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1 vmull.s16 \d0, \s0, \c0 vmlal.s16 \d0, \s2, \c1 vmull.s16 \d1, \s1, \c0 vmlal.s16 \d1, \s3, \c1 .endm .macro vmull_vmlsl d0, s0, s1, c0, c1 vmull.s16 \d0, \s0, \c0 vmlsl.s16 \d0, \s1, \c1 .endm .macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1 vmull.s16 \d0, \s0, \c0 vmlsl.s16 \d0, \s2, \c1 vmull.s16 \d1, \s1, \c0 vmlsl.s16 \d1, \s3, \c1 .endm .macro vqrshrn_8h d0, d1, s0, s1, shift vqrshrn.s32 \d0, \s0, \shift vqrshrn.s32 \d1, \s1, \shift .endm .macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 vqrdmulh.s16 \r0, \r0, \c vqrdmulh.s16 \r1, \r1, \c .ifnb \r2 vqrdmulh.s16 \r2, \r2, \c vqrdmulh.s16 \r3, \r3, \c .endif .ifnb \r4 vqrdmulh.s16 \r4, \r4, \c vqrdmulh.s16 \r5, \r5, \c vqrdmulh.s16 \r6, \r6, \c vqrdmulh.s16 \r7, \r7, \c .endif .endm .macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 .ifnb \load vld1.8 {\load}, [\src, :64], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #\shiftbits .endif .ifnb \addsrc vaddw.u8 \adddst, \adddst, \addsrc .endif .ifnb \narrowsrc vqmovun.s16 \narrowdst, \narrowsrc .endif .ifnb \store vst1.8 {\store}, [\dst, :64], r1 .endif .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst load_add_store d2, q8, , , , , , \dst, \src, \shiftbits load_add_store d3, q9, , , , , , \dst, \src, \shiftbits load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits load_add_store , , , , , , d3, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src mov \src, \dst load_add_store d2, q8, , , , , , \dst, \src load_add_store d3, q9, , , , , , \dst, \src load_add_store d4, q10, d2, q8, , , , \dst, \src load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src load_add_store , , d4, q10, q9, d3, d2, \dst, \src load_add_store , , d5, q11, q10, d4, d3, \dst, \src load_add_store , , , , q11, d5, d4, \dst, \src load_add_store , , , , , , d5, \dst, \src .endm .macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src .ifnb \load vld1.32 {\load[0]}, [\src, :32], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #4 .endif .ifnb \load vld1.32 {\load[1]}, [\src, :32], r1 .endif .ifnb \addsrc vaddw.u8 \adddst, \adddst, \addsrc .endif .ifnb \store vst1.32 {\store[0]}, [\dst, :32], r1 .endif .ifnb \narrowsrc vqmovun.s16 \narrowdst, \narrowsrc .endif .ifnb \store vst1.32 {\store[1]}, [\dst, :32], r1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst load_add_store4 d0, , , , , , , \dst, \src load_add_store4 d1, q8, , , , , , \dst, \src load_add_store4 d2, q9, d0, q8, , , , \dst, \src load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src load_add_store4 , , , , q15, d7, d6, \dst, \src load_add_store4 , , , , , , d7, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst load_add_store4 d0, , , , , , , \dst, \src load_add_store4 d1, q8, , , , , , \dst, \src load_add_store4 d2, q9, d0, q8, , , , \dst, \src load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src load_add_store4 , , , , q11, d3, d2, \dst, \src load_add_store4 , , , , , , d3, \dst, \src .endm .macro idct_dc w, h, shift cmp r3, #0 bne 1f vmov.i16 d30, #0 movw r12, #2896*8 vld1.16 {d16[]}, [r2, :16] vdup.16 d0, r12 vqrdmulh.s16 d16, d16, d0[0] vst1.16 {d30[0]}, [r2, :16] .if (\w == 2*\h) || (2*\w == \h) vqrdmulh.s16 d16, d16, d0[0] .endif .if \shift > 0 vrshr.s16 d16, d16, #\shift .endif vqrdmulh.s16 d20, d16, d0[0] mov r3, #\h vrshr.s16 d16, d20, #4 vrshr.s16 d17, d20, #4 b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon 1: vld1.32 {d0[0]}, [r0, :32], r1 vld1.32 {d0[1]}, [r0, :32], r1 vld1.32 {d1[0]}, [r0, :32], r1 vld1.32 {d1[1]}, [r0, :32], r1 subs r3, r3, #4 sub r0, r0, r1, lsl #2 vaddw.u8 q10, q8, d0 vqmovun.s16 d0, q10 vaddw.u8 q11, q8, d1 vst1.32 {d0[0]}, [r0, :32], r1 vqmovun.s16 d1, q11 vst1.32 {d0[1]}, [r0, :32], r1 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 bgt 1b bx lr endfunc function idct_dc_w8_neon 1: vld1.8 {d0}, [r0, :64], r1 vld1.8 {d1}, [r0, :64], r1 vld1.8 {d2}, [r0, :64], r1 vaddw.u8 q10, q8, d0 vld1.8 {d3}, [r0, :64], r1 sub r0, r0, r1, lsl #2 subs r3, r3, #4 vaddw.u8 q11, q8, d1 vqmovun.s16 d0, q10 vaddw.u8 q12, q8, d2 vqmovun.s16 d1, q11 vaddw.u8 q13, q8, d3 vst1.8 {d0}, [r0, :64], r1 vqmovun.s16 d2, q12 vst1.8 {d1}, [r0, :64], r1 vqmovun.s16 d3, q13 vst1.8 {d2}, [r0, :64], r1 vst1.8 {d3}, [r0, :64], r1 bgt 1b bx lr endfunc function idct_dc_w16_neon 1: vld1.8 {q0}, [r0, :128], r1 vld1.8 {q1}, [r0, :128], r1 vld1.8 {q2}, [r0, :128], r1 subs r3, r3, #4 vaddw.u8 q10, q8, d0 vaddw.u8 q11, q8, d1 vld1.8 {q3}, [r0, :128], r1 vaddw.u8 q12, q8, d2 vaddw.u8 q13, q8, d3 sub r0, r0, r1, lsl #2 vaddw.u8 q14, q8, d4 vaddw.u8 q15, q8, d5 vqmovun.s16 d0, q10 vqmovun.s16 d1, q11 vaddw.u8 q10, q8, d6 vaddw.u8 q11, q8, d7 vqmovun.s16 d2, q12 vqmovun.s16 d3, q13 vqmovun.s16 d4, q14 vqmovun.s16 d5, q15 vst1.8 {q0}, [r0, :128], r1 vqmovun.s16 d6, q10 vqmovun.s16 d7, q11 vst1.8 {q1}, [r0, :128], r1 vst1.8 {q2}, [r0, :128], r1 vst1.8 {q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w32_neon 1: vld1.8 {q0, q1}, [r0, :128], r1 subs r3, r3, #2 vld1.8 {q2, q3}, [r0, :128], r1 vaddw.u8 q10, q8, d0 vaddw.u8 q11, q8, d1 vaddw.u8 q12, q8, d2 vaddw.u8 q13, q8, d3 sub r0, r0, r1, lsl #1 vaddw.u8 q14, q8, d4 vaddw.u8 q15, q8, d5 vqmovun.s16 d0, q10 vqmovun.s16 d1, q11 vaddw.u8 q10, q8, d6 vaddw.u8 q11, q8, d7 vqmovun.s16 d2, q12 vqmovun.s16 d3, q13 vqmovun.s16 d4, q14 vqmovun.s16 d5, q15 vst1.8 {q0, q1}, [r0, :128], r1 vqmovun.s16 d6, q10 vqmovun.s16 d7, q11 vst1.8 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w64_neon sub r1, r1, #32 1: vld1.8 {q0, q1}, [r0, :128]! subs r3, r3, #1 vld1.8 {q2, q3}, [r0, :128] vaddw.u8 q10, q8, d0 vaddw.u8 q11, q8, d1 vaddw.u8 q12, q8, d2 vaddw.u8 q13, q8, d3 sub r0, r0, #32 vaddw.u8 q14, q8, d4 vaddw.u8 q15, q8, d5 vqmovun.s16 d0, q10 vqmovun.s16 d1, q11 vaddw.u8 q10, q8, d6 vaddw.u8 q11, q8, d7 vqmovun.s16 d2, q12 vqmovun.s16 d3, q13 vqmovun.s16 d4, q14 vqmovun.s16 d5, q15 vst1.8 {q0, q1}, [r0, :128]! vqmovun.s16 d6, q10 vqmovun.s16 d7, q11 vst1.8 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc .macro iwht4 vadd.i16 d16, d16, d17 vsub.i16 d21, d18, d19 vsub.i16 d20, d16, d21 vshr.s16 d20, d20, #1 vsub.i16 d18, d20, d17 vsub.i16 d17, d20, d19 vadd.i16 d19, d21, d18 vsub.i16 d16, d16, d17 .endm .macro idct_4h_x4 r0, r1, r2, r3 vmull_vmlal q3, \r1, \r3, d0[3], d0[2] vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] vmull_vmlal q1, \r0, \r2, d0[0], d0[0] vqrshrn.s32 d6, q3, #12 vqrshrn.s32 d7, q2, #12 vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] vqrshrn.s32 d2, q1, #12 vqrshrn.s32 d3, q2, #12 vqadd.s16 \r0, d2, d6 vqsub.s16 \r3, d2, d6 vqadd.s16 \r1, d3, d7 vqsub.s16 \r2, d3, d7 .endm .macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] vqrshrn_8h d12, d13, q6, q7, #12 vqrshrn_8h d14, d15, q4, q5, #12 vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] vqrshrn_8h d4, d5, q2, q3, #12 vqrshrn_8h d6, d7, q4, q5, #12 vqadd.s16 \q0, q2, q6 vqsub.s16 \q3, q2, q6 vqadd.s16 \q1, q3, q7 vqsub.s16 \q2, q3, q7 .endm function inv_dct_4h_x4_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {d0}, [r12, :64] idct_4h_x4 d16, d17, d18, d19 bx lr endfunc function inv_dct_8h_x4_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {d0}, [r12, :64] idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel_local r12, iadst4_coeffs vld1.16 {d0, d1}, [r12, :128] vsubl.s16 q1, d16, d18 vmull.s16 q2, d16, d0[0] vmlal.s16 q2, d18, d0[1] vmlal.s16 q2, d19, d0[2] vmull.s16 q10, d17, d0[3] vaddw.s16 q1, q1, d19 vmull.s16 q3, d16, d0[2] vmlsl.s16 q3, d18, d0[0] vmlsl.s16 q3, d19, d0[1] vadd.s32 q11, q2, q3 vmul.s32 q1, q1, d1[0] vadd.s32 q2, q2, q10 vadd.s32 q3, q3, q10 vsub.s32 q11, q11, q10 vqrshrn.s32 \o0, q2, #12 vqrshrn.s32 \o2, q1, #12 vqrshrn.s32 \o1, q3, #12 vqrshrn.s32 \o3, q11, #12 .endm function inv_adst_4h_x4_neon, export=1 iadst_4x4 d16, d17, d18, d19 bx lr endfunc function inv_flipadst_4h_x4_neon, export=1 iadst_4x4 d19, d18, d17, d16 bx lr endfunc .macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7 movrel_local r12, iadst4_coeffs vld1.16 {d0, d1}, [r12, :128] vsubl.s16 q2, d16, d20 vsubl.s16 q3, d17, d21 vmull.s16 q4, d16, d0[0] vmlal.s16 q4, d20, d0[1] vmlal.s16 q4, d22, d0[2] vmull.s16 q5, d17, d0[0] vmlal.s16 q5, d21, d0[1] vmlal.s16 q5, d23, d0[2] vaddw.s16 q2, q2, d22 vaddw.s16 q3, q3, d23 vmull.s16 q6, d16, d0[2] vmlsl.s16 q6, d20, d0[0] vmlsl.s16 q6, d22, d0[1] vmull.s16 q7, d17, d0[2] vmlsl.s16 q7, d21, d0[0] vmlsl.s16 q7, d23, d0[1] vmul.s32 q10, q2, d1[0] vmul.s32 q11, q3, d1[0] vmull.s16 q2, d18, d0[3] vmull.s16 q3, d19, d0[3] vadd.s32 q8, q4, q2 // out0 vadd.s32 q9, q5, q3 vadd.s32 q4, q4, q6 // out3 vadd.s32 q5, q5, q7 vadd.s32 q6, q6, q2 // out1 vadd.s32 q7, q7, q3 vsub.s32 q4, q4, q2 // out3 vsub.s32 q5, q5, q3 vqrshrn.s32 d20, q10, #12 vqrshrn.s32 d21, q11, #12 vqrshrn.s32 \o0, q8, #12 vqrshrn.s32 \o1, q9, #12 .ifc \o4, d18 vmov q9, q10 .endif vqrshrn.s32 \o2, q6, #12 vqrshrn.s32 \o3, q7, #12 vqrshrn.s32 \o6, q4, #12 vqrshrn.s32 \o7, q5, #12 .endm function inv_adst_8h_x4_neon, export=1 iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc function inv_flipadst_8h_x4_neon, export=1 iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17 bx lr endfunc function inv_identity_4h_x4_neon, export=1 movw r12, #(5793-4096)*8 vdup.16 d0, r12 vqrdmulh.s16 q2, q8, d0[0] vqrdmulh.s16 q3, q9, d0[0] vqadd.s16 q8, q8, q2 vqadd.s16 q9, q9, q3 bx lr endfunc function inv_identity_8h_x4_neon, export=1 movw r12, #(5793-4096)*8 vdup.16 d0, r12 vqrdmulh.s16 q1, q8, d0[0] vqrdmulh.s16 q2, q9, d0[0] vqrdmulh.s16 q3, q10, d0[0] vqadd.s16 q8, q8, q1 vqrdmulh.s16 q1, q11, d0[0] vqadd.s16 q9, q9, q2 vqadd.s16 q10, q10, q3 vqadd.s16 q11, q11, q1 bx lr endfunc .macro identity_8x4_shift1 r0, r1, r2, r3, c .irp i, \r0, \r1, \r2, \r3 vqrdmulh.s16 q1, \i, \c vrhadd.s16 \i, \i, q1 .endr .endm function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 push {r4-r5,lr} vmov.i16 q15, #0 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q15}, [r2, :128]! vshr.s16 q8, q8, #2 vshr.s16 q9, q9, #2 iwht4 vst1.16 {q15}, [r2, :128]! transpose_4x4h q8, q9, d16, d17, d18, d19 iwht4 vld1.32 {d0[]}, [r0, :32], r1 vld1.32 {d0[1]}, [r0, :32], r1 vld1.32 {d1[]}, [r0, :32], r1 vld1.32 {d1[1]}, [r0, :32], r1 b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon vmov.i16 q15, #0 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q15}, [r2, :128]! blx r4 vst1.16 {q15}, [r2, :128]! transpose_4x4h q8, q9, d16, d17, d18, d19 blx r5 vld1.32 {d0[]}, [r0, :32], r1 vld1.32 {d0[1]}, [r0, :32], r1 vld1.32 {d1[]}, [r0, :32], r1 vld1.32 {d1[1]}, [r0, :32], r1 vrshr.s16 q8, q8, #4 vrshr.s16 q9, q9, #4 L(itx_4x4_end): sub r0, r0, r1, lsl #2 vaddw.u8 q8, q8, d0 vqmovun.s16 d0, q8 vaddw.u8 q9, q9, d1 vst1.32 {d0[0]}, [r0, :32], r1 vqmovun.s16 d1, q9 vst1.32 {d0[1]}, [r0, :32], r1 vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 pop {r4-r5,pc} endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 push {r4-r5,lr} .ifc \txfm1\()_\txfm2, dct_dct cmp r3, #0 bne 1f vmov.i16 d30, #0 movw r12, #2896*8 vld1.16 {d16[]}, [r2, :16] vdup.16 d4, r12 vst1.16 {d30[0]}, [r2, :16] vqrdmulh.s16 d16, d16, d4[0] vld1.32 {d0[0]}, [r0, :32], r1 vqrdmulh.s16 d20, d16, d4[0] vld1.32 {d0[1]}, [r0, :32], r1 vrshr.s16 d16, d20, #4 vrshr.s16 d17, d20, #4 vld1.32 {d1[0]}, [r0, :32], r1 vmov q9, q8 vld1.32 {d1[1]}, [r0, :32], r1 b L(itx_4x4_end) 1: .endif movrel_local r4, inv_\txfm1\()_4h_x4_neon movrel_local r5, inv_\txfm2\()_4h_x4_neon b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13 vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a vqrshrn_8h \r2, \r3, q2, q3, #12 // t4a vqrshrn_8h \r14, \r15, q4, q5, #12 // t7a vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a vqrshrn_8h \r6, \r7, q6, q7, #12 // t5a vqrshrn_8h \r10, \r11, q2, q3, #12 // t6a vqadd.s16 q2, \q1, \q3 // t4 vqsub.s16 \q1, \q1, \q3 // t5a vqadd.s16 q3, \q7, \q5 // t7 vqsub.s16 \q3, \q7, \q5 // t6a vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 vqrshrn_8h d8, d9, q4, q5, #12 // t5 vqrshrn_8h d10, d11, q6, q7, #12 // t6 vqsub.s16 \q7, \q0, q3 // out7 vqadd.s16 \q0, \q0, q3 // out0 vqadd.s16 \q1, \q2, q5 // out1 vqsub.s16 q6, \q2, q5 // out6 vqadd.s16 \q2, \q4, q4 // out2 vqsub.s16 \q5, \q4, q4 // out5 vqadd.s16 \q3, \q6, q2 // out3 vqsub.s16 \q4, \q6, q2 // out4 vmov \q6, q6 // out6 .endm .macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4h_x4 \r0, \r2, \r4, \r6 vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a vqrshrn.s32 \r1, q1, #12 // t4a vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a vqrshrn.s32 \r7, q2, #12 // t7a vqrshrn.s32 \r3, q3, #12 // t5a vqrshrn.s32 \r5, q1, #12 // taa vqadd.s16 d2, \r1, \r3 // t4 vqsub.s16 \r1, \r1, \r3 // t5a vqadd.s16 d3, \r7, \r5 // t7 vqsub.s16 \r3, \r7, \r5 // t6a vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 vqrshrn.s32 d4, q2, #12 // t5 vqrshrn.s32 d5, q3, #12 // t6 vqsub.s16 \r7, \r0, d3 // out7 vqadd.s16 \r0, \r0, d3 // out0 vqadd.s16 \r1, \r2, d5 // out1 vqsub.s16 d6, \r2, d5 // out6 vqadd.s16 \r2, \r4, d4 // out2 vqsub.s16 \r5, \r4, d4 // out5 vqadd.s16 \r3, \r6, d2 // out3 vqsub.s16 \r4, \r6, d2 // out4 vmov \r6, d6 // out6 .endm function inv_dct_8h_x8_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {q0}, [r12, :128] idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_dct_4h_x8_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {q0}, [r12, :128] idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc .macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 movrel_local r12, iadst8_coeffs vld1.16 {d0, d1, d2}, [r12, :64] vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] vqrshrn_8h d16, d17, q2, q3, #12 // t0a vqrshrn_8h d30, d31, q4, q5, #12 // t1a vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] vqrshrn_8h d20, d21, q6, q7, #12 // t2a vqrshrn_8h d26, d27, q2, q3, #12 // t3a vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] vqrshrn_8h d24, d25, q4, q5, #12 // t4a vqrshrn_8h d22, d23, q6, q7, #12 // t5a vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] vqrshrn_8h d28, d29, q2, q3, #12 // t6a vqrshrn_8h d18, d19, q4, q5, #12 // t7a vqadd.s16 q2, q8, q12 // t0 vqsub.s16 q3, q8, q12 // t4 vqadd.s16 q4, q15, q11 // t1 vqsub.s16 q5, q15, q11 // t5 vqadd.s16 q6, q10, q14 // t2 vqsub.s16 q7, q10, q14 // t6 vqadd.s16 q10, q13, q9 // t3 vqsub.s16 q11, q13, q9 // t7 vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2] vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2] vqrshrn_8h d6, d7, q8, q9, #12 // t4a vqrshrn_8h d10, d11, q12, q13, #12 // t5a vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3] vqrshrn_8h d14, d15, q14, q15, #12 // t6a vqrshrn_8h d22, d23, q8, q9, #12 // t7a vqadd.s16 \q0, q2, q6 // out0 vqsub.s16 q2, q2, q6 // t2 vqadd.s16 \q7, q4, q10 // out7 vqsub.s16 q4, q4, q10 // t3 vqneg.s16 \q7, \q7 // out7 vqadd.s16 \q1, q3, q7 // out1 vqsub.s16 q3, q3, q7 // t6 vqadd.s16 \q6, q5, q11 // out6 vqsub.s16 q5, q5, q11 // t7 vqneg.s16 \q1, \q1 // out1 vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) vqrshrn_8h d4, d5, q10, q11, #12 // out3 vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) vqrshrn_8h d6, d7, q12, q13, #12 // out5 vqrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) vqrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) vqneg.s16 \q3, q2 // out3 vqneg.s16 \q5, q3 // out5 .endm .macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 movrel_local r12, iadst8_coeffs vld1.16 {d0, d1, d2}, [r12, :64] vmull_vmlal q2, d23, d16, d0[0], d0[1] vmull_vmlsl q3, d23, d16, d0[1], d0[0] vmull_vmlal q4, d21, d18, d0[2], d0[3] vqrshrn.s32 d16, q2, #12 // t0a vqrshrn.s32 d23, q3, #12 // t1a vmull_vmlsl q5, d21, d18, d0[3], d0[2] vmull_vmlal q6, d19, d20, d1[0], d1[1] vqrshrn.s32 d18, q4, #12 // t2a vqrshrn.s32 d21, q5, #12 // t3a vmull_vmlsl q7, d19, d20, d1[1], d1[0] vmull_vmlal q2, d17, d22, d1[2], d1[3] vqrshrn.s32 d20, q6, #12 // t4a vqrshrn.s32 d19, q7, #12 // t5a vmull_vmlsl q3, d17, d22, d1[3], d1[2] vqrshrn.s32 d22, q2, #12 // t6a vqrshrn.s32 d17, q3, #12 // t7a vqadd.s16 d4, d16, d20 // t0 vqsub.s16 d5, d16, d20 // t4 vqadd.s16 d6, d23, d19 // t1 vqsub.s16 d7, d23, d19 // t5 vqadd.s16 d8, d18, d22 // t2 vqsub.s16 d9, d18, d22 // t6 vqadd.s16 d18, d21, d17 // t3 vqsub.s16 d19, d21, d17 // t7 vmull_vmlal q8, d5, d7, d2[3], d2[2] vmull_vmlsl q10, d5, d7, d2[2], d2[3] vmull_vmlsl q11, d19, d9, d2[3], d2[2] vqrshrn.s32 d5, q8, #12 // t4a vqrshrn.s32 d7, q10, #12 // t5a vmull_vmlal q8, d19, d9, d2[2], d2[3] vqrshrn.s32 d9, q11, #12 // t6a vqrshrn.s32 d19, q8, #12 // t7a vqadd.s16 \r0, d4, d8 // out0 vqsub.s16 d4, d4, d8 // t2 vqadd.s16 \r7, d6, d18 // out7 vqsub.s16 d6, d6, d18 // t3 vqneg.s16 \r7, \r7 // out7 vqadd.s16 \r1, d5, d9 // out1 vqsub.s16 d5, d5, d9 // t6 vqadd.s16 \r6, d7, d19 // out6 vqsub.s16 d7, d7, d19 // t7 vqneg.s16 \r1, \r1 // out1 vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) vqrshrn.s32 d4, q9, #12 // out3 vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) vqrshrn.s32 d5, q10, #12 // out5 vqrshrn.s32 \r2, q9, #12 // out2 (d18 or d21) vqrshrn.s32 \r4, q4, #12 // out4 (d20 or d19) vqneg.s16 \r3, d4 // out3 vqneg.s16 \r5, d5 // out5 .endm function inv_adst_8h_x8_neon, export=1 iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_flipadst_8h_x8_neon, export=1 iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17 bx lr endfunc function inv_adst_4h_x8_neon, export=1 iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 bx lr endfunc function inv_flipadst_4h_x8_neon, export=1 iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16 bx lr endfunc function inv_identity_8h_x8_neon, export=1 vqshl.s16 q8, q8, #1 vqshl.s16 q9, q9, #1 vqshl.s16 q10, q10, #1 vqshl.s16 q11, q11, #1 vqshl.s16 q12, q12, #1 vqshl.s16 q13, q13, #1 vqshl.s16 q14, q14, #1 vqshl.s16 q15, q15, #1 bx lr endfunc function inv_identity_4h_x8_neon, export=1 vqshl.s16 q8, q8, #1 vqshl.s16 q9, q9, #1 vqshl.s16 q10, q10, #1 vqshl.s16 q11, q11, #1 bx lr endfunc .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_neon vmov.i16 q0, #0 vmov.i16 q1, #0 vld1.16 {q8, q9}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q12, q13}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q14, q15}, [r2, :128] vst1.16 {q0, q1}, [r2, :128] .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out .else blx r4 vrshr.s16 q8, q8, #1 vrshr.s16 q9, q9, #1 vrshr.s16 q10, q10, #1 vrshr.s16 q11, q11, #1 vrshr.s16 q12, q12, #1 vrshr.s16 q13, q13, #1 vrshr.s16 q14, q14, #1 vrshr.s16 q15, q15, #1 .endif transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 blx r5 load_add_store_8x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} endfunc .endm def_fn_8x8_base def_fn_8x8_base identity_ .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif push {r4-r5,r7,lr} vpush {q4-q7} movrel_local r5, inv_\txfm2\()_8h_x8_neon .ifc \txfm1, identity b inv_txfm_identity_add_8x8_neon .else movrel_local r4, inv_\txfm1\()_8h_x8_neon b inv_txfm_add_8x8_neon .endif endfunc .endm def_fn_8x8 dct, dct def_fn_8x8 identity, identity def_fn_8x8 dct, adst def_fn_8x8 dct, flipadst def_fn_8x8 dct, identity def_fn_8x8 adst, dct def_fn_8x8 adst, adst def_fn_8x8 adst, flipadst def_fn_8x8 flipadst, dct def_fn_8x8 flipadst, adst def_fn_8x8 flipadst, flipadst def_fn_8x8 identity, dct def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst function inv_txfm_add_8x4_neon vmov.i16 q14, #0 vmov.i16 q15, #0 movw r12, #2896*8 vdup.16 d0, r12 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q14, q15}, [r2, :128]! vld1.16 {d20, d21, d22, d23}, [r2, :128] vst1.16 {q14, q15}, [r2, :128] scale_input d0[0], q8, q9, q10, q11 blx r4 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 vswp d17, d20 vswp d19, d21 vswp d18, d20 vswp d21, d22 blx r5 load_add_store_8x4 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} endfunc function inv_txfm_add_4x8_neon vmov.i16 q14, #0 vmov.i16 q15, #0 movw r12, #2896*8 vdup.16 d0, r12 vld1.16 {q8, q9}, [r2, :128] vst1.16 {q14, q15}, [r2, :128]! vld1.16 {q10, q11}, [r2, :128] vst1.16 {q14, q15}, [r2, :128] scale_input d0[0], q8, q9, q10, q11 blx r4 transpose_4x8h q8, q9, q10, q11 vswp d17, d20 vswp d19, d21 vswp d17, d18 vswp d19, d22 blx r5 load_add_store_4x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,pc} endfunc .macro def_fn_48 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif push {r4-r5,r7,lr} vpush {q4-q7} movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct def_fn_48 \w, \h, identity, identity def_fn_48 \w, \h, dct, adst def_fn_48 \w, \h, dct, flipadst def_fn_48 \w, \h, dct, identity def_fn_48 \w, \h, adst, dct def_fn_48 \w, \h, adst, adst def_fn_48 \w, \h, adst, flipadst def_fn_48 \w, \h, flipadst, dct def_fn_48 \w, \h, flipadst, adst def_fn_48 \w, \h, flipadst, flipadst def_fn_48 \w, \h, identity, dct def_fn_48 \w, \h, adst, identity def_fn_48 \w, \h, flipadst, identity def_fn_48 \w, \h, identity, adst def_fn_48 \w, \h, identity, flipadst .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_dct_4h_x16_neon, export=1 movrel_local r12, idct_coeffs vld1.16 {q0, q1}, [r12, :128] vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a vqrshrn.s32 d17, q2, #12 // t8a vqrshrn.s32 d31, q3, #12 // t15a vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a vqrshrn.s32 d23, q4, #12 // t9a vqrshrn.s32 d25, q2, #12 // t14a vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a vqrshrn.s32 d21, q3, #12 // t10a vqrshrn.s32 d27, q4, #12 // t13a vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a vqrshrn.s32 d19, q2, #12 // t11a vqrshrn.s32 d29, q3, #12 // t12a idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30 vqsub.s16 d4, d17, d23 // t9 vqadd.s16 d17, d17, d23 // t8 vqsub.s16 d5, d31, d25 // t14 vqadd.s16 d31, d31, d25 // t15 vqsub.s16 d23, d19, d21 // t10 vqadd.s16 d19, d19, d21 // t11 vqadd.s16 d25, d29, d27 // t12 vqsub.s16 d29, d29, d27 // t13 vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a vqrshrn.s32 d21, q3, #12 // t9a vqrshrn.s32 d27, q4, #12 // t14a vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a vqrshrn.s32 d29, q3, #12 // t13a vneg.s32 q4, q4 vqrshrn.s32 d23, q4, #12 // t10a vqsub.s16 d4, d17, d19 // t11a vqadd.s16 d17, d17, d19 // t8a vqsub.s16 d5, d31, d25 // t12a vqadd.s16 d31, d31, d25 // t15a vqadd.s16 d19, d21, d23 // t9 vqsub.s16 d21, d21, d23 // t10 vqsub.s16 d25, d27, d29 // t13 vqadd.s16 d27, d27, d29 // t14 vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11 vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a vqrshrn.s32 d6, q3, #12 // t11 vqrshrn.s32 d7, q4, #12 // t12 vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t13a vqrshrn.s32 d4, q2, #12 // t10a vqrshrn.s32 d5, q4, #12 // t13a vqadd.s16 d8, d16, d31 // out0 vqsub.s16 d31, d16, d31 // out15 vmov d16, d8 vqadd.s16 d23, d30, d17 // out7 vqsub.s16 d9, d30, d17 // out8 vqadd.s16 d17, d18, d27 // out1 vqsub.s16 d30, d18, d27 // out14 vqadd.s16 d18, d20, d5 // out2 vqsub.s16 d29, d20, d5 // out13 vqadd.s16 d5, d28, d19 // out6 vqsub.s16 d25, d28, d19 // out9 vqadd.s16 d19, d22, d7 // out3 vqsub.s16 d28, d22, d7 // out12 vqadd.s16 d20, d24, d6 // out4 vqsub.s16 d27, d24, d6 // out11 vqadd.s16 d21, d26, d4 // out5 vqsub.s16 d26, d26, d4 // out10 vmov d24, d9 vmov d22, d5 bx lr endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 movrel_local r12, iadst16_coeffs vld1.16 {q0, q1}, [r12, :128] movrel_local r12, idct_coeffs vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 vqrshrn.s32 d16, q2, #12 // t0 vqrshrn.s32 d31, q3, #12 // t1 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 vqrshrn.s32 d18, q4, #12 // t2 vqrshrn.s32 d29, q2, #12 // t3 vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 vqrshrn.s32 d20, q3, #12 // t4 vqrshrn.s32 d27, q4, #12 // t5 vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 vqrshrn.s32 d22, q2, #12 // t6 vqrshrn.s32 d25, q3, #12 // t7 vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 vqrshrn.s32 d23, q4, #12 // t8 vqrshrn.s32 d24, q2, #12 // t9 vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 vqrshrn.s32 d21, q3, #12 // t10 vqrshrn.s32 d26, q4, #12 // t11 vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 vqrshrn.s32 d19, q2, #12 // t12 vqrshrn.s32 d28, q3, #12 // t13 vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 vqrshrn.s32 d17, q4, #12 // t14 vqrshrn.s32 d30, q2, #12 // t15 vld1.16 {q0}, [r12, :128] vqsub.s16 d2, d16, d23 // t8a vqadd.s16 d16, d16, d23 // t0a vqsub.s16 d3, d31, d24 // t9a vqadd.s16 d31, d31, d24 // t1a vqadd.s16 d23, d18, d21 // t2a vqsub.s16 d18, d18, d21 // t10a vqadd.s16 d24, d29, d26 // t3a vqsub.s16 d29, d29, d26 // t11a vqadd.s16 d21, d20, d19 // t4a vqsub.s16 d20, d20, d19 // t12a vqadd.s16 d26, d27, d28 // t5a vqsub.s16 d27, d27, d28 // t13a vqadd.s16 d19, d22, d17 // t6a vqsub.s16 d22, d22, d17 // t14a vqadd.s16 d28, d25, d30 // t7a vqsub.s16 d25, d25, d30 // t15a vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 vqrshrn.s32 d17, q2, #12 // t8 vqrshrn.s32 d30, q3, #12 // t9 vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 vqrshrn.s32 d18, q4, #12 // t10 vqrshrn.s32 d29, q2, #12 // t11 vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 vqrshrn.s32 d27, q3, #12 // t12 vqrshrn.s32 d20, q4, #12 // t13 vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 vqrshrn.s32 d25, q2, #12 // t14 vqrshrn.s32 d22, q3, #12 // t15 vqsub.s16 d2, d16, d21 // t4 vqadd.s16 d16, d16, d21 // t0 vqsub.s16 d3, d31, d26 // t5 vqadd.s16 d31, d31, d26 // t1 vqadd.s16 d21, d23, d19 // t2 vqsub.s16 d23, d23, d19 // t6 vqadd.s16 d26, d24, d28 // t3 vqsub.s16 d24, d24, d28 // t7 vqadd.s16 d19, d17, d27 // t8a vqsub.s16 d17, d17, d27 // t12a vqadd.s16 d28, d30, d20 // t9a vqsub.s16 d30, d30, d20 // t13a vqadd.s16 d27, d18, d25 // t10a vqsub.s16 d18, d18, d25 // t14a vqadd.s16 d20, d29, d22 // t11a vqsub.s16 d29, d29, d22 // t15a vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a vqrshrn.s32 d22, q2, #12 // t4a vqrshrn.s32 d25, q3, #12 // t5a vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 vqrshrn.s32 d24, q4, #12 // t6a vqrshrn.s32 d23, q2, #12 // t7a vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 vqrshrn.s32 d17, q3, #12 // t12 vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 vqrshrn.s32 d29, q4, #12 // t13 vqrshrn.s32 d30, q2, #12 // t14 vqrshrn.s32 d18, q3, #12 // t15 vqsub.s16 d2, d16, d21 // t2a .ifc \o0, d16 vqadd.s16 \o0, d16, d21 // out0 vqsub.s16 d21, d31, d26 // t3a vqadd.s16 \o15,d31, d26 // out15 .else vqadd.s16 d4, d16, d21 // out0 vqsub.s16 d21, d31, d26 // t3a vqadd.s16 \o15,d31, d26 // out15 vmov \o0, d4 .endif vqneg.s16 \o15, \o15 // out15 vqsub.s16 d3, d29, d18 // t15a vqadd.s16 \o13,d29, d18 // out13 vqadd.s16 \o2, d17, d30 // out2 vqsub.s16 d26, d17, d30 // t14a vqneg.s16 \o13,\o13 // out13 vqadd.s16 \o1, d19, d27 // out1 vqsub.s16 d27, d19, d27 // t10 vqadd.s16 \o14,d28, d20 // out14 vqsub.s16 d20, d28, d20 // t11 vqneg.s16 \o1, \o1 // out1 vqadd.s16 \o3, d22, d24 // out3 vqsub.s16 d22, d22, d24 // t6 vqadd.s16 \o12,d25, d23 // out12 vqsub.s16 d23, d25, d23 // t7 vqneg.s16 \o3, \o3 // out3 vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) vqrshrn.s32 d24, q12, #12 // out8 vqrshrn.s32 d4, q2, #12 // out7 vqrshrn.s32 d5, q3, #12 // out5 vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) vqrshrn.s32 d26, q4, #12 // out10 vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) vqrshrn.s32 \o4, q1, #12 // out4 vqrshrn.s32 d7, q3, #12 // out9 vqrshrn.s32 d6, q4, #12 // out11 vqrshrn.s32 \o6, q11, #12 // out6 .ifc \o8, d23 vmov \o8, d24 vmov \o10,d26 .endif vqneg.s16 \o7, d4 // out7 vqneg.s16 \o5, d5 // out5 vqneg.s16 \o11,d6 // out11 vqneg.s16 \o9, d7 // out9 .endm function inv_adst_4h_x16_neon, export=1 iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_flipadst_4h_x16_neon, export=1 iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 bx lr endfunc function inv_identity_4h_x16_neon, export=1 movw r12, #2*(5793-4096)*8 vdup.16 d0, r12 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q1, \i, d0[0] vqadd.s16 \i, \i, \i vqadd.s16 \i, \i, q1 .endr bx lr endfunc .macro identity_4x16_shift2 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q2, \i, \c vshr.s16 q2, q2, #1 vrhadd.s16 \i, \i, q2 .endr .endm .macro identity_4x16_shift1 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q2, \i, \c vrshr.s16 q2, q2, #1 vqadd.s16 \i, \i, q2 .endr .endm .macro identity_8x8_shift1 c identity_4x16_shift1 \c .endm .macro identity_8x8 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s16 q2, \i, \c vqadd.s16 \i, \i, \i vqadd.s16 \i, \i, q2 .endr .endm .macro def_horz_16 scale=0, identity=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x4_neon push {lr} vmov.i16 d7, #0 .if \identity movw r12, #2*(5793-4096)*8 vdup.16 d0, r12 .endif .if \scale movw r12, #2896*8 vdup.16 d1, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64] vst1.16 {d7}, [r7, :64], r8 .endr .if \scale scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif .if \identity .if \shift == -2 identity_4x16_shift2 d0[0] .else identity_4x16_shift1 d0[0] .endif .else blx r4 .endif .if \shift > 0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #\shift .endr .endif transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 transpose_4x4h q14, q15, d28, d29, d30, d31 .irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31 vst1.16 {\i}, [r6, :64]! .endr pop {pc} endfunc .endm def_horz_16 scale=0, identity=0, shift=2 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity function inv_txfm_add_vert_4x16_neon push {lr} .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr blx r5 load_add_store_4x16 r6, r7 pop {pc} endfunc function inv_txfm_add_16x16_neon sub_sp_align 512 ldrh r11, [r10], #2 .irp i, 0, 4, 8, 12 add r6, sp, #(\i*16*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 12 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #16*2 blx r9 .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #32 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 512 vpop {q4} pop {r4-r11,pc} endfunc const eob_16x16 .short 10, 36, 78, 256 endconst const eob_16x16_identity .short 4, 8, 12, 256 endconst .macro def_fn_16x16 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif push {r4-r11,lr} vpush {q4} .ifc \txfm1, identity movrel_local r9, inv_txfm_horz_identity_16x4_neon .else movrel_local r9, inv_txfm_horz_16x4_neon movrel_local r4, inv_\txfm1\()_4h_x16_neon .endif movrel_local r5, inv_\txfm2\()_4h_x16_neon .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_16x16 .else movrel_local r10, eob_16x16_identity .endif .else .ifc \txfm2, identity movrel_local r10, eob_16x16_identity .else movrel_local r10, eob_16x16 .endif .endif b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct def_fn_16x16 identity, identity def_fn_16x16 dct, adst def_fn_16x16 dct, flipadst def_fn_16x16 dct, identity def_fn_16x16 adst, dct def_fn_16x16 adst, adst def_fn_16x16 adst, flipadst def_fn_16x16 flipadst, dct def_fn_16x16 flipadst, adst def_fn_16x16 flipadst, flipadst def_fn_16x16 identity, dct .macro def_fn_416_base variant function inv_txfm_\variant\()add_16x4_neon .ifc \variant, identity_ vmov.i16 d4, #0 .irp i, d16, d18, d20, d22 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr .irp i, d17, d19, d21, d23 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr movw r12, #2*(5793-4096)*8 vdup.16 d0, r12 .irp i, d24, d26, d28, d30 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr .irp i, d25, d27, d29, d31 vld1.16 {\i}, [r2, :64] vst1.16 {d4}, [r2, :64]! .endr identity_4x16_shift1 d0[0] .else vmov.i16 q2, #0 vmov.i16 q3, #0 vld1.16 {d16, d17, d18, d19}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! vld1.16 {d20, d21, d22, d23}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! vld1.16 {d24, d25, d26, d27}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! vld1.16 {d28, d29, d30, d31}, [r2, :128] vst1.16 {q2, q3}, [r2, :128]! blx r4 vswp d17, d20 vswp d19, d22 vswp d18, d20 vswp d19, d21 .irp i, q8, q9, q10, q11 vrshr.s16 \i, \i, #1 .endr .endif transpose_4x8h q8, q9, q10, q11 blx r5 mov r6, r0 load_add_store_8x4 r6, r7 .ifc \variant, identity_ vmov q8, q12 vmov q9, q13 vmov q10, q14 vmov q11, q15 .else vswp d25, d28 vswp d27, d30 vswp d26, d28 vswp d27, d29 vrshr.s16 q8, q12, #1 vrshr.s16 q9, q13, #1 vrshr.s16 q10, q14, #1 vrshr.s16 q11, q15, #1 .endif transpose_4x8h q8, q9, q10, q11 blx r5 add r6, r0, #8 load_add_store_8x4 r6, r7 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_\variant\()add_4x16_neon vmov.i16 q2, #0 mov r11, #32 cmp r3, r10 blt 1f add r6, r2, #16 .ifc \variant, identity_ .irp i, q12, q13, q14, q15 vld1.16 {\i}, [r6, :128] vst1.16 {q2}, [r6, :128], r11 .endr movw r12, #(5793-4096)*8 vdup.16 d0, r12 identity_8x4_shift1 q12, q13, q14, q15, d0[0] .else .irp i, q8, q9, q10, q11 vld1.16 {\i}, [r6, :128] vst1.16 {q2}, [r6, :128], r11 .endr blx r4 vrshr.s16 q12, q8, #1 vrshr.s16 q13, q9, #1 vrshr.s16 q14, q10, #1 vrshr.s16 q15, q11, #1 .endif transpose_4x8h q12, q13, q14, q15 vswp d27, d29 vswp d26, d28 vswp d27, d30 vswp d25, d28 b 2f 1: .irp i, q12, q13, q14, q15 vmov.i16 \i, #0 .endr 2: vmov.i16 q2, #0 .irp i, q8, q9, q10, q11 vld1.16 {\i}, [r2, :128] vst1.16 {q2}, [r2, :128], r11 .endr .ifc \variant, identity_ movw r12, #(5793-4096)*8 vdup.16 d0, r12 identity_8x4_shift1 q8, q9, q10, q11, d0[0] .else blx r4 .irp i, q8, q9, q10, q11 vrshr.s16 \i, \i, #1 .endr .endif transpose_4x8h q8, q9, q10, q11 vswp d19, d21 vswp d18, d20 vswp d19, d22 vswp d17, d20 blx r5 load_add_store_4x16 r0, r6 vpop {q4-q7} pop {r4-r11,pc} endfunc .endm def_fn_416_base def_fn_416_base identity_ .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 4 movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon mov r10, #\eob_half .else movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 29 def_fn_416 \w, \h, identity, identity, 29 def_fn_416 \w, \h, dct, adst, 29 def_fn_416 \w, \h, dct, flipadst, 29 def_fn_416 \w, \h, dct, identity, 8 def_fn_416 \w, \h, adst, dct, 29 def_fn_416 \w, \h, adst, adst, 29 def_fn_416 \w, \h, adst, flipadst, 29 def_fn_416 \w, \h, flipadst, dct, 29 def_fn_416 \w, \h, flipadst, adst, 29 def_fn_416 \w, \h, flipadst, flipadst, 29 def_fn_416 \w, \h, identity, dct, 32 def_fn_416 \w, \h, adst, identity, 8 def_fn_416 \w, \h, flipadst, identity, 8 def_fn_416 \w, \h, identity, adst, 32 def_fn_416 \w, \h, identity, flipadst, 32 .endm def_fns_416 4, 16 def_fns_416 16, 4 .macro def_fn_816_base variant function inv_txfm_\variant\()add_16x8_neon sub_sp_align 256 .irp i, 0, 4 add r6, sp, #(\i*16*2) .if \i > 0 cmp r3, r10 blt 1f .endif add r7, r2, #(\i*2) mov r8, #8*2 blx r9 .endr b 2f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr 2: .irp i, 0, 8 add r7, sp, #(\i*2) mov r8, #32 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\j}, [r7, :128], r8 .endr blx r5 add r6, r0, #(\i) load_add_store_8x8 r6, r7 .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_\variant\()add_8x16_neon sub_sp_align 256 .irp i, 0, 8 add r6, sp, #(\i*8*2) .if \i > 0 cmp r3, r10 blt 1f .endif add r7, r2, #(\i*2) mov r8, #16*2 vmov.i16 q2, #0 movw r12, #2896*8 vdup.16 d0, r12 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\j}, [r7, :128] vst1.16 {q2}, [r7, :128], r8 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .ifc \variant, identity_ // The identity shl #1 and downshift vrshr #1 cancel out .else blx r4 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \j, \j, #1 .endr .endif transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 vst1.16 {q8, q9}, [r6, :128]! vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r6, :128]! vst1.16 {q14, q15}, [r6, :128]! .endr b 2f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr 2: .irp i, 0, 4 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #16 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} endfunc .endm def_fn_816_base def_fn_816_base identity_ .macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 8 movrel_local r4, inv_\txfm1\()_8h_x8_neon movrel_local r5, inv_\txfm2\()_4h_x16_neon .else .ifc \txfm1, identity movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon .else movrel_local r4, inv_\txfm1\()_4h_x16_neon movrel_local r9, inv_txfm_horz_scale_16x4_neon .endif movrel_local r5, inv_\txfm2\()_8h_x8_neon .endif .if \w == 8 mov r10, #\eob_8x8 .else mov r10, #\eob_4x4 .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct, 43, 10 def_fn_816 \w, \h, identity, identity, 43, 10 def_fn_816 \w, \h, dct, adst, 43, 10 def_fn_816 \w, \h, dct, flipadst, 43, 10 def_fn_816 \w, \h, dct, identity, 8, 4 def_fn_816 \w, \h, adst, dct, 43, 10 def_fn_816 \w, \h, adst, adst, 43, 10 def_fn_816 \w, \h, adst, flipadst, 43, 10 def_fn_816 \w, \h, flipadst, dct, 43, 10 def_fn_816 \w, \h, flipadst, adst, 43, 10 def_fn_816 \w, \h, flipadst, flipadst, 43, 10 def_fn_816 \w, \h, identity, dct, 64, 4 def_fn_816 \w, \h, adst, identity, 8, 4 def_fn_816 \w, \h, flipadst, identity, 8, 4 def_fn_816 \w, \h, identity, adst, 64, 4 def_fn_816 \w, \h, identity, flipadst, 64, 4 .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_4h_x16_neon, export=1 movrel_local r12, idct_coeffs, 2*16 vld1.16 {q0, q1}, [r12, :128] sub r12, r12, #2*16 vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a vqrshrn.s32 d16, q2, #12 // t16a vqrshrn.s32 d31, q3, #12 // t31a vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a vqrshrn.s32 d24, q4, #12 // t17a vqrshrn.s32 d23, q2, #12 // t30a vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a vqrshrn.s32 d20, q3, #12 // t18a vqrshrn.s32 d27, q4, #12 // t29a vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a vqrshrn.s32 d28, q2, #12 // t19a vqrshrn.s32 d19, q3, #12 // t28a vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a vqrshrn.s32 d18, q4, #12 // t20a vqrshrn.s32 d29, q2, #12 // t27a vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a vqrshrn.s32 d26, q3, #12 // t21a vqrshrn.s32 d21, q4, #12 // t26a vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a vqrshrn.s32 d22, q2, #12 // t22a vqrshrn.s32 d25, q3, #12 // t25a vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a vqrshrn.s32 d30, q4, #12 // t23a vqrshrn.s32 d17, q2, #12 // t24a vld1.16 {q0}, [r12, :128] vqsub.s16 d2, d16, d24 // t17 vqadd.s16 d16, d16, d24 // t16 vqsub.s16 d3, d31, d23 // t30 vqadd.s16 d31, d31, d23 // t31 vqsub.s16 d24, d28, d20 // t18 vqadd.s16 d28, d28, d20 // t19 vqadd.s16 d23, d18, d26 // t20 vqsub.s16 d18, d18, d26 // t21 vqsub.s16 d20, d30, d22 // t22 vqadd.s16 d30, d30, d22 // t23 vqadd.s16 d26, d17, d25 // t24 vqsub.s16 d17, d17, d25 // t25 vqsub.s16 d22, d29, d21 // t26 vqadd.s16 d29, d29, d21 // t27 vqadd.s16 d25, d19, d27 // t28 vqsub.s16 d19, d19, d27 // t29 vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a vqrshrn.s32 d21, q2, #12 // t17a vqrshrn.s32 d27, q3, #12 // t30a vneg.s32 q4, q4 // -> t18a vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a vqrshrn.s32 d19, q4, #12 // t18a vqrshrn.s32 d24, q1, #12 // t29a vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a vqrshrn.s32 d22, q2, #12 // t21a vqrshrn.s32 d18, q3, #12 // t26a vneg.s32 q4, q4 // -> t22a vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a vqrshrn.s32 d17, q4, #12 // t22a vqrshrn.s32 d20, q1, #12 // t25a vqsub.s16 d2, d27, d24 // t29 vqadd.s16 d27, d27, d24 // t30 vqsub.s16 d3, d21, d19 // t18 vqadd.s16 d21, d21, d19 // t17 vqsub.s16 d24, d16, d28 // t19a vqadd.s16 d16, d16, d28 // t16a vqsub.s16 d19, d30, d23 // t20a vqadd.s16 d30, d30, d23 // t23a vqsub.s16 d28, d17, d22 // t21 vqadd.s16 d17, d17, d22 // t22 vqadd.s16 d23, d26, d29 // t24a vqsub.s16 d26, d26, d29 // t27a vqadd.s16 d22, d20, d18 // t25 vqsub.s16 d20, d20, d18 // t26 vqsub.s16 d29, d31, d25 // t28a vqadd.s16 d31, d31, d25 // t31a vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 vqrshrn.s32 d18, q2, #12 // t18a vqrshrn.s32 d25, q3, #12 // t29a vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 vqrshrn.s32 d29, q4, #12 // t19 vqrshrn.s32 d24, q1, #12 // t28 vneg.s32 q2, q2 // -> t20 vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a vqrshrn.s32 d26, q2, #12 // t20 vqrshrn.s32 d19, q3, #12 // t27 vneg.s32 q4, q4 // -> t21a vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a vqrshrn.s32 d20, q4, #12 // t21a vqrshrn.s32 d28, q1, #12 // t26a vqsub.s16 d2, d16, d30 // t23 vqadd.s16 d16, d16, d30 // t16 = out16 vqsub.s16 d3, d31, d23 // t24 vqadd.s16 d31, d31, d23 // t31 = out31 vqsub.s16 d23, d21, d17 // t22a vqadd.s16 d17, d21, d17 // t17a = out17 vqadd.s16 d30, d27, d22 // t30a = out30 vqsub.s16 d21, d27, d22 // t25a vqsub.s16 d27, d18, d20 // t21 vqadd.s16 d18, d18, d20 // t18 = out18 vqadd.s16 d4, d29, d26 // t19a = out19 vqsub.s16 d26, d29, d26 // t20a vqadd.s16 d29, d25, d28 // t29 = out29 vqsub.s16 d25, d25, d28 // t26 vqadd.s16 d28, d24, d19 // t28a = out28 vqsub.s16 d24, d24, d19 // t27a vmov d19, d4 // out19 vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 vqrshrn.s32 d20, q2, #12 // t20 vqrshrn.s32 d22, q3, #12 // t27 vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a vmov d27, d22 // t27 vqrshrn.s32 d26, q2, #12 // t26a vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 vqrshrn.s32 d21, q3, #12 // t21a vqrshrn.s32 d22, q12, #12 // t22 vqrshrn.s32 d25, q2, #12 // t25 vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a vqrshrn.s32 d23, q2, #12 // t23a vqrshrn.s32 d24, q3, #12 // t24a bx lr endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x4_neon push {lr} vmov.i16 d7, #0 lsl r8, r8, #1 .if \scale movw r12, #2896*8 vdup.16 d0, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64] vst1.16 {d7}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .if \scale scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct_4h_x16_neon transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 transpose_4x4h q14, q15, d28, d29, d30, d31 .macro store1 r0, r1, r2, r3 vst1.16 {\r0}, [r6, :64]! vst1.16 {\r1}, [r6, :64]! vst1.16 {\r2}, [r6, :64]! vst1.16 {\r3}, [r6, :64]! add r6, r6, #32 .endm store1 d16, d20, d24, d28 store1 d17, d21, d25, d29 store1 d18, d22, d26, d30 store1 d19, d23, d27, d31 .purgem store1 sub r6, r6, #64*4 vmov.i16 d7, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64] vst1.16 {d7}, [r7, :64], r8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in d0[1] scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct32_odd_4h_x16_neon transpose_4x4h q15, q14, d31, d30, d29, d28 transpose_4x4h q13, q12, d27, d26, d25, d24 transpose_4x4h q11, q10, d23, d22, d21, d20 transpose_4x4h q9, q8, d19, d18, d17, d16 .macro store2 r0, r1, r2, r3, shift vld1.16 {q0, q1}, [r6, :128] vqsub.s16 d7, d0, \r0 vqadd.s16 d0, d0, \r0 vqsub.s16 d6, d1, \r1 vqadd.s16 d1, d1, \r1 vqsub.s16 d5, d2, \r2 vqadd.s16 d2, d2, \r2 vqsub.s16 d4, d3, \r3 vqadd.s16 d3, d3, \r3 vrev64.16 q2, q2 vrev64.16 q3, q3 vrshr.s16 q0, q0, #\shift vrshr.s16 q1, q1, #\shift vrshr.s16 q2, q2, #\shift vrshr.s16 q3, q3, #\shift vst1.16 {q0, q1}, [r6, :128]! vst1.16 {q2, q3}, [r6, :128]! .endm store2 d31, d27, d23, d19, \shift store2 d30, d26, d22, d18, \shift store2 d29, d25, d21, d17, \shift store2 d28, d24, d20, d16, \shift .purgem store2 pop {pc} endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_4x32_neon push {r10-r11,lr} lsl r8, r8, #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 bl inv_dct_4h_x16_neon .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vst1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 sub r7, r7, r8, lsr #1 bl inv_dct32_odd_4h_x16_neon neg r9, r8 mov r10, r6 .macro combine r0, r1, r2, r3, op, stride vld1.16 {d4}, [r7, :64], \stride vld1.32 {d2[0]}, [r10, :32], r1 vld1.16 {d5}, [r7, :64], \stride vld1.32 {d2[1]}, [r10, :32], r1 \op\().s16 d4, d4, \r0 vld1.16 {d6}, [r7, :64], \stride vld1.32 {d3[0]}, [r10, :32], r1 \op\().s16 d5, d5, \r1 vld1.32 {d3[1]}, [r10, :32], r1 vrshr.s16 q2, q2, #4 \op\().s16 d6, d6, \r2 vld1.16 {d7}, [r7, :64], \stride vaddw.u8 q2, q2, d2 \op\().s16 d7, d7, \r3 vqmovun.s16 d2, q2 vrshr.s16 q3, q3, #4 vst1.32 {d2[0]}, [r6, :32], r1 vaddw.u8 q3, q3, d3 vst1.32 {d2[1]}, [r6, :32], r1 vqmovun.s16 d3, q3 vst1.32 {d3[0]}, [r6, :32], r1 vst1.32 {d3[1]}, [r6, :32], r1 .endm combine d31, d30, d29, d28, vqadd, r8 combine d27, d26, d25, d24, vqadd, r8 combine d23, d22, d21, d20, vqadd, r8 combine d19, d18, d17, d16, vqadd, r8 sub r7, r7, r8 combine d16, d17, d18, d19, vqsub, r9 combine d20, d21, d22, d23, vqsub, r9 combine d24, d25, d26, d27, vqsub, r9 combine d28, d29, d30, d31, vqsub, r9 .purgem combine pop {r10-r11,pc} endfunc const eob_32x32 .short 10, 36, 78, 136, 210, 300, 406, 1024 endconst const eob_16x32 .short 10, 36, 78, 151, 215, 279, 343, 512 endconst const eob_16x32_shortside .short 10, 36, 78, 512 endconst const eob_8x32 // Contrary to the others, this one is only ever used in increments of 8x8 .short 43, 107, 171, 256 endconst function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 push {r4-r7,lr} vmov.i16 q0, #0 movrel_local r5, eob_32x32, 2 mov r6, #2*32 1: mov r12, #0 movrel_local r4, eob_32x32, 2 2: add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q0}, [r2, :128], r6 .endr transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 load_add_store_8x8 r0, r7, shiftbits=2 ldrh lr, [r4], #4 sub r0, r0, r1, lsl #3 cmp r3, lr add r0, r0, #8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12 add r0, r0, r1, lsl #3 mls r2, r6, r12, r2 add r2, r2, #2*8 b 1b 9: pop {r4-r7,pc} endfunc .macro shift_8_regs op, shift .irp i, q8, q9, q10, q11, q12, q13, q14, q15 \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 push {r4-r7,lr} movw r6, #2896*8 movw r7, #2*(5793-4096)*8 vdup.i16 d0, r6 movrel_local r5, eob_16x32\hshort, 2 vmov.16 d0[1], r7 mov r6, #2*\h 1: mov r12, #0 movrel_local r4, eob_16x32\wshort, 2 2: vmov.i16 q1, #0 add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q1}, [r2, :128], r6 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .if \w == 16 // 16x32 identity_8x8_shift1 d0[1] .else // 32x16 shift_8_regs vqshl.s16, 1 identity_8x8 d0[1] .endif transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 .if \w == 16 load_add_store_8x8 r0, r7, shiftbits=2 .else load_add_store_8x8 r0, r7, shiftbits=4 .endif ldrh lr, [r4], #4 sub r0, r0, r1, lsl #3 cmp r3, lr add r0, r0, #8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12 add r0, r0, r1, lsl #3 mls r2, r6, r12, r2 add r2, r2, #2*8 b 1b 9: pop {r4-r7,pc} endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 push {r4-r5,lr} vmov.i16 q0, #0 movrel_local r4, eob_8x32 mov r12, #2*\h 1: ldrh lr, [r4], #2 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q0}, [r2, :128], r12 .endr .if \w == 8 // 8x32 shift_8_regs vrshr.s16, 1 .endif transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 cmp r3, lr .if \w == 8 load_add_store_8x8 r0, r5, shiftbits=2 .else load_add_store_8x8 r0, r5, shiftbits=3 .endif blt 9f .if \w == 8 sub r2, r2, r12, lsl #3 add r2, r2, #2*8 .else sub r0, r0, r1, lsl #3 add r0, r0, #8 .endif b 1b 9: pop {r4-r5,pc} endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 idct_dc 32, 32, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 2048 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, sp, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_dct_32x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 2048 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 idct_dc 16, 32, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, sp, #(\i*16*2) add r7, r2, #(\i*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif mov r8, #2*32 bl inv_txfm_horz_scale_16x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #16*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 1024 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 idct_dc 32, 16, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r5, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12 add r6, sp, #(\i*32*2) add r7, r2, #(\i*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 12 ldrh r11, [r10], #2 .endif .endif mov r8, #2*16 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 1024 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 idct_dc 8, 32, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 512 movrel_local r10, eob_8x32 mov r8, #2*32 mov r9, #32 mov r6, sp 1: vmov.i16 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r2, :128] vst1.16 {q0}, [r2, :128], r8 .endr ldrh r11, [r10], #2 sub r2, r2, r8, lsl #3 sub r9, r9, #8 add r2, r2, #2*8 bl inv_dct_8h_x8_neon .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vrshr.s16 \i, \i, #2 .endr transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 vst1.16 {q8, q9}, [r6, :128]! cmp r3, r11 vst1.16 {q10, q11}, [r6, :128]! vst1.16 {q12, q13}, [r6, :128]! vst1.16 {q14, q15}, [r6, :128]! bge 1b cmp r9, #0 beq 3f vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r9, r9, #8 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4 add r6, r0, #(\i) add r7, sp, #(\i*2) mov r8, #8*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 idct_dc 32, 8, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 512 .irp i, 0, 4 add r6, sp, #(\i*32*2) add r7, r2, #(\i*2) .if \i > 0 cmp r3, #10 blt 1f .endif mov r8, #8*2 bl inv_txfm_horz_dct_32x4_neon .endr b 2f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr 2: mov r8, #2*32 mov r9, #0 1: add r6, r0, r9 add r7, sp, r9, lsl #1 // #(\i*2) .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r7, :128], r8 .endr add r9, r9, #8 bl inv_dct_8h_x8_neon cmp r9, #32 load_add_store_8x8 r6, r7 blt 1b add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vld1.16 {d0, d1, d2}, [r12, :64]! vqrdmulh.s16 d23, d16, d0[1] // t63a vqrdmulh.s16 d16, d16, d0[0] // t32a vqrdmulh.s16 d22, d17, d0[2] // t62a vqrdmulh.s16 d17, d17, d0[3] // t33a vqrdmulh.s16 d21, d18, d1[1] // t61a vqrdmulh.s16 d18, d18, d1[0] // t34a vqrdmulh.s16 d20, d19, d1[2] // t60a vqrdmulh.s16 d19, d19, d1[3] // t35a vqadd.s16 d24, d16, d17 // t32 vqsub.s16 d25, d16, d17 // t33 vqsub.s16 d26, d19, d18 // t34 vqadd.s16 d27, d19, d18 // t35 vqadd.s16 d28, d20, d21 // t60 vqsub.s16 d29, d20, d21 // t61 vqsub.s16 d30, d23, d22 // t62 vqadd.s16 d31, d23, d22 // t63 vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a vneg.s32 q2, q2 // t34a vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a vqrshrn.s32 d26, q2, #12 // t34a vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a vqrshrn.s32 d29, q3, #12 // t61a vqrshrn.s32 d25, q4, #12 // t33a vqrshrn.s32 d30, q2, #12 // t62a vqadd.s16 d16, d24, d27 // t32a vqsub.s16 d19, d24, d27 // t35a vqadd.s16 d17, d25, d26 // t33 vqsub.s16 d18, d25, d26 // t34 vqsub.s16 d20, d31, d28 // t60a vqadd.s16 d23, d31, d28 // t63a vqsub.s16 d21, d30, d29 // t61 vqadd.s16 d22, d30, d29 // t62 vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 vqrshrn.s32 d21, q2, #12 // t61a vqrshrn.s32 d18, q3, #12 // t34a vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 vqrshrn.s32 d20, q4, #12 // t60 vqrshrn.s32 d19, q2, #12 // t35 vst1.16 {d16, d17, d18, d19}, [r6, :128]! vst1.16 {d20, d21, d22, d23}, [r6, :128]! bx lr endfunc function inv_dct64_step2_neon movrel_local r12, idct_coeffs vld1.16 {d0}, [r12, :64] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a vldr d16, [r6, #2*4*0] // t32a vldr d17, [r9, #2*4*8] // t39a vldr d18, [r9, #2*4*0] // t63a vldr d19, [r6, #2*4*8] // t56a vldr d20, [r6, #2*4*16] // t40a vldr d21, [r9, #2*4*24] // t47a vldr d22, [r9, #2*4*16] // t55a vldr d23, [r6, #2*4*24] // t48a vqadd.s16 d24, d16, d17 // t32 vqsub.s16 d25, d16, d17 // t39 vqadd.s16 d26, d18, d19 // t63 vqsub.s16 d27, d18, d19 // t56 vqsub.s16 d28, d21, d20 // t40 vqadd.s16 d29, d21, d20 // t47 vqadd.s16 d30, d23, d22 // t48 vqsub.s16 d31, d23, d22 // t55 vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a vqrshrn.s32 d25, q2, #12 // t56a vqrshrn.s32 d27, q3, #12 // t39a vneg.s32 q4, q4 // t40a vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a vqrshrn.s32 d31, q4, #12 // t40a vqrshrn.s32 d28, q2, #12 // t55a vqadd.s16 d16, d24, d29 // t32a vqsub.s16 d19, d24, d29 // t47a vqadd.s16 d17, d27, d31 // t39 vqsub.s16 d18, d27, d31 // t40 vqsub.s16 d20, d26, d30 // t48a vqadd.s16 d23, d26, d30 // t63a vqsub.s16 d21, d25, d28 // t55 vqadd.s16 d22, d25, d28 // t56 vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 vqrshrn.s32 d18, q2, #12 // t40a vqrshrn.s32 d21, q3, #12 // t55a vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 vqrshrn.s32 d19, q4, #12 // t47 vqrshrn.s32 d20, q2, #12 // t48 vstr d16, [r6, #2*4*0] // t32a vstr d17, [r9, #2*4*0] // t39 vstr d18, [r6, #2*4*8] // t40a vstr d19, [r9, #2*4*8] // t47 vstr d20, [r6, #2*4*16] // t48 vstr d21, [r9, #2*4*16] // t55a vstr d22, [r6, #2*4*24] // t56 vstr d23, [r9, #2*4*24] // t63a add r6, r6, #2*4 sub r9, r9, #2*4 cmp r6, r9 blt 1b bx lr endfunc .macro load8 src, strd, zero, clear .irp i, d16, d17, d18, d19, d20, d21, d22, d23 .if \clear vld1.16 {\i}, [\src, :64] vst1.16 {\zero}, [\src, :64], \strd .else vld1.16 {\i}, [\src, :64], \strd .endif .endr .endm .macro store16 dst vst1.16 {q8, q9}, [\dst, :128]! vst1.16 {q10, q11}, [\dst, :128]! vst1.16 {q12, q13}, [\dst, :128]! vst1.16 {q14, q15}, [\dst, :128]! .endm .macro clear_upper8 .irp i, q12, q13, q14, q15 vmov.i16 \i, #0 .endr .endm .macro vmov_if reg, val, cond .if \cond vmov.i16 \reg, \val .endif .endm .macro movdup_if reg, gpr, val, cond .if \cond movw \gpr, \val vdup.16 \reg, \gpr .endif .endm .macro vst1_if regs, dst, dstalign, cond .if \cond vst1.16 \regs, \dst, \dstalign .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_4h_x64_neon, export=1 mov r6, sp push {r10-r11,lr} lsl r8, r8, #2 movdup_if d0, r12, #2896*8, \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 add r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct_4h_x16_neon store16 r6 movdup_if d0, r12, #2896*8, \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 lsr r8, r8, #1 sub r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct32_odd_4h_x16_neon add r10, r6, #8*15 sub r6, r6, #8*16 mov r9, #-8 .macro store_addsub r0, r1, r2, r3 vld1.16 {d2}, [r6, :64]! vld1.16 {d3}, [r6, :64]! vqadd.s16 d6, d2, \r0 vqsub.s16 \r0, d2, \r0 vld1.16 {d4}, [r6, :64]! vqadd.s16 d7, d3, \r1 vqsub.s16 \r1, d3, \r1 vld1.16 {d5}, [r6, :64]! vqadd.s16 d2, d4, \r2 sub r6, r6, #8*4 vqsub.s16 \r2, d4, \r2 vst1.16 {d6}, [r6, :64]! vst1.16 {\r0}, [r10, :64], r9 vqadd.s16 d3, d5, \r3 vqsub.s16 \r3, d5, \r3 vst1.16 {d7}, [r6, :64]! vst1.16 {\r1}, [r10, :64], r9 vst1.16 {d2}, [r6, :64]! vst1.16 {\r2}, [r10, :64], r9 vst1.16 {d3}, [r6, :64]! vst1.16 {\r3}, [r10, :64], r9 .endm store_addsub d31, d30, d29, d28 store_addsub d27, d26, d25, d24 store_addsub d23, d22, d21, d20 store_addsub d19, d18, d17, d16 .purgem store_addsub add r6, r6, #2*4*16 movrel_local r12, idct64_coeffs movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear add r9, r7, r8, lsl #4 // offset 16 add r10, r7, r8, lsl #3 // offset 8 sub r9, r9, r8 // offset 15 sub r11, r10, r8 // offset 7 vld1.16 {d16}, [r7, :64] // in1 (offset 0) vld1.16 {d17}, [r9, :64] // in31 (offset 15) vld1.16 {d18}, [r10, :64] // in17 (offset 8) vld1.16 {d19}, [r11, :64] // in15 (offset 7) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear add r7, r7, r8, lsl #2 // offset 4 sub r9, r9, r8, lsl #2 // offset 11 sub r10, r7, r8 // offset 3 add r11, r9, r8 // offset 12 vld1.16 {d16}, [r10, :64] // in7 (offset 3) vld1.16 {d17}, [r11, :64] // in25 (offset 12) vld1.16 {d18}, [r9, :64] // in23 (offset 11) vld1.16 {d19}, [r7, :64] // in9 (offset 4) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear sub r10, r10, r8, lsl #1 // offset 1 sub r9, r9, r8, lsl #1 // offset 9 add r10, r10, r8 // offset 2 add r9, r9, r8 // offset 10 add r7, r7, r8 // offset 5 add r11, r11, r8 // offset 13 vld1.16 d16, [r10, :64] // in5 (offset 2) vld1.16 d17, [r11, :64] // in27 (offset 13) vld1.16 d18, [r9, :64] // in21 (offset 10) vld1.16 d19, [r7, :64] // in11 (offset 5) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, #2896*8, \scale vmov_if d7, #0, \clear sub r10, r10, r8 // offset 1 sub r9, r9, r8 // offset 9 add r11, r11, r8 // offset 14 add r7, r7, r8 // offset 6 vld1.16 d16, [r10, :64] // in3 (offset 1) vld1.16 d17, [r11, :64] // in29 (offset 14) vld1.16 d18, [r9, :64] // in19 (offset 9) vld1.16 d19, [r7, :64] // in13 (offset 6) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon sub r6, r6, #2*4*32 add r9, r6, #2*4*7 bl inv_dct64_step2_neon pop {r10-r11,pc} endfunc .endm def_dct64_func def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x4_neon vdup.16 q3, r9 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, #2*56 push {r10-r11,lr} mov r10, #2*64 mov r11, #-2*4*4 1: vld1.16 {d16, d17, d18, d19}, [r7, :128]! vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 vld1.16 {d20, d21, d22, d23}, [r7, :128]! vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q15, q14, d31, d30, d29, d28 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q13, q12, d27, d26, d25, d24 .macro store_addsub src0, src1, src2, src3 vqsub.s16 d3, \src0, \src1 vqsub.s16 d2, \src2, \src3 vqadd.s16 d0, \src0, \src1 vqadd.s16 d1, \src2, \src3 vrshl.s16 q1, q1, q3 vrshl.s16 q0, q0, q3 vrev64.16 q1, q1 vst1.16 {q0}, [r6, :128], r10 vst1.16 {q1}, [r9, :128], r10 .endm store_addsub d16, d31, d20, d27 store_addsub d17, d30, d21, d26 store_addsub d18, d29, d22, d25 store_addsub d19, d28, d23, d24 .purgem store_addsub sub r6, r6, r10, lsl #2 sub r9, r9, r10, lsl #2 add r6, r6, #16 sub r9, r9, #16 cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_vert_dct_4x64_neon lsl r8, r8, #1 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, r1, lsl #6 sub r9, r9, r1 push {r10-r11,lr} neg r10, r1 mov r11, #-2*4*4 1: vld1.16 {d16, d17, d18, d19}, [r7, :128]! vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 vld1.16 {d20, d21, d22, d23}, [r7, :128]! vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 .macro add_dest_addsub src0, src1, src2, src3 vld1.32 {d0[0]}, [r6, :32], r1 vld1.32 {d1[0]}, [r9, :32], r10 vqadd.s16 d4, \src0, \src1 vld1.32 {d0[1]}, [r6, :32] vqadd.s16 d5, \src2, \src3 vld1.32 {d1[1]}, [r9, :32] vqsub.s16 d6, \src0, \src1 vqsub.s16 d7, \src2, \src3 sub r6, r6, r1 sub r9, r9, r10 vrshr.s16 q2, q2, #4 vrshr.s16 q3, q3, #4 vaddw.u8 q2, q2, d0 vaddw.u8 q3, q3, d1 vqmovun.s16 d0, q2 vqmovun.s16 d1, q3 vst1.32 {d0[0]}, [r6, :32], r1 vst1.32 {d1[0]}, [r9, :32], r10 vst1.32 {d0[1]}, [r6, :32], r1 vst1.32 {d1[1]}, [r9, :32], r10 .endm add_dest_addsub d16, d31, d17, d30 add_dest_addsub d18, d29, d19, d28 add_dest_addsub d20, d27, d21, d26 add_dest_addsub d22, d25, d23, d24 .purgem add_dest_addsub cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_dct_clear_4h_x64_neon add r6, r5, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r7, r5, #(\i*2) mov r8, #64*2 bl inv_txfm_dct_4h_x64_neon add r6, r0, #(\i) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 idct_dc 64, 32, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_dct_clear_scale_4h_x64_neon add r6, r5, #(\i*64*2) mov r9, #-1 // shift bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i) add r7, r5, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 idct_dc 32, 64, 1 push {r4-r11,lr} vpush {q4} sub_sp_align 32*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r7, r5, #(\i*2) mov r8, #32*2 bl inv_txfm_dct_4h_x64_neon add r6, r0, #(\i) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 32*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 idct_dc 64, 16, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 64*16*2+64*4*2 add r4, sp, #64*4*2 movrel_local r10, eob_16x32 .irp i, 0, 4, 8, 12 add r6, r4, #(\i*64*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*2) mov r8, #16*2 bl inv_txfm_dct_clear_4h_x64_neon add r6, r4, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x4_neon .if \i < 12 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: movrel_local r5, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i) add r7, r4, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 64*16*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 idct_dc 16, 64, 2 push {r4-r11,lr} vpush {q4} sub_sp_align 16*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_4h_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r5, #(\i*16*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 28 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*2) mov r8, #32*2 bl inv_txfm_horz_16x4_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r7, r5, #(\i*2) mov r8, #16*2 bl inv_txfm_dct_4h_x64_neon add r6, r0, #(\i) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 16*32*2+64*4*2 vpop {q4} pop {r4-r11,pc} endfunc rav1e-0.7.1/src/arm/32/itx16.S000064400000000000000000003613531046102023000135530ustar 00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); // Most of the functions use the following register layout: // r0-r3 external parameters // r4 function pointer to first transform // r5 function pointer to second transform // r6 output parameter for helper function // r7 input parameter for helper function // r8 input stride for helper function // r9 scratch variable for helper functions // r10-r11 pointer to list of eob thresholds, eob threshold value, // scratch variables within helper functions (backed up) // The SIMD registers most often use the following layout: // d0-d3 multiplication coefficients // d4-d7 scratch registers // d8-d15 unused in some transforms, used for scratch registers in others // d16-v31 inputs/outputs of transforms // Potential further optimizations, that are left unimplemented for now: // - Trying to keep multiplication coefficients in registers across multiple // transform functions. (The register layout is designed to potentially // allow this.) // - Use a simplified version of the transforms themselves for cases where // we know a significant number of inputs are zero. E.g. if the eob value // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. // A macro for cases where a thumb mov can express the constant in one // instruction, while arm mode requires two separate movw+movt pairs. .macro mov_const reg, val #if CONFIG_THUMB mov.w \reg, #\val #else movw \reg, #((\val) & 0xffff) movt \reg, #(((\val) >> 16) & 0xffff) #endif .endm const idct_coeffs, align=4 // idct4 .int 2896, 2896*8*(1<<16), 1567, 3784 // idct8 .int 799, 4017, 3406, 2276 // idct16 .int 401, 4076, 3166, 2598 .int 1931, 3612, 3920, 1189 // idct32 .int 201, 4091, 3035, 2751 .int 1751, 3703, 3857, 1380 .int 995, 3973, 3513, 2106 .int 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) .int 4076, 401, 4017, 799 .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) .int -3166, -2598, -799, -4017 .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) .int 3612, 1931, 2276, 3406 .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) .int -3920, -1189, -3406, -2276 endconst const iadst4_coeffs, align=4 .int 1321, 3803, 2482, 3344 endconst const iadst8_coeffs, align=4 .int 4076, 401, 3612, 1931 .int 2598, 3166, 1189, 3920 // idct_coeffs .int 2896, 0, 1567, 3784 endconst const iadst16_coeffs, align=4 .int 4091, 201, 3973, 995 .int 3703, 1751, 3290, 2440 .int 2751, 3035, 2106, 3513 .int 1380, 3857, 601, 4052 endconst .macro vmul_vmla d0, s0, s1, c0, c1 vmul.i32 \d0, \s0, \c0 vmla.i32 \d0, \s1, \c1 .endm .macro vmul_vmls d0, s0, s1, c0, c1 vmul.i32 \d0, \s0, \c0 vmls.i32 \d0, \s1, \c1 .endm .macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 vqrdmulh.s32 \r0, \r0, \c vqrdmulh.s32 \r1, \r1, \c .ifnb \r2 vqrdmulh.s32 \r2, \r2, \c vqrdmulh.s32 \r3, \r3, \c .endif .ifnb \r4 vqrdmulh.s32 \r4, \r4, \c vqrdmulh.s32 \r5, \r5, \c vqrdmulh.s32 \r6, \r6, \c vqrdmulh.s32 \r7, \r7, \c .endif .endm .macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 .ifnb \load vld1.16 {\load}, [\src, :128], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #\shiftbits .endif .ifnb \addsrc vqadd.s16 \adddst, \adddst, \addsrc .endif .ifnb \max vmax.s16 \max, \max, q6 .endif .ifnb \min vmin.s16 \min, \min, q7 .endif .ifnb \store vst1.16 {\store}, [\dst, :128], r1 .endif .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff load_add_store q0, q8, , , , , , \dst, \src, \shiftbits load_add_store q1, q9, , , , , , \dst, \src, \shiftbits load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits load_add_store q4, q12, q2, q10, q9, q8, , \dst, \src, \shiftbits load_add_store q5, q13, q3, q11, q10, q9, q8, \dst, \src, \shiftbits load_add_store q0, q14, q4, q12, q11, q10, q9, \dst, \src, \shiftbits load_add_store q1, q15, q5, q13, q12, q11, q10, \dst, \src, \shiftbits load_add_store , , q0, q14, q13, q12, q11, \dst, \src, \shiftbits load_add_store , , q1, q15, q14, q13, q12, \dst, \src, \shiftbits load_add_store , , , , q15, q14, q13, \dst, \src, \shiftbits load_add_store , , , , , q15, q14, \dst, \src, \shiftbits load_add_store , , , , , , q15, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff load_add_store q0, q8, , , , , , \dst, \src, \shiftbits load_add_store q1, q9, , , , , , \dst, \src, \shiftbits load_add_store q2, q10, q0, q8, , , , \dst, \src, \shiftbits load_add_store q3, q11, q1, q9, q8, , , \dst, \src, \shiftbits load_add_store , , q2, q10, q9, q8, , \dst, \src, \shiftbits load_add_store , , q3, q11, q10, q9, q8, \dst, \src, \shiftbits load_add_store , , , , q11, q10, q9, \dst, \src, \shiftbits load_add_store , , , , , q11, q10, \dst, \src, \shiftbits load_add_store , , , , , , q11, \dst, \src, \shiftbits .endm .macro load_add_store4 load1, load2, shift, addsrc, adddst, max, min, store1, store2, dst, src, shiftbits=4 .ifnb \load1 vld1.16 {\load1}, [\src, :64], r1 .endif .ifnb \shift vrshr.s16 \shift, \shift, #\shiftbits .endif .ifnb \load2 vld1.16 {\load2}, [\src, :64], r1 .endif .ifnb \addsrc vqadd.s16 \adddst, \adddst, \addsrc .endif .ifnb \max vmax.s16 \max, \max, q6 .endif .ifnb \store1 vst1.16 {\store1}, [\dst, :64], r1 .endif .ifnb \min vmin.s16 \min, \min, q7 .endif .ifnb \store2 vst1.16 {\store2}, [\dst, :64], r1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff mov \src, \dst load_add_store4 d0, d1, q8, , , , , , , \dst, \src load_add_store4 d2, d3, q9, , , , , , , \dst, \src load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src load_add_store4 d8, d9, q12, q2, q10, q9, q8, , , \dst, \src load_add_store4 d10, d11, q13, q3, q11, q10, q9, d16, d17, \dst, \src load_add_store4 d0, d1, q14, q4, q12, q11, q10, d18, d19, \dst, \src load_add_store4 d2, d3, q15, q5, q13, q12, q11, d20, d21, \dst, \src load_add_store4 , , , q0, q14, q13, q12, d22, d23, \dst, \src load_add_store4 , , , q1, q15, q14, q13, d24, d25, \dst, \src load_add_store4 , , , , , q15, q14, d26, d27, \dst, \src load_add_store4 , , , , , , q15, d28, d29, \dst, \src load_add_store4 , , , , , , , d30, d31, \dst, \src .endm .macro load_add_store_4x8 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff mov \src, \dst load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits load_add_store4 d2, d3, q9, , , , , , , \dst, \src, \shiftbits load_add_store4 d4, d5, q10, q0, q8, , , , , \dst, \src, \shiftbits load_add_store4 d6, d7, q11, q1, q9, q8, , , , \dst, \src, \shiftbits load_add_store4 , , , q2, q10, q9, q8, , , \dst, \src, \shiftbits load_add_store4 , , , q3, q11, q10, q9, d16, d17, \dst, \src, \shiftbits load_add_store4 , , , , , q11, q10, d18, d19, \dst, \src, \shiftbits load_add_store4 , , , , , , q11, d20, d21, \dst, \src, \shiftbits load_add_store4 , , , , , , , d22, d23, \dst, \src, \shiftbits .endm .macro load_add_store_4x4 dst, src, shiftbits=4 mov \src, \dst vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff mov \src, \dst load_add_store4 d0, d1, q8, , , , , , , \dst, \src, \shiftbits load_add_store4 d2, d3, q9, q0, q8, , , , , \dst, \src, \shiftbits load_add_store4 , , , q1, q9, q8, , , , \dst, \src, \shiftbits load_add_store4 , , , , , q9, q8, , , \dst, \src, \shiftbits load_add_store4 , , , , , , q9, d16, d17, \dst, \src, \shiftbits load_add_store4 , , , , , , , d18, d19, \dst, \src, \shiftbits .endm .macro idct_dc w, h, shift cmp r3, #0 bne 1f vmov.i16 q14, #0 mov_const r12, 2896*8*(1<<16) vld1.32 {d24[], d25[]}, [r2, :32] vdup.32 d0, r12 vqrdmulh.s32 q13, q12, d0[0] vst1.32 {d28[0]}, [r2, :32] .if (\w == 2*\h) || (2*\w == \h) vqrdmulh.s32 q13, q13, d0[0] .endif .if \shift > 0 vqrshrn.s32 d24, q13, #\shift vqrshrn.s32 d25, q13, #\shift .else vqmovn.s32 d24, q13 vqmovn.s32 d25, q13 .endif vqrdmulh.s16 q12, q12, d0[1] mov r3, #\h vrshr.s16 q12, q12, #4 b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {d0}, [r0, :64], r1 vld1.16 {d1}, [r0, :64], r1 vld1.16 {d2}, [r0, :64], r1 vld1.16 {d3}, [r0, :64], r1 subs r3, r3, #4 vqadd.s16 q0, q0, q12 sub r0, r0, r1, lsl #2 vqadd.s16 q1, q1, q12 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmin.s16 q0, q0, q15 vst1.16 {d0}, [r0, :64], r1 vmin.s16 q1, q1, q15 vst1.16 {d1}, [r0, :64], r1 vst1.16 {d2}, [r0, :64], r1 vst1.16 {d3}, [r0, :64], r1 bgt 1b bx lr endfunc function idct_dc_w8_neon vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0}, [r0, :128], r1 subs r3, r3, #4 vld1.16 {q1}, [r0, :128], r1 vqadd.s16 q0, q0, q12 vld1.16 {q2}, [r0, :128], r1 vqadd.s16 q1, q1, q12 vld1.16 {q3}, [r0, :128], r1 vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 sub r0, r0, r1, lsl #2 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vst1.16 {q0}, [r0, :128], r1 vmin.s16 q2, q2, q15 vst1.16 {q1}, [r0, :128], r1 vmin.s16 q3, q3, q15 vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w16_neon vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0, q1}, [r0, :128], r1 subs r3, r3, #2 vld1.16 {q2, q3}, [r0, :128], r1 vqadd.s16 q0, q0, q12 vqadd.s16 q1, q1, q12 vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 sub r0, r0, r1, lsl #1 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vst1.16 {q0, q1}, [r0, :128], r1 vmin.s16 q3, q3, q15 vst1.16 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w32_neon sub r1, r1, #32 vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0, q1}, [r0, :128]! subs r3, r3, #1 vld1.16 {q2, q3}, [r0, :128] vqadd.s16 q0, q0, q12 vqadd.s16 q1, q1, q12 vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 sub r0, r0, #32 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vst1.16 {q0, q1}, [r0, :128]! vmin.s16 q3, q3, q15 vst1.16 {q2, q3}, [r0, :128], r1 bgt 1b bx lr endfunc function idct_dc_w64_neon sub r1, r1, #96 vmvn.i16 q15, #0xfc00 // 0x3ff 1: vld1.16 {q0, q1}, [r0, :128]! subs r3, r3, #1 vld1.16 {q2, q3}, [r0, :128]! vqadd.s16 q0, q0, q12 vld1.16 {q8, q9}, [r0, :128]! vqadd.s16 q1, q1, q12 vld1.16 {q10, q11}, [r0, :128] vqadd.s16 q2, q2, q12 vqadd.s16 q3, q3, q12 vqadd.s16 q8, q8, q12 vqadd.s16 q9, q9, q12 vqadd.s16 q10, q10, q12 vqadd.s16 q11, q11, q12 sub r0, r0, #96 vmax.s16 q0, q0, q14 vmax.s16 q1, q1, q14 vmax.s16 q2, q2, q14 vmax.s16 q3, q3, q14 vmax.s16 q8, q8, q14 vmax.s16 q9, q9, q14 vmax.s16 q10, q10, q14 vmax.s16 q11, q11, q14 vmin.s16 q0, q0, q15 vmin.s16 q1, q1, q15 vmin.s16 q2, q2, q15 vmin.s16 q3, q3, q15 vmin.s16 q8, q8, q15 vst1.16 {q0, q1}, [r0, :128]! vmin.s16 q9, q9, q15 vst1.16 {q2, q3}, [r0, :128]! vmin.s16 q10, q10, q15 vst1.16 {q8, q9}, [r0, :128]! vmin.s16 q11, q11, q15 vst1.16 {q10, q11}, [r0, :128], r1 bgt 1b bx lr endfunc .macro iwht4 vadd.i32 q8, q8, q9 vsub.i32 q13, q10, q11 vsub.i32 q12, q8, q13 vshr.s32 q12, q12, #1 vsub.i32 q10, q12, q9 vsub.i32 q9, q12, q11 vadd.i32 q11, q13, q10 vsub.i32 q8, q8, q9 .endm .macro idct_4s_x4 r0, r1, r2, r3 vmul_vmla q4, \r1, \r3, d1[1], d1[0] vmul_vmla q2, \r0, \r2, d0[0], d0[0] vmul_vmls q3, \r1, \r3, d1[0], d1[1] vmul_vmls q5, \r0, \r2, d0[0], d0[0] vrshr.s32 q4, q4, #12 vrshr.s32 q2, q2, #12 vrshr.s32 q3, q3, #12 vrshr.s32 q5, q5, #12 vqadd.s32 \r0, q2, q4 vqsub.s32 \r3, q2, q4 vqadd.s32 \r1, q5, q3 vqsub.s32 \r2, q5, q3 .endm .macro idct_2s_x4 r0, r1, r2, r3 vmul_vmla d6, \r1, \r3, d1[1], d1[0] vmul_vmla d4, \r0, \r2, d0[0], d0[0] vmul_vmls d5, \r1, \r3, d1[0], d1[1] vmul_vmls d7, \r0, \r2, d0[0], d0[0] vrshr.s32 d6, d6, #12 vrshr.s32 d4, d4, #12 vrshr.s32 d5, d5, #12 vrshr.s32 d7, d7, #12 vqadd.s32 \r0, d4, d6 vqsub.s32 \r3, d4, d6 vqadd.s32 \r1, d7, d5 vqsub.s32 \r2, d7, d5 .endm function inv_dct_4s_x4_neon movrel_local r12, idct_coeffs vld1.32 {d0, d1}, [r12, :128] idct_4s_x4 q8, q9, q10, q11 bx lr endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel_local r12, iadst4_coeffs vld1.32 {d0, d1}, [r12, :128] vsub.i32 q1, q8, q10 vmul.i32 q2, q8, d0[0] vmla.i32 q2, q10, d0[1] vmla.i32 q2, q11, d1[0] vmul.i32 q4, q9, d1[1] vadd.i32 q1, q1, q11 vmul.i32 q3, q8, d1[0] vmls.i32 q3, q10, d0[0] vmls.i32 q3, q11, d0[1] vadd.i32 \o3, q2, q3 vmul.i32 \o2, q1, d1[1] vadd.i32 \o0, q2, q4 vadd.i32 \o1, q3, q4 vsub.i32 \o3, \o3, q4 vrshr.s32 \o0, \o0, #12 vrshr.s32 \o2, \o2, #12 vrshr.s32 \o1, \o1, #12 vrshr.s32 \o3, \o3, #12 .endm function inv_adst_4s_x4_neon iadst_4x4 q8, q9, q10, q11 bx lr endfunc function inv_flipadst_4s_x4_neon iadst_4x4 q11, q10, q9, q8 bx lr endfunc function inv_identity_4s_x4_neon mov r12, #0 movt r12, #(5793-4096)*8 vdup.32 d0, r12 vqrdmulh.s32 q1, q8, d0[0] vqrdmulh.s32 q2, q9, d0[0] vqrdmulh.s32 q3, q10, d0[0] vqrdmulh.s32 q4, q11, d0[0] vqadd.s32 q8, q8, q1 vqadd.s32 q9, q9, q2 vqadd.s32 q10, q10, q3 vqadd.s32 q11, q11, q4 bx lr endfunc function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 push {r4-r5,lr} vpush {q4-q5} vmov.i16 q14, #0 vmov.i16 q15, #0 vld1.32 {q8, q9}, [r2, :128] vst1.32 {q14, q15}, [r2, :128]! vshr.s16 q8, q8, #2 vld1.32 {q10, q11}, [r2, :128] vshr.s16 q9, q9, #2 vshr.s16 q10, q10, #2 vshr.s16 q11, q11, #2 iwht4 vst1.32 {q14, q15}, [r2, :128] transpose_4x4s q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 iwht4 vld1.16 {d0}, [r0, :64], r1 vqmovn.s32 d16, q8 vld1.16 {d1}, [r0, :64], r1 vqmovn.s32 d17, q9 vld1.16 {d2}, [r0, :64], r1 vqmovn.s32 d18, q10 vld1.16 {d3}, [r0, :64], r1 vqmovn.s32 d19, q11 b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon vmov.i16 q14, #0 vmov.i16 q15, #0 vld1.32 {q8, q9}, [r2, :128] vst1.16 {q14, q15}, [r2, :128]! vld1.32 {q10, q11}, [r2, :128] vst1.16 {q14, q15}, [r2, :128] blx r4 vqmovn.s32 d16, q8 vqmovn.s32 d17, q9 vqmovn.s32 d18, q10 vqmovn.s32 d19, q11 transpose_4x4h q8, q9, d16, d17, d18, d19 blx r5 vld1.16 {d0}, [r0, :64], r1 vld1.16 {d1}, [r0, :64], r1 vrshr.s16 q8, q8, #4 vld1.16 {d2}, [r0, :64], r1 vrshr.s16 q9, q9, #4 vld1.16 {d3}, [r0, :64], r1 L(itx_4x4_end): vmvn.i16 q15, #0xfc00 // 0x3ff sub r0, r0, r1, lsl #2 vqadd.s16 q8, q8, q0 vqadd.s16 q9, q9, q1 vmax.s16 q8, q8, q14 vmax.s16 q9, q9, q14 vmin.s16 q8, q8, q15 vmin.s16 q9, q9, q15 vst1.16 {d16}, [r0, :64], r1 vst1.16 {d17}, [r0, :64], r1 vst1.16 {d18}, [r0, :64], r1 vst1.16 {d19}, [r0, :64], r1 vpop {q4-q5} pop {r4-r5,pc} endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 push {r4-r5,lr} vpush {q4-q5} .ifc \txfm1\()_\txfm2, dct_dct cmp r3, #0 bne 1f vmov.i16 q14, #0 mov_const r12, 2896*8*(1<<16) vld1.32 {d16[], d17[]}, [r2, :32] vdup.32 d4, r12 vst1.32 {d28[0]}, [r2, :32] vqrdmulh.s32 q8, q8, d4[0] vld1.16 {d0}, [r0, :64], r1 vqmovn.s32 d20, q8 vqmovn.s32 d21, q8 vld1.16 {d1}, [r0, :64], r1 vqrdmulh.s16 q10, q10, d4[1] vld1.16 {d2}, [r0, :64], r1 vrshr.s16 q8, q10, #4 vld1.16 {d3}, [r0, :64], r1 vrshr.s16 q9, q10, #4 b L(itx_4x4_end) 1: .endif movrel_local r4, inv_\txfm1\()_4s_x4_neon movrel r5, X(inv_\txfm2\()_4h_x4_neon) b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4s_x4 \r0, \r2, \r4, \r6 vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, \r0, \r2, \r4, \r6 vmin.s32 \r, \r, q5 .endr .irp r, \r0, \r2, \r4, \r6 vmax.s32 \r, \r, q4 .endr vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a vrshr.s32 \r1, q2, #12 // t4a vrshr.s32 \r7, q3, #12 // t7a vrshr.s32 \r3, q6, #12 // t5a vrshr.s32 \r5, q7, #12 // t6a vqadd.s32 q2, \r1, \r3 // t4 vqsub.s32 \r1, \r1, \r3 // t5a vqadd.s32 q3, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a .irp r, q2, \r1, q3, \r3 vmin.s32 \r, \r, q5 .endr .irp r, q2, \r1, q3, \r3 vmax.s32 \r, \r, q4 .endr vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6 vrshr.s32 q7, q7, #12 // t5 vrshr.s32 q5, q6, #12 // t6 vqsub.s32 \r7, \r0, q3 // out7 vqadd.s32 \r0, \r0, q3 // out0 vqadd.s32 \r1, \r2, q5 // out1 vqsub.s32 q6, \r2, q5 // out6 vqadd.s32 \r2, \r4, q7 // out2 vqsub.s32 \r5, \r4, q7 // out5 vqadd.s32 \r3, \r6, q2 // out3 vqsub.s32 \r4, \r6, q2 // out4 vmov \r6, q6 // out6 .endm .macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7 idct_2s_x4 \r0, \r2, \r4, \r6 vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, \r0, \r2, \r4, \r6 vmin.s32 \r, \r, d9 .endr .irp r, \r0, \r2, \r4, \r6 vmax.s32 \r, \r, d8 .endr vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a vmul_vmla d7, \r5, \r3, d3[1], d3[0] // -> t6a vrshr.s32 \r1, d4, #12 // t4a vrshr.s32 \r7, d5, #12 // t7a vrshr.s32 \r3, d6, #12 // t5a vrshr.s32 \r5, d7, #12 // t6a vqadd.s32 d4, \r1, \r3 // t4 vqsub.s32 \r1, \r1, \r3 // t5a vqadd.s32 d5, \r7, \r5 // t7 vqsub.s32 \r3, \r7, \r5 // t6a .irp r, d4, \r1, d5, \r3 vmin.s32 \r, \r, d9 .endr .irp r, d4, \r1, d5, \r3 vmax.s32 \r, \r, d8 .endr vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5 vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6 vrshr.s32 d6, d6, #12 // t5 vrshr.s32 d7, d7, #12 // t6 vqsub.s32 \r7, \r0, d5 // out7 vqadd.s32 \r0, \r0, d5 // out0 vqadd.s32 \r1, \r2, d7 // out1 vqsub.s32 d7, \r2, d7 // out6 vqadd.s32 \r2, \r4, d6 // out2 vqsub.s32 \r5, \r4, d6 // out5 vqadd.s32 \r3, \r6, d4 // out3 vqsub.s32 \r4, \r6, d4 // out4 vmov \r6, d7 // out6 .endm function inv_dct_4s_x8_neon movrel_local r12, idct_coeffs vld1.32 {q0, q1}, [r12, :128] idct_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 bx lr endfunc .macro iadst_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7 movrel_local r12, iadst8_coeffs vld1.32 {q0, q1}, [r12, :128]! vmul_vmla q2, q15, q8, d0[0], d0[1] vmul_vmls q3, q15, q8, d0[1], d0[0] vmul_vmla q4, q13, q10, d1[0], d1[1] vrshr.s32 q8, q2, #12 // t0a vrshr.s32 q15, q3, #12 // t1a vmul_vmls q5, q13, q10, d1[1], d1[0] vmul_vmla q6, q11, q12, d2[0], d2[1] vrshr.s32 q10, q4, #12 // t2a vrshr.s32 q13, q5, #12 // t3a vmul_vmls q7, q11, q12, d2[1], d2[0] vmul_vmla q2, q9, q14, d3[0], d3[1] vrshr.s32 q12, q6, #12 // t4a vrshr.s32 q11, q7, #12 // t5a vmul_vmls q3, q9, q14, d3[1], d3[0] vrshr.s32 q14, q2, #12 // t6a vrshr.s32 q9, q3, #12 // t7a vld1.32 {q0}, [r12] vqadd.s32 q2, q8, q12 // t0 vqsub.s32 q3, q8, q12 // t4 vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vqadd.s32 q4, q15, q11 // t1 vqsub.s32 q5, q15, q11 // t5 vqadd.s32 q6, q10, q14 // t2 vqsub.s32 q7, q10, q14 // t6 vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 q10, q13, q9 // t3 vqsub.s32 q11, q13, q9 // t7 .irp r, q2, q3, q4, q5, q6, q7, q10, q11 vmin.s32 \r, \r, q12 .endr .irp r, q2, q3, q4, q5, q6, q7, q10, q11 vmax.s32 \r, \r, q14 .endr vmul_vmla q8, q3, q5, d1[1], d1[0] vmul_vmls q13, q3, q5, d1[0], d1[1] vmul_vmls q14, q11, q7, d1[1], d1[0] vrshr.s32 q3, q8, #12 // t4a vrshr.s32 q5, q13, #12 // t5a vmul_vmla q8, q11, q7, d1[0], d1[1] vrshr.s32 q7, q14, #12 // t6a vrshr.s32 q11, q8, #12 // t7a vqadd.s32 \r0, q2, q6 // out0 vqsub.s32 q2, q2, q6 // t2 vqadd.s32 \r7, q4, q10 // out7 vqsub.s32 q4, q4, q10 // t3 vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqadd.s32 \r1, q3, q7 // out1 vqsub.s32 q3, q3, q7 // t6 vqadd.s32 \r6, q5, q11 // out6 vqsub.s32 q5, q5, q11 // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, q2, q4, q3, q5 vmin.s32 \r, \r, q12 .endr .irp r, q2, q4, q3, q5 vmax.s32 \r, \r, q10 .endr vqneg.s32 \r7, \r7 // out7 vqneg.s32 \r1, \r1 // out1 vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12) vmul_vmls q6, q2, q4, d0[0], d0[0] // -> out4 (q12 or q11) vmul_vmls q12, q3, q5, d0[0], d0[0] // -> out5 (q13 or q10) vrshr.s32 q2, q10, #12 // out3 vmul_vmla q10, q3, q5, d0[0], d0[0] // -> out2 (q10 or q13) vrshr.s32 q3, q12, #12 // out5 vrshr.s32 \r2, q10, #12 // out2 (q10 or q13) vrshr.s32 \r4, q6, #12 // out4 (q12 or q11) vqneg.s32 \r3, q2 // out3 vqneg.s32 \r5, q3 // out5 .endm function inv_adst_4s_x8_neon iadst_4s_x8 q8, q9, q10, q11, q12, q13, q14, q15 bx lr endfunc function inv_flipadst_4s_x8_neon iadst_4s_x8 q15, q14, q13, q12, q11, q10, q9, q8 bx lr endfunc function inv_identity_4s_x8_neon vqshl.s32 q8, q8, #1 vqshl.s32 q9, q9, #1 vqshl.s32 q10, q10, #1 vqshl.s32 q11, q11, #1 vqshl.s32 q12, q12, #1 vqshl.s32 q13, q13, #1 vqshl.s32 q14, q14, #1 vqshl.s32 q15, q15, #1 bx lr endfunc function inv_txfm_add_8x8_neon vmov.i32 q0, #0 mov r7, #8*4 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q12, #1 vqrshrn.s32 d18, q9, #1 vqrshrn.s32 d19, q13, #1 vqrshrn.s32 d20, q10, #1 vqrshrn.s32 d21, q14, #1 vqrshrn.s32 d22, q11, #1 vqrshrn.s32 d23, q15, #1 cmp r3, r10 transpose_4x8h q8, q9, q10, q11 blt 1f sub r2, r2, r7, lsl #3 vpush {q8-q11} add r2, r2, #16 vmov.i32 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr blx r4 vqrshrn.s32 d31, q15, #1 vqrshrn.s32 d30, q11, #1 vqrshrn.s32 d29, q14, #1 vqrshrn.s32 d28, q10, #1 vqrshrn.s32 d27, q13, #1 vqrshrn.s32 d26, q9, #1 vqrshrn.s32 d25, q12, #1 vqrshrn.s32 d24, q8, #1 vpop {q8-q11} transpose_4x8h q12, q13, q14, q15 b 2f 1: vmov.i16 q12, #0 vmov.i16 q13, #0 vmov.i16 q14, #0 vmov.i16 q15, #0 2: blx r5 load_add_store_8x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,r10,pc} endfunc .macro def_fn_8x8 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif push {r4-r5,r7,r10,lr} vpush {q4-q7} mov r10, #\eob_half movrel_local r4, inv_\txfm1\()_4s_x8_neon movrel r5, X(inv_\txfm2\()_8h_x8_neon) b inv_txfm_add_8x8_neon endfunc .endm def_fn_8x8 dct, dct, 10 def_fn_8x8 identity, identity, 10 def_fn_8x8 dct, adst, 10 def_fn_8x8 dct, flipadst, 10 def_fn_8x8 dct, identity, 4 def_fn_8x8 adst, dct, 10 def_fn_8x8 adst, adst, 10 def_fn_8x8 adst, flipadst, 10 def_fn_8x8 flipadst, dct, 10 def_fn_8x8 flipadst, adst, 10 def_fn_8x8 flipadst, flipadst, 10 def_fn_8x8 identity, dct, 4 def_fn_8x8 adst, identity, 4 def_fn_8x8 flipadst, identity, 4 def_fn_8x8 identity, adst, 4 def_fn_8x8 identity, flipadst, 4 function inv_txfm_add_8x4_neon mov_const r12, 2896*8*(1<<16) vmov.i32 q0, #0 vmov.i32 q1, #0 vld1.16 {q8, q9}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vdup.32 d4, r12 vld1.16 {q10, q11}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q12, q13}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! vld1.16 {q14, q15}, [r2, :128] vst1.16 {q0, q1}, [r2, :128]! scale_input d4[0], q8, q9, q10, q11, q12, q13, q14, q15 blx r4 vqmovn.s32 d16, q8 vqmovn.s32 d17, q9 vqmovn.s32 d18, q10 vqmovn.s32 d19, q11 vqmovn.s32 d20, q12 vqmovn.s32 d21, q13 vqmovn.s32 d22, q14 vqmovn.s32 d23, q15 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 vswp d17, d20 vswp d19, d21 vswp d18, d20 vswp d21, d22 blx r5 load_add_store_8x4 r0, r7 vpop {q4-q7} pop {r4-r5,r7,r10,pc} endfunc function inv_txfm_add_4x8_neon mov_const r12, 2896*8*(1<<16) vmov.i32 q0, #0 cmp r3, r10 mov r7, #32 blt 1f add r2, r2, #16 vdup.32 d2, r12 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr scale_input d2[0], q8, q9, q10, q11 sub r2, r2, r7, lsl #2 blx r4 sub r2, r2, #16 vqmovn.s32 d24, q8 vqmovn.s32 d25, q9 vqmovn.s32 d26, q10 vqmovn.s32 d27, q11 transpose_4x4h q12, q13, d24, d25, d26, d27 b 2f 1: vmov.i16 q12, #0 vmov.i16 q13, #0 2: mov_const r12, 2896*8*(1<<16) vmov.i32 q0, #0 vdup.32 d2, r12 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r7 .endr scale_input d2[0], q8, q9, q10, q11 blx r4 vqmovn.s32 d16, q8 vqmovn.s32 d17, q9 vqmovn.s32 d18, q10 vqmovn.s32 d19, q11 transpose_4x4h q8, q9, d16, d17, d18, d19 vmov q10, q12 vmov q11, q13 blx r5 load_add_store_4x8 r0, r7 vpop {q4-q7} pop {r4-r5,r7,r10,pc} endfunc .macro def_fn_48 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif push {r4-r5,r7,r10,lr} vpush {q4-q7} movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon .if \w == 4 mov r10, #\eob_half .endif movrel r5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct, 13 def_fn_48 \w, \h, identity, identity, 13 def_fn_48 \w, \h, dct, adst, 13 def_fn_48 \w, \h, dct, flipadst, 13 def_fn_48 \w, \h, dct, identity, 4 def_fn_48 \w, \h, adst, dct, 13 def_fn_48 \w, \h, adst, adst, 13 def_fn_48 \w, \h, adst, flipadst, 13 def_fn_48 \w, \h, flipadst, dct, 13 def_fn_48 \w, \h, flipadst, adst, 13 def_fn_48 \w, \h, flipadst, flipadst, 13 def_fn_48 \w, \h, identity, dct, 16 def_fn_48 \w, \h, adst, identity, 4 def_fn_48 \w, \h, flipadst, identity, 4 def_fn_48 \w, \h, identity, adst, 16 def_fn_48 \w, \h, identity, flipadst, 16 .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_dct_2s_x16_neon movrel_local r12, idct_coeffs vld1.32 {q0, q1}, [r12, :128]! idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30 // idct_8 leaves the row_clip_max/min constants in d9 and d8 .irp r, d16, d18, d20, d22, d24, d26, d28, d30 vmin.s32 \r, \r, d9 .endr .irp r, d16, d18, d20, d22, d24, d26, d28, d30 vmax.s32 \r, \r, d8 .endr vld1.32 {q0, q1}, [r12, :128] sub r12, r12, #32 vmul_vmls d4, d17, d31, d0[0], d0[1] // -> t8a vmul_vmla d5, d17, d31, d0[1], d0[0] // -> t15a vmul_vmls d6, d25, d23, d1[0], d1[1] // -> t9a vrshr.s32 d17, d4, #12 // t8a vrshr.s32 d31, d5, #12 // t15a vmul_vmla d4, d25, d23, d1[1], d1[0] // -> t14a vmul_vmls d5, d21, d27, d2[0], d2[1] // -> t10a vrshr.s32 d23, d6, #12 // t9a vrshr.s32 d25, d4, #12 // t14a vmul_vmla d6, d21, d27, d2[1], d2[0] // -> t13a vmul_vmls d4, d29, d19, d3[0], d3[1] // -> t11a vrshr.s32 d21, d5, #12 // t10a vrshr.s32 d27, d6, #12 // t13a vmul_vmla d5, d29, d19, d3[1], d3[0] // -> t12a vrshr.s32 d19, d4, #12 // t11a vrshr.s32 d29, d5, #12 // t12a vld1.32 {q0}, [r12, :128] vqsub.s32 d4, d17, d23 // t9 vqadd.s32 d17, d17, d23 // t8 vqsub.s32 d5, d31, d25 // t14 vqadd.s32 d31, d31, d25 // t15 vqsub.s32 d23, d19, d21 // t10 vqadd.s32 d19, d19, d21 // t11 vqadd.s32 d25, d29, d27 // t12 vqsub.s32 d29, d29, d27 // t13 .irp r, d4, d17, d5, d31, d23, d19, d25, d29 vmin.s32 \r, \r, d9 .endr .irp r, d4, d17, d5, d31, d23, d19, d25, d29 vmax.s32 \r, \r, d8 .endr vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a vrshr.s32 d21, d6, #12 // t9a vrshr.s32 d27, d7, #12 // t14a vmul_vmls d6, d29, d23, d1[0], d1[1] // -> t13a vmul_vmla d7, d29, d23, d1[1], d1[0] // -> t10a vrshr.s32 d29, d6, #12 // t13a vneg.s32 d7, d7 vrshr.s32 d23, d7, #12 // t10a vqsub.s32 d4, d17, d19 // t11a vqadd.s32 d17, d17, d19 // t8a vqsub.s32 d5, d31, d25 // t12a vqadd.s32 d31, d31, d25 // t15a vqadd.s32 d19, d21, d23 // t9 vqsub.s32 d21, d21, d23 // t10 vqsub.s32 d25, d27, d29 // t13 vqadd.s32 d27, d27, d29 // t14 .irp r, d4, d17, d5, d31, d19, d21, d25, d27 vmin.s32 \r, \r, d9 .endr .irp r, d4, d17, d5, d31, d19, d21, d25, d27 vmax.s32 \r, \r, d8 .endr vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11 vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12 vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a vrshr.s32 d6, d6, #12 // t11 vrshr.s32 d7, d7, #12 // t12 vmul_vmla d5, d25, d21, d0[0], d0[0] // -> t13a vrshr.s32 d4, d4, #12 // t10a vrshr.s32 d5, d5, #12 // t13a vqadd.s32 d8, d16, d31 // out0 vqsub.s32 d31, d16, d31 // out15 vmov d16, d8 vqadd.s32 d23, d30, d17 // out7 vqsub.s32 d9, d30, d17 // out8 vqadd.s32 d17, d18, d27 // out1 vqsub.s32 d30, d18, d27 // out14 vqadd.s32 d18, d20, d5 // out2 vqsub.s32 d29, d20, d5 // out13 vqadd.s32 d5, d28, d19 // out6 vqsub.s32 d25, d28, d19 // out9 vqadd.s32 d19, d22, d7 // out3 vqsub.s32 d28, d22, d7 // out12 vqadd.s32 d20, d24, d6 // out4 vqsub.s32 d27, d24, d6 // out11 vqadd.s32 d21, d26, d4 // out5 vqsub.s32 d26, d26, d4 // out10 vmov d24, d9 vmov d22, d5 bx lr endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 movrel_local r12, iadst16_coeffs vld1.32 {q0, q1}, [r12, :128]! vmul_vmla d4, d31, d16, d0[0], d0[1] // -> t0 vmul_vmls d6, d31, d16, d0[1], d0[0] // -> t1 vmul_vmla d8, d29, d18, d1[0], d1[1] // -> t2 vrshr.s32 d16, d4, #12 // t0 vrshr.s32 d31, d6, #12 // t1 vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t3 vmul_vmla d6, d27, d20, d2[0], d2[1] // -> t4 vrshr.s32 d18, d8, #12 // t2 vrshr.s32 d29, d4, #12 // t3 vmul_vmls d8, d27, d20, d2[1], d2[0] // -> t5 vmul_vmla d4, d25, d22, d3[0], d3[1] // -> t6 vrshr.s32 d20, d6, #12 // t4 vrshr.s32 d27, d8, #12 // t5 vmul_vmls d6, d25, d22, d3[1], d3[0] // -> t7 vld1.32 {q0, q1}, [r12, :128] movrel_local r12, idct_coeffs vmul_vmla d8, d23, d24, d0[0], d0[1] // -> t8 vrshr.s32 d22, d4, #12 // t6 vrshr.s32 d25, d6, #12 // t7 vmul_vmls d4, d23, d24, d0[1], d0[0] // -> t9 vmul_vmla d6, d21, d26, d1[0], d1[1] // -> t10 vrshr.s32 d23, d8, #12 // t8 vrshr.s32 d24, d4, #12 // t9 vmul_vmls d8, d21, d26, d1[1], d1[0] // -> t11 vmul_vmla d4, d19, d28, d2[0], d2[1] // -> t12 vrshr.s32 d21, d6, #12 // t10 vrshr.s32 d26, d8, #12 // t11 vmul_vmls d6, d19, d28, d2[1], d2[0] // -> t13 vmul_vmla d8, d17, d30, d3[0], d3[1] // -> t14 vrshr.s32 d19, d4, #12 // t12 vrshr.s32 d28, d6, #12 // t13 vmul_vmls d4, d17, d30, d3[1], d3[0] // -> t15 vrshr.s32 d17, d8, #12 // t14 vrshr.s32 d30, d4, #12 // t15 vld1.32 {q0, q1}, [r12, :128] vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqsub.s32 d5, d16, d23 // t8a vqadd.s32 d16, d16, d23 // t0a vqsub.s32 d7, d31, d24 // t9a vqadd.s32 d31, d31, d24 // t1a vqadd.s32 d23, d18, d21 // t2a vqsub.s32 d18, d18, d21 // t10a vqadd.s32 d24, d29, d26 // t3a vqsub.s32 d29, d29, d26 // t11a vqadd.s32 d21, d20, d19 // t4a vqsub.s32 d20, d20, d19 // t12a vqadd.s32 d26, d27, d28 // t5a vqsub.s32 d27, d27, d28 // t13a vqadd.s32 d19, d22, d17 // t6a vqsub.s32 d22, d22, d17 // t14a vqadd.s32 d28, d25, d30 // t7a vqsub.s32 d25, d25, d30 // t15a .irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 vmin.s32 \r, \r, d11 .endr .irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25 vmax.s32 \r, \r, d10 .endr vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8 vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9 vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10 vrshr.s32 d17, d4, #12 // t8 vrshr.s32 d30, d6, #12 // t9 vmul_vmls d4, d18, d29, d3[0], d3[1] // -> t11 vmul_vmls d6, d27, d20, d2[1], d2[0] // -> t12 vrshr.s32 d18, d8, #12 // t10 vrshr.s32 d29, d4, #12 // t11 vmul_vmla d8, d27, d20, d2[0], d2[1] // -> t13 vmul_vmls d4, d25, d22, d3[1], d3[0] // -> t14 vrshr.s32 d27, d6, #12 // t12 vrshr.s32 d20, d8, #12 // t13 vmul_vmla d6, d25, d22, d3[0], d3[1] // -> t15 vrshr.s32 d25, d4, #12 // t14 vrshr.s32 d22, d6, #12 // t15 vqsub.s32 d2, d16, d21 // t4 vqadd.s32 d16, d16, d21 // t0 vqsub.s32 d3, d31, d26 // t5 vqadd.s32 d31, d31, d26 // t1 vqadd.s32 d21, d23, d19 // t2 vqsub.s32 d23, d23, d19 // t6 vqadd.s32 d26, d24, d28 // t3 vqsub.s32 d24, d24, d28 // t7 vqadd.s32 d19, d17, d27 // t8a vqsub.s32 d17, d17, d27 // t12a vqadd.s32 d28, d30, d20 // t9a vqsub.s32 d30, d30, d20 // t13a vqadd.s32 d27, d18, d25 // t10a vqsub.s32 d18, d18, d25 // t14a vqadd.s32 d20, d29, d22 // t11a vqsub.s32 d29, d29, d22 // t15a .irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 vmin.s32 \r, \r, d11 .endr .irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29 vmax.s32 \r, \r, d10 .endr vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a vrshr.s32 d22, d4, #12 // t4a vrshr.s32 d25, d6, #12 // t5a vmul_vmla d4, d24, d23, d1[0], d1[1] // -> t7a vmul_vmla d6, d17, d30, d1[1], d1[0] // -> t12 vrshr.s32 d24, d8, #12 // t6a vrshr.s32 d23, d4, #12 // t7a vmul_vmls d8, d17, d30, d1[0], d1[1] // -> t13 vmul_vmls d4, d29, d18, d1[1], d1[0] // -> t14 vrshr.s32 d17, d6, #12 // t12 vmul_vmla d6, d29, d18, d1[0], d1[1] // -> t15 vrshr.s32 d29, d8, #12 // t13 vrshr.s32 d30, d4, #12 // t14 vrshr.s32 d18, d6, #12 // t15 vqsub.s32 d2, d16, d21 // t2a .ifc \o0, d16 vqadd.s32 \o0, d16, d21 // out0 vqsub.s32 d21, d31, d26 // t3a vqadd.s32 \o15,d31, d26 // out15 .else vqadd.s32 d4, d16, d21 // out0 vqsub.s32 d21, d31, d26 // t3a vqadd.s32 \o15,d31, d26 // out15 vmov \o0, d4 .endif vqsub.s32 d3, d29, d18 // t15a vqadd.s32 \o13,d29, d18 // out13 vqadd.s32 \o2, d17, d30 // out2 vqsub.s32 d26, d17, d30 // t14a vqadd.s32 \o1, d19, d27 // out1 vqsub.s32 d27, d19, d27 // t10 vqadd.s32 \o14,d28, d20 // out14 vqsub.s32 d20, d28, d20 // t11 vqadd.s32 \o3, d22, d24 // out3 vqsub.s32 d22, d22, d24 // t6 vqadd.s32 \o12,d25, d23 // out12 vqsub.s32 d23, d25, d23 // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, d2, d21, d3, d26, d27, d20, d22, d23 vmin.s32 \r, \r, d11 .endr .irp r, d2, d21, d3, d26, d27, d20, d22, d23 vmax.s32 \r, \r, d10 .endr vqneg.s32 \o15, \o15 // out15 vqneg.s32 \o13,\o13 // out13 vqneg.s32 \o1, \o1 // out1 vqneg.s32 \o3, \o3 // out3 vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) vmul_vmla d4, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) vmul_vmla d6, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) vrshr.s32 d24, d24, #12 // out8 vrshr.s32 d4, d4, #12 // out7 vrshr.s32 d5, d6, #12 // out5 vmul_vmls d8, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) vmul_vmla d2, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) vrshr.s32 d26, d8, #12 // out10 vmul_vmls d8, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) vmul_vmla d22, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) vmul_vmls d6, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) vrshr.s32 \o4, d2, #12 // out4 vrshr.s32 d7, d6, #12 // out9 vrshr.s32 d6, d8, #12 // out11 vrshr.s32 \o6, d22, #12 // out6 .ifc \o8, d23 vmov \o8, d24 vmov \o10,d26 .endif vqneg.s32 \o7, d4 // out7 vqneg.s32 \o5, d5 // out5 vqneg.s32 \o11,d6 // out11 vqneg.s32 \o9, d7 // out9 .endm function inv_adst_2s_x16_neon iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 bx lr endfunc function inv_flipadst_2s_x16_neon iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 bx lr endfunc function inv_identity_2s_x16_neon mov r12, #0 movt r12, #2*(5793-4096)*8 vdup.32 d0, r12 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s32 q1, \i, d0[0] vqadd.s32 \i, \i, \i vqadd.s32 \i, \i, q1 .endr bx lr endfunc .macro identity_8x4_shift1 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s32 q2, \i, \c vrshr.s32 q2, q2, #1 vqadd.s32 \i, \i, q2 .endr .endm .macro identity_8x4 c .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vqrdmulh.s32 q2, \i, \c vqadd.s32 \i, \i, \i vqadd.s32 \i, \i, q2 .endr .endm .macro def_horz_16 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x2_neon push {lr} vmov.i32 d7, #0 .if \scale mov_const r12, 2896*8*(1<<16) vdup.32 d1, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r7, :64] vst1.32 {d7}, [r7, :64], r8 .endr .if \scale scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif blx r4 vqrshrn.s32 d16, q8, #\shift vqrshrn.s32 d17, q9, #\shift vqrshrn.s32 d18, q10, #\shift vqrshrn.s32 d19, q11, #\shift vqrshrn.s32 d20, q12, #\shift vqrshrn.s32 d21, q13, #\shift vqrshrn.s32 d22, q14, #\shift vqrshrn.s32 d23, q15, #\shift vuzp.16 q8, q9 vuzp.16 q10, q11 .irp i, q8, q10, q9, q11 vst1.16 {\i}, [r6, :128]! .endr pop {pc} endfunc .endm def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_4x16_neon push {lr} .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr blx r5 load_add_store_4x16 r6, r7 pop {pc} endfunc function inv_txfm_add_16x16_neon sub_sp_align 512 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6, 8, 10, 12, 14 add r6, sp, #(\i*16*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 14 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #16*4 bl inv_txfm_horz_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #32 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc const eob_16x16 .short 3, 10, 21, 36, 55, 78, 105, 256 endconst const eob_16x16_identity .short 2, 4, 6, 8, 10, 12, 14, 256 endconst .macro def_fn_16x16 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif push {r4-r11,lr} vpush {q4-q7} movrel_local r4, inv_\txfm1\()_2s_x16_neon movrel r5, X(inv_\txfm2\()_4h_x16_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_16x16 .else movrel_local r10, eob_16x16_identity .endif .else .ifc \txfm2, identity movrel_local r10, eob_16x16_identity .else movrel_local r10, eob_16x16 .endif .endif b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct def_fn_16x16 identity, identity def_fn_16x16 dct, adst def_fn_16x16 dct, flipadst def_fn_16x16 dct, identity def_fn_16x16 adst, dct def_fn_16x16 adst, adst def_fn_16x16 adst, flipadst def_fn_16x16 flipadst, dct def_fn_16x16 flipadst, adst def_fn_16x16 flipadst, flipadst def_fn_16x16 identity, dct function inv_txfm_add_16x4_neon cmp r3, r10 mov r11, #16 blt 1f add r6, r2, #8 vmov.i32 d4, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r6, :64] vst1.32 {d4}, [r6, :64], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 vqrshrn.s32 d20, q12, #1 vqrshrn.s32 d21, q13, #1 vqrshrn.s32 d22, q14, #1 vqrshrn.s32 d23, q15, #1 vuzp.16 q8, q9 mov r6, sp vuzp.16 q10, q11 vpush {q8-q11} b 2f 1: vmov.i16 q8, #0 vmov.i16 q9, #0 mov r6, sp vpush {q8-q9} vpush {q8-q9} 2: vmov.i32 d4, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r2, :64] vst1.32 {d4}, [r2, :64], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 vqrshrn.s32 d20, q12, #1 vqrshrn.s32 d21, q13, #1 vqrshrn.s32 d22, q14, #1 vqrshrn.s32 d23, q15, #1 vuzp.16 q8, q9 mov r6, sp vuzp.16 q10, q11 vmov q12, q10 vmov q13, q11 vpop {q10-q11} blx r5 mov r6, r0 load_add_store_8x4 r6, r7 vpop {q10-q11} vmov q8, q12 vmov q9, q13 blx r5 add r6, r0, #16 load_add_store_8x4 r6, r7 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_4x16_neon ldrh r9, [r10, #4] mov r11, #64 cmp r3, r9 ldrh r9, [r10, #2] blt 1f add r6, r2, #48 vmov.i32 q2, #0 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r6, :128] vst1.32 {q2}, [r6, :128], r11 .endr blx r4 vqrshrn.s32 d28, q8, #1 vqrshrn.s32 d29, q9, #1 vqrshrn.s32 d30, q10, #1 vqrshrn.s32 d31, q11, #1 transpose_4x4h q14, q15, d28, d29, d30, d31 b 2f 1: vmov.i16 q14, #0 vmov.i16 q15, #0 2: cmp r3, r9 ldrh r9, [r10] blt 1f add r6, r2, #32 vmov.i32 q2, #0 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r6, :128] vst1.32 {q2}, [r6, :128], r11 .endr blx r4 vqrshrn.s32 d24, q8, #1 vqrshrn.s32 d25, q9, #1 vqrshrn.s32 d26, q10, #1 vqrshrn.s32 d27, q11, #1 transpose_4x4h q12, q13, d24, d25, d26, d27 b 2f 1: vmov.i16 q12, #0 vmov.i16 q13, #0 2: cmp r3, r9 blt 1f add r6, r2, #16 vmov.i32 q2, #0 .irp i, q8, q9, q10, q11 vld1.32 {\i}, [r6, :128] vst1.32 {q2}, [r6, :128], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 transpose_4x4h q8, q9, d16, d17, d18, d19 b 2f 1: vmov.i16 q8, #0 vmov.i16 q9, #0 2: vmov.i16 q2, #0 vpush {q8-q9} .irp i, q8, q9, q10, q11 vld1.16 {\i}, [r2, :128] vst1.16 {q2}, [r2, :128], r11 .endr blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 transpose_4x4h q8, q9, d16, d17, d18, d19 vpop {q10-q11} blx r5 load_add_store_4x16 r0, r6 vpop {q4-q7} pop {r4-r11,pc} endfunc const eob_4x16 .short 13, 29, 45, 64 endconst const eob_4x16_identity1 .short 16, 32, 48, 64 endconst const eob_4x16_identity2 .short 4, 8, 12, 64 endconst .macro def_fn_416 w, h, txfm1, txfm2, eob_16x4 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 4 movrel_local r4, inv_\txfm1\()_4s_x\w\()_neon movrel r5, X(inv_\txfm2\()_4h_x\h\()_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_4x16 .else movrel_local r10, eob_4x16_identity1 .endif .else .ifc \txfm2, identity movrel_local r10, eob_4x16_identity2 .else movrel_local r10, eob_4x16 .endif .endif .else mov r10, #\eob_16x4 movrel_local r4, inv_\txfm1\()_2s_x\w\()_neon movrel r5, X(inv_\txfm2\()_8h_x\h\()_neon) .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 3 def_fn_416 \w, \h, identity, identity, 3 def_fn_416 \w, \h, dct, adst, 3 def_fn_416 \w, \h, dct, flipadst, 3 def_fn_416 \w, \h, dct, identity, 2 def_fn_416 \w, \h, adst, dct, 3 def_fn_416 \w, \h, adst, adst, 3 def_fn_416 \w, \h, adst, flipadst, 3 def_fn_416 \w, \h, flipadst, dct, 3 def_fn_416 \w, \h, flipadst, adst, 3 def_fn_416 \w, \h, flipadst, flipadst, 3 def_fn_416 \w, \h, identity, dct, 2 def_fn_416 \w, \h, adst, identity, 2 def_fn_416 \w, \h, flipadst, identity, 2 def_fn_416 \w, \h, identity, adst, 2 def_fn_416 \w, \h, identity, flipadst, 2 .endm def_fns_416 4, 16 def_fns_416 16, 4 function inv_txfm_add_16x8_neon sub_sp_align 256 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6 add r6, sp, #(\i*16*2) .if \i > 0 mov r8, #(8 - \i) cmp r3, r11 blt 1f .if \i < 6 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #8*4 bl inv_txfm_horz_scale_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 8 add r7, sp, #(\i*2) mov r8, #32 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\j}, [r7, :128], r8 .endr blx r5 add r6, r0, #(\i*2) load_add_store_8x8 r6, r7 .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_8x16_neon add r10, r10, #2 sub_sp_align 256 ldrh r11, [r10], #4 .irp i, 0, 4, 8, 12 add r6, sp, #(\i*8*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 12 ldrh r11, [r10], #4 .endif .endif add r7, r2, #(\i*4) mov r8, #16*4 mov_const r12, 2896*8*(1<<16) vmov.i32 q2, #0 vdup.32 d0, r12 .irp j, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\j}, [r7, :128] vst1.32 {q2}, [r7, :128], r8 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 blx r4 vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q9, #1 vqrshrn.s32 d18, q10, #1 vqrshrn.s32 d19, q11, #1 vqrshrn.s32 d20, q12, #1 vqrshrn.s32 d21, q13, #1 vqrshrn.s32 d22, q14, #1 vqrshrn.s32 d23, q15, #1 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 .irp j, d16, d20, d17, d21, d18, d22, d19, d23 vst1.16 {\j}, [r6, :64]! .endr .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #4 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #16 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 256 vpop {q4-q7} pop {r4-r11,pc} endfunc const eob_8x16 .short 3, 10, 21, 43, 59, 75, 91, 128 endconst const eob_8x16_identity1 .short 2, 4, 6, 64, 80, 96, 112, 128 endconst const eob_8x16_identity2 .short 2, 4, 6, 8, 10, 12, 14, 128 endconst .macro def_fn_816 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif push {r4-r11,lr} vpush {q4-q7} .if \w == 8 movrel_local r4, inv_\txfm1\()_4s_x8_neon movrel r5, X(inv_\txfm2\()_4h_x16_neon) .else movrel_local r4, inv_\txfm1\()_2s_x16_neon movrel r5, X(inv_\txfm2\()_8h_x8_neon) .endif .ifc \txfm1, identity .ifc \txfm2, identity movrel_local r10, eob_8x16 .else movrel_local r10, eob_8x16_identity1 .endif .else .ifc \txfm2, identity movrel_local r10, eob_8x16_identity2 .else movrel_local r10, eob_8x16 .endif .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct def_fn_816 \w, \h, identity, identity def_fn_816 \w, \h, dct, adst def_fn_816 \w, \h, dct, flipadst def_fn_816 \w, \h, dct, identity def_fn_816 \w, \h, adst, dct def_fn_816 \w, \h, adst, adst def_fn_816 \w, \h, adst, flipadst def_fn_816 \w, \h, flipadst, dct def_fn_816 \w, \h, flipadst, adst def_fn_816 \w, \h, flipadst, flipadst def_fn_816 \w, \h, identity, dct def_fn_816 \w, \h, adst, identity def_fn_816 \w, \h, flipadst, identity def_fn_816 \w, \h, identity, adst def_fn_816 \w, \h, identity, flipadst .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_2s_x16_neon movrel_local r12, idct_coeffs, 4*16 vld1.32 {q0, q1}, [r12, :128]! vmul_vmls d4, d16, d31, d0[0], d0[1] // -> t16a vmul_vmla d6, d16, d31, d0[1], d0[0] // -> t31a vmul_vmls d8, d24, d23, d1[0], d1[1] // -> t17a vrshr.s32 d16, d4, #12 // t16a vrshr.s32 d31, d6, #12 // t31a vmul_vmla d4, d24, d23, d1[1], d1[0] // -> t30a vmul_vmls d6, d20, d27, d2[0], d2[1] // -> t18a vrshr.s32 d24, d8, #12 // t17a vrshr.s32 d23, d4, #12 // t30a vmul_vmla d8, d20, d27, d2[1], d2[0] // -> t29a vmul_vmls d4, d28, d19, d3[0], d3[1] // -> t19a vrshr.s32 d20, d6, #12 // t18a vrshr.s32 d27, d8, #12 // t29a vmul_vmla d6, d28, d19, d3[1], d3[0] // -> t28a vld1.32 {q0, q1}, [r12, :128] sub r12, r12, #4*24 vmul_vmls d8, d18, d29, d0[0], d0[1] // -> t20a vrshr.s32 d28, d4, #12 // t19a vrshr.s32 d19, d6, #12 // t28a vmul_vmla d4, d18, d29, d0[1], d0[0] // -> t27a vmul_vmls d6, d26, d21, d1[0], d1[1] // -> t21a vrshr.s32 d18, d8, #12 // t20a vrshr.s32 d29, d4, #12 // t27a vmul_vmla d8, d26, d21, d1[1], d1[0] // -> t26a vmul_vmls d4, d22, d25, d2[0], d2[1] // -> t22a vrshr.s32 d26, d6, #12 // t21a vrshr.s32 d21, d8, #12 // t26a vmul_vmla d6, d22, d25, d2[1], d2[0] // -> t25a vmul_vmls d8, d30, d17, d3[0], d3[1] // -> t23a vrshr.s32 d22, d4, #12 // t22a vrshr.s32 d25, d6, #12 // t25a vmul_vmla d4, d30, d17, d3[1], d3[0] // -> t24a vrshr.s32 d30, d8, #12 // t23a vrshr.s32 d17, d4, #12 // t24a vld1.32 {q0, q1}, [r12, :128] vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 vqsub.s32 d5, d16, d24 // t17 vqadd.s32 d16, d16, d24 // t16 vqsub.s32 d7, d31, d23 // t30 vqadd.s32 d31, d31, d23 // t31 vqsub.s32 d24, d28, d20 // t18 vqadd.s32 d28, d28, d20 // t19 vqadd.s32 d23, d18, d26 // t20 vqsub.s32 d18, d18, d26 // t21 vqsub.s32 d20, d30, d22 // t22 vqadd.s32 d30, d30, d22 // t23 vqadd.s32 d26, d17, d25 // t24 vqsub.s32 d17, d17, d25 // t25 vqsub.s32 d22, d29, d21 // t26 vqadd.s32 d29, d29, d21 // t27 vqadd.s32 d25, d19, d27 // t28 vqsub.s32 d19, d19, d27 // t29 .irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 vmin.s32 \r, \r, d11 .endr .irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19 vmax.s32 \r, \r, d10 .endr vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a vrshr.s32 d21, d4, #12 // t17a vrshr.s32 d27, d6, #12 // t30a vneg.s32 d8, d8 // -> t18a vmul_vmls d5, d19, d24, d2[0], d2[1] // -> t29a vmul_vmls d4, d22, d18, d3[0], d3[1] // -> t21a vrshr.s32 d19, d8, #12 // t18a vrshr.s32 d24, d5, #12 // t29a vmul_vmla d6, d22, d18, d3[1], d3[0] // -> t26a vmul_vmla d8, d17, d20, d3[1], d3[0] // -> t22a vrshr.s32 d22, d4, #12 // t21a vrshr.s32 d18, d6, #12 // t26a vneg.s32 d8, d8 // -> t22a vmul_vmls d5, d17, d20, d3[0], d3[1] // -> t25a vrshr.s32 d17, d8, #12 // t22a vrshr.s32 d20, d5, #12 // t25a vqsub.s32 d2, d27, d24 // t29 vqadd.s32 d27, d27, d24 // t30 vqsub.s32 d3, d21, d19 // t18 vqadd.s32 d21, d21, d19 // t17 vqsub.s32 d24, d16, d28 // t19a vqadd.s32 d16, d16, d28 // t16a vqsub.s32 d19, d30, d23 // t20a vqadd.s32 d30, d30, d23 // t23a vqsub.s32 d28, d17, d22 // t21 vqadd.s32 d17, d17, d22 // t22 vqadd.s32 d23, d26, d29 // t24a vqsub.s32 d26, d26, d29 // t27a vqadd.s32 d22, d20, d18 // t25 vqsub.s32 d20, d20, d18 // t26 vqsub.s32 d29, d31, d25 // t28a vqadd.s32 d31, d31, d25 // t31a .irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 vmin.s32 \r, \r, d11 .endr .irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31 vmax.s32 \r, \r, d10 .endr vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19 vrshr.s32 d18, d4, #12 // t18a vrshr.s32 d25, d6, #12 // t29a vmul_vmla d5, d29, d24, d1[1], d1[0] // -> t28 vmul_vmla d4, d26, d19, d1[1], d1[0] // -> t20 vrshr.s32 d29, d8, #12 // t19 vrshr.s32 d24, d5, #12 // t28 vneg.s32 d4, d4 // -> t20 vmul_vmls d6, d26, d19, d1[0], d1[1] // -> t27 vmul_vmla d8, d20, d28, d1[1], d1[0] // -> t21a vrshr.s32 d26, d4, #12 // t20 vrshr.s32 d19, d6, #12 // t27 vneg.s32 d8, d8 // -> t21a vmul_vmls d5, d20, d28, d1[0], d1[1] // -> t26a vrshr.s32 d20, d8, #12 // t21a vrshr.s32 d28, d5, #12 // t26a vqsub.s32 d2, d16, d30 // t23 vqadd.s32 d16, d16, d30 // t16 = out16 vqsub.s32 d3, d31, d23 // t24 vqadd.s32 d31, d31, d23 // t31 = out31 vqsub.s32 d23, d21, d17 // t22a vqadd.s32 d17, d21, d17 // t17a = out17 vqadd.s32 d30, d27, d22 // t30a = out30 vqsub.s32 d21, d27, d22 // t25a vqsub.s32 d27, d18, d20 // t21 vqadd.s32 d18, d18, d20 // t18 = out18 vqadd.s32 d4, d29, d26 // t19a = out19 vqsub.s32 d26, d29, d26 // t20a vqadd.s32 d29, d25, d28 // t29 = out29 vqsub.s32 d25, d25, d28 // t26 vqadd.s32 d28, d24, d19 // t28a = out28 vqsub.s32 d24, d24, d19 // t27a vmov d19, d4 // out19 .irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 vmin.s32 \r, \r, d11 .endr .irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24 vmax.s32 \r, \r, d10 .endr vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20 vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27 vrshr.s32 d20, d4, #12 // t20 vrshr.s32 d22, d6, #12 // t27 vmul_vmla d4, d25, d27, d0[0], d0[0] // -> t26a vmul_vmls d6, d25, d27, d0[0], d0[0] // -> t21a vmov d27, d22 // t27 vrshr.s32 d26, d4, #12 // t26a vmul_vmls d24, d21, d23, d0[0], d0[0] // -> t22 vmul_vmla d4, d21, d23, d0[0], d0[0] // -> t25 vrshr.s32 d21, d6, #12 // t21a vrshr.s32 d22, d24, #12 // t22 vrshr.s32 d25, d4, #12 // t25 vmul_vmls d4, d3, d2, d0[0], d0[0] // -> t23a vmul_vmla d6, d3, d2, d0[0], d0[0] // -> t24a vrshr.s32 d23, d4, #12 // t23a vrshr.s32 d24, d6, #12 // t24a bx lr endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x2_neon push {lr} vmov.i32 d7, #0 lsl r8, r8, #1 .if \scale mov_const r12, 2896*8*(1<<16) vdup.32 d0, r12 .endif .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r7, :64] vst1.32 {d7}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .if \scale scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct_2s_x16_neon // idct_16 leaves the row_clip_max/min constants in d9 and d8, // but here we want to use full q registers for clipping. vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmin.s32 \r, \r, q3 .endr .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmax.s32 \r, \r, q2 .endr vtrn.32 d16, d17 vtrn.32 d18, d19 vtrn.32 d20, d21 vtrn.32 d22, d23 vtrn.32 d24, d25 vtrn.32 d26, d27 vtrn.32 d28, d29 vtrn.32 d30, d31 .macro store1 r0, r1, r2, r3 vst1.16 {\r0}, [r6, :64]! vst1.16 {\r1}, [r6, :64]! vst1.16 {\r2}, [r6, :64]! vst1.16 {\r3}, [r6, :64]! .endm store1 d16, d18, d20, d22 store1 d24, d26, d28, d30 store1 d17, d19, d21, d23 store1 d25, d27, d29, d31 .purgem store1 sub r6, r6, #64*2 vmov.i32 d7, #0 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.32 {\i}, [r7, :64] vst1.32 {d7}, [r7, :64], r8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in d0[1] scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 .endif bl inv_dct32_odd_2s_x16_neon vtrn.32 d31, d30 vtrn.32 d29, d28 vtrn.32 d27, d26 vtrn.32 d25, d24 vtrn.32 d23, d22 vtrn.32 d21, d20 vtrn.32 d19, d18 vtrn.32 d17, d16 .macro store2 r0, r1, r2, r3, r4, r5, r6, r7, shift vld1.32 {q0, q1}, [r6, :128]! vld1.32 {q2, q3}, [r6, :128] sub r6, r6, #32 vqsub.s32 d15, d0, \r0 vqadd.s32 d0, d0, \r0 vqsub.s32 d14, d1, \r1 vqadd.s32 d1, d1, \r1 vqsub.s32 d13, d2, \r2 vqadd.s32 d2, d2, \r2 vqsub.s32 d12, d3, \r3 vqadd.s32 d3, d3, \r3 vqsub.s32 d11, d4, \r4 vqadd.s32 d4, d4, \r4 vqsub.s32 d10, d5, \r5 vqadd.s32 d5, d5, \r5 vqsub.s32 d9, d6, \r6 vqadd.s32 d6, d6, \r6 vqsub.s32 d8, d7, \r7 vqadd.s32 d7, d7, \r7 vqrshrn.s32 d0, q0, #\shift vqrshrn.s32 d1, q1, #\shift vqrshrn.s32 d2, q2, #\shift vqrshrn.s32 d3, q3, #\shift vqrshrn.s32 d4, q4, #\shift vqrshrn.s32 d5, q5, #\shift vqrshrn.s32 d6, q6, #\shift vqrshrn.s32 d7, q7, #\shift vrev32.16 q2, q2 vrev32.16 q3, q3 vst1.16 {q0, q1}, [r6, :128]! vst1.16 {q2, q3}, [r6, :128]! .endm store2 d31, d29, d27, d25, d23, d21, d19, d17, \shift store2 d30, d28, d26, d24, d22, d20, d18, d16, \shift .purgem store2 pop {pc} endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_4x32_neon push {r10-r11,lr} lsl r8, r8, #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 bl X(inv_dct_4h_x16_neon) .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vst1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 add r7, r7, r8, lsr #1 .irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 vld1.16 {\i}, [r7, :64], r8 .endr sub r7, r7, r8, lsl #4 sub r7, r7, r8, lsr #1 bl X(inv_dct32_odd_4h_x16_neon) neg r9, r8 mov r10, r6 vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff .macro combine r0, r1, r2, r3, op, stride vld1.16 {d4}, [r7, :64], \stride vld1.16 {d0}, [r10, :64], r1 vld1.16 {d5}, [r7, :64], \stride vld1.16 {d1}, [r10, :64], r1 \op\().s16 d4, d4, \r0 vld1.16 {d6}, [r7, :64], \stride vld1.16 {d2}, [r10, :64], r1 \op\().s16 d5, d5, \r1 vld1.16 {d3}, [r10, :64], r1 vrshr.s16 q2, q2, #4 \op\().s16 d6, d6, \r2 vld1.16 {d7}, [r7, :64], \stride vqadd.s16 q0, q0, q2 \op\().s16 d7, d7, \r3 vmax.s16 q0, q0, q6 vrshr.s16 q3, q3, #4 vmin.s16 q0, q0, q7 vqadd.s16 q1, q1, q3 vst1.16 {d0}, [r6, :64], r1 vmax.s16 q1, q1, q6 vst1.16 {d1}, [r6, :64], r1 vmin.s16 q1, q1, q7 vst1.16 {d2}, [r6, :64], r1 vst1.16 {d3}, [r6, :64], r1 .endm combine d31, d30, d29, d28, vqadd, r8 combine d27, d26, d25, d24, vqadd, r8 combine d23, d22, d21, d20, vqadd, r8 combine d19, d18, d17, d16, vqadd, r8 sub r7, r7, r8 combine d16, d17, d18, d19, vqsub, r9 combine d20, d21, d22, d23, vqsub, r9 combine d24, d25, d26, d27, vqsub, r9 combine d28, d29, d30, d31, vqsub, r9 .purgem combine pop {r10-r11,pc} endfunc const eob_32x32 .short 3, 10, 21, 36, 55, 78, 105, 136, 171, 210, 253, 300, 351, 406, 465, 1024 endconst const eob_16x32 .short 3, 10, 21, 36, 55, 78, 105, 151, 183, 215, 247, 279, 311, 343, 375, 512 endconst const eob_16x32_shortside .short 3, 10, 21, 36, 55, 78, 105, 512 endconst const eob_8x32 .short 3, 10, 21, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 256 endconst function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 push {r4-r7,lr} vpush {q6-q7} movrel_local r5, eob_32x32, 2 mov r6, #4*32 1: mov r12, #0 movrel_local r4, eob_32x32, 6 2: vmov.i32 q0, #0 add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r6 .endr vqmovn.s32 d16, q8 vqmovn.s32 d17, q12 vqmovn.s32 d18, q9 vqmovn.s32 d19, q13 vqmovn.s32 d20, q10 vqmovn.s32 d21, q14 vqmovn.s32 d22, q11 vqmovn.s32 d23, q15 transpose_4x8h q8, q9, q10, q11 load_add_store_8x4 r0, r7, shiftbits=2 ldrh lr, [r4], #8 sub r0, r0, r1, lsl #2 cmp r3, lr add r0, r0, #2*8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12, lsl #1 add r0, r0, r1, lsl #2 mls r2, r6, r12, r2 add r2, r2, #4*4 b 1b 9: vpop {q6-q7} pop {r4-r7,pc} endfunc .macro shift_8_regs op, shift .irp i, q8, q9, q10, q11, q12, q13, q14, q15 \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 push {r4-r9,lr} vpush {q6-q7} mov r9, #0 mov_const r8, 2896*8*(1<<16) movt r9, #2*(5793-4096)*8 movrel_local r5, eob_16x32\hshort, 2 mov r6, #4*\h 1: mov r12, #0 movrel_local r4, eob_16x32\wshort, 6 2: vdup.i32 d0, r8 vmov.i32 q1, #0 vmov.32 d0[1], r9 add r12, r12, #8 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q1}, [r2, :128], r6 .endr scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 .if \w == 16 // 16x32 identity_8x4_shift1 d0[1] .else // 32x16 shift_8_regs vqshl.s32, 1 identity_8x4 d0[1] .endif vqmovn.s32 d16, q8 vqmovn.s32 d17, q12 vqmovn.s32 d18, q9 vqmovn.s32 d19, q13 vqmovn.s32 d20, q10 vqmovn.s32 d21, q14 vqmovn.s32 d22, q11 vqmovn.s32 d23, q15 transpose_4x8h q8, q9, q10, q11 .if \w == 16 load_add_store_8x4 r0, r7, shiftbits=2 .else load_add_store_8x4 r0, r7, shiftbits=4 .endif ldrh lr, [r4], #8 sub r0, r0, r1, lsl #2 cmp r3, lr add r0, r0, #2*8 bge 2b ldrh lr, [r5], #4 cmp r3, lr blt 9f sub r0, r0, r12, lsl #1 add r0, r0, r1, lsl #2 mls r2, r6, r12, r2 add r2, r2, #4*4 b 1b 9: vpop {q6-q7} pop {r4-r9,pc} endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 push {r4-r5,lr} vpush {q6-q7} movrel_local r4, eob_8x32, 2 mov r12, #4*\h 1: ldrh lr, [r4], #4 .if \w == 8 vmov.i32 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r12 .endr vqrshrn.s32 d16, q8, #1 vqrshrn.s32 d17, q12, #1 vqrshrn.s32 d18, q9, #1 vqrshrn.s32 d19, q13, #1 vqrshrn.s32 d20, q10, #1 vqrshrn.s32 d21, q14, #1 vqrshrn.s32 d22, q11, #1 vqrshrn.s32 d23, q15, #1 transpose_4x8h q8, q9, q10, q11 cmp r3, lr load_add_store_8x4 r0, r5, shiftbits=2 blt 9f sub r2, r2, r12, lsl #3 add r2, r2, #4*4 .else vmov.i32 q0, #0 vmov.i32 q1, #0 vld1.32 {q8, q9}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vld1.32 {q10, q11}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vld1.32 {q12, q13}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vld1.32 {q14, q15}, [r2, :128] vst1.32 {q0, q1}, [r2, :128], r12 vqmovn.s32 d16, q8 vqmovn.s32 d17, q10 vqmovn.s32 d20, q9 vqmovn.s32 d21, q11 vqmovn.s32 d18, q12 vqmovn.s32 d19, q14 vqmovn.s32 d22, q13 vqmovn.s32 d23, q15 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 cmp r3, lr load_add_store_4x8 r0, r5, shiftbits=3 blt 9f sub r0, r0, r1, lsl #3 add r0, r0, #2*4 .endif b 1b 9: vpop {q6-q7} pop {r4-r5,pc} endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 idct_dc 32, 32, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 2048 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, sp, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_horz_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 2048 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 idct_dc 16, 32, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_2s_x16_neon .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, sp, #(\i*16*2) add r7, r2, #(\i*4) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif mov r8, #4*32 bl inv_txfm_horz_scale_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #16*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 1024 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 idct_dc 32, 16, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 1024 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel r5, X(inv_dct_4h_x16_neon) .irp i, 0, 2, 4, 6, 8, 10, 12, 14 add r6, sp, #(\i*32*2) add r7, r2, #(\i*4) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .if \i < 14 ldrh r11, [r10], #2 .endif .endif mov r8, #4*16 bl inv_txfm_horz_scale_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #32*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 1024 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 idct_dc 8, 32, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 512 movrel_local r10, eob_8x32, 2 mov r8, #4*32 mov r9, #32 mov r6, sp 1: vmov.i32 q0, #0 .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.32 {\i}, [r2, :128] vst1.32 {q0}, [r2, :128], r8 .endr ldrh r11, [r10], #4 sub r2, r2, r8, lsl #3 sub r9, r9, #4 add r2, r2, #4*4 bl inv_dct_4s_x8_neon vqrshrn.s32 d16, q8, #2 vqrshrn.s32 d18, q9, #2 vqrshrn.s32 d20, q10, #2 vqrshrn.s32 d22, q11, #2 vqrshrn.s32 d17, q12, #2 vqrshrn.s32 d19, q13, #2 vqrshrn.s32 d21, q14, #2 vqrshrn.s32 d23, q15, #2 transpose_4x8h q8, q9, q10, q11 vst1.16 {q8, q9}, [r6, :128]! cmp r3, r11 vst1.16 {q10, q11}, [r6, :128]! bge 1b cmp r9, #0 beq 3f vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r9, r9, #4 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4 add r6, r0, #(\i*2) add r7, sp, #(\i*2) mov r8, #8*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 idct_dc 32, 8, 2 push {r4-r11,lr} vpush {q4-q7} movrel_local r10, eob_8x32 sub_sp_align 512 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6 add r6, sp, #(\i*32*2) add r7, r2, #(\i*4) .if \i > 0 cmp r3, r11 mov r8, #(8 - \i) blt 1f .if \i < 6 ldrh r11, [r10], #2 .endif .endif mov r8, #8*4 bl inv_txfm_horz_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: mov r8, #2*32 mov r9, #0 1: add r6, r0, r9, lsl #1 add r7, sp, r9, lsl #1 // #(\i*2) .irp i, q8, q9, q10, q11, q12, q13, q14, q15 vld1.16 {\i}, [r7, :128], r8 .endr add r9, r9, #8 bl X(inv_dct_8h_x8_neon) cmp r9, #32 load_add_store_8x8 r6, r7 blt 1b add_sp_align 512 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vld1.32 {q0, q1}, [r12, :128]! vqrdmulh.s32 d23, d16, d0[1] // t63a vqrdmulh.s32 d16, d16, d0[0] // t32a vqrdmulh.s32 d22, d17, d1[0] // t62a vqrdmulh.s32 d17, d17, d1[1] // t33a vqrdmulh.s32 d21, d18, d2[1] // t61a vqrdmulh.s32 d18, d18, d2[0] // t34a vqrdmulh.s32 d20, d19, d3[0] // t60a vqrdmulh.s32 d19, d19, d3[1] // t35a vld1.32 {q0}, [r12, :128]! vqadd.s32 d24, d16, d17 // t32 vqsub.s32 d25, d16, d17 // t33 vqsub.s32 d26, d19, d18 // t34 vqadd.s32 d27, d19, d18 // t35 vqadd.s32 d28, d20, d21 // t60 vqsub.s32 d29, d20, d21 // t61 vqsub.s32 d30, d23, d22 // t62 vqadd.s32 d31, d23, d22 // t63 .irp r, q12, q13, q14, q15 vmin.s32 \r, \r, q5 .endr .irp r, q12, q13, q14, q15 vmax.s32 \r, \r, q4 .endr vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a vneg.s32 d4, d4 // t34a vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a vrshr.s32 d26, d4, #12 // t34a vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a vrshr.s32 d29, d6, #12 // t61a vrshr.s32 d25, d7, #12 // t33a vrshr.s32 d30, d4, #12 // t62a vqadd.s32 d16, d24, d27 // t32a vqsub.s32 d19, d24, d27 // t35a vqadd.s32 d17, d25, d26 // t33 vqsub.s32 d18, d25, d26 // t34 vqsub.s32 d20, d31, d28 // t60a vqadd.s32 d23, d31, d28 // t63a vqsub.s32 d21, d30, d29 // t61 vqadd.s32 d22, d30, d29 // t62 .irp r, q8, q9, q10, q11 vmin.s32 \r, \r, q5 .endr .irp r, q8, q9, q10, q11 vmax.s32 \r, \r, q4 .endr vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60 vrshr.s32 d21, d4, #12 // t61a vrshr.s32 d18, d6, #12 // t34a vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35 vrshr.s32 d20, d7, #12 // t60 vrshr.s32 d19, d4, #12 // t35 vst1.32 {d16, d17, d18, d19}, [r6, :128]! vst1.32 {d20, d21, d22, d23}, [r6, :128]! bx lr endfunc function inv_dct64_step2_neon movrel_local r12, idct_coeffs vld1.32 {q0}, [r12, :128] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a vldr d16, [r6, #4*2*0] // t32a vldr d17, [r9, #4*2*8] // t39a vldr d18, [r9, #4*2*0] // t63a vldr d19, [r6, #4*2*8] // t56a vldr d20, [r6, #4*2*16] // t40a vldr d21, [r9, #4*2*24] // t47a vldr d22, [r9, #4*2*16] // t55a vldr d23, [r6, #4*2*24] // t48a vqadd.s32 d24, d16, d17 // t32 vqsub.s32 d25, d16, d17 // t39 vqadd.s32 d26, d18, d19 // t63 vqsub.s32 d27, d18, d19 // t56 vqsub.s32 d28, d21, d20 // t40 vqadd.s32 d29, d21, d20 // t47 vqadd.s32 d30, d23, d22 // t48 vqsub.s32 d31, d23, d22 // t55 .irp r, q12, q13, q14, q15 vmin.s32 \r, \r, q5 .endr .irp r, q12, q13, q14, q15 vmax.s32 \r, \r, q4 .endr vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a vrshr.s32 d25, d4, #12 // t56a vrshr.s32 d27, d6, #12 // t39a vneg.s32 d7, d7 // t40a vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a vrshr.s32 d31, d7, #12 // t40a vrshr.s32 d28, d4, #12 // t55a vqadd.s32 d16, d24, d29 // t32a vqsub.s32 d19, d24, d29 // t47a vqadd.s32 d17, d27, d31 // t39 vqsub.s32 d18, d27, d31 // t40 vqsub.s32 d20, d26, d30 // t48a vqadd.s32 d23, d26, d30 // t63a vqsub.s32 d21, d25, d28 // t55 vqadd.s32 d22, d25, d28 // t56 .irp r, q8, q9, q10, q11 vmin.s32 \r, \r, q5 .endr .irp r, q8, q9, q10, q11 vmax.s32 \r, \r, q4 .endr vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47 vrshr.s32 d18, d4, #12 // t40a vrshr.s32 d21, d6, #12 // t55a vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48 vrshr.s32 d19, d7, #12 // t47 vrshr.s32 d20, d4, #12 // t48 vstr d16, [r6, #4*2*0] // t32a vstr d17, [r9, #4*2*0] // t39 vstr d18, [r6, #4*2*8] // t40a vstr d19, [r9, #4*2*8] // t47 vstr d20, [r6, #4*2*16] // t48 vstr d21, [r9, #4*2*16] // t55a vstr d22, [r6, #4*2*24] // t56 vstr d23, [r9, #4*2*24] // t63a add r6, r6, #4*2 sub r9, r9, #4*2 cmp r6, r9 blt 1b bx lr endfunc .macro load8 src, strd, zero, clear .irp i, d16, d17, d18, d19, d20, d21, d22, d23 .if \clear vld1.32 {\i}, [\src, :64] vst1.32 {\zero}, [\src, :64], \strd .else vld1.32 {\i}, [\src, :64], \strd .endif .endr .endm .macro store16 dst vst1.32 {q8, q9}, [\dst, :128]! vst1.32 {q10, q11}, [\dst, :128]! vst1.32 {q12, q13}, [\dst, :128]! vst1.32 {q14, q15}, [\dst, :128]! .endm .macro clear_upper8 .irp i, q12, q13, q14, q15 vmov.i32 \i, #0 .endr .endm .macro vmov_if reg, val, cond .if \cond vmov.i32 \reg, \val .endif .endm .macro movdup_if reg, gpr, val, cond .if \cond mov_const \gpr, \val vdup.32 \reg, \gpr .endif .endm .macro vst1_if regs, dst, dstalign, cond .if \cond vst1.32 \regs, \dst, \dstalign .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_2s_x64_neon mov r6, sp push {r10-r11,lr} lsl r8, r8, #2 movdup_if d0, r12, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 add r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct_2s_x16_neon // idct_16 leaves the row_clip_max/min constants in d9 and d8, // but here we want to use full q registers for clipping. vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmin.s32 \r, \r, q3 .endr .irp r, q8, q9, q10, q11, q12, q13, q14, q15 vmax.s32 \r, \r, q2 .endr store16 r6 movdup_if d0, r12, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear load8 r7, r8, d7, \clear clear_upper8 sub r7, r7, r8, lsl #3 lsr r8, r8, #1 sub r7, r7, r8, lsr #1 scale_if \scale, d0[0], q8, q9, q10, q11 bl inv_dct32_odd_2s_x16_neon add r10, r6, #8*15 sub r6, r6, #8*16 mov r9, #-8 vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 .macro store_addsub r0, r1, r2, r3 vld1.32 {d2}, [r6, :64]! vld1.32 {d3}, [r6, :64]! vqadd.s32 d6, d2, \r0 vqsub.s32 \r0, d2, \r0 vld1.32 {d4}, [r6, :64]! vqadd.s32 d7, d3, \r1 vqsub.s32 \r1, d3, \r1 vmin.s32 d6, d6, d1 vmin.s32 \r0, \r0, d1 vld1.32 {d5}, [r6, :64]! vqadd.s32 d2, d4, \r2 sub r6, r6, #8*4 vmax.s32 d6, d6, d0 vmax.s32 \r0, \r0, d0 vqsub.s32 \r2, d4, \r2 vmin.s32 d7, d7, d1 vmin.s32 \r1, \r1, d1 vst1.32 {d6}, [r6, :64]! vst1.32 {\r0}, [r10, :64], r9 vmin.s32 d2, d2, d1 vmin.s32 \r2, \r2, d1 vmax.s32 d7, d7, d0 vmax.s32 \r1, \r1, d0 vqadd.s32 d3, d5, \r3 vqsub.s32 \r3, d5, \r3 vmax.s32 d2, d2, d0 vmax.s32 \r2, \r2, d0 vmin.s32 d3, d3, d1 vmin.s32 \r3, \r3, d1 vst1.32 {d7}, [r6, :64]! vst1.32 {\r1}, [r10, :64], r9 vmax.s32 d3, d3, d0 vmax.s32 \r3, \r3, d0 vst1.32 {d2}, [r6, :64]! vst1.32 {\r2}, [r10, :64], r9 vst1.32 {d3}, [r6, :64]! vst1.32 {\r3}, [r10, :64], r9 .endm store_addsub d31, d30, d29, d28 store_addsub d27, d26, d25, d24 store_addsub d23, d22, d21, d20 store_addsub d19, d18, d17, d16 .purgem store_addsub add r6, r6, #2*4*16 movrel_local r12, idct64_coeffs vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000 movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear add r9, r7, r8, lsl #4 // offset 16 add r10, r7, r8, lsl #3 // offset 8 sub r9, r9, r8 // offset 15 sub r11, r10, r8 // offset 7 vld1.32 {d16}, [r7, :64] // in1 (offset 0) vld1.32 {d17}, [r9, :64] // in31 (offset 15) vld1.32 {d18}, [r10, :64] // in17 (offset 8) vld1.32 {d19}, [r11, :64] // in15 (offset 7) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear add r7, r7, r8, lsl #2 // offset 4 sub r9, r9, r8, lsl #2 // offset 11 sub r10, r7, r8 // offset 3 add r11, r9, r8 // offset 12 vld1.32 {d16}, [r10, :64] // in7 (offset 3) vld1.32 {d17}, [r11, :64] // in25 (offset 12) vld1.32 {d18}, [r9, :64] // in23 (offset 11) vld1.32 {d19}, [r7, :64] // in9 (offset 4) vst1_if {d7}, [r7, :64], \clear vst1_if {d7}, [r9, :64], \clear vst1_if {d7}, [r10, :64], \clear vst1_if {d7}, [r11, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear sub r10, r10, r8, lsl #1 // offset 1 sub r9, r9, r8, lsl #1 // offset 9 add r10, r10, r8 // offset 2 add r9, r9, r8 // offset 10 add r7, r7, r8 // offset 5 add r11, r11, r8 // offset 13 vld1.32 d16, [r10, :64] // in5 (offset 2) vld1.32 d17, [r11, :64] // in27 (offset 13) vld1.32 d18, [r9, :64] // in21 (offset 10) vld1.32 d19, [r7, :64] // in11 (offset 5) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon movdup_if d0, lr, 2896*8*(1<<16), \scale vmov_if d7, #0, \clear sub r10, r10, r8 // offset 1 sub r9, r9, r8 // offset 9 add r11, r11, r8 // offset 14 add r7, r7, r8 // offset 6 vld1.32 d16, [r10, :64] // in3 (offset 1) vld1.32 d17, [r11, :64] // in29 (offset 14) vld1.32 d18, [r9, :64] // in19 (offset 9) vld1.32 d19, [r7, :64] // in13 (offset 6) vst1_if d7, [r10, :64], \clear vst1_if d7, [r11, :64], \clear vst1_if d7, [r9, :64], \clear vst1_if d7, [r7, :64], \clear scale_if \scale, d0[0], q8, q9 bl inv_dct64_step1_neon sub r6, r6, #2*4*32 add r9, r6, #2*4*7 bl inv_dct64_step2_neon pop {r10-r11,pc} endfunc .endm def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x2_neon vdup.32 q4, r9 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, #2*56 push {r10-r11,lr} mov r10, #2*64 mov r11, #-2*4*4 1: vld1.32 {d16, d17, d18, d19}, [r7, :128]! vld1.32 {d28, d29, d30, d31}, [r8, :128], r11 vld1.32 {d20, d21, d22, d23}, [r7, :128]! vld1.32 {d24, d25, d26, d27}, [r8, :128], r11 vtrn.32 d16, d17 vtrn.32 d18, d19 vtrn.32 d20, d21 vtrn.32 d22, d23 vtrn.32 d31, d30 vtrn.32 d29, d28 vtrn.32 d27, d26 vtrn.32 d25, d24 .macro store_addsub src0, src1, src2, src3, src4, src5, src6, src7 vqsub.s32 d7, \src0, \src1 vqsub.s32 d6, \src2, \src3 vqsub.s32 d5, \src4, \src5 vqsub.s32 d4, \src6, \src7 vqadd.s32 d0, \src0, \src1 vqadd.s32 d1, \src2, \src3 vqadd.s32 d2, \src4, \src5 vqadd.s32 d3, \src6, \src7 vrshl.s32 q3, q3, q4 vrshl.s32 q2, q2, q4 vrshl.s32 q0, q0, q4 vrshl.s32 q1, q1, q4 vqmovn.s32 d7, q3 vqmovn.s32 d6, q2 vqmovn.s32 d0, q0 vqmovn.s32 d1, q1 vrev32.16 q3, q3 vst1.16 {q0}, [r6, :128], r10 vst1.16 {q3}, [r9, :128], r10 .endm store_addsub d16, d31, d18, d29, d20, d27, d22, d25 store_addsub d17, d30, d19, d28, d21, d26, d23, d24 .purgem store_addsub sub r6, r6, r10, lsl #1 sub r9, r9, r10, lsl #1 add r6, r6, #16 sub r9, r9, #16 cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_vert_dct_4x64_neon lsl r8, r8, #1 mov r7, sp add r8, sp, #2*4*(64 - 4) add r9, r6, r1, lsl #6 sub r9, r9, r1 push {r10-r11,lr} neg r10, r1 mov r11, #-2*4*4 1: vld1.16 {d16, d17, d18, d19}, [r7, :128]! vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 vld1.16 {d20, d21, d22, d23}, [r7, :128]! vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 vmov.i16 q6, #0 vmvn.i16 q7, #0xfc00 // 0x3ff .macro add_dest_addsub src0, src1, src2, src3 vld1.16 {d0}, [r6, :64], r1 vld1.16 {d1}, [r9, :64], r10 vqadd.s16 d4, \src0, \src1 vld1.16 {d2}, [r6, :64] vqsub.s16 d5, \src0, \src1 vld1.16 {d3}, [r9, :64] vqadd.s16 d6, \src2, \src3 vqsub.s16 d7, \src2, \src3 sub r6, r6, r1 sub r9, r9, r10 vrshr.s16 q2, q2, #4 vrshr.s16 q3, q3, #4 vqadd.s16 q2, q2, q0 vqadd.s16 q3, q3, q1 vmax.s16 q2, q2, q6 vmax.s16 q3, q3, q6 vmin.s16 q2, q2, q7 vmin.s16 q3, q3, q7 vst1.16 {d4}, [r6, :64], r1 vst1.16 {d5}, [r9, :64], r10 vst1.16 {d6}, [r6, :64], r1 vst1.16 {d7}, [r9, :64], r10 .endm add_dest_addsub d16, d31, d17, d30 add_dest_addsub d18, d29, d19, d28 add_dest_addsub d20, d27, d21, d26 add_dest_addsub d22, d25, d23, d24 .purgem add_dest_addsub cmp r7, r8 blt 1b pop {r10-r11,pc} endfunc function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 idct_dc 64, 64, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_dct_clear_2s_x64_neon add r6, r5, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x2_neon .if \i < 30 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r7, r5, #(\i*2) mov r8, #64*2 bl X(inv_txfm_dct_4h_x64_neon) add r6, r0, #(\i*2) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 idct_dc 64, 32, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 64*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*64*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_dct_clear_scale_2s_x64_neon add r6, r5, #(\i*64*2) mov r9, #-1 // shift bl inv_txfm_horz_dct_64x2_neon .if \i < 30 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i*2) add r7, r5, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_dct_4x32_neon .endr add_sp_align 64*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 idct_dc 32, 64, 1 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 32*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_32x32 ldrh r11, [r10], #2 .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*32*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_horz_scale_dct_32x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 4 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r7, r5, #(\i*2) mov r8, #32*2 bl X(inv_txfm_dct_4h_x64_neon) add r6, r0, #(\i*2) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 32*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 idct_dc 64, 16, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 64*16*2+64*4*2 add r4, sp, #64*4*2 movrel_local r10, eob_16x32 .irp i, 0, 2, 4, 6, 8, 10, 12, 14 add r6, r4, #(\i*64*2) .if \i > 0 mov r8, #(16 - \i) cmp r3, r11 blt 1f .endif add r7, r2, #(\i*4) mov r8, #16*4 bl inv_txfm_dct_clear_2s_x64_neon add r6, r4, #(\i*64*2) mov r9, #-2 // shift bl inv_txfm_horz_dct_64x2_neon .if \i < 8 ldrh r11, [r10], #2 .endif .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 8 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: movrel r5, X(inv_dct_4h_x16_neon) .irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 add r6, r0, #(\i*2) add r7, r4, #(\i*2) mov r8, #64*2 bl inv_txfm_add_vert_4x16_neon .endr add_sp_align 64*16*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 idct_dc 16, 64, 2 push {r4-r11,lr} vpush {q4-q7} sub_sp_align 16*32*2+64*4*2 add r5, sp, #64*4*2 movrel_local r10, eob_16x32 ldrh r11, [r10], #2 movrel_local r4, inv_dct_2s_x16_neon .irp i, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 add r6, r5, #(\i*16*2) .if \i > 0 mov r8, #(32 - \i) cmp r3, r11 blt 1f .if \i < 30 ldrh r11, [r10], #2 .endif .endif add r7, r2, #(\i*4) mov r8, #32*4 bl inv_txfm_horz_16x2_neon .endr b 3f 1: vmov.i16 q2, #0 vmov.i16 q3, #0 2: subs r8, r8, #2 .rept 2 vst1.16 {q2, q3}, [r6, :128]! .endr bgt 2b 3: .irp i, 0, 4, 8, 12 add r7, r5, #(\i*2) mov r8, #16*2 bl X(inv_txfm_dct_4h_x64_neon) add r6, r0, #(\i*2) bl inv_txfm_add_vert_dct_4x64_neon .endr add_sp_align 16*32*2+64*4*2 vpop {q4-q7} pop {r4-r11,pc} endfunc rav1e-0.7.1/src/arm/32/loopfilter.S000064400000000000000000000771631046102023000147620ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro loop_filter wd function lpf_8_wd\wd\()_neon vabd.u8 d0, d22, d23 // abs(p1 - p0) vabd.u8 d1, d25, d24 // abs(q1 - q0) vabd.u8 d2, d23, d24 // abs(p0 - q0) vabd.u8 d3, d22, d25 // abs(p1 - q1) .if \wd >= 6 vabd.u8 d4, d21, d22 // abs(p2 - p1) vabd.u8 d5, d26, d25 // abs(q2 - q1) .endif .if \wd >= 8 vabd.u8 d6, d20, d21 // abs(p3 - p2) vabd.u8 d7, d27, d26 // abs(q3 - q3) .endif .if \wd >= 6 vmax.u8 d4, d4, d5 .endif vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2 .if \wd >= 8 vmax.u8 d6, d6, d7 .endif vshr.u8 d3, d3, #1 .if \wd >= 8 vmax.u8 d4, d4, d6 .endif .if \wd >= 6 vand d4, d4, d14 .endif vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 vmax.u8 d4, d0, d4 vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E vand d1, d1, d2 // fm vand d1, d1, d13 // fm && wd >= 4 .if \wd >= 6 vand d14, d14, d1 // fm && wd > 4 .endif .if \wd >= 16 vand d15, d15, d1 // fm && wd == 16 .endif vmov r10, r11, d1 orrs r10, r10, r11 beq 9f // if (!fm || wd < 4) return; .if \wd >= 6 vmov.i8 d10, #1 vabd.u8 d2, d21, d23 // abs(p2 - p0) vabd.u8 d3, d22, d23 // abs(p1 - p0) vabd.u8 d4, d25, d24 // abs(q1 - q0) vabd.u8 d5, d26, d24 // abs(q2 - q0) .if \wd >= 8 vabd.u8 d6, d20, d23 // abs(p3 - p0) vabd.u8 d7, d27, d24 // abs(q3 - q0) .endif vmax.u8 d2, d2, d3 vmax.u8 d4, d4, d5 .if \wd >= 8 vmax.u8 d6, d6, d7 .endif vmax.u8 d2, d2, d4 .if \wd >= 8 vmax.u8 d2, d2, d6 .endif .if \wd == 16 vabd.u8 d3, d17, d23 // abs(p6 - p0) vabd.u8 d4, d18, d23 // abs(p5 - p0) vabd.u8 d5, d19, d23 // abs(p4 - p0) .endif vcge.u8 d2, d10, d2 // flat8in .if \wd == 16 vabd.u8 d6, d28, d24 // abs(q4 - q0) vabd.u8 d7, d29, d24 // abs(q5 - q0) vabd.u8 d8, d30, d24 // abs(q6 - q0) .endif vand d14, d2, d14 // flat8in && fm && wd > 4 vbic d1, d1, d14 // fm && wd >= 4 && !flat8in .if \wd == 16 vmax.u8 d3, d3, d4 vmax.u8 d5, d5, d6 .endif vmov r10, r11, d1 .if \wd == 16 vmax.u8 d7, d7, d8 vmax.u8 d3, d3, d5 vmax.u8 d3, d3, d7 vcge.u8 d3, d10, d3 // flat8out .endif orrs r10, r10, r11 .if \wd == 16 vand d15, d15, d3 // flat8out && fm && wd == 16 vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out .endif beq 1f // skip wd == 4 case .endif vsubl.u8 q1, d22, d25 // p1 - q1 vcgt.u8 d0, d0, d12 // hev vqmovn.s16 d2, q1 vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) vbic d0, d1, d0 // (fm && wd >= 4 && !hev) vsubl.u8 q1, d24, d23 vmov.i16 q3, #3 vmul.i16 q1, q1, q3 vmov.i8 d6, #4 vaddw.s8 q1, q1, d4 vmov.i8 d7, #3 vqmovn.s16 d2, q1 // f vqadd.s8 d4, d6, d2 // imin(f + 4, 127) vqadd.s8 d5, d7, d2 // imin(f + 3, 127) vshr.s8 d4, d4, #3 // f1 vshr.s8 d5, d5, #3 // f2 vmovl.u8 q1, d23 // p0 vmovl.u8 q3, d24 // q0 vaddw.s8 q1, q1, d5 vsubw.s8 q3, q3, d4 vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1 vqmovun.s16 d2, q1 // out p0 vqmovun.s16 d6, q3 // out q0 vbit d23, d2, d1 // if (fm && wd >= 4) vmovl.u8 q1, d22 // p1 vbit d24, d6, d1 // if (fm && wd >= 4) vmovl.u8 q3, d25 // q1 vaddw.s8 q1, q1, d4 vsubw.s8 q3, q3, d4 vqmovun.s16 d2, q1 // out p1 vqmovun.s16 d6, q3 // out q1 vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 vmov r10, r11, d14 orrs r10, r10, r11 beq 2f // skip if there's no flat8in vaddl.u8 q0, d21, d21 // p2 * 2 vaddl.u8 q1, d21, d22 // p2 + p1 vaddl.u8 q2, d22, d23 // p1 + p0 vaddl.u8 q3, d23, d24 // p0 + q0 vadd.i16 q4, q0, q1 vadd.i16 q5, q2, q3 vaddl.u8 q6, d24, d25 // q0 + q1 vadd.i16 q4, q4, q5 vsub.i16 q6, q6, q0 vaddl.u8 q5, d25, d26 // q1 + q2 vrshrn.i16 d0, q4, #3 // out p1 vadd.i16 q4, q4, q6 vsub.i16 q5, q5, q1 vaddl.u8 q6, d26, d26 // q2 + q2 vrshrn.i16 d1, q4, #3 // out p0 vadd.i16 q4, q4, q5 vsub.i16 q6, q6, q2 vrshrn.i16 d2, q4, #3 // out q0 vbit d22, d0, d14 // p1 if (flat8in) vadd.i16 q4, q4, q6 vbit d23, d1, d14 // p0 if (flat8in) vrshrn.i16 d3, q4, #3 // out q1 vbit d24, d2, d14 // q0 if (flat8in) vbit d25, d3, d14 // q1 if (flat8in) .elseif \wd >= 8 vmov r10, r11, d14 orrs r10, r10, r11 .if \wd == 8 beq 8f // skip if there's no flat8in .else beq 2f // skip if there's no flat8in .endif vaddl.u8 q0, d20, d21 // p3 + p2 vaddl.u8 q1, d22, d25 // p1 + q1 vaddl.u8 q2, d20, d22 // p3 + p1 vaddl.u8 q3, d23, d26 // p0 + q2 vadd.i16 q4, q0, q0 // 2 * (p3 + p2) vaddw.u8 q4, q4, d23 // + p0 vaddw.u8 q4, q4, d24 // + q0 vadd.i16 q4, q4, q2 // + p3 + p1 vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2 vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1 vrshrn.i16 d10, q4, #3 // out p2 vadd.i16 q4, q4, q1 vaddl.u8 q0, d20, d23 // p3 + p0 vaddl.u8 q1, d24, d27 // q0 + q3 vrshrn.i16 d11, q4, #3 // out p1 vadd.i16 q4, q4, q3 vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0 vaddl.u8 q2, d21, d24 // p2 + q0 vaddl.u8 q3, d25, d27 // q1 + q3 vrshrn.i16 d12, q4, #3 // out p0 vadd.i16 q4, q4, q1 vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0 vaddl.u8 q0, d22, d25 // p1 + q1 vaddl.u8 q1, d26, d27 // q2 + q3 vrshrn.i16 d13, q4, #3 // out q0 vadd.i16 q4, q4, q3 vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1 vrshrn.i16 d0, q4, #3 // out q1 vadd.i16 q4, q4, q1 vbit d21, d10, d14 vbit d22, d11, d14 vbit d23, d12, d14 vrshrn.i16 d1, q4, #3 // out q2 vbit d24, d13, d14 vbit d25, d0, d14 vbit d26, d1, d14 .endif 2: .if \wd == 16 vmov r10, r11, d15 orrs r10, r10, r11 bne 1f // check if flat8out is needed vmov r10, r11, d14 orrs r10, r10, r11 beq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: vaddl.u8 q1, d17, d17 // p6 + p6 vaddl.u8 q2, d17, d18 // p6 + p5 vaddl.u8 q3, d17, d19 // p6 + p4 vaddl.u8 q4, d17, d20 // p6 + p3 vadd.i16 q6, q1, q2 vadd.i16 q5, q3, q4 vaddl.u8 q3, d17, d21 // p6 + p2 vadd.i16 q6, q6, q5 vaddl.u8 q4, d17, d22 // p6 + p1 vaddl.u8 q5, d18, d23 // p5 + p0 vadd.i16 q3, q3, q4 vaddl.u8 q4, d19, d24 // p4 + q0 vadd.i16 q6, q6, q3 vadd.i16 q5, q5, q4 vaddl.u8 q3, d20, d25 // p3 + q1 vadd.i16 q6, q6, q5 vsub.i16 q3, q3, q1 vaddl.u8 q1, d21, d26 // p2 + q2 vrshrn.i16 d0, q6, #4 // out p5 vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1) vsub.i16 q1, q1, q2 vaddl.u8 q2, d22, d27 // p1 + q3 vaddl.u8 q3, d17, d19 // p6 + p4 vrshrn.i16 d1, q6, #4 // out p4 vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2) vsub.i16 q2, q2, q3 vaddl.u8 q3, d23, d28 // p0 + q4 vaddl.u8 q4, d17, d20 // p6 + p3 vrshrn.i16 d2, q6, #4 // out p3 vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3) vsub.i16 q3, q3, q4 vaddl.u8 q4, d24, d29 // q0 + q5 vaddl.u8 q2, d17, d21 // p6 + p2 vrshrn.i16 d3, q6, #4 // out p2 vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4) vsub.i16 q4, q4, q2 vaddl.u8 q3, d25, d30 // q1 + q6 vaddl.u8 q5, d17, d22 // p6 + p1 vrshrn.i16 d4, q6, #4 // out p1 vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5) vsub.i16 q3, q3, q5 vaddl.u8 q4, d26, d30 // q2 + q6 vbif d0, d18, d15 // out p5 vaddl.u8 q5, d18, d23 // p5 + p0 vrshrn.i16 d5, q6, #4 // out p0 vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6) vsub.i16 q4, q4, q5 vaddl.u8 q5, d27, d30 // q3 + q6 vbif d1, d19, d15 // out p4 vaddl.u8 q9, d19, d24 // p4 + q0 vrshrn.i16 d6, q6, #4 // out q0 vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6) vsub.i16 q5, q5, q9 vaddl.u8 q4, d28, d30 // q4 + q6 vbif d2, d20, d15 // out p3 vaddl.u8 q9, d20, d25 // p3 + q1 vrshrn.i16 d7, q6, #4 // out q1 vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6) vsub.i16 q9, q4, q9 vaddl.u8 q5, d29, d30 // q5 + q6 vbif d3, d21, d15 // out p2 vaddl.u8 q10, d21, d26 // p2 + q2 vrshrn.i16 d8, q6, #4 // out q2 vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6) vsub.i16 q5, q5, q10 vaddl.u8 q9, d30, d30 // q6 + q6 vbif d4, d22, d15 // out p1 vaddl.u8 q10, d22, d27 // p1 + q3 vrshrn.i16 d9, q6, #4 // out q3 vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6) vsub.i16 q9, q9, q10 vbif d5, d23, d15 // out p0 vrshrn.i16 d10, q6, #4 // out q4 vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6) vrshrn.i16 d11, q6, #4 // out q5 vbif d6, d24, d15 // out q0 vbif d7, d25, d15 // out q1 vbif d8, d26, d15 // out q2 vbif d9, d27, d15 // out q3 vbif d10, d28, d15 // out q4 vbif d11, d29, d15 // out q5 .endif bx lr .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels bx r8 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels bx r9 .endif 9: // Return directly without writing back any pixels bx r12 endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_8_wd16 adr r8, 7f + CONFIG_THUMB adr r9, 8f + CONFIG_THUMB bl lpf_8_wd16_neon .endm .macro lpf_8_wd8 adr r9, 8f + CONFIG_THUMB bl lpf_8_wd8_neon .endm .macro lpf_8_wd6 bl lpf_8_wd6_neon .endm .macro lpf_8_wd4 bl lpf_8_wd4_neon .endm function lpf_v_4_8_neon mov r12, lr sub r10, r0, r1, lsl #1 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 lpf_8_wd4 sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_4_8_neon mov r12, lr sub r10, r0, #2 add r0, r10, r1, lsl #2 vld1.32 {d22[0]}, [r10], r1 vld1.32 {d22[1]}, [r0], r1 vld1.32 {d23[0]}, [r10], r1 vld1.32 {d23[1]}, [r0], r1 vld1.32 {d24[0]}, [r10], r1 vld1.32 {d24[1]}, [r0], r1 vld1.32 {d25[0]}, [r10], r1 vld1.32 {d25[1]}, [r0], r1 add r0, r0, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 lpf_8_wd4 sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc function lpf_v_6_8_neon mov r12, lr sub r10, r0, r1, lsl #1 sub r10, r10, r1 vld1.8 {d21}, [r10, :64], r1 // p2 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d25}, [r0, :64], r1 // q1 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 lpf_8_wd6 sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_6_8_neon mov r12, lr sub r10, r0, #4 add r0, r10, r1, lsl #2 vld1.8 {d20}, [r10], r1 vld1.8 {d24}, [r0], r1 vld1.8 {d21}, [r10], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d22}, [r10], r1 vld1.8 {d26}, [r0], r1 vld1.8 {d23}, [r10], r1 vld1.8 {d27}, [r0], r1 add r0, r0, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 lpf_8_wd6 sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc function lpf_v_8_8_neon mov r12, lr sub r10, r0, r1, lsl #2 vld1.8 {d20}, [r10, :64], r1 // p3 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d21}, [r10, :64], r1 // p2 vld1.8 {d25}, [r0, :64], r1 // q1 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d26}, [r0, :64], r1 // q2 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d27}, [r0, :64], r1 // q3 sub r0, r0, r1, lsl #2 lpf_8_wd8 sub r10, r0, r1, lsl #1 sub r10, r10, r1 vst1.8 {d21}, [r10, :64], r1 // p2 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d25}, [r0, :64], r1 // q1 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_8_8_neon mov r12, lr sub r10, r0, #4 add r0, r10, r1, lsl #2 vld1.8 {d20}, [r10], r1 vld1.8 {d24}, [r0], r1 vld1.8 {d21}, [r10], r1 vld1.8 {d25}, [r0], r1 vld1.8 {d22}, [r10], r1 vld1.8 {d26}, [r0], r1 vld1.8 {d23}, [r10], r1 vld1.8 {d27}, [r0], r1 add r0, r0, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 lpf_8_wd8 sub r10, r0, r1, lsl #3 sub r10, r10, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 add r0, r10, r1, lsl #2 vst1.8 {d20}, [r10], r1 vst1.8 {d24}, [r0], r1 vst1.8 {d21}, [r10], r1 vst1.8 {d25}, [r0], r1 vst1.8 {d22}, [r10], r1 vst1.8 {d26}, [r0], r1 vst1.8 {d23}, [r10], r1 vst1.8 {d27}, [r0], r1 add r0, r0, #4 bx r12 8: sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc function lpf_v_16_8_neon mov r12, lr sub r10, r0, r1, lsl #3 add r10, r10, r1 vld1.8 {d17}, [r10, :64], r1 // p6 vld1.8 {d24}, [r0, :64], r1 // q0 vld1.8 {d18}, [r10, :64], r1 // p5 vld1.8 {d25}, [r0, :64], r1 // q1 vld1.8 {d19}, [r10, :64], r1 // p4 vld1.8 {d26}, [r0, :64], r1 // q2 vld1.8 {d20}, [r10, :64], r1 // p3 vld1.8 {d27}, [r0, :64], r1 // q3 vld1.8 {d21}, [r10, :64], r1 // p2 vld1.8 {d28}, [r0, :64], r1 // q4 vld1.8 {d22}, [r10, :64], r1 // p1 vld1.8 {d29}, [r0, :64], r1 // q5 vld1.8 {d23}, [r10, :64], r1 // p0 vld1.8 {d30}, [r0, :64], r1 // q6 sub r0, r0, r1, lsl #3 add r0, r0, r1 lpf_8_wd16 sub r10, r0, r1, lsl #2 sub r10, r10, r1, lsl #1 vst1.8 {d0}, [r10, :64], r1 // p5 vst1.8 {d6}, [r0, :64], r1 // q0 vst1.8 {d1}, [r10, :64], r1 // p4 vst1.8 {d7}, [r0, :64], r1 // q1 vst1.8 {d2}, [r10, :64], r1 // p3 vst1.8 {d8}, [r0, :64], r1 // q2 vst1.8 {d3}, [r10, :64], r1 // p2 vst1.8 {d9}, [r0, :64], r1 // q3 vst1.8 {d4}, [r10, :64], r1 // p1 vst1.8 {d10}, [r0, :64], r1 // q4 vst1.8 {d5}, [r10, :64], r1 // p0 vst1.8 {d11}, [r0, :64], r1 // q5 sub r0, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 bx r12 7: sub r10, r0, r1 sub r10, r10, r1, lsl #1 vst1.8 {d21}, [r10, :64], r1 // p2 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d25}, [r0, :64], r1 // q1 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.8 {d22}, [r10, :64], r1 // p1 vst1.8 {d24}, [r0, :64], r1 // q0 vst1.8 {d23}, [r10, :64], r1 // p0 vst1.8 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_16_8_neon mov r12, lr sub r10, r0, #8 vld1.8 {d16}, [r10, :64], r1 vld1.8 {d24}, [r0, :64], r1 vld1.8 {d17}, [r10, :64], r1 vld1.8 {d25}, [r0, :64], r1 vld1.8 {d18}, [r10, :64], r1 vld1.8 {d26}, [r0, :64], r1 vld1.8 {d19}, [r10, :64], r1 vld1.8 {d27}, [r0, :64], r1 vld1.8 {d20}, [r10, :64], r1 vld1.8 {d28}, [r0, :64], r1 vld1.8 {d21}, [r10, :64], r1 vld1.8 {d29}, [r0, :64], r1 vld1.8 {d22}, [r10, :64], r1 vld1.8 {d30}, [r0, :64], r1 vld1.8 {d23}, [r10, :64], r1 vld1.8 {d31}, [r0, :64], r1 transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31 lpf_8_wd16 sub r0, r0, r1, lsl #3 sub r10, r0, #8 transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5 transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31 vst1.8 {d16}, [r10, :64], r1 vst1.8 {d6}, [r0, :64], r1 vst1.8 {d17}, [r10, :64], r1 vst1.8 {d7}, [r0, :64], r1 vst1.8 {d0}, [r10, :64], r1 vst1.8 {d8}, [r0, :64], r1 vst1.8 {d1}, [r10, :64], r1 vst1.8 {d9}, [r0, :64], r1 vst1.8 {d2}, [r10, :64], r1 vst1.8 {d10}, [r0, :64], r1 vst1.8 {d3}, [r10, :64], r1 vst1.8 {d11}, [r0, :64], r1 vst1.8 {d4}, [r10, :64], r1 vst1.8 {d30}, [r0, :64], r1 vst1.8 {d5}, [r10, :64], r1 vst1.8 {d31}, [r0, :64], r1 bx r12 7: sub r10, r0, r1, lsl #3 sub r10, r10, #4 transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27 add r0, r10, r1, lsl #2 vst1.8 {d20}, [r10], r1 vst1.8 {d24}, [r0], r1 vst1.8 {d21}, [r10], r1 vst1.8 {d25}, [r0], r1 vst1.8 {d22}, [r10], r1 vst1.8 {d26}, [r0], r1 vst1.8 {d23}, [r10], r1 vst1.8 {d27}, [r0], r1 add r0, r0, #4 bx r12 8: sub r10, r0, r1, lsl #3 sub r10, r10, #2 transpose_4x8b q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #2 vst1.32 {d22[0]}, [r10], r1 vst1.32 {d22[1]}, [r0], r1 vst1.32 {d23[0]}, [r10], r1 vst1.32 {d23[1]}, [r0], r1 vst1.32 {d24[0]}, [r10], r1 vst1.32 {d24[1]}, [r0], r1 vst1.32 {d25[0]}, [r10], r1 vst1.32 {d25[1]}, [r0], r1 add r0, r0, #2 bx r12 endfunc // void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av1FilterLUT *lut, const int w) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [r2] // vmask[0], vmask[1] .ifc \type, y ldr r2, [r2, #8] // vmask[2] .endif add r5, r5, #128 // Move to sharp part of lut .ifc \type, y orr r7, r7, r2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub r4, r3, r4, lsl #2 .else sub r3, r3, #4 lsl r4, r4, #2 .endif orr r6, r6, r7 // vmask[0] |= vmask[1] 1: tst r6, #0x03 .ifc \dir, v vld1.8 {d0}, [r4]! vld1.8 {d1}, [r3]! .else vld2.32 {d0[0], d1[0]}, [r3], r4 vld2.32 {d0[1], d1[1]}, [r3], r4 .endif beq 7f // if (!(vm & bits)) continue; vld1.8 {d5[]}, [r5] // sharp[0] add r5, r5, #8 vmov.i32 d2, #0xff vdup.32 d13, r6 // vmask[0] vand d0, d0, d2 // Keep only lowest byte in each 32 bit word vand d1, d1, d2 vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0] vmov.i8 d4, #1 vld1.8 {d6[]}, [r5] // sharp[1] sub r5, r5, #8 vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0] vtst.32 d2, d1, d2 // L != 0 vmul.i32 d1, d1, d4 // L .ifc \type, y vdup.32 d15, r2 // vmask[2] .endif vdup.32 d14, r7 // vmask[1] vmov r10, r11, d2 orrs r10, r10, r11 beq 7f // if (!L) continue; vneg.s8 d5, d5 // -sharp[0] movrel_local r10, word_12 vshr.u8 d12, d1, #4 // H vld1.32 {d16}, [r10, :64] vshl.s8 d3, d1, d5 // L >> sharp[0] .ifc \type, y vtst.32 d15, d15, d16 // if (vmask[2] & bits) .endif vmov.i8 d7, #2 vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1]) vadd.i8 d0, d1, d7 // L + 2 vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I vadd.u8 d0, d0, d0 // 2*(L + 2) vtst.32 d14, d14, d16 // if (vmask[1] & bits) vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E vtst.32 d13, d13, d16 // if (vmask[0] & bits) vand d13, d13, d2 // vmask[0] &= L != 0 .ifc \type, y tst r2, #0x03 beq 2f // wd16 bl lpf_\dir\()_16_8_neon b 8f 2: .endif tst r7, #0x03 beq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_8_neon .else // wd6 bl lpf_\dir\()_6_8_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_8_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment r0. // If the whole function is skipped, increment it here instead. add r0, r0, r1, lsl #3 .else 7: .endif 8: lsrs r6, r6, #2 // vmask[0] >>= 2 lsr r7, r7, #2 // vmask[1] >>= 2 .ifc \type, y lsr r2, r2, #2 // vmask[2] >>= 2 .endif .ifc \dir, v add r0, r0, #8 .else // For dir h, r0 is returned incremented .endif bne 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv const word_12, align=4 .word 1, 2 endconst rav1e-0.7.1/src/arm/32/loopfilter16.S000064400000000000000000000777251046102023000151350ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro loop_filter wd function lpf_4_wd\wd\()_neon vabd.u16 d0, d22, d23 // abs(p1 - p0) vabd.u16 d1, d25, d24 // abs(q1 - q0) vabd.u16 d2, d23, d24 // abs(p0 - q0) vabd.u16 d3, d22, d25 // abs(p1 - q1) .if \wd >= 6 vabd.u16 d4, d21, d22 // abs(p2 - p1) vabd.u16 d5, d26, d25 // abs(q2 - q1) .endif .if \wd >= 8 vabd.u16 d6, d20, d21 // abs(p3 - p2) vabd.u16 d7, d27, d26 // abs(q3 - q3) .endif .if \wd >= 6 vmax.u16 d4, d4, d5 .endif vqadd.u16 d2, d2, d2 // abs(p0 - q0) * 2 .if \wd >= 8 vmax.u16 d6, d6, d7 .endif vshr.u16 d3, d3, #1 .if \wd >= 8 vmax.u16 d4, d4, d6 .endif vmax.u16 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0)) vqadd.u16 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 vmax.u16 d4, d0, d4 vcge.u16 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else vcge.u16 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif vcge.u16 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E vand d1, d1, d2 // fm && wd >= 4 (implicit) .if \wd >= 6 vmov d14, d1 // fm && wd > 4 (implicit) .endif .if \wd >= 16 vmov d15, d1 // fm && wd == 16 (implicit) .endif vmov r10, r11, d1 orrs r10, r10, r11 beq 9f // if (!fm || wd < 4) return; .if \wd >= 6 vmov.i16 d10, #1 vabd.u16 d2, d21, d23 // abs(p2 - p0) vabd.u16 d3, d22, d23 // abs(p1 - p0) vabd.u16 d4, d25, d24 // abs(q1 - q0) vabd.u16 d5, d26, d24 // abs(q2 - q0) vdup.16 d9, r9 // bitdepth_min_8 .if \wd >= 8 vabd.u16 d6, d20, d23 // abs(p3 - p0) vabd.u16 d7, d27, d24 // abs(q3 - q0) .endif vmax.u16 d2, d2, d3 vmax.u16 d4, d4, d5 .if \wd >= 8 vmax.u16 d6, d6, d7 .endif vmax.u16 d2, d2, d4 vshl.u16 d10, d10, d9 // F = 1 << bitdepth_min_8 .if \wd >= 8 vmax.u16 d2, d2, d6 .endif .if \wd == 16 vabd.u16 d3, d17, d23 // abs(p6 - p0) vabd.u16 d4, d18, d23 // abs(p5 - p0) vabd.u16 d5, d19, d23 // abs(p4 - p0) .endif vcge.u16 d2, d10, d2 // flat8in .if \wd == 16 vabd.u16 d6, d28, d24 // abs(q4 - q0) vabd.u16 d7, d29, d24 // abs(q5 - q0) vabd.u16 d8, d30, d24 // abs(q6 - q0) .endif vand d14, d2, d14 // flat8in && fm && wd > 4 vbic d1, d1, d14 // fm && wd >= 4 && !flat8in .if \wd == 16 vmax.u16 d3, d3, d4 vmax.u16 d5, d5, d6 .endif vmov r10, r11, d1 .if \wd == 16 vmax.u16 d7, d7, d8 vmax.u16 d3, d3, d5 vmax.u16 d3, d3, d7 vcge.u16 d3, d10, d3 // flat8out .endif orrs r10, r10, r11 .if \wd == 16 vand d15, d15, d3 // flat8out && fm && wd == 16 vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16 vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out .endif beq 1f // skip wd == 4 case .endif vdup.16 d3, r8 // bitdepth_max vsub.u16 d2, d22, d25 // p1 - q1 vshr.u16 d3, d3, #1 // 128 << bitdepth_min_8 - 1 vcgt.u16 d0, d0, d12 // hev vmvn d9, d3 // - 128 * (1 << bitdepth_min_8) vmin.s16 d2, d2, d3 // iclip_diff(p1 - q1) vmax.s16 d2, d2, d9 // iclip_diff(p1 - q1) vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1) vsub.u16 d2, d24, d23 vmov.i16 d6, #3 vbic d0, d1, d0 // (fm && wd >= 4 && !hev) vmul.i16 d2, d2, d6 vmov.i16 d7, #4 vadd.i16 d2, d2, d4 vmin.s16 d2, d2, d3 // f = iclip_diff() vmax.s16 d2, d2, d9 // f = iclip_diff() vqadd.s16 d4, d7, d2 // f + 4 vqadd.s16 d5, d6, d2 // f + 3 vmin.s16 d4, d4, d3 // imin(f + 4, 128 << bitdepth_min_8 - 1) vmin.s16 d5, d5, d3 // imin(f + 3, 128 << bitdepth_min_8 - 1) vshr.s16 d4, d4, #3 // f1 vshr.s16 d5, d5, #3 // f2 vmov.i16 d9, #0 vdup.16 d3, r8 // bitdepth_max vqadd.s16 d2, d23, d5 // p0 + f2 vqsub.s16 d6, d24, d4 // q0 - f1 vrshr.s16 d4, d4, #1 // (f1 + 1) >> 1 vmin.s16 d2, d2, d3 // out p0 = iclip_pixel() vmin.s16 d6, d6, d3 // out q0 = iclip_pixel() vmax.s16 d2, d2, d9 // out p0 = iclip_pixel() vmax.s16 d6, d6, d9 // out q0 = iclip_pixel() vbit d23, d2, d1 // if (fm && wd >= 4) vbit d24, d6, d1 // if (fm && wd >= 4) vqadd.s16 d2, d22, d4 // p1 + f vqsub.s16 d6, d25, d4 // q1 - f vmin.s16 d2, d2, d3 // out p1 = iclip_pixel() vmin.s16 d6, d6, d3 // out q1 = iclip_pixel() vmax.s16 d2, d2, d9 // out p1 = iclip_pixel() vmax.s16 d6, d6, d9 // out q1 = iclip_pixel() vbit d22, d2, d0 // if (fm && wd >= 4 && !hev) vbit d25, d6, d0 // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 vmov r10, r11, d14 orrs r10, r10, r11 beq 2f // skip if there's no flat8in vadd.i16 d0, d21, d21 // p2 * 2 vadd.i16 d2, d21, d22 // p2 + p1 vadd.i16 d4, d22, d23 // p1 + p0 vadd.i16 d6, d23, d24 // p0 + q0 vadd.i16 d8, d0, d2 vadd.i16 d10, d4, d6 vadd.i16 d12, d24, d25 // q0 + q1 vadd.i16 d8, d8, d10 vsub.i16 d12, d12, d0 vadd.i16 d10, d25, d26 // q1 + q2 vrshr.u16 d0, d8, #3 // out p1 vadd.i16 d8, d8, d12 vsub.i16 d10, d10, d2 vadd.i16 d12, d26, d26 // q2 + q2 vrshr.u16 d1, d8, #3 // out p0 vadd.i16 d8, d8, d10 vsub.i16 d12, d12, d4 vrshr.u16 d2, d8, #3 // out q0 vbit d22, d0, d14 // p1 if (flat8in) vadd.i16 d8, d8, d12 vbit d23, d1, d14 // p0 if (flat8in) vrshr.u16 d3, d8, #3 // out q1 vbit d24, d2, d14 // q0 if (flat8in) vbit d25, d3, d14 // q1 if (flat8in) .elseif \wd >= 8 vmov r10, r11, d14 orrs r10, r10, r11 .if \wd == 8 beq 8f // skip if there's no flat8in .else beq 2f // skip if there's no flat8in .endif vadd.i16 d0, d20, d21 // p3 + p2 vadd.i16 d2, d22, d25 // p1 + q1 vadd.i16 d4, d20, d22 // p3 + p1 vadd.i16 d6, d23, d26 // p0 + q2 vadd.i16 d8, d0, d0 // 2 * (p3 + p2) vadd.i16 d9, d23, d24 // p0 + q0 vadd.i16 d8, d8, d4 // + p3 + p1 vsub.i16 d2, d2, d0 // p1 + q1 - p3 - p2 vadd.i16 d8, d8, d9 // + p0 + q0 vsub.i16 d6, d6, d4 // p0 + q2 - p3 - p1 vrshr.u16 d10, d8, #3 // out p2 vadd.i16 d8, d8, d2 vadd.i16 d0, d20, d23 // p3 + p0 vadd.i16 d2, d24, d27 // q0 + q3 vrshr.u16 d11, d8, #3 // out p1 vadd.i16 d8, d8, d6 vsub.i16 d2, d2, d0 // q0 + q3 - p3 - p0 vadd.i16 d4, d21, d24 // p2 + q0 vadd.i16 d6, d25, d27 // q1 + q3 vrshr.u16 d12, d8, #3 // out p0 vadd.i16 d8, d8, d2 vsub.i16 d6, d6, d4 // q1 + q3 - p2 - q0 vadd.i16 d0, d22, d25 // p1 + q1 vadd.i16 d2, d26, d27 // q2 + q3 vrshr.u16 d13, d8, #3 // out q0 vadd.i16 d8, d8, d6 vsub.i16 d2, d2, d0 // q2 + q3 - p1 - q1 vrshr.u16 d0, d8, #3 // out q1 vadd.i16 d8, d8, d2 vbit d21, d10, d14 vbit d22, d11, d14 vbit d23, d12, d14 vrshr.u16 d1, d8, #3 // out q2 vbit d24, d13, d14 vbit d25, d0, d14 vbit d26, d1, d14 .endif 2: .if \wd == 16 vmov r10, r11, d15 orrs r10, r10, r11 bne 1f // check if flat8out is needed vmov r10, r11, d14 orrs r10, r10, r11 beq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: vadd.i16 d2, d17, d17 // p6 + p6 vadd.i16 d4, d17, d18 // p6 + p5 vadd.i16 d6, d17, d19 // p6 + p4 vadd.i16 d8, d17, d20 // p6 + p3 vadd.i16 d12, d2, d4 vadd.i16 d10, d6, d8 vadd.i16 d6, d17, d21 // p6 + p2 vadd.i16 d12, d12, d10 vadd.i16 d8, d17, d22 // p6 + p1 vadd.i16 d10, d18, d23 // p5 + p0 vadd.i16 d6, d6, d8 vadd.i16 d8, d19, d24 // p4 + q0 vadd.i16 d12, d12, d6 vadd.i16 d10, d10, d8 vadd.i16 d6, d20, d25 // p3 + q1 vadd.i16 d12, d12, d10 vsub.i16 d6, d6, d2 vadd.i16 d2, d21, d26 // p2 + q2 vrshr.u16 d0, d12, #4 // out p5 vadd.i16 d12, d12, d6 // - (p6 + p6) + (p3 + q1) vsub.i16 d2, d2, d4 vadd.i16 d4, d22, d27 // p1 + q3 vadd.i16 d6, d17, d19 // p6 + p4 vrshr.u16 d1, d12, #4 // out p4 vadd.i16 d12, d12, d2 // - (p6 + p5) + (p2 + q2) vsub.i16 d4, d4, d6 vadd.i16 d6, d23, d28 // p0 + q4 vadd.i16 d8, d17, d20 // p6 + p3 vrshr.u16 d2, d12, #4 // out p3 vadd.i16 d12, d12, d4 // - (p6 + p4) + (p1 + q3) vsub.i16 d6, d6, d8 vadd.i16 d8, d24, d29 // q0 + q5 vadd.i16 d4, d17, d21 // p6 + p2 vrshr.u16 d3, d12, #4 // out p2 vadd.i16 d12, d12, d6 // - (p6 + p3) + (p0 + q4) vsub.i16 d8, d8, d4 vadd.i16 d6, d25, d30 // q1 + q6 vadd.i16 d10, d17, d22 // p6 + p1 vrshr.u16 d4, d12, #4 // out p1 vadd.i16 d12, d12, d8 // - (p6 + p2) + (q0 + q5) vsub.i16 d6, d6, d10 vadd.i16 d8, d26, d30 // q2 + q6 vbif d0, d18, d15 // out p5 vadd.i16 d10, d18, d23 // p5 + p0 vrshr.u16 d5, d12, #4 // out p0 vadd.i16 d12, d12, d6 // - (p6 + p1) + (q1 + q6) vsub.i16 d8, d8, d10 vadd.i16 d10, d27, d30 // q3 + q6 vbif d1, d19, d15 // out p4 vadd.i16 d18, d19, d24 // p4 + q0 vrshr.u16 d6, d12, #4 // out q0 vadd.i16 d12, d12, d8 // - (p5 + p0) + (q2 + q6) vsub.i16 d10, d10, d18 vadd.i16 d8, d28, d30 // q4 + q6 vbif d2, d20, d15 // out p3 vadd.i16 d18, d20, d25 // p3 + q1 vrshr.u16 d7, d12, #4 // out q1 vadd.i16 d12, d12, d10 // - (p4 + q0) + (q3 + q6) vsub.i16 d18, d8, d18 vadd.i16 d10, d29, d30 // q5 + q6 vbif d3, d21, d15 // out p2 vadd.i16 d20, d21, d26 // p2 + q2 vrshr.u16 d8, d12, #4 // out q2 vadd.i16 d12, d12, d18 // - (p3 + q1) + (q4 + q6) vsub.i16 d10, d10, d20 vadd.i16 d18, d30, d30 // q6 + q6 vbif d4, d22, d15 // out p1 vadd.i16 d20, d22, d27 // p1 + q3 vrshr.u16 d9, d12, #4 // out q3 vadd.i16 d12, d12, d10 // - (p2 + q2) + (q5 + q6) vsub.i16 d18, d18, d20 vbif d5, d23, d15 // out p0 vrshr.u16 d10, d12, #4 // out q4 vadd.i16 d12, d12, d18 // - (p1 + q3) + (q6 + q6) vrshr.u16 d11, d12, #4 // out q5 vbif d6, d24, d15 // out q0 vbif d7, d25, d15 // out q1 vbif d8, d26, d15 // out q2 vbif d9, d27, d15 // out q3 vbif d10, d28, d15 // out q4 vbif d11, d29, d15 // out q5 .endif bx lr .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels bx r6 .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels bx r7 .endif 9: // Return directly without writing back any pixels bx r12 endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_4_wd16 adr r6, 7f + CONFIG_THUMB adr r7, 8f + CONFIG_THUMB bl lpf_4_wd16_neon .endm .macro lpf_4_wd8 adr r7, 8f + CONFIG_THUMB bl lpf_4_wd8_neon .endm .macro lpf_4_wd6 bl lpf_4_wd6_neon .endm .macro lpf_4_wd4 bl lpf_4_wd4_neon .endm function lpf_v_4_4_neon mov r12, lr sub r10, r0, r1, lsl #1 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 lpf_4_wd4 sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_4_4_neon mov r12, lr sub r10, r0, #4 add r0, r10, r1, lsl #1 vld1.16 {d22}, [r10], r1 vld1.16 {d24}, [r0], r1 vld1.16 {d23}, [r10], r1 vld1.16 {d25}, [r0], r1 add r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 lpf_4_wd4 sub r10, r0, r1, lsl #2 sub r10, r10, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 add r0, r10, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc function lpf_v_6_4_neon mov r12, lr sub r10, r0, r1, lsl #1 sub r10, r10, r1 vld1.16 {d21}, [r10, :64], r1 // p2 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d25}, [r0, :64], r1 // q1 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 lpf_4_wd6 sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_6_4_neon mov r12, lr sub r10, r0, #8 vld1.16 {d20}, [r10, :64], r1 vld1.16 {d24}, [r0, :64], r1 vld1.16 {d21}, [r10, :64], r1 vld1.16 {d25}, [r0, :64], r1 vld1.16 {d22}, [r10, :64], r1 vld1.16 {d26}, [r0, :64], r1 vld1.16 {d23}, [r10, :64], r1 vld1.16 {d27}, [r0, :64], r1 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 lpf_4_wd6 sub r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 sub r10, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc function lpf_v_8_4_neon mov r12, lr sub r10, r0, r1, lsl #2 vld1.16 {d20}, [r10, :64], r1 // p3 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d21}, [r10, :64], r1 // p2 vld1.16 {d25}, [r0, :64], r1 // q1 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d26}, [r0, :64], r1 // q2 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d27}, [r0, :64], r1 // q3 sub r0, r0, r1, lsl #2 lpf_4_wd8 sub r10, r0, r1, lsl #1 sub r10, r10, r1 vst1.16 {d21}, [r10, :64], r1 // p2 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d25}, [r0, :64], r1 // q1 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_8_4_neon mov r12, lr sub r10, r0, #8 vld1.16 {d20}, [r10, :64], r1 vld1.16 {d24}, [r0, :64], r1 vld1.16 {d21}, [r10, :64], r1 vld1.16 {d25}, [r0, :64], r1 vld1.16 {d22}, [r10, :64], r1 vld1.16 {d26}, [r0, :64], r1 vld1.16 {d23}, [r10, :64], r1 vld1.16 {d27}, [r0, :64], r1 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 lpf_4_wd8 sub r0, r0, r1, lsl #2 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 sub r10, r0, #8 vst1.16 {d20}, [r10, :64], r1 vst1.16 {d24}, [r0, :64], r1 vst1.16 {d21}, [r10, :64], r1 vst1.16 {d25}, [r0, :64], r1 vst1.16 {d22}, [r10, :64], r1 vst1.16 {d26}, [r0, :64], r1 vst1.16 {d23}, [r10, :64], r1 vst1.16 {d27}, [r0, :64], r1 bx r12 8: sub r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 sub r10, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc function lpf_v_16_4_neon mov r12, lr sub r10, r0, r1, lsl #3 add r10, r10, r1 vld1.16 {d17}, [r10, :64], r1 // p6 vld1.16 {d24}, [r0, :64], r1 // q0 vld1.16 {d18}, [r10, :64], r1 // p5 vld1.16 {d25}, [r0, :64], r1 // q1 vld1.16 {d19}, [r10, :64], r1 // p4 vld1.16 {d26}, [r0, :64], r1 // q2 vld1.16 {d20}, [r10, :64], r1 // p3 vld1.16 {d27}, [r0, :64], r1 // q3 vld1.16 {d21}, [r10, :64], r1 // p2 vld1.16 {d28}, [r0, :64], r1 // q4 vld1.16 {d22}, [r10, :64], r1 // p1 vld1.16 {d29}, [r0, :64], r1 // q5 vld1.16 {d23}, [r10, :64], r1 // p0 vld1.16 {d30}, [r0, :64], r1 // q6 sub r0, r0, r1, lsl #3 add r0, r0, r1 lpf_4_wd16 sub r10, r0, r1, lsl #2 sub r10, r10, r1, lsl #1 vst1.16 {d0}, [r10, :64], r1 // p5 vst1.16 {d6}, [r0, :64], r1 // q0 vst1.16 {d1}, [r10, :64], r1 // p4 vst1.16 {d7}, [r0, :64], r1 // q1 vst1.16 {d2}, [r10, :64], r1 // p3 vst1.16 {d8}, [r0, :64], r1 // q2 vst1.16 {d3}, [r10, :64], r1 // p2 vst1.16 {d9}, [r0, :64], r1 // q3 vst1.16 {d4}, [r10, :64], r1 // p1 vst1.16 {d10}, [r0, :64], r1 // q4 vst1.16 {d5}, [r10, :64], r1 // p0 vst1.16 {d11}, [r0, :64], r1 // q5 sub r0, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 bx r12 7: sub r10, r0, r1 sub r10, r10, r1, lsl #1 vst1.16 {d21}, [r10, :64], r1 // p2 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d25}, [r0, :64], r1 // q1 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d26}, [r0, :64], r1 // q2 sub r0, r0, r1, lsl #1 sub r0, r0, r1 bx r12 8: sub r10, r0, r1, lsl #1 vst1.16 {d22}, [r10, :64], r1 // p1 vst1.16 {d24}, [r0, :64], r1 // q0 vst1.16 {d23}, [r10, :64], r1 // p0 vst1.16 {d25}, [r0, :64], r1 // q1 sub r0, r0, r1, lsl #1 bx r12 endfunc function lpf_h_16_4_neon mov r12, lr sub r10, r0, #16 sub r0, r0, #8 vld1.16 {d16}, [r10, :64], r1 vld1.16 {d20}, [r0, :64], r1 vld1.16 {d17}, [r10, :64], r1 vld1.16 {d21}, [r0, :64], r1 vld1.16 {d18}, [r10, :64], r1 vld1.16 {d22}, [r0, :64], r1 vld1.16 {d19}, [r10, :64], r1 vld1.16 {d23}, [r0, :64], r1 sub r10, r10, r1, lsl #2 sub r0, r0, r1, lsl #2 add r10, r10, #16 add r0, r0, #16 vld1.16 {d24}, [r10, :64], r1 vld1.16 {d28}, [r0, :64], r1 vld1.16 {d25}, [r10, :64], r1 vld1.16 {d29}, [r0, :64], r1 vld1.16 {d26}, [r10, :64], r1 vld1.16 {d30}, [r0, :64], r1 vld1.16 {d27}, [r10, :64], r1 vld1.16 {d31}, [r0, :64], r1 sub r0, r0, #8 transpose_4x4h q8, q9, d16, d17, d18, d19 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 transpose_4x4h q14, q15, d28, d29, d30, d31 lpf_4_wd16 sub r0, r0, r1, lsl #2 transpose_4x4h q8, q0, d16, d17, d0, d1 transpose_4x4h q1, q2, d2, d3, d4, d5 transpose_4x4h q3, q4, d6, d7, d8, d9 transpose_4x4h q5, q15, d10, d11, d30, d31 sub r10, r0, #16 sub r0, r0, #8 vst1.16 {d16}, [r10, :64], r1 vst1.16 {d2}, [r0, :64], r1 vst1.16 {d17}, [r10, :64], r1 vst1.16 {d3}, [r0, :64], r1 vst1.16 {d0}, [r10, :64], r1 vst1.16 {d4}, [r0, :64], r1 vst1.16 {d1}, [r10, :64], r1 vst1.16 {d5}, [r0, :64], r1 sub r10, r10, r1, lsl #2 sub r0, r0, r1, lsl #2 add r10, r10, #16 add r0, r0, #16 vst1.16 {d6}, [r10, :64], r1 vst1.16 {d10}, [r0, :64], r1 vst1.16 {d7}, [r10, :64], r1 vst1.16 {d11}, [r0, :64], r1 vst1.16 {d8}, [r10, :64], r1 vst1.16 {d30}, [r0, :64], r1 vst1.16 {d9}, [r10, :64], r1 vst1.16 {d31}, [r0, :64], r1 sub r0, r0, #8 bx r12 7: sub r0, r0, r1, lsl #2 transpose_4x4h q10, q11, d20, d21, d22, d23 transpose_4x4h q12, q13, d24, d25, d26, d27 sub r10, r0, #8 vst1.16 {d20}, [r10, :64], r1 vst1.16 {d24}, [r0, :64], r1 vst1.16 {d21}, [r10, :64], r1 vst1.16 {d25}, [r0, :64], r1 vst1.16 {d22}, [r10, :64], r1 vst1.16 {d26}, [r0, :64], r1 vst1.16 {d23}, [r10, :64], r1 vst1.16 {d27}, [r0, :64], r1 bx r12 8: sub r0, r0, #4 transpose_4x4h q11, q12, d22, d23, d24, d25 sub r10, r0, r1, lsl #2 sub r0, r0, r1, lsl #1 vst1.16 {d22}, [r10], r1 vst1.16 {d24}, [r0], r1 vst1.16 {d23}, [r10], r1 vst1.16 {d25}, [r0], r1 add r0, r0, #4 bx r12 endfunc // void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av1FilterLUT *lut, const int w, // const int bitdepth_max) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldr r8, [sp, #112] // bitdepth_max; the 'w' parameter isn't loaded sub sp, sp, #8 clz r9, r8 rsb r9, r9, #24 // bitdepth_min_8 ldrd r6, r7, [r2] // vmask[0], vmask[1] .ifc \type, y ldr r2, [r2, #8] // vmask[2] .endif add r5, r5, #128 // Move to sharp part of lut .ifc \type, y orr r7, r7, r2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub r4, r3, r4, lsl #2 .else sub r3, r3, #4 lsl r4, r4, #2 .endif orr r6, r6, r7 // vmask[0] |= vmask[1] 1: tst r6, #0x01 strd r6, r7, [sp] .ifc \dir, v ldrb r10, [r4], #4 ldrb r11, [r3], #4 .else ldrb r10, [r3] ldrb r11, [r3, #4] add r3, r3, r4 .endif beq 7f // if (!(vm & bits)) continue; orrs r12, r10, r11 vdup.16 d31, r9 // bitdepth_min_8 beq 7f // if (!(l[0][0] | l[offset][0])) continue; cmp r11, #0 // Check for nonzero values in l[0][0] ldrb r6, [r5], #8 // sharp[0] it eq moveq r11, r10 // if (!l[0][0]) L = l[offset][0] ldrb r12, [r5] // sharp[1] lsr r6, r11, r6 // L >> sharp[0] sub r5, r5, #8 cmp r12, r6 lsr r10, r11, #4 // H add r11, r11, #2 // L + 2 it lt movlt r6, r12 // imin(L >> sharp[0], sharp[1]) add r11, r11, r11 // 2*(L + 2) cmp r6, #1 lsl r10, r10, r9 // H << bitdepth_min_8 it lt movlt r6, #1 // imax(imin(), 1) = limit = I vdup.16 d12, r10 // H << bitdepth_min_8 add r11, r11, r6 // 2*(L + 2) + limit = E lsl r6, r6, r9 // I << bitdepth_min_8 lsl r11, r11, r9 // E << bitdepth_min_8 vdup.16 d11, r6 // I << bitdepth_min_8 vdup.16 d10, r11 // E << bitdepth_min_8 .ifc \type, y tst r2, #0x01 beq 2f // wd16 bl lpf_\dir\()_16_4_neon b 8f 2: .endif tst r7, #0x01 beq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_4_neon .else // wd6 bl lpf_\dir\()_6_4_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_4_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment r0. // If the whole function is skipped, increment it here instead. add r0, r0, r1, lsl #2 .else 7: .endif 8: ldrd r6, r7, [sp] .ifc \type, y lsr r2, r2, #1 // vmask[2] >>= 1 .endif .ifc \dir, v add r0, r0, #8 .else // For dir h, r0 is returned incremented .endif lsrs r6, r6, #1 // vmask[0] >>= 1 lsr r7, r7, #1 // vmask[1] >>= 1 bne 1b add sp, sp, #8 vpop {q4-q7} pop {r4-r11,pc} endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv rav1e-0.7.1/src/arm/32/looprestoration.S000064400000000000000000000717351046102023000160450ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" const right_ext_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 right_ext_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, // const int16_t fh[8], intptr_t w, // int h, enum LrEdgeFlags edges); function wiener_filter_h_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] mov r8, r5 vld1.16 {q0}, [r4, :128] movw r9, #(1 << 14) - (1 << 2) vdup.16 q14, r9 vmov.s16 q15, #2048 // Calculate mid_stride add r10, r5, #7 bic r10, r10, #7 lsl r10, r10, #1 // Set up pointers for reading/writing alternate rows add r12, r0, r10 lsl r10, r10, #1 add lr, r2, r3 lsl r3, r3, #1 // Subtract the aligned width from mid_stride add r11, r5, #7 bic r11, r11, #7 sub r10, r10, r11, lsl #1 // Subtract the number of pixels read from the source stride add r11, r11, #8 sub r3, r3, r11 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst r7, #1 // LR_HAVE_LEFT beq 2f // LR_HAVE_LEFT cmp r1, #0 bne 0f // left == NULL sub r2, r2, #3 sub lr, lr, #3 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 3 pixels from the src pointer, // but shift it as if we had done that. add r3, r3, #3 1: // Loop vertically vld1.8 {q2}, [r2]! vld1.8 {q9}, [lr]! tst r7, #1 // LR_HAVE_LEFT beq 0f cmp r1, #0 beq 2f // LR_HAVE_LEFT, left != NULL vld1.32 {d3[1]}, [r1]! // Move r2/lr back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub r2, r2, #3 sub lr, lr, #3 vld1.32 {d17[1]}, [r1]! vext.8 q2, q1, q2, #13 vext.8 q9, q8, q9, #13 b 2f 0: // !LR_HAVE_LEFT, fill q1 with the leftmost byte // and shift q2 to have 3x the first byte at the front. vdup.8 q1, d4[0] vdup.8 q8, d18[0] // Move r2 back to account for the last 3 bytes we loaded before, // which we shifted out. sub r2, r2, #3 sub lr, lr, #3 vext.8 q2, q1, q2, #13 vext.8 q9, q8, q9, #13 2: vmovl.u8 q1, d4 vmovl.u8 q2, d5 vmovl.u8 q8, d18 vmovl.u8 q9, d19 tst r7, #2 // LR_HAVE_RIGHT bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. sub r9, r5, #14 ldrb r11, [r2, r9] ldrb r9, [lr, r9] // Fill q12/q13 with the right padding pixel vdup.16 q12, r11 vdup.16 q13, r9 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. movrel_local r4, right_ext_mask, -6 sub r4, r4, r5, lsl #1 vld1.8 {q10, q11}, [r4] vbit q1, q12, q10 vbit q2, q12, q11 vbit q8, q13, q10 vbit q9, q13, q11 4: // Loop horizontally vext.8 q11, q1, q2, #4 vext.8 q5, q1, q2, #8 vext.8 q10, q1, q2, #2 vext.8 q6, q1, q2, #10 vext.8 q7, q1, q2, #12 vext.8 q4, q1, q2, #6 vadd.i16 q5, q5, q11 vadd.i16 q6, q6, q10 vadd.i16 q7, q7, q1 vmul.s16 q3, q4, d0[3] vmla.s16 q3, q5, d1[0] vmla.s16 q3, q6, d1[1] vmla.s16 q3, q7, d1[2] vext.8 q4, q8, q9, #4 vext.8 q6, q8, q9, #8 vext.8 q11, q8, q9, #2 vext.8 q7, q8, q9, #10 vadd.i16 q6, q6, q4 vext.8 q4, q8, q9, #12 vext.8 q5, q8, q9, #6 vadd.i16 q7, q7, q11 vadd.i16 q4, q4, q8 vmul.s16 q10, q5, d0[3] vmla.s16 q10, q6, d1[0] vmla.s16 q10, q7, d1[1] vmla.s16 q10, q4, d1[2] vext.8 q1, q1, q2, #6 vext.8 q8, q8, q9, #6 vshl.s16 q1, q1, #7 vshl.s16 q8, q8, #7 vsub.s16 q1, q1, q14 vsub.s16 q8, q8, q14 vqadd.s16 q3, q3, q1 vqadd.s16 q10, q10, q8 vshr.s16 q3, q3, #3 vshr.s16 q10, q10, #3 vadd.s16 q3, q3, q15 vadd.s16 q10, q10, q15 subs r5, r5, #8 vst1.16 {q3}, [r0, :128]! vst1.16 {q10}, [r12, :128]! ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q1, q2 vmov q8, q9 vld1.8 {d4}, [r2]! vld1.8 {d18}, [lr]! vmovl.u8 q2, d4 vmovl.u8 q9, d18 bne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 9: subs r6, r6, #2 ble 0f // Jump to the next row and loop horizontally add r0, r0, r10 add r12, r12, r10 add r2, r2, r3 add lr, lr, r3 mov r5, r8 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, ptrdiff_t stride, // const int16_t *mid, int w, int h, // const int16_t fv[8], enum LrEdgeFlags edges, // ptrdiff_t mid_stride); function wiener_filter_v_8bpc_neon, export=1 push {r4-r7,lr} vpush {q4-q6} ldrd r4, r5, [sp, #68] ldrd r6, r7, [sp, #76] mov lr, r4 vld1.16 {q0}, [r5, :128] // Calculate the number of rows to move back when looping vertically mov r12, r4 tst r6, #4 // LR_HAVE_TOP beq 0f sub r2, r2, r7, lsl #1 add r12, r12, #2 0: tst r6, #8 // LR_HAVE_BOTTOM beq 1f add r12, r12, #2 1: // Start of horizontal loop; start one vertical filter slice. // Load rows into q8-q11 and pad properly. tst r6, #4 // LR_HAVE_TOP vld1.16 {q8}, [r2, :128], r7 beq 2f // LR_HAVE_TOP vld1.16 {q10}, [r2, :128], r7 vmov q9, q8 vld1.16 {q11}, [r2, :128], r7 b 3f 2: // !LR_HAVE_TOP vmov q9, q8 vmov q10, q8 vmov q11, q8 3: cmp r4, #4 blt 5f // Start filtering normally; fill in q12-q14 with unique rows. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 vld1.16 {q14}, [r2, :128], r7 4: .macro filter compare subs r4, r4, #1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. vadd.i16 q4, q10, q12 vadd.i16 q5, q9, q13 vadd.i16 q6, q8, q14 vmull.s16 q2, d22, d0[3] vmlal.s16 q2, d8, d1[0] vmlal.s16 q2, d10, d1[1] vmlal.s16 q2, d12, d1[2] vmull.s16 q3, d23, d0[3] vmlal.s16 q3, d9, d1[0] vmlal.s16 q3, d11, d1[1] vmlal.s16 q3, d13, d1[2] vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 vqmovun.s16 d4, q2 vst1.8 {d4}, [r0, :64], r1 .if \compare cmp r4, #4 .else ble 9f .endif vmov q8, q9 vmov q9, q10 vmov q10, q11 vmov q11, q12 vmov q12, q13 vmov q13, q14 .endm filter 1 blt 7f vld1.16 {q14}, [r2, :128], r7 b 4b 5: // Less than 4 rows in total; not all of q12-q13 are filled yet. tst r6, #8 // LR_HAVE_BOTTOM beq 6f // LR_HAVE_BOTTOM cmp r4, #2 // We load at least 2 rows in all cases. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 bgt 53f // 3 rows in total beq 52f // 2 rows in total 51: // 1 row in total, q11 already loaded, load edge into q12-q14. vmov q13, q12 b 8f 52: // 2 rows in total, q11 already loaded, load q12 with content data // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vmov q15, q14 b 8f 53: // 3 rows in total, q11 already loaded, load q12 and q13 with content // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vld1.16 {q15}, [r2, :128], r7 vmov q1, q15 b 8f 6: // !LR_HAVE_BOTTOM cmp r4, #2 bgt 63f // 3 rows in total beq 62f // 2 rows in total 61: // 1 row in total, q11 already loaded, pad that into q12-q14. vmov q12, q11 vmov q13, q11 vmov q14, q11 b 8f 62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. vld1.16 {q12}, [r2, :128], r7 vmov q13, q12 vmov q14, q12 vmov q15, q12 b 8f 63: // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 vmov q14, q13 vmov q15, q13 vmov q1, q13 b 8f 7: // All registers up to q13 are filled already, 3 valid rows left. // < 4 valid rows left; fill in padding and filter the last // few rows. tst r6, #8 // LR_HAVE_BOTTOM beq 71f // LR_HAVE_BOTTOM; load 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vld1.16 {q15}, [r2, :128], r7 vmov q1, q15 b 8f 71: // !LR_HAVE_BOTTOM, pad 3 rows vmov q14, q13 vmov q15, q13 vmov q1, q13 8: // At this point, all registers up to q14-15,q1 are loaded with // edge/padding (depending on how many rows are left). filter 0 // This branches to 9f when done vmov q14, q15 vmov q15, q1 b 8b 9: // End of one vertical slice. subs r3, r3, #8 ble 0f // Move pointers back up to the top and loop horizontally. mls r0, r1, lr, r0 mls r2, r7, r12, r2 add r0, r0, #8 add r2, r2, #16 mov r4, lr b 1b 0: vpop {q4-q6} pop {r4-r7,pc} .purgem filter endfunc #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box3_h_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] add r5, r5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add r10, r0, #(4*SUM_STRIDE) // sumsq add r11, r1, #(2*SUM_STRIDE) // sum add r12, r3, r4 // src lsl r4, r4, #1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add lr, r5, #7 bic lr, lr, #7 sub r9, r9, lr, lsl #1 // Store the width for the vertical loop mov r8, r5 // Subtract the number of pixels read from the input from the stride add lr, lr, #8 sub r4, r4, lr // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst r7, #1 // LR_HAVE_LEFT beq 2f // LR_HAVE_LEFT cmp r2, #0 bne 0f // left == NULL sub r3, r3, #2 sub r12, r12, #2 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 2 pixels from the src pointer, // but shift it as if we had done that. add r4, r4, #2 1: // Loop vertically vld1.8 {q0}, [r3]! vld1.8 {q4}, [r12]! tst r7, #1 // LR_HAVE_LEFT beq 0f cmp r2, #0 beq 2f // LR_HAVE_LEFT, left != NULL vld1.32 {d3[]}, [r2]! // Move r3/r12 back to account for the last 2 bytes we loaded earlier, // which we'll shift out. sub r3, r3, #2 sub r12, r12, #2 vld1.32 {d11[]}, [r2]! vext.8 q0, q1, q0, #14 vext.8 q4, q5, q4, #14 b 2f 0: // !LR_HAVE_LEFT, fill q1 with the leftmost byte // and shift q0 to have 2x the first byte at the front. vdup.8 q1, d0[0] vdup.8 q5, d8[0] // Move r3 back to account for the last 2 bytes we loaded before, // which we shifted out. sub r3, r3, #2 sub r12, r12, #2 vext.8 q0, q1, q0, #14 vext.8 q4, q5, q4, #14 2: vmull.u8 q1, d0, d0 vmull.u8 q2, d1, d1 vmull.u8 q5, d8, d8 vmull.u8 q6, d9, d9 tst r7, #2 // LR_HAVE_RIGHT bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. sub lr, r5, #(2 + 16 - 2 + 1) ldrb r11, [r3, lr] ldrb lr, [r12, lr] // Fill q14/q15 with the right padding pixel vdup.8 q14, r11 vdup.8 q15, lr // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp r5, #10 bge 4f // If w >= 10, all used input pixels are valid // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. // Insert padding in q0/4.b[w] onwards movrel_local lr, right_ext_mask sub lr, lr, r5 vld1.8 {q13}, [lr] vbit q0, q14, q13 vbit q4, q15, q13 // Update the precalculated squares vmull.u8 q1, d0, d0 vmull.u8 q2, d1, d1 vmull.u8 q5, d8, d8 vmull.u8 q6, d9, d9 4: // Loop horizontally vext.8 d16, d0, d1, #1 vext.8 d17, d0, d1, #2 vext.8 d18, d8, d9, #1 vext.8 d19, d8, d9, #2 vaddl.u8 q3, d0, d16 vaddw.u8 q3, q3, d17 vaddl.u8 q7, d8, d18 vaddw.u8 q7, q7, d19 vext.8 q8, q1, q2, #2 vext.8 q9, q1, q2, #4 vext.8 q10, q5, q6, #2 vext.8 q11, q5, q6, #4 vaddl.u16 q12, d2, d16 vaddl.u16 q13, d3, d17 vaddw.u16 q12, q12, d18 vaddw.u16 q13, q13, d19 vaddl.u16 q8, d10, d20 vaddl.u16 q9, d11, d21 vaddw.u16 q8, q8, d22 vaddw.u16 q9, q9, d23 subs r5, r5, #8 vst1.16 {q3}, [r1, :128]! vst1.16 {q7}, [r11, :128]! vst1.32 {q12, q13}, [r0, :128]! vst1.32 {q8, q9}, [r10, :128]! ble 9f tst r7, #2 // LR_HAVE_RIGHT vld1.8 {d6}, [r3]! vld1.8 {d14}, [r12]! vmov q1, q2 vmov q5, q6 vext.8 q0, q0, q3, #8 vext.8 q4, q4, q7, #8 vmull.u8 q2, d6, d6 vmull.u8 q6, d14, d14 bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs r6, r6, #2 ble 0f // Jump to the next row and loop horizontally add r0, r0, r9, lsl #1 add r10, r10, r9, lsl #1 add r1, r1, r9 add r11, r11, r9 add r3, r3, r4 add r12, r12, r4 mov r5, r8 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc // void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box5_h_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] add r5, r5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add r10, r0, #(4*SUM_STRIDE) // sumsq add r11, r1, #(2*SUM_STRIDE) // sum add r12, r3, r4 // src lsl r4, r4, #1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add lr, r5, #7 bic lr, lr, #7 sub r9, r9, lr, lsl #1 add lr, lr, #8 sub r4, r4, lr // Store the width for the vertical loop mov r8, r5 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst r7, #1 // LR_HAVE_LEFT beq 2f // LR_HAVE_LEFT cmp r2, #0 bne 0f // left == NULL sub r3, r3, #3 sub r12, r12, #3 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 3 pixels from the src pointer, // but shift it as if we had done that. add r4, r4, #3 1: // Loop vertically vld1.8 {q0}, [r3]! vld1.8 {q4}, [r12]! tst r7, #1 // LR_HAVE_LEFT beq 0f cmp r2, #0 beq 2f // LR_HAVE_LEFT, left != NULL vld1.32 {d3[]}, [r2]! // Move r3/r12 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub r3, r3, #3 sub r12, r12, #3 vld1.32 {d11[]}, [r2]! vext.8 q0, q1, q0, #13 vext.8 q4, q5, q4, #13 b 2f 0: // !LR_HAVE_LEFT, fill q1 with the leftmost byte // and shift q0 to have 3x the first byte at the front. vdup.8 q1, d0[0] vdup.8 q5, d8[0] // Move r3 back to account for the last 3 bytes we loaded before, // which we shifted out. sub r3, r3, #3 sub r12, r12, #3 vext.8 q0, q1, q0, #13 vext.8 q4, q5, q4, #13 2: vmull.u8 q1, d0, d0 vmull.u8 q2, d1, d1 vmull.u8 q5, d8, d8 vmull.u8 q6, d9, d9 tst r7, #2 // LR_HAVE_RIGHT bne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. sub lr, r5, #(2 + 16 - 3 + 1) ldrb r11, [r3, lr] ldrb lr, [r12, lr] // Fill q14/q15 with the right padding pixel vdup.8 q14, r11 vdup.8 q15, lr // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // Insert padding in q0/4.b[w+1] onwards; fuse the +1 into the // buffer pointer. movrel_local lr, right_ext_mask, -1 sub lr, lr, r5 vld1.8 {q13}, [lr] vbit q0, q14, q13 vbit q4, q15, q13 // Update the precalculated squares vmull.u8 q1, d0, d0 vmull.u8 q2, d1, d1 vmull.u8 q5, d8, d8 vmull.u8 q6, d9, d9 4: // Loop horizontally vext.8 d16, d0, d1, #1 vext.8 d17, d0, d1, #2 vext.8 d18, d0, d1, #3 vext.8 d19, d0, d1, #4 vext.8 d20, d8, d9, #1 vext.8 d21, d8, d9, #2 vext.8 d22, d8, d9, #3 vext.8 d23, d8, d9, #4 vaddl.u8 q3, d0, d16 vaddl.u8 q12, d17, d18 vaddl.u8 q7, d8, d20 vaddl.u8 q13, d21, d22 vaddw.u8 q3, q3, d19 vaddw.u8 q7, q7, d23 vadd.u16 q3, q3, q12 vadd.u16 q7, q7, q13 vext.8 q8, q1, q2, #2 vext.8 q9, q1, q2, #4 vext.8 q10, q1, q2, #6 vext.8 q11, q1, q2, #8 vaddl.u16 q12, d2, d16 vaddl.u16 q13, d3, d17 vaddl.u16 q8, d18, d20 vaddl.u16 q9, d19, d21 vaddw.u16 q12, q12, d22 vaddw.u16 q13, q13, d23 vadd.i32 q12, q12, q8 vadd.i32 q13, q13, q9 vext.8 q8, q5, q6, #2 vext.8 q9, q5, q6, #4 vext.8 q10, q5, q6, #6 vext.8 q11, q5, q6, #8 vaddl.u16 q1, d10, d16 vaddl.u16 q5, d11, d17 vaddl.u16 q8, d18, d20 vaddl.u16 q9, d19, d21 vaddw.u16 q1, q1, d22 vaddw.u16 q5, q5, d23 vadd.i32 q10, q1, q8 vadd.i32 q11, q5, q9 subs r5, r5, #8 vst1.16 {q3}, [r1, :128]! vst1.16 {q7}, [r11, :128]! vst1.32 {q12, q13}, [r0, :128]! vst1.32 {q10, q11}, [r10, :128]! ble 9f tst r7, #2 // LR_HAVE_RIGHT vld1.8 {d6}, [r3]! vld1.8 {d14}, [r12]! vmov q1, q2 vmov q5, q6 vext.8 q0, q0, q3, #8 vext.8 q4, q4, q7, #8 vmull.u8 q2, d6, d6 vmull.u8 q6, d14, d14 bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs r6, r6, #2 ble 0f // Jump to the next row and loop horizontally add r0, r0, r9, lsl #1 add r10, r10, r9, lsl #1 add r1, r1, r9 add r11, r11, r9 add r3, r3, r4 add r12, r12, r4 mov r5, r8 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc sgr_funcs 8 rav1e-0.7.1/src/arm/32/looprestoration16.S000064400000000000000000000735721046102023000162150ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" const right_ext_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 right_ext_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4], // const pixel *src, ptrdiff_t stride, // const int16_t fh[7], const intptr_t w, // int h, enum LrEdgeFlags edges, // const int bitdepth_max); function wiener_filter_h_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] ldr r8, [sp, #116] // bitdepth_max vld1.16 {q0}, [r4, :128] clz r8, r8 vmov.i32 q14, #1 sub r9, r8, #38 // -(bitdepth + 6) sub r8, r8, #25 // -round_bits_h neg r9, r9 // bitdepth + 6 vdup.32 q1, r9 vdup.32 q13, r8 // -round_bits_h vmov.i16 q15, #8192 vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6) mov r8, r5 // Calculate mid_stride add r10, r5, #7 bic r10, r10, #7 lsl r10, r10, #1 // Set up pointers for reading/writing alternate rows add r12, r0, r10 lsl r10, r10, #1 add lr, r2, r3 lsl r3, r3, #1 // Subtract the aligned width from mid_stride add r11, r5, #7 bic r11, r11, #7 sub r10, r10, r11, lsl #1 // Subtract the number of pixels read from the source stride add r11, r11, #8 sub r3, r3, r11, lsl #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst r7, #1 // LR_HAVE_LEFT beq 2f // LR_HAVE_LEFT cmp r1, #0 bne 0f // left == NULL sub r2, r2, #6 sub lr, lr, #6 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 3 pixels from the src pointer, // but shift it as if we had done that. add r3, r3, #6 1: // Loop vertically vld1.16 {q2, q3}, [r2]! vld1.16 {q4, q5}, [lr]! tst r7, #1 // LR_HAVE_LEFT beq 0f cmp r1, #0 beq 2f // LR_HAVE_LEFT, left != NULL vld1.16 {d3}, [r1]! // Move r2/lr back to account for the last 3 pixels we loaded earlier, // which we'll shift out. sub r2, r2, #6 sub lr, lr, #6 vld1.16 {d13}, [r1]! vext.8 q3, q2, q3, #10 vext.8 q2, q1, q2, #10 vext.8 q5, q4, q5, #10 vext.8 q4, q6, q4, #10 b 2f 0: // !LR_HAVE_LEFT, fill q1 with the leftmost pixel // and shift q2/q3 to have 3x the first pixel at the front. vdup.16 q1, d4[0] vdup.16 q6, d8[0] // Move r2 back to account for the last 3 pixels we loaded before, // which we shifted out. sub r2, r2, #6 sub lr, lr, #6 vext.8 q3, q2, q3, #10 vext.8 q2, q1, q2, #10 vext.8 q5, q4, q5, #10 vext.8 q4, q6, q4, #10 2: tst r7, #2 // LR_HAVE_RIGHT bne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub r9, r5, #14 lsl r9, r9, #1 ldrh r11, [r2, r9] ldrh r9, [lr, r9] // Fill q11/q12 with the right padding pixel vdup.16 q11, r11 vdup.16 q12, r9 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. movrel_local r4, right_ext_mask, -6 sub r4, r4, r5, lsl #1 vld1.8 {q9, q10}, [r4] vbit q2, q11, q9 vbit q3, q11, q10 vbit q4, q12, q9 vbit q5, q12, q10 4: // Loop horizontally vext.8 q7, q2, q3, #4 vext.8 q8, q2, q3, #8 vext.8 q6, q2, q3, #2 vext.8 q9, q2, q3, #10 vadd.i16 q8, q8, q7 vadd.i16 q9, q9, q6 vext.8 q6, q2, q3, #12 vext.8 q7, q2, q3, #6 vadd.i16 q2, q2, q6 vmull.s16 q6, d14, d0[3] vmlal.s16 q6, d16, d1[0] vmlal.s16 q6, d18, d1[1] vmlal.s16 q6, d4, d1[2] vmull.s16 q7, d15, d0[3] vmlal.s16 q7, d17, d1[0] vmlal.s16 q7, d19, d1[1] vmlal.s16 q7, d5, d1[2] vext.8 q8, q4, q5, #4 vext.8 q10, q4, q5, #8 vext.8 q9, q4, q5, #2 vext.8 q2, q4, q5, #10 vadd.i16 q10, q10, q8 vadd.i16 q2, q2, q9 vext.8 q8, q4, q5, #12 vext.8 q9, q4, q5, #6 vadd.i16 q4, q4, q8 vmull.s16 q8, d18, d0[3] vmlal.s16 q8, d20, d1[0] vmlal.s16 q8, d4, d1[1] vmlal.s16 q8, d8, d1[2] vmull.s16 q9, d19, d0[3] vmlal.s16 q9, d21, d1[0] vmlal.s16 q9, d5, d1[1] vmlal.s16 q9, d9, d1[2] vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1 vadd.i32 q6, q6, q14 vadd.i32 q7, q7, q14 vadd.i32 q8, q8, q14 vadd.i32 q9, q9, q14 vrshl.s32 q6, q6, q13 vrshl.s32 q7, q7, q13 vrshl.s32 q8, q8, q13 vrshl.s32 q9, q9, q13 vqmovun.s32 d12, q6 vqmovun.s32 d13, q7 vqmovun.s32 d14, q8 vqmovun.s32 d15, q9 vmin.u16 q6, q6, q10 vmin.u16 q7, q7, q10 vsub.i16 q6, q6, q15 vsub.i16 q7, q7, q15 subs r5, r5, #8 vst1.16 {q6}, [r0, :128]! vst1.16 {q7}, [r12, :128]! ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q2, q3 vmov q4, q5 vld1.16 {q3}, [r2]! vld1.16 {q5}, [lr]! bne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 9: subs r6, r6, #2 ble 0f // Jump to the next row and loop horizontally add r0, r0, r10 add r12, r12, r10 add r2, r2, r3 add lr, lr, r3 mov r5, r8 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc // void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride, // const int16_t *mid, int w, int h, // const int16_t fv[7], enum LrEdgeFlags edges, // ptrdiff_t mid_stride, const int bitdepth_max); function wiener_filter_v_16bpc_neon, export=1 push {r4-r7,lr} vpush {q4-q5} ldrd r4, r5, [sp, #52] ldrd r6, r7, [sp, #60] ldr lr, [sp, #68] // bitdepth_max vld1.16 {q0}, [r5, :128] vdup.16 q5, lr clz lr, lr sub lr, lr, #11 // round_bits_v vdup.32 q4, lr mov lr, r4 vneg.s32 q4, q4 // -round_bits_v // Calculate the number of rows to move back when looping vertically mov r12, r4 tst r6, #4 // LR_HAVE_TOP beq 0f sub r2, r2, r7, lsl #1 add r12, r12, #2 0: tst r6, #8 // LR_HAVE_BOTTOM beq 1f add r12, r12, #2 1: // Start of horizontal loop; start one vertical filter slice. // Load rows into q8-q11 and pad properly. tst r6, #4 // LR_HAVE_TOP vld1.16 {q8}, [r2, :128], r7 beq 2f // LR_HAVE_TOP vld1.16 {q10}, [r2, :128], r7 vmov q9, q8 vld1.16 {q11}, [r2, :128], r7 b 3f 2: // !LR_HAVE_TOP vmov q9, q8 vmov q10, q8 vmov q11, q8 3: cmp r4, #4 blt 5f // Start filtering normally; fill in q12-q14 with unique rows. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 vld1.16 {q14}, [r2, :128], r7 4: .macro filter compare subs r4, r4, #1 // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. vmull.s16 q2, d16, d0[0] vmlal.s16 q2, d18, d0[1] vmlal.s16 q2, d20, d0[2] vmlal.s16 q2, d22, d0[3] vmlal.s16 q2, d24, d1[0] vmlal.s16 q2, d26, d1[1] vmlal.s16 q2, d28, d1[2] vmull.s16 q3, d17, d0[0] vmlal.s16 q3, d19, d0[1] vmlal.s16 q3, d21, d0[2] vmlal.s16 q3, d23, d0[3] vmlal.s16 q3, d25, d1[0] vmlal.s16 q3, d27, d1[1] vmlal.s16 q3, d29, d1[2] vrshl.s32 q2, q2, q4 // round_bits_v vrshl.s32 q3, q3, q4 vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vmin.u16 q2, q2, q5 // bitdepth_max vst1.16 {q2}, [r0, :128], r1 .if \compare cmp r4, #4 .else ble 9f .endif vmov q8, q9 vmov q9, q10 vmov q10, q11 vmov q11, q12 vmov q12, q13 vmov q13, q14 .endm filter 1 blt 7f vld1.16 {q14}, [r2, :128], r7 b 4b 5: // Less than 4 rows in total; not all of q12-q13 are filled yet. tst r6, #8 // LR_HAVE_BOTTOM beq 6f // LR_HAVE_BOTTOM cmp r4, #2 // We load at least 2 rows in all cases. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 bgt 53f // 3 rows in total beq 52f // 2 rows in total 51: // 1 row in total, q11 already loaded, load edge into q12-q14. vmov q13, q12 b 8f 52: // 2 rows in total, q11 already loaded, load q12 with content data // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vmov q15, q14 b 8f 53: // 3 rows in total, q11 already loaded, load q12 and q13 with content // and 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vld1.16 {q15}, [r2, :128], r7 vmov q1, q15 b 8f 6: // !LR_HAVE_BOTTOM cmp r4, #2 bgt 63f // 3 rows in total beq 62f // 2 rows in total 61: // 1 row in total, q11 already loaded, pad that into q12-q14. vmov q12, q11 vmov q13, q11 vmov q14, q11 b 8f 62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15. vld1.16 {q12}, [r2, :128], r7 vmov q13, q12 vmov q14, q12 vmov q15, q12 b 8f 63: // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1. vld1.16 {q12}, [r2, :128], r7 vld1.16 {q13}, [r2, :128], r7 vmov q14, q13 vmov q15, q13 vmov q1, q13 b 8f 7: // All registers up to q13 are filled already, 3 valid rows left. // < 4 valid rows left; fill in padding and filter the last // few rows. tst r6, #8 // LR_HAVE_BOTTOM beq 71f // LR_HAVE_BOTTOM; load 2 rows of edge. vld1.16 {q14}, [r2, :128], r7 vld1.16 {q15}, [r2, :128], r7 vmov q1, q15 b 8f 71: // !LR_HAVE_BOTTOM, pad 3 rows vmov q14, q13 vmov q15, q13 vmov q1, q13 8: // At this point, all registers up to q14-q15,q1 are loaded with // edge/padding (depending on how many rows are left). filter 0 // This branches to 9f when done vmov q14, q15 vmov q15, q1 b 8b 9: // End of one vertical slice. subs r3, r3, #8 ble 0f // Move pointers back up to the top and loop horizontally. mls r0, r1, lr, r0 mls r2, r7, r12, r2 add r0, r0, #16 add r2, r2, #16 mov r4, lr b 1b 0: vpop {q4-q5} pop {r4-r7,pc} .purgem filter endfunc #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" // void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box3_h_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] add r5, r5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add r10, r0, #(4*SUM_STRIDE) // sumsq add r11, r1, #(2*SUM_STRIDE) // sum add r12, r3, r4 // src lsl r4, r4, #1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add lr, r5, #7 bic lr, lr, #7 sub r9, r9, lr, lsl #1 // Store the width for the vertical loop mov r8, r5 // Subtract the number of pixels read from the input from the stride add lr, lr, #8 sub r4, r4, lr, lsl #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst r7, #1 // LR_HAVE_LEFT beq 2f // LR_HAVE_LEFT cmp r2, #0 bne 0f // left == NULL sub r3, r3, #4 sub r12, r12, #4 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 2 pixels from the src pointer, // but shift it as if we had done that. add r4, r4, #4 1: // Loop vertically vld1.16 {q0, q1}, [r3]! vld1.16 {q4, q5}, [r12]! tst r7, #1 // LR_HAVE_LEFT beq 0f cmp r2, #0 beq 2f // LR_HAVE_LEFT, left != NULL vld1.16 {d5}, [r2]! // Move r3/r12 back to account for the last 2 pixels we loaded earlier, // which we'll shift out. sub r3, r3, #4 sub r12, r12, #4 vld1.16 {d13}, [r2]! vext.8 q1, q0, q1, #12 vext.8 q0, q2, q0, #12 vext.8 q5, q4, q5, #12 vext.8 q4, q6, q4, #12 b 2f 0: // !LR_HAVE_LEFT, fill q2 with the leftmost pixel // and shift q0 to have 2x the first byte at the front. vdup.16 q2, d0[0] vdup.16 q6, d8[0] // Move r3 back to account for the last 2 pixels we loaded before, // which we shifted out. sub r3, r3, #4 sub r12, r12, #4 vext.8 q1, q0, q1, #12 vext.8 q0, q2, q0, #12 vext.8 q5, q4, q5, #12 vext.8 q4, q6, q4, #12 2: tst r7, #2 // LR_HAVE_RIGHT bne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub lr, r5, #(2 + 16 - 2 + 1) lsl lr, lr, #1 ldrh r11, [r3, lr] ldrh lr, [r12, lr] // Fill q14/q15 with the right padding pixel vdup.16 q14, r11 vdup.16 q15, lr // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp r5, #10 bge 4f // If w >= 10, all used input pixels are valid // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. // Insert padding in q0/1.h[w] onwards movrel_local lr, right_ext_mask sub lr, lr, r5, lsl #1 vld1.8 {q12, q13}, [lr] vbit q0, q14, q12 vbit q1, q14, q13 vbit q4, q15, q12 vbit q5, q15, q13 4: // Loop horizontally vext.8 q8, q0, q1, #2 vext.8 q10, q4, q5, #2 vext.8 q9, q0, q1, #4 vext.8 q11, q4, q5, #4 vadd.i16 q2, q0, q8 vadd.i16 q3, q4, q10 vadd.i16 q2, q2, q9 vadd.i16 q3, q3, q11 vmull.u16 q6, d0, d0 vmlal.u16 q6, d16, d16 vmlal.u16 q6, d18, d18 vmull.u16 q12, d8, d8 vmlal.u16 q12, d20, d20 vmlal.u16 q12, d22, d22 vmull.u16 q7, d1, d1 vmlal.u16 q7, d17, d17 vmlal.u16 q7, d19, d19 vmull.u16 q13, d9, d9 vmlal.u16 q13, d21, d21 vmlal.u16 q13, d23, d23 subs r5, r5, #8 vst1.16 {q2}, [r1, :128]! vst1.16 {q3}, [r11, :128]! vst1.32 {q6, q7}, [r0, :128]! vst1.32 {q12, q13}, [r10, :128]! ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q0, q1 vmov q4, q5 vld1.16 {q1}, [r3]! vld1.16 {q5}, [r12]! bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs r6, r6, #2 ble 0f // Jump to the next row and loop horizontally add r0, r0, r9, lsl #1 add r10, r10, r9, lsl #1 add r1, r1, r9 add r11, r11, r9 add r3, r3, r4 add r12, r12, r4 mov r5, r8 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc // void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box5_h_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] add r5, r5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add r10, r0, #(4*SUM_STRIDE) // sumsq add r11, r1, #(2*SUM_STRIDE) // sum add r12, r3, r4 // src lsl r4, r4, #1 mov r9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add lr, r5, #7 bic lr, lr, #7 sub r9, r9, lr, lsl #1 add lr, lr, #8 sub r4, r4, lr, lsl #1 // Store the width for the vertical loop mov r8, r5 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst r7, #1 // LR_HAVE_LEFT beq 2f // LR_HAVE_LEFT cmp r2, #0 bne 0f // left == NULL sub r3, r3, #6 sub r12, r12, #6 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 3 pixels from the src pointer, // but shift it as if we had done that. add r4, r4, #6 1: // Loop vertically vld1.16 {q0, q1}, [r3]! vld1.16 {q4, q5}, [r12]! tst r7, #1 // LR_HAVE_LEFT beq 0f cmp r2, #0 beq 2f // LR_HAVE_LEFT, left != NULL vld1.16 {d5}, [r2]! // Move r3/r12 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. sub r3, r3, #6 sub r12, r12, #6 vld1.16 {d13}, [r2]! vext.8 q1, q0, q1, #10 vext.8 q0, q2, q0, #10 vext.8 q5, q4, q5, #10 vext.8 q4, q6, q4, #10 b 2f 0: // !LR_HAVE_LEFT, fill q2 with the leftmost pixel // and shift q0 to have 3x the first pixel at the front. vdup.16 q2, d0[0] vdup.16 q6, d8[0] // Move r3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub r3, r3, #6 sub r12, r12, #6 vext.8 q1, q0, q1, #10 vext.8 q0, q2, q0, #10 vext.8 q5, q4, q5, #10 vext.8 q4, q6, q4, #10 2: tst r7, #2 // LR_HAVE_RIGHT bne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub lr, r5, #(2 + 16 - 3 + 1) lsl lr, lr, #1 ldrh r11, [r3, lr] ldrh lr, [r12, lr] // Fill q14/q15 with the right padding pixel vdup.16 q14, r11 vdup.16 q15, lr // Restore r11 after using it for a temporary value add r11, r1, #(2*SUM_STRIDE) 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp r5, #11 bge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the // buffer pointer. movrel_local lr, right_ext_mask, -2 sub lr, lr, r5, lsl #1 vld1.8 {q12, q13}, [lr] vbit q0, q14, q12 vbit q1, q14, q13 vbit q4, q15, q12 vbit q5, q15, q13 4: // Loop horizontally vext.8 q8, q0, q1, #2 vext.8 q10, q4, q5, #2 vext.8 q9, q0, q1, #4 vext.8 q11, q4, q5, #4 vadd.i16 q2, q0, q8 vadd.i16 q3, q4, q10 vadd.i16 q2, q2, q9 vadd.i16 q3, q3, q11 vmull.u16 q6, d0, d0 vmlal.u16 q6, d16, d16 vmlal.u16 q6, d18, d18 vmull.u16 q12, d8, d8 vmlal.u16 q12, d20, d20 vmlal.u16 q12, d22, d22 vmull.u16 q7, d1, d1 vmlal.u16 q7, d17, d17 vmlal.u16 q7, d19, d19 vmull.u16 q13, d9, d9 vmlal.u16 q13, d21, d21 vmlal.u16 q13, d23, d23 vext.8 q8, q0, q1, #6 vext.8 q10, q4, q5, #6 vext.8 q9, q0, q1, #8 vext.8 q11, q4, q5, #8 vadd.i16 q2, q2, q8 vadd.i16 q3, q3, q10 vadd.i16 q2, q2, q9 vadd.i16 q3, q3, q11 vmlal.u16 q6, d16, d16 vmlal.u16 q6, d1, d1 vmlal.u16 q12, d20, d20 vmlal.u16 q12, d9, d9 vmlal.u16 q7, d17, d17 vmlal.u16 q7, d19, d19 vmlal.u16 q13, d21, d21 vmlal.u16 q13, d23, d23 subs r5, r5, #8 vst1.16 {q2}, [r1, :128]! vst1.16 {q3}, [r11, :128]! vst1.32 {q6, q7}, [r0, :128]! vst1.32 {q12, q13}, [r10, :128]! ble 9f tst r7, #2 // LR_HAVE_RIGHT vmov q0, q1 vmov q4, q5 vld1.16 {q1}, [r3]! vld1.16 {q5}, [r12]! bne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs r6, r6, #2 ble 0f // Jump to the next row and loop horizontally add r0, r0, r9, lsl #1 add r10, r10, r9, lsl #1 add r1, r1, r9 add r11, r11, r9 add r3, r3, r4 add r12, r12, r4 mov r5, r8 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc sgr_funcs 16 rav1e-0.7.1/src/arm/32/looprestoration_common.S000064400000000000000000000410341046102023000174020ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define SUM_STRIDE (384+16) // void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box3_v_neon, export=1 push {r4-r9,lr} ldr r4, [sp, #28] add r12, r3, #2 // Number of output rows to move back mov lr, r3 // Number of input rows to move back add r2, r2, #2 // Actual summed width mov r7, #(4*SUM_STRIDE) // sumsq stride mov r8, #(2*SUM_STRIDE) // sum stride sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride sub r1, r1, #(2*SUM_STRIDE) // sum -= stride tst r4, #4 // LR_HAVE_TOP beq 0f // If have top, read from row -2. sub r5, r0, #(4*SUM_STRIDE) sub r6, r1, #(2*SUM_STRIDE) add lr, lr, #2 b 1f 0: // !LR_HAVE_TOP // If we don't have top, read from row 0 even if // we start writing to row -1. add r5, r0, #(4*SUM_STRIDE) add r6, r1, #(2*SUM_STRIDE) 1: tst r4, #8 // LR_HAVE_BOTTOM beq 1f // LR_HAVE_BOTTOM add r3, r3, #2 // Sum all h+2 lines with the main loop add lr, lr, #2 1: mov r9, r3 // Backup of h for next loops 1: // Start of horizontal loop; start one vertical filter slice. // Start loading rows into q8-q13 and q0-q2 taking top // padding into consideration. tst r4, #4 // LR_HAVE_TOP vld1.32 {q8, q9}, [r5, :128], r7 vld1.16 {q0}, [r6, :128], r8 beq 2f // LR_HAVE_TOP vld1.32 {q10, q11}, [r5, :128], r7 vld1.16 {q1}, [r6, :128], r8 vld1.32 {q12, q13}, [r5, :128], r7 vld1.16 {q2}, [r6, :128], r8 b 3f 2: // !LR_HAVE_TOP vmov q10, q8 vmov q11, q9 vmov q1, q0 vmov q12, q8 vmov q13, q9 vmov q2, q0 3: subs r3, r3, #1 .macro add3 vadd.i32 q8, q8, q10 vadd.i32 q9, q9, q11 vadd.i16 q0, q0, q1 vadd.i32 q8, q8, q12 vadd.i32 q9, q9, q13 vadd.i16 q0, q0, q2 vst1.32 {q8, q9}, [r0, :128], r7 vst1.16 {q0}, [r1, :128], r8 .endm add3 vmov q8, q10 vmov q9, q11 vmov q0, q1 vmov q10, q12 vmov q11, q13 vmov q1, q2 ble 4f vld1.32 {q12, q13}, [r5, :128], r7 vld1.16 {q2}, [r6, :128], r8 b 3b 4: tst r4, #8 // LR_HAVE_BOTTOM bne 5f // !LR_HAVE_BOTTOM // Produce two more rows, extending the already loaded rows. add3 vmov q8, q10 vmov q9, q11 vmov q0, q1 add3 5: // End of one vertical slice. subs r2, r2, #8 ble 0f // Move pointers back up to the top and loop horizontally. // Input pointers mls r5, r7, lr, r5 mls r6, r8, lr, r6 // Output pointers mls r0, r7, r12, r0 mls r1, r8, r12, r1 add r0, r0, #32 add r1, r1, #16 add r5, r5, #32 add r6, r6, #16 mov r3, r9 b 1b 0: pop {r4-r9,pc} .purgem add3 endfunc // void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box5_v_neon, export=1 push {r4-r9,lr} vpush {q5-q7} ldr r4, [sp, #76] add r12, r3, #2 // Number of output rows to move back mov lr, r3 // Number of input rows to move back add r2, r2, #8 // Actual summed width mov r7, #(4*SUM_STRIDE) // sumsq stride mov r8, #(2*SUM_STRIDE) // sum stride sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride sub r1, r1, #(2*SUM_STRIDE) // sum -= stride tst r4, #4 // LR_HAVE_TOP beq 0f // If have top, read from row -2. sub r5, r0, #(4*SUM_STRIDE) sub r6, r1, #(2*SUM_STRIDE) add lr, lr, #2 b 1f 0: // !LR_HAVE_TOP // If we don't have top, read from row 0 even if // we start writing to row -1. add r5, r0, #(4*SUM_STRIDE) add r6, r1, #(2*SUM_STRIDE) 1: tst r4, #8 // LR_HAVE_BOTTOM beq 0f // LR_HAVE_BOTTOM add r3, r3, #2 // Handle h+2 lines with the main loop add lr, lr, #2 b 1f 0: // !LR_HAVE_BOTTOM sub r3, r3, #1 // Handle h-1 lines with the main loop 1: mov r9, r3 // Backup of h for next loops 1: // Start of horizontal loop; start one vertical filter slice. // Start loading rows into q6-q15 and q0-q3,q5 taking top // padding into consideration. tst r4, #4 // LR_HAVE_TOP vld1.32 {q6, q7}, [r5, :128], r7 vld1.16 {q0}, [r6, :128], r8 beq 2f // LR_HAVE_TOP vld1.32 {q10, q11}, [r5, :128], r7 vld1.16 {q2}, [r6, :128], r8 vmov q8, q6 vmov q9, q7 vmov q1, q0 vld1.32 {q12, q13}, [r5, :128], r7 vld1.16 {q3}, [r6, :128], r8 b 3f 2: // !LR_HAVE_TOP vmov q8, q6 vmov q9, q7 vmov q1, q0 vmov q10, q6 vmov q11, q7 vmov q2, q0 vmov q12, q6 vmov q13, q7 vmov q3, q0 3: cmp r3, #0 beq 4f vld1.32 {q14, q15}, [r5, :128], r7 vld1.16 {q5}, [r6, :128], r8 3: // Start of vertical loop subs r3, r3, #2 .macro add5 vadd.i32 q6, q6, q8 vadd.i32 q7, q7, q9 vadd.i16 q0, q0, q1 vadd.i32 q6, q6, q10 vadd.i32 q7, q7, q11 vadd.i16 q0, q0, q2 vadd.i32 q6, q6, q12 vadd.i32 q7, q7, q13 vadd.i16 q0, q0, q3 vadd.i32 q6, q6, q14 vadd.i32 q7, q7, q15 vadd.i16 q0, q0, q5 vst1.32 {q6, q7}, [r0, :128], r7 vst1.16 {q0}, [r1, :128], r8 .endm add5 .macro shift2 vmov q6, q10 vmov q7, q11 vmov q0, q2 vmov q8, q12 vmov q9, q13 vmov q1, q3 vmov q10, q14 vmov q11, q15 vmov q2, q5 .endm shift2 add r0, r0, r7 add r1, r1, r8 ble 5f vld1.32 {q12, q13}, [r5, :128], r7 vld1.16 {q3}, [r6, :128], r8 vld1.32 {q14, q15}, [r5, :128], r7 vld1.16 {q5}, [r6, :128], r8 b 3b 4: // h == 1, !LR_HAVE_BOTTOM. // Pad the last row with the only content row, and add. vmov q14, q12 vmov q15, q13 vmov q5, q3 add5 shift2 add r0, r0, r7 add r1, r1, r8 add5 b 6f 5: tst r4, #8 // LR_HAVE_BOTTOM bne 6f // !LR_HAVE_BOTTOM cmp r3, #0 bne 5f // The intended three edge rows left; output the one at h-2 and // the past edge one at h. vld1.32 {q12, q13}, [r5, :128], r7 vld1.16 {q3}, [r6, :128], r8 // Pad the past-edge row from the last content row. vmov q14, q12 vmov q15, q13 vmov q5, q3 add5 shift2 add r0, r0, r7 add r1, r1, r8 // The last two rows are already padded properly here. add5 b 6f 5: // r3 == -1, two rows left, output one. // Pad the last two rows from the mid one. vmov q12, q10 vmov q13, q11 vmov q3, q2 vmov q14, q10 vmov q15, q11 vmov q5, q2 add5 add r0, r0, r7 add r1, r1, r8 b 6f 6: // End of one vertical slice. subs r2, r2, #8 ble 0f // Move pointers back up to the top and loop horizontally. // Input pointers mls r5, r7, lr, r5 mls r6, r8, lr, r6 // Output pointers mls r0, r7, r12, r0 mls r1, r8, r12, r1 add r0, r0, #32 add r1, r1, #16 add r5, r5, #32 add r6, r6, #16 mov r3, r9 b 1b 0: vpop {q5-q7} pop {r4-r9,pc} .purgem add5 endfunc // void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, // const int w, const int h, const int strength, // const int bitdepth_max); // void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, // const int w, const int h, const int strength, // const int bitdepth_max); function sgr_calc_ab1_neon, export=1 push {r4-r7,lr} vpush {q4-q7} ldrd r4, r5, [sp, #84] add r3, r3, #2 // h += 2 clz r6, r5 vmov.i32 q15, #9 // n movw r5, #455 mov lr, #SUM_STRIDE b sgr_calc_ab_neon endfunc function sgr_calc_ab2_neon, export=1 push {r4-r7,lr} vpush {q4-q7} ldrd r4, r5, [sp, #84] add r3, r3, #3 // h += 3 clz r6, r5 asr r3, r3, #1 // h /= 2 vmov.i32 q15, #25 // n mov r5, #164 mov lr, #(2*SUM_STRIDE) endfunc function sgr_calc_ab_neon movrel r12, X(sgr_x_by_x) sub r6, r6, #24 // -bitdepth_min_8 vld1.8 {q8, q9}, [r12, :128]! add r7, r6, r6 // -2*bitdepth_min_8 vmov.i8 q11, #5 vmov.i8 d10, #55 // idx of last 5 vld1.8 {q10}, [r12, :128] vmov.i8 d11, #72 // idx of last 4 vmov.i8 d12, #101 // idx of last 3 vmov.i8 d13, #169 // idx of last 2 vmov.i8 d14, #254 // idx of last 1 vmov.i8 d15, #32 // elements consumed in first vtbl add r2, r2, #2 // w += 2 add r12, r2, #7 bic r12, r12, #7 // aligned w sub r12, lr, r12 // increment between rows vdup.32 q12, r4 sub r0, r0, #(4*(SUM_STRIDE)) sub r1, r1, #(2*(SUM_STRIDE)) mov r4, r2 // backup of w vsub.i8 q8, q8, q11 vsub.i8 q9, q9, q11 vsub.i8 q10, q10, q11 1: vld1.32 {q0, q1}, [r0, :128] // a vld1.16 {q2}, [r1, :128] // b vdup.32 q13, r7 // -2*bitdepth_min_8 vdup.16 q14, r6 // -bitdepth_min_8 subs r2, r2, #8 vrshl.s32 q0, q0, q13 vrshl.s32 q1, q1, q13 vrshl.s16 q4, q2, q14 vmul.i32 q0, q0, q15 // a * n vmul.i32 q1, q1, q15 // a * n vmull.u16 q3, d8, d8 // b * b vmull.u16 q4, d9, d9 // b * b vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) vmul.i32 q0, q0, q12 // p * s vmul.i32 q1, q1, q12 // p * s vqshrn.u32 d0, q0, #16 vqshrn.u32 d1, q1, #16 vqrshrn.u16 d0, q0, #4 // imin(z, 255) vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 vtbl.8 d1, {q8, q9}, d0 vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 vsub.i8 d9, d0, d15 // indices for vtbx vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 vadd.i8 d2, d2, d3 vtbx.8 d1, {q10}, d9 vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 vadd.i8 d6, d6, d7 vadd.i8 d8, d8, d22 vadd.i8 d2, d2, d6 vadd.i8 d1, d1, d8 vadd.i8 d1, d1, d2 vmovl.u8 q0, d1 // x vmov.i16 q13, #256 vdup.32 q14, r5 // one_by_x vmull.u16 q1, d0, d4 // x * BB[i] vmull.u16 q2, d1, d5 // x * BB[i] vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x vrshr.s32 q1, q1, #12 // AA[i] vrshr.s32 q2, q2, #12 // AA[i] vsub.i16 q0, q13, q0 // 256 - x vst1.32 {q1, q2}, [r0, :128]! vst1.16 {q0}, [r1, :128]! bgt 1b subs r3, r3, #1 ble 0f add r0, r0, r12, lsl #2 add r1, r1, r12, lsl #1 mov r2, r4 b 1b 0: vpop {q4-q7} pop {r4-r7,pc} endfunc rav1e-0.7.1/src/arm/32/looprestoration_tmpl.S000064400000000000000000000534221046102023000170720ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #define FILTER_OUT_STRIDE 384 .macro sgr_funcs bpc // void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, // const pixel *src, const ptrdiff_t stride, // const int32_t *a, const int16_t *b, // const int w, const int h); function sgr_finish_filter1_\bpc\()bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldr r6, [sp, #108] sub r7, r3, #(4*SUM_STRIDE) add r8, r3, #(4*SUM_STRIDE) sub r9, r4, #(2*SUM_STRIDE) add r10, r4, #(2*SUM_STRIDE) mov r11, #SUM_STRIDE mov r12, #FILTER_OUT_STRIDE add lr, r5, #3 bic lr, lr, #3 // Aligned width .if \bpc == 8 sub r2, r2, lr .else sub r2, r2, lr, lsl #1 .endif sub r12, r12, lr sub r11, r11, lr sub r11, r11, #4 // We read 4 extra elements from both a and b mov lr, r5 vmov.i16 q14, #3 vmov.i32 q15, #3 1: vld1.16 {q0}, [r9, :128]! vld1.16 {q1}, [r4, :128]! vld1.16 {q2}, [r10, :128]! vld1.32 {q8, q9}, [r7, :128]! vld1.32 {q10, q11}, [r3, :128]! vld1.32 {q12, q13}, [r8, :128]! 2: subs r5, r5, #4 vext.8 d6, d0, d1, #2 // -stride vext.8 d7, d2, d3, #2 // 0 vext.8 d8, d4, d5, #2 // +stride vext.8 d9, d0, d1, #4 // +1-stride vext.8 d10, d2, d3, #4 // +1 vext.8 d11, d4, d5, #4 // +1+stride vadd.i16 d2, d2, d6 // -1, -stride vadd.i16 d7, d7, d8 // 0, +stride vadd.i16 d0, d0, d9 // -1-stride, +1-stride vadd.i16 d2, d2, d7 vadd.i16 d4, d4, d11 // -1+stride, +1+stride vadd.i16 d2, d2, d10 // +1 vadd.i16 d0, d0, d4 vext.8 q3, q8, q9, #4 // -stride vshl.i16 d2, d2, #2 vext.8 q4, q8, q9, #8 // +1-stride vext.8 q5, q10, q11, #4 // 0 vext.8 q6, q10, q11, #8 // +1 vmla.i16 d2, d0, d28 // * 3 -> a vadd.i32 q3, q3, q10 // -stride, -1 vadd.i32 q8, q8, q4 // -1-stride, +1-stride vadd.i32 q5, q5, q6 // 0, +1 vadd.i32 q8, q8, q12 // -1+stride vadd.i32 q3, q3, q5 vext.8 q7, q12, q13, #4 // +stride vext.8 q10, q12, q13, #8 // +1+stride .if \bpc == 8 vld1.32 {d24[0]}, [r1, :32]! // src .else vld1.16 {d24}, [r1, :64]! // src .endif vadd.i32 q3, q3, q7 // +stride vadd.i32 q8, q8, q10 // +1+stride vshl.i32 q3, q3, #2 vmla.i32 q3, q8, q15 // * 3 -> b .if \bpc == 8 vmovl.u8 q12, d24 // src .endif vmov d0, d1 vmlal.u16 q3, d2, d24 // b + a * src vmov d2, d3 vrshrn.i32 d6, q3, #9 vmov d4, d5 vst1.16 {d6}, [r0]! ble 3f vmov q8, q9 vmov q10, q11 vmov q12, q13 vld1.16 {d1}, [r9, :64]! vld1.16 {d3}, [r4, :64]! vld1.16 {d5}, [r10, :64]! vld1.32 {q9}, [r7, :128]! vld1.32 {q11}, [r3, :128]! vld1.32 {q13}, [r8, :128]! b 2b 3: subs r6, r6, #1 ble 0f mov r5, lr add r0, r0, r12, lsl #1 add r1, r1, r2 add r3, r3, r11, lsl #2 add r7, r7, r11, lsl #2 add r8, r8, r11, lsl #2 add r4, r4, r11, lsl #1 add r9, r9, r11, lsl #1 add r10, r10, r11, lsl #1 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc // void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, // const pixel *src, const ptrdiff_t stride, // const int32_t *a, const int16_t *b, // const int w, const int h); function sgr_finish_filter2_\bpc\()bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldr r6, [sp, #108] add r7, r3, #(4*(SUM_STRIDE)) sub r3, r3, #(4*(SUM_STRIDE)) add r8, r4, #(2*(SUM_STRIDE)) sub r4, r4, #(2*(SUM_STRIDE)) mov r9, #(2*SUM_STRIDE) mov r10, #FILTER_OUT_STRIDE add r11, r5, #7 bic r11, r11, #7 // Aligned width .if \bpc == 8 sub r2, r2, r11 .else sub r2, r2, r11, lsl #1 .endif sub r10, r10, r11 sub r9, r9, r11 sub r9, r9, #4 // We read 4 extra elements from a sub r12, r9, #4 // We read 8 extra elements from b mov lr, r5 1: vld1.16 {q0, q1}, [r4, :128]! vld1.16 {q2, q3}, [r8, :128]! vld1.32 {q8, q9}, [r3, :128]! vld1.32 {q11, q12}, [r7, :128]! vld1.32 {q10}, [r3, :128]! vld1.32 {q13}, [r7, :128]! 2: vmov.i16 q14, #5 vmov.i16 q15, #6 subs r5, r5, #8 vext.8 q4, q0, q1, #4 // +1-stride vext.8 q5, q2, q3, #4 // +1+stride vext.8 q6, q0, q1, #2 // -stride vext.8 q7, q2, q3, #2 // +stride vadd.i16 q0, q0, q4 // -1-stride, +1-stride vadd.i16 q5, q2, q5 // -1+stride, +1+stride vadd.i16 q2, q6, q7 // -stride, +stride vadd.i16 q0, q0, q5 vext.8 q4, q8, q9, #8 // +1-stride vext.8 q5, q9, q10, #8 vext.8 q6, q11, q12, #8 // +1+stride vext.8 q7, q12, q13, #8 vmul.i16 q0, q0, q14 // * 5 vmla.i16 q0, q2, q15 // * 6 vadd.i32 q4, q4, q8 // -1-stride, +1-stride vadd.i32 q5, q5, q9 vadd.i32 q6, q6, q11 // -1+stride, +1+stride vadd.i32 q7, q7, q12 vadd.i32 q4, q4, q6 vadd.i32 q5, q5, q7 vext.8 q6, q8, q9, #4 // -stride vext.8 q7, q9, q10, #4 vext.8 q8, q11, q12, #4 // +stride vext.8 q11, q12, q13, #4 .if \bpc == 8 vld1.8 {d4}, [r1, :64]! .else vld1.8 {q2}, [r1, :128]! .endif vmov.i32 q14, #5 vmov.i32 q15, #6 vadd.i32 q6, q6, q8 // -stride, +stride vadd.i32 q7, q7, q11 vmul.i32 q4, q4, q14 // * 5 vmla.i32 q4, q6, q15 // * 6 vmul.i32 q5, q5, q14 // * 5 vmla.i32 q5, q7, q15 // * 6 .if \bpc == 8 vmovl.u8 q2, d4 .endif vmlal.u16 q4, d0, d4 // b + a * src vmlal.u16 q5, d1, d5 // b + a * src vmov q0, q1 vrshrn.i32 d8, q4, #9 vrshrn.i32 d9, q5, #9 vmov q2, q3 vst1.16 {q4}, [r0, :128]! ble 3f vmov q8, q10 vmov q11, q13 vld1.16 {q1}, [r4, :128]! vld1.16 {q3}, [r8, :128]! vld1.32 {q9, q10}, [r3, :128]! vld1.32 {q12, q13}, [r7, :128]! b 2b 3: subs r6, r6, #1 ble 0f mov r5, lr add r0, r0, r10, lsl #1 add r1, r1, r2 add r3, r3, r9, lsl #2 add r7, r7, r9, lsl #2 add r4, r4, r12, lsl #1 add r8, r8, r12, lsl #1 vld1.32 {q8, q9}, [r3, :128]! vld1.16 {q0, q1}, [r4, :128]! vld1.32 {q10}, [r3, :128]! vmov.i16 q12, #5 vmov.i16 q13, #6 4: subs r5, r5, #8 vext.8 q3, q0, q1, #4 // +1 vext.8 q2, q0, q1, #2 // 0 vadd.i16 q0, q0, q3 // -1, +1 vext.8 q4, q8, q9, #4 // 0 vext.8 q5, q9, q10, #4 vext.8 q6, q8, q9, #8 // +1 vext.8 q7, q9, q10, #8 vmul.i16 q2, q2, q13 // * 6 vmla.i16 q2, q0, q12 // * 5 -> a .if \bpc == 8 vld1.8 {d22}, [r1, :64]! .else vld1.16 {q11}, [r1, :128]! .endif vadd.i32 q8, q8, q6 // -1, +1 vadd.i32 q9, q9, q7 .if \bpc == 8 vmovl.u8 q11, d22 .endif vmul.i32 q4, q4, q15 // * 6 vmla.i32 q4, q8, q14 // * 5 -> b vmul.i32 q5, q5, q15 // * 6 vmla.i32 q5, q9, q14 // * 5 -> b vmlal.u16 q4, d4, d22 // b + a * src vmlal.u16 q5, d5, d23 vmov q0, q1 vrshrn.i32 d8, q4, #8 vrshrn.i32 d9, q5, #8 vmov q8, q10 vst1.16 {q4}, [r0, :128]! ble 5f vld1.16 {q1}, [r4, :128]! vld1.32 {q9, q10}, [r3, :128]! b 4b 5: subs r6, r6, #1 ble 0f mov r5, lr sub r3, r3, r11, lsl #2 // Rewind r3/r4 to where they started sub r4, r4, r11, lsl #1 add r0, r0, r10, lsl #1 add r1, r1, r2 sub r3, r3, #16 sub r4, r4, #16 b 1b 0: vpop {q4-q7} pop {r4-r11,pc} endfunc // void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int w, const int h, // const int wt, const int bitdepth_max); function sgr_weighted1_\bpc\()bpc_neon, export=1 push {r4-r9,lr} ldrd r4, r5, [sp, #28] ldrd r6, r7, [sp, #36] .if \bpc == 16 ldr r8, [sp, #44] .endif vdup.16 d31, r7 cmp r6, #2 .if \bpc == 16 vdup.16 q14, r8 .endif add r9, r0, r1 add r12, r2, r3 add lr, r4, #2*FILTER_OUT_STRIDE mov r7, #(4*FILTER_OUT_STRIDE) lsl r1, r1, #1 lsl r3, r3, #1 add r8, r5, #7 bic r8, r8, #7 // Aligned width .if \bpc == 8 sub r1, r1, r8 sub r3, r3, r8 .else sub r1, r1, r8, lsl #1 sub r3, r3, r8, lsl #1 .endif sub r7, r7, r8, lsl #1 mov r8, r5 blt 2f 1: .if \bpc == 8 vld1.8 {d0}, [r2, :64]! vld1.8 {d16}, [r12, :64]! .else vld1.16 {q0}, [r2, :128]! vld1.16 {q8}, [r12, :128]! .endif vld1.16 {q1}, [r4, :128]! vld1.16 {q9}, [lr, :128]! subs r5, r5, #8 .if \bpc == 8 vshll.u8 q0, d0, #4 // u vshll.u8 q8, d16, #4 // u .else vshl.i16 q0, q0, #4 // u vshl.i16 q8, q8, #4 // u .endif vsub.i16 q1, q1, q0 // t1 - u vsub.i16 q9, q9, q8 // t1 - u vshll.u16 q2, d0, #7 // u << 7 vshll.u16 q3, d1, #7 // u << 7 vshll.u16 q10, d16, #7 // u << 7 vshll.u16 q11, d17, #7 // u << 7 vmlal.s16 q2, d2, d31 // v vmlal.s16 q3, d3, d31 // v vmlal.s16 q10, d18, d31 // v vmlal.s16 q11, d19, d31 // v .if \bpc == 8 vrshrn.i32 d4, q2, #11 vrshrn.i32 d5, q3, #11 vrshrn.i32 d20, q10, #11 vrshrn.i32 d21, q11, #11 vqmovun.s16 d4, q2 vqmovun.s16 d20, q10 vst1.8 {d4}, [r0, :64]! vst1.8 {d20}, [r9, :64]! .else vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 vqrshrun.s32 d20, q10, #11 vqrshrun.s32 d21, q11, #11 vmin.u16 q2, q2, q14 vmin.u16 q10, q10, q14 vst1.16 {q2}, [r0, :128]! vst1.16 {q10}, [r9, :128]! .endif bgt 1b sub r6, r6, #2 cmp r6, #1 blt 0f mov r5, r8 add r0, r0, r1 add r9, r9, r1 add r2, r2, r3 add r12, r12, r3 add r4, r4, r7 add lr, lr, r7 beq 2f b 1b 2: .if \bpc == 8 vld1.8 {d0}, [r2, :64]! .else vld1.16 {q0}, [r2, :128]! .endif vld1.16 {q1}, [r4, :128]! subs r5, r5, #8 .if \bpc == 8 vshll.u8 q0, d0, #4 // u .else vshl.i16 q0, q0, #4 // u .endif vsub.i16 q1, q1, q0 // t1 - u vshll.u16 q2, d0, #7 // u << 7 vshll.u16 q3, d1, #7 // u << 7 vmlal.s16 q2, d2, d31 // v vmlal.s16 q3, d3, d31 // v .if \bpc == 8 vrshrn.i32 d4, q2, #11 vrshrn.i32 d5, q3, #11 vqmovun.s16 d2, q2 vst1.8 {d2}, [r0, :64]! .else vqrshrun.s32 d4, q2, #11 vqrshrun.s32 d5, q3, #11 vmin.u16 q2, q2, q14 vst1.16 {q2}, [r0, :128]! .endif bgt 2b 0: pop {r4-r9,pc} endfunc // void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int16_t *t2, // const int w, const int h, // const int16_t wt[2], const int bitdepth_max); function sgr_weighted2_\bpc\()bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] .if \bpc == 8 ldr r8, [sp, #52] .else ldrd r8, r9, [sp, #52] .endif cmp r7, #2 add r10, r0, r1 add r11, r2, r3 add r12, r4, #2*FILTER_OUT_STRIDE add lr, r5, #2*FILTER_OUT_STRIDE vld2.16 {d30[], d31[]}, [r8] // wt[0], wt[1] .if \bpc == 16 vdup.16 q14, r9 .endif mov r8, #4*FILTER_OUT_STRIDE lsl r1, r1, #1 lsl r3, r3, #1 add r9, r6, #7 bic r9, r9, #7 // Aligned width .if \bpc == 8 sub r1, r1, r9 sub r3, r3, r9 .else sub r1, r1, r9, lsl #1 sub r3, r3, r9, lsl #1 .endif sub r8, r8, r9, lsl #1 mov r9, r6 blt 2f 1: .if \bpc == 8 vld1.8 {d0}, [r2, :64]! vld1.8 {d16}, [r11, :64]! .else vld1.16 {q0}, [r2, :128]! vld1.16 {q8}, [r11, :128]! .endif vld1.16 {q1}, [r4, :128]! vld1.16 {q9}, [r12, :128]! vld1.16 {q2}, [r5, :128]! vld1.16 {q10}, [lr, :128]! subs r6, r6, #8 .if \bpc == 8 vshll.u8 q0, d0, #4 // u vshll.u8 q8, d16, #4 // u .else vshl.i16 q0, q0, #4 // u vshl.i16 q8, q8, #4 // u .endif vsub.i16 q1, q1, q0 // t1 - u vsub.i16 q2, q2, q0 // t2 - u vsub.i16 q9, q9, q8 // t1 - u vsub.i16 q10, q10, q8 // t2 - u vshll.u16 q3, d0, #7 // u << 7 vshll.u16 q0, d1, #7 // u << 7 vshll.u16 q11, d16, #7 // u << 7 vshll.u16 q8, d17, #7 // u << 7 vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) vmlal.s16 q11, d18, d30 // wt[0] * (t1 - u) vmlal.s16 q11, d20, d31 // wt[1] * (t2 - u) vmlal.s16 q8, d19, d30 // wt[0] * (t1 - u) vmlal.s16 q8, d21, d31 // wt[1] * (t2 - u) .if \bpc == 8 vrshrn.i32 d6, q3, #11 vrshrn.i32 d7, q0, #11 vrshrn.i32 d22, q11, #11 vrshrn.i32 d23, q8, #11 vqmovun.s16 d6, q3 vqmovun.s16 d22, q11 vst1.8 {d6}, [r0, :64]! vst1.8 {d22}, [r10, :64]! .else vqrshrun.s32 d6, q3, #11 vqrshrun.s32 d7, q0, #11 vqrshrun.s32 d22, q11, #11 vqrshrun.s32 d23, q8, #11 vmin.u16 q3, q3, q14 vmin.u16 q11, q11, q14 vst1.16 {q3}, [r0, :128]! vst1.16 {q11}, [r10, :128]! .endif bgt 1b subs r7, r7, #2 cmp r7, #1 blt 0f mov r6, r9 add r0, r0, r1 add r10, r10, r1 add r2, r2, r3 add r11, r11, r3 add r4, r4, r8 add r12, r12, r8 add r5, r5, r8 add lr, lr, r8 beq 2f b 1b 2: .if \bpc == 8 vld1.8 {d0}, [r2, :64]! .else vld1.16 {q0}, [r2, :128]! .endif vld1.16 {q1}, [r4, :128]! vld1.16 {q2}, [r5, :128]! subs r6, r6, #8 .if \bpc == 8 vshll.u8 q0, d0, #4 // u .else vshl.i16 q0, q0, #4 // u .endif vsub.i16 q1, q1, q0 // t1 - u vsub.i16 q2, q2, q0 // t2 - u vshll.u16 q3, d0, #7 // u << 7 vshll.u16 q0, d1, #7 // u << 7 vmlal.s16 q3, d2, d30 // wt[0] * (t1 - u) vmlal.s16 q3, d4, d31 // wt[1] * (t2 - u) vmlal.s16 q0, d3, d30 // wt[0] * (t1 - u) vmlal.s16 q0, d5, d31 // wt[1] * (t2 - u) .if \bpc == 8 vrshrn.i32 d6, q3, #11 vrshrn.i32 d7, q0, #11 vqmovun.s16 d6, q3 vst1.8 {d6}, [r0, :64]! .else vqrshrun.s32 d6, q3, #11 vqrshrun.s32 d7, q0, #11 vmin.u16 q3, q3, q14 vst1.16 {q3}, [r0, :128]! .endif bgt 1b 0: pop {r4-r11,pc} endfunc .endm rav1e-0.7.1/src/arm/32/mc.S000064400000000000000000003517621046102023000132020ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro avg dst0, dst1, t0, t1, t2, t3 vld1.16 {\t0,\t1}, [r2, :128]! vld1.16 {\t2,\t3}, [r3, :128]! vadd.i16 \t0, \t0, \t2 vadd.i16 \t1, \t1, \t3 vqrshrun.s16 \dst0, \t0, #5 vqrshrun.s16 \dst1, \t1, #5 .endm .macro w_avg dst0, dst1, t0, t1, t2, t3 vld1.16 {\t0,\t1}, [r2, :128]! vld1.16 {\t2,\t3}, [r3, :128]! vsub.i16 \t0, \t2, \t0 vsub.i16 \t1, \t3, \t1 vqdmulh.s16 \t0, \t0, q15 vqdmulh.s16 \t1, \t1, q15 vadd.i16 \t0, \t2, \t0 vadd.i16 \t1, \t3, \t1 vqrshrun.s16 \dst0, \t0, #4 vqrshrun.s16 \dst1, \t1, #4 .endm .macro mask dst0, dst1, t0, t1, t2, t3 vld1.8 {q14}, [lr, :128]! vld1.16 {\t0,\t1}, [r2, :128]! vmul.i8 q14, q14, q15 vld1.16 {\t2,\t3}, [r3, :128]! vshll.i8 q13, d28, #8 vshll.i8 q14, d29, #8 vsub.i16 \t0, \t2, \t0 vsub.i16 \t1, \t3, \t1 vqdmulh.s16 \t0, \t0, q13 vqdmulh.s16 \t1, \t1, q14 vadd.i16 \t0, \t2, \t0 vadd.i16 \t1, \t3, \t1 vqrshrun.s16 \dst0, \t0, #4 vqrshrun.s16 \dst1, \t1, #4 .endm .macro bidir_fn type function \type\()_8bpc_neon, export=1 push {r4-r6,lr} ldrd r4, r5, [sp, #16] clz r4, r4 .ifnc \type, avg ldr lr, [sp, #24] .endif .ifc \type, w_avg vdup.s16 q15, lr vneg.s16 q15, q15 vshl.i16 q15, q15, #11 .endif .ifc \type, mask vmov.i8 q15, #256-2 .endif adr r12, L(\type\()_tbl) sub r4, r4, #24 ldr r4, [r12, r4, lsl #2] \type d16, d17, q0, q1, q2, q3 add r12, r12, r4 bx r12 .align 2 L(\type\()_tbl): .word 1280f - L(\type\()_tbl) + CONFIG_THUMB .word 640f - L(\type\()_tbl) + CONFIG_THUMB .word 320f - L(\type\()_tbl) + CONFIG_THUMB .word 160f - L(\type\()_tbl) + CONFIG_THUMB .word 80f - L(\type\()_tbl) + CONFIG_THUMB .word 4f - L(\type\()_tbl) + CONFIG_THUMB 4: add r6, r0, r1 lsl r1, r1, #1 cmp r5, #4 vst1.32 {d16[0]}, [r0, :32], r1 vst1.32 {d16[1]}, [r6, :32], r1 vst1.32 {d17[0]}, [r0, :32], r1 vst1.32 {d17[1]}, [r6, :32], r1 beq 0f \type d18, d19, q0, q1, q2, q3 cmp r5, #8 vst1.32 {d18[0]}, [r0, :32], r1 vst1.32 {d18[1]}, [r6, :32], r1 vst1.32 {d19[0]}, [r0, :32], r1 vst1.32 {d19[1]}, [r6, :32], r1 beq 0f \type d16, d17, q0, q1, q2, q3 vst1.32 {d16[0]}, [r0, :32], r1 vst1.32 {d16[1]}, [r6, :32], r1 \type d18, d19, q0, q1, q2, q3 vst1.32 {d17[0]}, [r0, :32], r1 vst1.32 {d17[1]}, [r6, :32], r1 vst1.32 {d18[0]}, [r0, :32], r1 vst1.32 {d18[1]}, [r6, :32], r1 vst1.32 {d19[0]}, [r0, :32], r1 vst1.32 {d19[1]}, [r6, :32], r1 pop {r4-r6,pc} 80: add r6, r0, r1 lsl r1, r1, #1 8: vst1.8 {d16}, [r0, :64], r1 \type d18, d19, q0, q1, q2, q3 vst1.8 {d17}, [r6, :64], r1 vst1.8 {d18}, [r0, :64], r1 subs r5, r5, #4 vst1.8 {d19}, [r6, :64], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 8b 160: add r6, r0, r1 lsl r1, r1, #1 16: \type d18, d19, q0, q1, q2, q3 vst1.8 {q8}, [r0, :128], r1 \type d20, d21, q0, q1, q2, q3 vst1.8 {q9}, [r6, :128], r1 \type d22, d23, q0, q1, q2, q3 vst1.8 {q10}, [r0, :128], r1 subs r5, r5, #4 vst1.8 {q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 16b 320: add r6, r0, r1 lsl r1, r1, #1 32: \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128], r1 \type d22, d23, q0, q1, q2, q3 subs r5, r5, #2 vst1.8 {q10, q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 32b 640: add r6, r0, #32 64: \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 \type d22, d23, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128], r1 \type d16, d17, q0, q1, q2, q3 vst1.8 {q10, q11}, [r6, :128], r1 \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128], r1 \type d22, d23, q0, q1, q2, q3 subs r5, r5, #2 vst1.8 {q10, q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 64b 1280: sub r1, r1, #32 add r6, r0, #64 128: \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 \type d22, d23, q0, q1, q2, q3 vst1.8 {q8, q9}, [r0, :128]! \type d16, d17, q0, q1, q2, q3 vst1.8 {q10, q11}, [r0, :128], r1 \type d18, d19, q0, q1, q2, q3 \type d20, d21, q0, q1, q2, q3 vst1.8 {q8, q9}, [r6, :128]! \type d22, d23, q0, q1, q2, q3 subs r5, r5, #1 vst1.8 {q10, q11}, [r6, :128], r1 ble 0f \type d16, d17, q0, q1, q2, q3 b 128b 0: pop {r4-r6,pc} endfunc .endm bidir_fn avg bidir_fn w_avg bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 push {r4-r9,lr} ldrd r4, r5, [sp, #28] ldrd r6, r7, [sp, #36] clz r8, r4 adr r9, L(w_mask_\type\()_tbl) sub r8, r8, #24 ldr r8, [r9, r8, lsl #2] add r9, r9, r8 movw r12, #6903 vdup.16 q14, r12 .if \type == 444 vmov.i8 q15, #64 .elseif \type == 422 vdup.8 d0, r7 // d0[] <- sign vmov.i8 d30, #129 vsub.i8 d30, d30, d0 // 129 - sign .elseif \type == 420 vdup.16 q0, r7 // d0[] <- sign vmov.i16 q15, #256 vsub.i16 q15, q15, q0 // 256 - sign .endif add r12, r0, r1 lsl r1, r1, #1 bx r9 .align 2 L(w_mask_\type\()_tbl): .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 640f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 320f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB .word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB 4: vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once) vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once) subs r5, r5, #4 vsub.i16 q8, q2, q0 // tmp2-tmp1 vsub.i16 q9, q3, q1 vabd.s16 q10, q0, q2 // (abs(tmp1[x] - tmp2[x])) vabd.s16 q11, q1, q3 vqsub.u16 q10, q14, q10 // 6903 - abs () vqsub.u16 q11, q14, q11 vshr.s16 q10, q10, #8 // 64-m = (6903 - abs()) >> 8 vshr.s16 q11, q11, #8 vshl.s16 q12, q10, #9 // (64-m)<<9 vshl.s16 q13, q11, #9 vqdmulh.s16 q12, q12, q8 // ((tmp2-tmp1)*(64-m)<<9)>>15 vqdmulh.s16 q13, q13, q9 vadd.i16 q12, q12, q0 // (((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1 vadd.i16 q13, q13, q1 vqrshrun.s16 d24, q12, #4 // (((((tmp2-tmp1)*(64-m)<<9)>>15) + tmp1) + 8) >> 4 vqrshrun.s16 d25, q13, #4 .if \type == 444 vmovn.u16 d20, q10 // 64 - m vmovn.u16 d21, q11 vsub.i8 q10, q15, q10 // m vst1.8 {d20, d21}, [r6, :128]! .elseif \type == 422 vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition) vpadd.s16 d21, d22, d23 vmovn.s16 d6, q10 vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1 vst1.8 {d6}, [r6, :64]! .elseif \type == 420 vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition) vadd.s16 d21, d22, d23 vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d20[0]}, [r6, :32]! .endif vst1.32 {d24[0]}, [r0, :32], r1 vst1.32 {d24[1]}, [r12, :32], r1 vst1.32 {d25[0]}, [r0, :32], r1 vst1.32 {d25[1]}, [r12, :32], r1 bgt 4b pop {r4-r9,pc} 8: vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2 vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2 subs r5, r5, #2 vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1 vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2 vabd.s16 q10, q0, q2 // abs(tmp1y1 - tmp2y1) vabd.s16 q11, q1, q3 // abs(tmp1y2 - tmp2y2) vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) vqsub.u16 q11, q14, q11 // 6903 - abs(tmp1y2 - tmp2y2) vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 vshr.s16 q11, q11, #8 // 64 - my2 = 6903 - abs(tmp1y2 - tmp2y2) >> 8 vshl.s16 q12, q10, #9 // (64 - my1) << 9 vshl.s16 q13, q11, #9 // (64 - my2) << 9 vqdmulh.s16 q12, q12, q8 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 vqdmulh.s16 q13, q13, q9 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 vadd.s16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 vadd.s16 q13, q13, q1 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 vqrshrun.s16 d25, q13, #4 // (((((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 .if \type == 444 vmovn.u16 d20, q10 // 64 - m vmovn.u16 d21, q11 vsub.i8 q10, q15, q10 // m vst1.8 {d20, d21}, [r6, :128]! .elseif \type == 422 vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2) vmovn.s16 d20, q10 vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1 vst1.8 {d20}, [r6, :64]! .elseif \type == 420 vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition) vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d20[0]}, [r6, :32]! .endif vst1.16 {d24}, [r0, :64], r1 vst1.16 {d25}, [r12, :64], r1 bgt 8b pop {r4-r9,pc} 1280: 640: 320: 160: sub r1, r1, r4 .if \type == 444 add lr, r6, r4 .elseif \type == 422 add lr, r6, r4, lsr #1 .endif add r9, r3, r4, lsl #1 add r7, r2, r4, lsl #1 161: mov r8, r4 16: vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1 vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1 vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2 subs r8, r8, #16 vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1 vsub.i16 q3, q3, q1 vabs.s16 q10, q2 // abs(tm2y1 - tmp1y1) vabs.s16 q11, q3 vqsub.u16 q10, q14, q10 // 6903 - abs(tmp1y1 - tmp2y1) vqsub.u16 q11, q14, q11 vshr.s16 q10, q10, #8 // 64 - my1 = 6903 - abs(tmp1y1 - tmp2y1) >> 8 vshr.s16 q11, q11, #8 vshl.s16 q12, q10, #9 // (64 - my1) << 9 vshl.s16 q13, q11, #9 vqdmulh.s16 q12, q12, q2 // ((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15 vqdmulh.s16 q13, q13, q3 vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1 vadd.i16 q13, q13, q1 vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2 .if \type == 444 vmovn.u16 d20, q10 // 64 - my1 vmovn.u16 d21, q11 vsub.i8 q10, q15, q10 // my1 vst1.8 {d20, d21}, [r6, :128]! .elseif \type == 422 vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition) vpadd.s16 d21, d22, d23 vmovn.s16 d20, q10 vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1 vst1.8 {d20}, [r6, :64]! .endif vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4 vqrshrun.s16 d25, q13, #4 vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2 vsub.i16 q1, q1, q9 vst1.16 {d24, d25}, [r0, :128]! // store dsty1 vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2) vabs.s16 q3, q1 vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2) vqsub.u16 q3, q14, q3 vshr.s16 q2, q2, #8 // (6903 - abs(tmp2y2 - tmp1y2)) >> 8 vshr.s16 q3, q3, #8 vshl.s16 q12, q2, #9 // (64 - my2) << 9 vshl.s16 q13, q3, #9 .if \type == 444 vmovn.u16 d4, q2 // 64 - my2 vmovn.u16 d5, q3 vsub.i8 q2, q15, q2 // my2 vst1.8 {d4, d5}, [lr, :128]! .elseif \type == 422 vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition) vpadd.s16 d5, d6, d7 vmovn.s16 d4, q2 vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1 vst1.8 {d4}, [lr, :64]! .elseif \type == 420 vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition) vadd.s16 q11, q11, q3 vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition) vpadd.s16 d21, d22, d23 vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.8 {d20}, [r6, :64]! .endif vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15 vqdmulh.s16 q13, q13, q1 vadd.i16 q12, q12, q8 // (((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15) + tmp1y2 vadd.i16 q13, q13, q9 vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4 vqrshrun.s16 d25, q13, #4 vst1.16 {d24, d25}, [r12, :128]! // store dsty2 bgt 16b subs r5, r5, #2 add r2, r2, r4, lsl #1 add r3, r3, r4, lsl #1 add r7, r7, r4, lsl #1 add r9, r9, r4, lsl #1 .if \type == 444 add r6, r6, r4 add lr, lr, r4 .elseif \type == 422 add r6, r6, r4, lsr #1 add lr, lr, r4, lsr #1 .endif add r0, r0, r1 add r12, r12, r1 bgt 161b pop {r4-r9,pc} endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_8bpc_neon, export=1 push {r4-r5,lr} ldrd r4, r5, [sp, #12] clz lr, r3 adr r3, L(blend_tbl) sub lr, lr, #26 ldr lr, [r3, lr, lsl #2] add r3, r3, lr bx r3 .align 2 L(blend_tbl): .word 320f - L(blend_tbl) + CONFIG_THUMB .word 160f - L(blend_tbl) + CONFIG_THUMB .word 80f - L(blend_tbl) + CONFIG_THUMB .word 40f - L(blend_tbl) + CONFIG_THUMB 40: vmov.i8 d22, #64 add r12, r0, r1 lsl r1, r1, #1 4: vld1.u8 {d2}, [r5, :64]! vld1.u8 {d1}, [r2, :64]! vld1.32 {d0[]}, [r0, :32] subs r4, r4, #2 vld1.32 {d0[1]}, [r12, :32] vsub.i8 d3, d22, d2 vmull.u8 q8, d1, d2 vmlal.u8 q8, d0, d3 vrshrn.i16 d20, q8, #6 vst1.32 {d20[0]}, [r0, :32], r1 vst1.32 {d20[1]}, [r12, :32], r1 bgt 4b pop {r4-r5,pc} 80: vmov.i8 d16, #64 add r12, r0, r1 lsl r1, r1, #1 8: vld1.u8 {q1}, [r5, :128]! vld1.u8 {q2}, [r2, :128]! vld1.u8 {d0}, [r0, :64] vsub.i8 d17, d16, d2 vld1.u8 {d1}, [r12, :64] subs r4, r4, #2 vsub.i8 d18, d16, d3 vmull.u8 q3, d2, d4 vmlal.u8 q3, d0, d17 vmull.u8 q10, d3, d5 vmlal.u8 q10, d1, d18 vrshrn.i16 d22, q3, #6 vrshrn.i16 d23, q10, #6 vst1.u8 {d22}, [r0, :64], r1 vst1.u8 {d23}, [r12, :64], r1 bgt 8b pop {r4-r5,pc} 160: vmov.i8 q12, #64 add r12, r0, r1 lsl r1, r1, #1 16: vld1.u8 {q1, q2}, [r5, :128]! vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {q0}, [r0, :128] subs r4, r4, #2 vsub.i8 q15, q12, q1 vld1.u8 {q13}, [r12, :128] vmull.u8 q3, d16, d2 vmlal.u8 q3, d0, d30 vmull.u8 q14, d17, d3 vmlal.u8 q14, d1, d31 vsub.i8 q15, q12, q2 vrshrn.i16 d20, q3, #6 vrshrn.i16 d21, q14, #6 vmull.u8 q3, d18, d4 vmlal.u8 q3, d26, d30 vmull.u8 q14, d19, d5 vmlal.u8 q14, d27, d31 vrshrn.i16 d22, q3, #6 vrshrn.i16 d23, q14, #6 vst1.u8 {q10}, [r0, :128], r1 vst1.u8 {q11}, [r12, :128], r1 bgt 16b pop {r4-r5,pc} 320: vmov.i8 q10, #64 32: vld1.u8 {q2, q3}, [r5, :128]! vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {q0, q1}, [r0, :128] subs r4, r4, #1 vsub.i8 q11, q10, q2 vmull.u8 q15, d16, d4 vmlal.u8 q15, d0, d22 vmull.u8 q14, d17, d5 vmlal.u8 q14, d1, d23 vsub.i8 q11, q10, q3 vrshrn.i16 d24, q15, #6 vrshrn.i16 d25, q14, #6 vmull.u8 q15, d18, d6 vmlal.u8 q15, d2, d22 vmull.u8 q14, d19, d7 vmlal.u8 q14, d3, d23 vrshrn.i16 d26, q15, #6 vrshrn.i16 d27, q14, #6 vst1.u8 {q12, q13}, [r0, :128], r1 bgt 32b pop {r4-r5,pc} endfunc function blend_h_8bpc_neon, export=1 push {r4-r5,lr} ldr r4, [sp, #12] movrel r5, X(obmc_masks) add r5, r5, r4 sub r4, r4, r4, lsr #2 clz lr, r3 adr r12, L(blend_h_tbl) sub lr, lr, #24 ldr lr, [r12, lr, lsl #2] add r12, r12, lr bx r12 .align 2 L(blend_h_tbl): .word 1280f - L(blend_h_tbl) + CONFIG_THUMB .word 640f - L(blend_h_tbl) + CONFIG_THUMB .word 320f - L(blend_h_tbl) + CONFIG_THUMB .word 160f - L(blend_h_tbl) + CONFIG_THUMB .word 80f - L(blend_h_tbl) + CONFIG_THUMB .word 40f - L(blend_h_tbl) + CONFIG_THUMB .word 20f - L(blend_h_tbl) + CONFIG_THUMB 20: vmov.i8 d22, #64 add r12, r0, r1 lsl r1, r1, #1 2: vld1.16 {d2[], d3[]}, [r5, :16]! vld1.32 {d1[]}, [r2, :32]! subs r4, r4, #2 vld1.16 {d0[]}, [r0, :16] vzip.8 d2, d3 vsub.i8 d4, d22, d2 vld1.16 {d0[1]}, [r12, :16] vmull.u8 q8, d1, d2 vmlal.u8 q8, d0, d4 vrshrn.i16 d20, q8, #6 vst1.16 {d20[0]}, [r0, :16], r1 vst1.16 {d20[1]}, [r12, :16], r1 bgt 2b pop {r4-r5,pc} 40: vmov.i8 d22, #64 add r12, r0, r1 lsl r1, r1, #1 4: vld2.u8 {d2[], d3[]}, [r5, :16]! vld1.u8 {d1}, [r2, :64]! subs r4, r4, #2 vext.u8 d2, d2, d3, #4 vld1.32 {d0[]}, [r0, :32] vsub.i8 d6, d22, d2 vld1.32 {d0[1]}, [r12, :32] vmull.u8 q8, d1, d2 vmlal.u8 q8, d0, d6 vrshrn.i16 d20, q8, #6 vst1.32 {d20[0]}, [r0, :32], r1 vst1.32 {d20[1]}, [r12, :32], r1 bgt 4b pop {r4-r5,pc} 80: vmov.i8 q8, #64 add r12, r0, r1 lsl r1, r1, #1 8: vld2.u8 {d2[], d3[]}, [r5, :16]! vld1.u8 {d4, d5}, [r2, :128]! vld1.u8 {d0}, [r0, :64] vsub.i8 q9, q8, q1 vld1.u8 {d1}, [r12, :64] subs r4, r4, #2 vmull.u8 q3, d2, d4 vmlal.u8 q3, d0, d18 vmull.u8 q10, d3, d5 vmlal.u8 q10, d1, d19 vrshrn.i16 d22, q3, #6 vrshrn.i16 d23, q10, #6 vst1.u8 {d22}, [r0, :64], r1 vst1.u8 {d23}, [r12, :64], r1 bgt 8b pop {r4-r5,pc} 160: vmov.i8 q12, #64 add r12, r0, r1 lsl r1, r1, #1 16: vld2.u8 {d28[], d29[]}, [r5, :16]! vld1.u8 {d2, d3, d4, d5}, [r2, :128]! vsub.i8 q15, q12, q14 vld1.u8 {q0}, [r0, :128] subs r4, r4, #2 vld1.u8 {q13}, [r12, :128] vmull.u8 q3, d2, d28 vmlal.u8 q3, d0, d30 vmull.u8 q8, d3, d28 vmlal.u8 q8, d1, d30 vrshrn.i16 d18, q3, #6 vrshrn.i16 d19, q8, #6 vmull.u8 q3, d4, d29 vmlal.u8 q3, d26, d31 vmull.u8 q8, d5, d29 vmlal.u8 q8, d27, d31 vrshrn.i16 d20, q3, #6 vrshrn.i16 d21, q8, #6 vst1.u8 {q9}, [r0, :128], r1 vst1.u8 {q10}, [r12, :128], r1 bgt 16b pop {r4-r5,pc} 320: 640: 1280: vmov.i8 d20, #64 sub r1, r1, r3 321: vld1.u8 {d6[]}, [r5]! vsub.i8 d7, d20, d6 mov r12, r3 32: vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {q0, q1}, [r0, :128] vmull.u8 q15, d16, d6 vmlal.u8 q15, d0, d7 vmull.u8 q14, d17, d6 vmlal.u8 q14, d1, d7 vrshrn.i16 d0, q15, #6 vrshrn.i16 d1, q14, #6 vmull.u8 q15, d18, d6 vmlal.u8 q15, d2, d7 vmull.u8 q14, d19, d6 vmlal.u8 q14, d3, d7 vrshrn.i16 d2, q15, #6 vrshrn.i16 d3, q14, #6 subs r12, r12, #32 vst1.u8 {q0, q1}, [r0, :128]! bgt 32b add r0, r0, r1 subs r4, r4, #1 bgt 321b pop {r4-r5,pc} endfunc function blend_v_8bpc_neon, export=1 push {r4,lr} ldr r4, [sp, #8] movrel lr, X(obmc_masks) add lr, lr, r3 clz r12, r3 adr r3, L(blend_v_tbl) sub r12, r12, #26 ldr r12, [r3, r12, lsl #2] add r3, r3, r12 bx r3 .align 2 L(blend_v_tbl): .word 320f - L(blend_v_tbl) + CONFIG_THUMB .word 160f - L(blend_v_tbl) + CONFIG_THUMB .word 80f - L(blend_v_tbl) + CONFIG_THUMB .word 40f - L(blend_v_tbl) + CONFIG_THUMB .word 20f - L(blend_v_tbl) + CONFIG_THUMB 20: vmov.i8 d22, #64 vld1.8 {d2[]}, [lr] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d3, d22, d2 2: vld1.16 {d1[0]}, [r2, :16]! vld1.8 {d0[]}, [r0] subs r4, r4, #2 vld1.8 {d1[1]}, [r2] vld1.8 {d0[1]}, [r12] vmull.u8 q2, d1, d2 vmlal.u8 q2, d0, d3 vrshrn.i16 d6, q2, #6 add r2, r2, #2 vst1.8 {d6[0]}, [r0], r1 vst1.8 {d6[1]}, [r12], r1 bgt 2b pop {r4,pc} 40: vmov.i8 d22, #64 vld1.32 {d4[]}, [lr, :32] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d5, d22, d4 sub r1, r1, #2 4: vld1.u8 {d2}, [r2, :64]! vld1.32 {d0[]}, [r0, :32] vld1.32 {d0[1]}, [r12, :32] subs r4, r4, #2 vmull.u8 q3, d2, d4 vmlal.u8 q3, d0, d5 vrshrn.i16 d20, q3, #6 vst1.16 {d20[0]}, [r0, :16]! vst1.16 {d20[2]}, [r12, :16]! vst1.8 {d20[2]}, [r0], r1 vst1.8 {d20[6]}, [r12], r1 bgt 4b pop {r4,pc} 80: vmov.i8 d16, #64 vld1.u8 {d2}, [lr, :64] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 d17, d16, d2 sub r1, r1, #4 8: vld1.u8 {d4, d5}, [r2, :128]! vld1.u8 {d0}, [r0, :64] vld1.u8 {d1}, [r12, :64] subs r4, r4, #2 vmull.u8 q3, d2, d4 vmlal.u8 q3, d0, d17 vmull.u8 q10, d2, d5 vmlal.u8 q10, d1, d17 vrshrn.i16 d22, q3, #6 vrshrn.i16 d23, q10, #6 vst1.32 {d22[0]}, [r0, :32]! vst1.32 {d23[0]}, [r12, :32]! vst1.16 {d22[2]}, [r0, :16], r1 vst1.16 {d23[2]}, [r12, :16], r1 bgt 8b pop {r4,pc} 160: vmov.i8 q12, #64 vld1.u8 {q14}, [lr, :128] add r12, r0, r1 lsl r1, r1, #1 vsub.i8 q11, q12, q14 sub r1, r1, #8 16: vld1.u8 {q1, q2}, [r2, :128]! vld1.u8 {q0}, [r0, :128] subs r4, r4, #2 vld1.u8 {q13}, [r12, :128] vmull.u8 q3, d2, d28 vmlal.u8 q3, d0, d22 vmull.u8 q8, d3, d29 vmlal.u8 q8, d1, d23 vrshrn.i16 d18, q3, #6 vrshrn.i16 d19, q8, #6 vmull.u8 q3, d4, d28 vmlal.u8 q3, d26, d22 vmull.u8 q8, d5, d29 vmlal.u8 q8, d27, d23 vrshrn.i16 d20, q3, #6 vrshrn.i16 d21, q8, #6 vst1.u8 {d18}, [r0, :64]! vst1.u8 {d20}, [r12, :64]! vst1.32 {d19[0]}, [r0, :32], r1 vst1.32 {d21[0]}, [r12, :32], r1 bgt 16b pop {r4,pc} 320: vmov.i8 q10, #64 vld1.u8 {q2, q3}, [lr, :128] vsub.i8 q11, q10, q2 vsub.i8 d24, d20, d6 32: vld1.u8 {q8, q9}, [r2, :128]! vld1.u8 {d0, d1, d2}, [r0, :64] subs r4, r4, #1 vmull.u8 q15, d16, d4 vmlal.u8 q15, d0, d22 vmull.u8 q14, d17, d5 vmlal.u8 q14, d1, d23 vrshrn.i16 d0, q15, #6 vrshrn.i16 d1, q14, #6 vmull.u8 q15, d18, d6 vmlal.u8 q15, d2, d24 vrshrn.i16 d2, q15, #6 vst1.u8 {d0, d1, d2}, [r0, :64], r1 bgt 32b pop {r4,pc} endfunc // This has got the same signature as the put_8tap functions, // assumes that the caller has loaded the h argument into r5, // and assumes that r8 is set to (clz(w)-24). function put_neon adr r9, L(put_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(put_tbl): .word 1280f - L(put_tbl) + CONFIG_THUMB .word 640f - L(put_tbl) + CONFIG_THUMB .word 32f - L(put_tbl) + CONFIG_THUMB .word 160f - L(put_tbl) + CONFIG_THUMB .word 8f - L(put_tbl) + CONFIG_THUMB .word 4f - L(put_tbl) + CONFIG_THUMB .word 2f - L(put_tbl) + CONFIG_THUMB 2: vld1.16 {d0[]}, [r2], r3 vld1.16 {d1[]}, [r2], r3 subs r5, r5, #2 vst1.16 {d0[0]}, [r0, :16], r1 vst1.16 {d1[0]}, [r0, :16], r1 bgt 2b pop {r4-r11,pc} 4: vld1.32 {d0[]}, [r2], r3 vld1.32 {d1[]}, [r2], r3 subs r5, r5, #2 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d1[0]}, [r0, :32], r1 bgt 4b pop {r4-r11,pc} 8: vld1.8 {d0}, [r2], r3 vld1.8 {d1}, [r2], r3 subs r5, r5, #2 vst1.8 {d0}, [r0, :64], r1 vst1.8 {d1}, [r0, :64], r1 bgt 8b pop {r4-r11,pc} 160: add r8, r0, r1 lsl r1, r1, #1 add r9, r2, r3 lsl r3, r3, #1 16: vld1.8 {q0}, [r2], r3 vld1.8 {q1}, [r9], r3 subs r5, r5, #2 vst1.8 {q0}, [r0, :128], r1 vst1.8 {q1}, [r8, :128], r1 bgt 16b pop {r4-r11,pc} 32: vld1.8 {q0, q1}, [r2], r3 subs r5, r5, #1 vst1.8 {q0, q1}, [r0, :128], r1 bgt 32b pop {r4-r11,pc} 640: sub r1, r1, #32 sub r3, r3, #32 64: vld1.8 {q0, q1}, [r2]! vst1.8 {q0, q1}, [r0, :128]! vld1.8 {q2, q3}, [r2], r3 subs r5, r5, #1 vst1.8 {q2, q3}, [r0, :128], r1 bgt 64b pop {r4-r11,pc} 1280: sub r1, r1, #96 sub r3, r3, #96 128: vld1.8 {q8, q9}, [r2]! vst1.8 {q8, q9}, [r0, :128]! vld1.8 {q10, q11}, [r2]! vst1.8 {q10, q11}, [r0, :128]! vld1.8 {q12, q13}, [r2]! vst1.8 {q12, q13}, [r0, :128]! vld1.8 {q14, q15}, [r2], r3 subs r5, r5, #1 vst1.8 {q14, q15}, [r0, :128], r1 bgt 128b pop {r4-r11,pc} endfunc // This has got the same signature as the put_8tap functions, // assumes that the caller has loaded the h argument into r4, // and assumes that r8 is set to (clz(w)-24), and r7 to w*2. function prep_neon adr r9, L(prep_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(prep_tbl): .word 1280f - L(prep_tbl) + CONFIG_THUMB .word 640f - L(prep_tbl) + CONFIG_THUMB .word 320f - L(prep_tbl) + CONFIG_THUMB .word 160f - L(prep_tbl) + CONFIG_THUMB .word 8f - L(prep_tbl) + CONFIG_THUMB .word 4f - L(prep_tbl) + CONFIG_THUMB 4: vld1.32 {d0[]}, [r1], r2 vld1.32 {d2[]}, [r1], r2 subs r4, r4, #2 vshll.u8 q0, d0, #4 vshll.u8 q1, d2, #4 vst1.16 {d1, d2}, [r0, :64]! bgt 4b pop {r4-r11,pc} 8: vld1.8 {d0}, [r1], r2 vld1.8 {d2}, [r1], r2 subs r4, r4, #2 vshll.u8 q0, d0, #4 vshll.u8 q1, d2, #4 vst1.16 {q0, q1}, [r0, :128]! bgt 8b pop {r4-r11,pc} 160: add r9, r1, r2 lsl r2, r2, #1 add r8, r0, r7 lsl r7, r7, #1 16: vld1.8 {q2}, [r1], r2 vld1.8 {q3}, [r9], r2 subs r4, r4, #2 vshll.u8 q0, d4, #4 vshll.u8 q1, d5, #4 vshll.u8 q2, d6, #4 vshll.u8 q3, d7, #4 vst1.16 {q0, q1}, [r0, :128], r7 vst1.16 {q2, q3}, [r8, :128], r7 bgt 16b pop {r4-r11,pc} 320: add r8, r0, r3 32: vld1.8 {q0, q1}, [r1], r2 subs r4, r4, #2 vshll.u8 q8, d0, #4 vshll.u8 q9, d1, #4 vld1.8 {q2, q3}, [r1], r2 vshll.u8 q10, d2, #4 vshll.u8 q11, d3, #4 vshll.u8 q12, d4, #4 vst1.16 {q8, q9}, [r0, :128], r7 vshll.u8 q13, d5, #4 vst1.16 {q10, q11}, [r8, :128], r7 vshll.u8 q14, d6, #4 vst1.16 {q12, q13}, [r0, :128], r7 vshll.u8 q15, d7, #4 vst1.16 {q14, q15}, [r8, :128], r7 bgt 32b pop {r4-r11,pc} 640: sub r2, r2, #32 add r8, r0, #32 mov r6, #64 64: vld1.8 {q0, q1}, [r1]! subs r4, r4, #1 vshll.u8 q8, d0, #4 vshll.u8 q9, d1, #4 vld1.8 {q2, q3}, [r1], r2 vshll.u8 q10, d2, #4 vshll.u8 q11, d3, #4 vshll.u8 q12, d4, #4 vst1.16 {q8, q9}, [r0, :128], r6 vshll.u8 q13, d5, #4 vshll.u8 q14, d6, #4 vst1.16 {q10, q11}, [r8, :128], r6 vshll.u8 q15, d7, #4 vst1.16 {q12, q13}, [r0, :128], r6 vst1.16 {q14, q15}, [r8, :128], r6 bgt 64b pop {r4-r11,pc} 1280: sub r2, r2, #96 add r8, r0, #32 mov r6, #64 128: vld1.8 {q0, q1}, [r1]! vld1.8 {q2, q3}, [r1]! vshll.u8 q10, d0, #4 vshll.u8 q11, d1, #4 vshll.u8 q12, d2, #4 vshll.u8 q13, d3, #4 vshll.u8 q14, d4, #4 vshll.u8 q15, d5, #4 vld1.8 {q8, q9}, [r1]! vst1.16 {q10, q11}, [r0, :128], r6 vst1.16 {q12, q13}, [r8, :128], r6 vshll.u8 q0, d6, #4 vshll.u8 q1, d7, #4 vshll.u8 q2, d16, #4 vshll.u8 q3, d17, #4 vshll.u8 q8, d18, #4 vshll.u8 q9, d19, #4 vld1.8 {q10, q11}, [r1], r2 vst1.16 {q14, q15}, [r0, :128], r6 vst1.16 {q0, q1}, [r8, :128], r6 vshll.u8 q12, d20, #4 vshll.u8 q13, d21, #4 vshll.u8 q14, d22, #4 vshll.u8 q15, d23, #4 subs r4, r4, #1 vst1.16 {q2, q3}, [r0, :128], r6 vst1.16 {q8, q9}, [r8, :128], r6 vst1.16 {q12, q13}, [r0, :128], r6 vst1.16 {q14, q15}, [r8, :128], r6 bgt 128b pop {r4-r11,pc} endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 vld1.\wd {\d0[]}, [\s0], \strd vld1.\wd {\d1[]}, [\s1], \strd .ifnb \d2 vld1.\wd {\d2[]}, [\s0], \strd vld1.\wd {\d3[]}, [\s1], \strd .endif .ifnb \d4 vld1.\wd {\d4[]}, [\s0], \strd .endif .ifnb \d5 vld1.\wd {\d5[]}, [\s1], \strd .endif .ifnb \d6 vld1.\wd {\d6[]}, [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 vld1.8 {\d0}, [\s0], \strd vld1.8 {\d1}, [\s1], \strd .ifnb \d2 vld1.8 {\d2}, [\s0], \strd vld1.8 {\d3}, [\s1], \strd .endif .ifnb \d4 vld1.8 {\d4}, [\s0], \strd .endif .ifnb \d5 vld1.8 {\d5}, [\s1], \strd .endif .ifnb \d6 vld1.8 {\d6}, [\s0], \strd .endif .endm .macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro interleave_1_16 r0, r1, r2, r3, r4 vext.8 \r0, \r0, \r1, #6 vext.8 \r1, \r1, \r2, #6 .ifnb \r3 vext.8 \r2, \r2, \r3, #6 vext.8 \r3, \r3, \r4, #6 .endif .endm .macro interleave_1_32 r0, r1, r2, r3, r4 vext.8 \r0, \r0, \r1, #4 vext.8 \r1, \r1, \r2, #4 .ifnb \r3 vext.8 \r2, \r2, \r3, #4 vext.8 \r3, \r3, \r4, #4 .endif .endm .macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6 vmovl.u8 \q0, \d0 vmovl.u8 \q1, \d1 .ifnb \q2 vmovl.u8 \q2, \d2 vmovl.u8 \q3, \d3 .endif .ifnb \q4 vmovl.u8 \q4, \d4 .endif .ifnb \q5 vmovl.u8 \q5, \d5 .endif .ifnb \q6 vmovl.u8 \q6, \d6 .endif .endm .macro mul_mla_4 d, s0, s1, s2, s3 vmul.s16 \d, \s0, d0[0] vmla.s16 \d, \s1, d0[1] vmla.s16 \d, \s2, d0[2] vmla.s16 \d, \s3, d0[3] .endm .macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 vmul.s16 \d0, \s0, d0[0] vmla.s16 \d0, \s1, d0[1] vmla.s16 \d0, \s2, d0[2] vmla.s16 \d0, \s3, d0[3] vmla.s16 \d0, \s4, d1[0] vmla.s16 \d0, \s5, d1[1] vmla.s16 \d0, \s6, d1[2] vmla.s16 \d0, \s7, d1[3] .endm .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 vmul.s16 \d0, \s0, d0[0] vmla.s16 \d0, \s1, d0[1] vmla.s16 \d0, \s2, d0[2] vmla.s16 \d0, \s3, d0[3] vmla.s16 \d0, \s4, d1[0] vmla.s16 \d0, \s5, d1[1] vmla.s16 \d0, \s6, d1[2] vmla.s16 \d0, \s7, d1[3] vmul.s16 \d1, \s1, d0[0] vmla.s16 \d1, \s2, d0[1] vmla.s16 \d1, \s3, d0[2] vmla.s16 \d1, \s4, d0[3] vmla.s16 \d1, \s5, d1[0] vmla.s16 \d1, \s6, d1[1] vmla.s16 \d1, \s7, d1[2] vmla.s16 \d1, \s8, d1[3] .endm .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 vmul.s16 \d0, \s0, d0[0] vmla.s16 \d0, \s1, d0[1] vmla.s16 \d0, \s2, d0[2] vmla.s16 \d0, \s3, d0[3] vmla.s16 \d0, \s4, d1[0] vmla.s16 \d0, \s5, d1[1] vmla.s16 \d0, \s6, d1[2] vmla.s16 \d0, \s7, d1[3] vmul.s16 \d1, \s2, d0[0] vmla.s16 \d1, \s3, d0[1] vmla.s16 \d1, \s4, d0[2] vmla.s16 \d1, \s5, d0[3] vmla.s16 \d1, \s6, d1[0] vmla.s16 \d1, \s7, d1[1] vmla.s16 \d1, \s8, d1[2] vmla.s16 \d1, \s9, d1[3] .endm .macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3 vqrshrun.s16 \d0, \q0, #\shift .ifnb \q1 vqrshrun.s16 \d1, \q1, #\shift .endif .ifnb \q2 vqrshrun.s16 \d2, \q2, #\shift vqrshrun.s16 \d3, \q3, #\shift .endif .endm .macro vrshr_s16 shift, r0, r1, r2, r3 vrshr.s16 \r0, \r0, #\shift .ifnb \r1 vrshr.s16 \r1, \r1, #\shift .endif .ifnb \r2 vrshr.s16 \r2, \r2, #\shift vrshr.s16 \r3, \r3, #\shift .endif .endm .macro st_16 strd, reg, lanes vst1.16 {\reg[0]}, [r0, :16], \strd vst1.16 {\reg[1]}, [r8, :16], \strd .if \lanes > 2 vst1.16 {\reg[2]}, [r0, :16], \strd vst1.16 {\reg[3]}, [r8, :16], \strd .endif .endm .macro st_32 strd, r0, r1 vst1.32 {\r0[0]}, [r0, :32], \strd vst1.32 {\r0[1]}, [r8, :32], \strd .ifnb \r1 vst1.32 {\r1[0]}, [r0, :32], \strd vst1.32 {\r1[1]}, [r8, :32], \strd .endif .endm .macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 vst1.8 {\r0}, [r0, \align], \strd vst1.8 {\r1}, [r8, \align], \strd .ifnb \r2 vst1.8 {\r2}, [r0, \align], \strd vst1.8 {\r3}, [r8, \align], \strd .endif .ifnb \r4 vst1.8 {\r4}, [r0, \align], \strd vst1.8 {\r5}, [r8, \align], \strd vst1.8 {\r6}, [r0, \align], \strd vst1.8 {\r7}, [r8, \align], \strd .endif .endm .macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3 .ifc \type, put vqrshrun_s16 6, \q0, \d0, \q1, \d2 st_32 \strd, \d0, \d2 .else vrshr_s16 2, \q0, \q1 st_reg \strd, :64, \d0, \d1, \d2, \d3 .endif .endm .macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3 .ifc \type, put vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 st_reg \strd, :64, \d0, \d1, \d2, \d3 .else vrshr_s16 2, \q0, \q1, \q2, \q3 st_reg \strd, :128,\q0, \q1, \q2, \q3 .endif .endm .macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3 .ifc \type, put vqrshrun.s16 \d0, \q0, #6 vqrshrun.s16 \d1, \q1, #6 vqrshrun.s16 \d4, \q2, #6 vqrshrun.s16 \d5, \q3, #6 st_reg \strd, :128, \q0, \q2 .else vrshr_s16 2, \q0, \q1, \q2, \q3 vst1.16 {\q0, \q1}, [r0, :128], \strd vst1.16 {\q2, \q3}, [r8, :128], \strd .endif .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_8bpc_neon, export=1 push {r4-r11,lr} movw r8, \type_h movw r9, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, r10 mul \my, \my, r10 add \mx, \mx, r8 // mx, 8tap_h, 4tap_h add \my, \my, r9 // my, 8tap_v, 4tap_v .ifc \type, prep lsl \d_strd, \w, #1 .endif clz r8, \w tst \mx, #(0x7f << 14) sub r8, r8, #24 movrel r10, X(mc_subpel_filters), -8 bne L(\type\()_8tap_h) tst \my, #(0x7f << 14) bne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx r9, \mx, #7, #7 and \mx, \mx, #0x7f it gt movgt \mx, r9 tst \my, #(0x7f << 14) add \mx, r10, \mx, lsl #3 bne L(\type\()_8tap_hv) adr r9, L(\type\()_8tap_h_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_8tap_h_tbl): .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 2: vld1.8 {d4}, [\src], \s_strd vld1.8 {d6}, [\sr2], \s_strd vmovl.u8 q2, d4 vmovl.u8 q3, d6 vext.8 d5, d4, d5, #2 vext.8 d7, d6, d7, #2 subs \h, \h, #2 vtrn.32 d4, d6 vtrn.32 d5, d7 vmul.s16 d2, d4, d0[0] vmla.s16 d2, d5, d0[1] vmla.s16 d2, d6, d0[2] vmla.s16 d2, d7, d0[3] vrshr.s16 d2, d2, #2 vqrshrun.s16 d2, q1, #4 vst1.16 {d2[0]}, [\dst, :16], \d_strd vst1.16 {d2[1]}, [\ds2, :16], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 4: vld1.8 {d16}, [\src], \s_strd vld1.8 {d24}, [\sr2], \s_strd vmovl.u8 q8, d16 vmovl.u8 q12, d24 vext.8 d18, d16, d17, #2 vext.8 d20, d16, d17, #4 vext.8 d22, d16, d17, #6 vext.8 d26, d24, d25, #2 vext.8 d28, d24, d25, #4 vext.8 d30, d24, d25, #6 subs \h, \h, #2 vmul.s16 d4, d16, d0[0] vmla.s16 d4, d18, d0[1] vmla.s16 d4, d20, d0[2] vmla.s16 d4, d22, d0[3] vmul.s16 d5, d24, d0[0] vmla.s16 d5, d26, d0[1] vmla.s16 d5, d28, d0[2] vmla.s16 d5, d30, d0[3] vrshr.s16 q2, q2, #2 .ifc \type, put vqrshrun.s16 d4, q2, #4 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd .endif bgt 4b pop {r4-r11,pc} 80: // 8xN h vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 8: vld1.8 {q8}, [\src], \s_strd vld1.8 {q12}, [\sr2], \s_strd vmovl.u8 q9, d17 vmovl.u8 q8, d16 vmovl.u8 q13, d25 vmovl.u8 q12, d24 vmul.s16 q10, q8, d0[0] vmul.s16 q14, q12, d0[0] .irpc i, 1234567 vext.8 q11, q8, q9, #(2*\i) vext.8 q15, q12, q13, #(2*\i) .if \i < 4 vmla.s16 q10, q11, d0[\i] vmla.s16 q14, q15, d0[\i] .else vmla.s16 q10, q11, d1[\i-4] vmla.s16 q14, q15, d1[\i-4] .endif .endr subs \h, \h, #2 vrshr.s16 q10, q10, #2 vrshr.s16 q14, q14, #2 .ifc \type, put vqrshrun.s16 d20, q10, #4 vqrshrun.s16 d28, q14, #4 vst1.8 {d20}, [\dst, :64], \d_strd vst1.8 {d28}, [\ds2, :64], \d_strd .else vst1.16 {q10}, [\dst, :128], \d_strd vst1.16 {q14}, [\ds2, :128], \d_strd .endif bgt 8b pop {r4-r11,pc} 160: 320: 640: 1280: // 16xN, 32xN, ... h // This could be done without touching q4-q6, by using only // one temporary for vext in the loop. That's slower on A7 and A53, // (but surprisingly, marginally faster on A8 and A73). vpush {q4-q6} vld1.8 {d0}, [\mx, :64] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 sub \s_strd, \s_strd, \w sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w .endif 161: vld1.8 {d16, d17, d18}, [\src]! vld1.8 {d24, d25, d26}, [\sr2]! mov \mx, \w vmovl.u8 q10, d18 vmovl.u8 q9, d17 vmovl.u8 q8, d16 vmovl.u8 q14, d26 vmovl.u8 q13, d25 vmovl.u8 q12, d24 16: vmul.s16 q1, q8, d0[0] vmul.s16 q2, q9, d0[0] vmul.s16 q3, q12, d0[0] vmul.s16 q4, q13, d0[0] .irpc i, 1234567 vext.8 q5, q8, q9, #(2*\i) vext.8 q6, q9, q10, #(2*\i) vext.8 q11, q12, q13, #(2*\i) vext.8 q15, q13, q14, #(2*\i) .if \i < 4 vmla.s16 q1, q5, d0[\i] vmla.s16 q2, q6, d0[\i] vmla.s16 q3, q11, d0[\i] vmla.s16 q4, q15, d0[\i] .else vmla.s16 q1, q5, d1[\i-4] vmla.s16 q2, q6, d1[\i-4] vmla.s16 q3, q11, d1[\i-4] vmla.s16 q4, q15, d1[\i-4] .endif .endr vrshr.s16 q1, q1, #2 vrshr.s16 q2, q2, #2 vrshr.s16 q3, q3, #2 vrshr.s16 q4, q4, #2 subs \mx, \mx, #16 .ifc \type, put vqrshrun.s16 d2, q1, #4 vqrshrun.s16 d3, q2, #4 vqrshrun.s16 d4, q3, #4 vqrshrun.s16 d5, q4, #4 vst1.8 {q1}, [\dst, :128]! vst1.8 {q2}, [\ds2, :128]! .else vst1.16 {q1, q2}, [\dst, :128]! vst1.16 {q3, q4}, [\ds2, :128]! .endif ble 9f vmov q8, q10 vmov q12, q14 vld1.8 {d18, d19}, [\src]! vld1.8 {d26, d27}, [\sr2]! vmovl.u8 q10, d19 vmovl.u8 q9, d18 vmovl.u8 q14, d27 vmovl.u8 q13, d26 b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 161b vpop {q4-q6} pop {r4-r11,pc} L(\type\()_8tap_v): cmp \h, #4 ubfx r9, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r9 add \my, r10, \my, lsl #3 adr r9, L(\type\()_8tap_v_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_8tap_v_tbl): .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put bgt 28f cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 // 2x2 v load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 interleave_1_16 d1, d2, d3, d4, d5 bgt 24f vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 mul_mla_4 d6, d16, d18, d20, d22 vqrshrun_s16 6, q3, d6 st_16 \d_strd, d6, 2 pop {r4-r11,pc} 24: // 2x4 v load_16 \sr2, \src, \s_strd, d6, d7 interleave_1_16 d5, d6, d7 vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6 vmov d17, d20 vmov d19, d22 vmov d21, d24 vmov d23, d26 mul_mla_4 q3, q8, q9, q10, q11 vqrshrun_s16 6, q3, d6 st_16 \d_strd, d6, 4 pop {r4-r11,pc} 28: // 2x6, 2x8, 2x12, 2x16 v vpush {q4-q7} vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14 interleave_1_16 d2, d4, d6, d8, d10 interleave_1_16 d10, d12, d14 vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12 vmov d3, d6 vmov d5, d8 vmov d7, d10 vmov d9, d12 216: subs \h, \h, #4 load_16 \sr2, \src, \s_strd, d16, d18, d20, d22 interleave_1_16 d14, d16, d18, d20, d22 vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20 vmov d11, d14 vmov d13, d16 vmov d15, d18 vmov d17, d20 mul_mla_8_0 q1, q1, q2, q3, q4, q5, q6, q7, q8 vqrshrun_s16 6, q1, d2 st_16 \d_strd, d2, 4 ble 0f cmp \h, #2 vmov q1, q5 vmov q2, q6 vmov q3, q7 vmov q4, q8 vmov q5, q9 vmov q6, q10 vmov d14, d22 beq 26f b 216b 26: load_16 \sr2, \src, \s_strd, d16, d18 interleave_1_16 d14, d16, d18 vmovl_u8 q7, d14, q8, d16 vmov d11, d14 vmov d13, d16 mul_mla_8_0 d2, d2, d4, d6, d8, d10, d12, d14, d16 vqrshrun_s16 6, q1, d2 st_16 \d_strd, d2, 2 0: vpop {q4-q7} pop {r4-r11,pc} .endif 40: bgt 480f // 4x2, 4x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 interleave_1_32 d1, d2, d3, d4, d5 vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4 mul_mla_4 q3, q8, q9, q10, q11 shift_store_4 \type, \d_strd, q3, d6, d7 ble 0f load_32 \sr2, \src, \s_strd, d6, d7 interleave_1_32 d5, d6, d7 vmovl_u8 q12, d5, q13, d6 mul_mla_4 q3, q10, q11, q12, q13 shift_store_4 \type, \d_strd, q3, d6, d7 0: pop {r4-r11,pc} 480: // 4x6, 4x8, 4x12, 4x16 v vpush {q4} vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 interleave_1_32 d2, d4, d6 interleave_1_32 d6, d8, d16, d18, d20 vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18 48: subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d22, d24, d26, d28 interleave_1_32 d20, d22, d24, d26, d28 vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26 mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13 shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5 ble 0f load_32 \sr2, \src, \s_strd, d30, d2 subs \h, \h, #2 interleave_1_32 d28, d30, d2 vmovl_u8 q14, d28, q15, d30 mul_mla_8_0 q8, q8, q9, q10, q11, q12, q13, q14, q15 shift_store_4 \type, \d_strd, q8, d16, d17 ble 0f load_32 \sr2, \src, \s_strd, d4, d6 subs \h, \h, #2 interleave_1_32 d2, d4, d6 vmovl_u8 q1, d2, q2, d4 mul_mla_8_0 q9, q10, q11, q12, q13, q14, q15, q1, q2 shift_store_4 \type, \d_strd, q9, d18, d19 ble 0f subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d8, d16, d18, d20 interleave_1_32 d6, d8, d16, d18, d20 vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18 mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9 shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27 bgt 48b 0: vpop {q4} pop {r4-r11,pc} 80: bgt 880f // 8x2, 8x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5 mul_mla_4 q1, q8, q9, q10, q11 mul_mla_4 q2, q9, q10, q11, q12 shift_store_8 \type, \d_strd, q1, d2, q2, d4 ble 0f load_reg \sr2, \src, \s_strd, d6, d7 vmovl_u8 q13, d6, q14, d7 mul_mla_4 q1, q10, q11, q12, q13 mul_mla_4 q2, q11, q12, q13, q14 shift_store_8 \type, \d_strd, q1, d2, q2, d4 0: pop {r4-r11,pc} 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: vpush {q4} vld1.8 {d0}, [\my, :64] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20 vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20 88: subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d22, d24 vmovl_u8 q11, d22, q12, d24 mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12 shift_store_8 \type, \d_strd, q1, d2, q2, d4 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d26, d28 vmovl_u8 q13, d26, q14, d28 mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14 shift_store_8 \type, \d_strd, q3, d6, q4, d8 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d30, d2 vmovl_u8 q15, d30, q1, d2 mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1 shift_store_8 \type, \d_strd, q8, d16, q9, d18 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, d4, d6 vmovl_u8 q2, d4, q3, d6 mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3 shift_store_8 \type, \d_strd, q10, d20, q11, d22 ble 9f subs \h, \h, #4 load_reg \sr2, \src, \s_strd, d8, d16, d18, d20 vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20 mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8 mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10 shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30 bgt 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: vpop {q4} pop {r4-r11,pc} 160: bgt 1680b // 16x2, 16x4 v add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 cmp \h, #2 load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15 vmovl.u8 q1, d22 vmovl.u8 q2, d24 vmovl.u8 q3, d26 vmovl.u8 q8, d28 vmovl.u8 q9, d30 vmovl.u8 q11, d23 vmovl.u8 q12, d25 vmovl.u8 q13, d27 vmovl.u8 q14, d29 vmovl.u8 q15, d31 mul_mla_4 q1, q1, q2, q3, q8 mul_mla_4 q10, q2, q3, q8, q9 mul_mla_4 q2, q11, q12, q13, q14 mul_mla_4 q11, q12, q13, q14, q15 shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11 ble 0f load_reg \sr2, \src, \s_strd, q10, q11 vmovl.u8 q1, d20 vmovl.u8 q10, d21 vmovl.u8 q12, d22 vmovl.u8 q11, d23 mul_mla_4 q2, q3, q8, q9, q1 mul_mla_4 q3, q13, q14, q15, q10 mul_mla_4 q13, q8, q9, q1, q12 mul_mla_4 q14, q14, q15, q10, q11 shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14 0: pop {r4-r11,pc} L(\type\()_8tap_hv): cmp \h, #4 ubfx r9, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r9 add \my, r10, \my, lsl #3 adr r9, L(\type\()_8tap_hv_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_8tap_hv_tbl): .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB 20: .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 280f add \my, \my, #2 vld1.32 {d2[]}, [\my] // 2x2, 2x4 hv sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.8 {d26}, [\src], \s_strd vmovl.u8 q13, d26 vext.8 q14, q13, q13, #2 vmul.s16 d26, d26, d0 vmul.s16 d28, d28, d0 vpadd.s16 d26, d26, d28 vpadd.s16 d26, d26, d26 vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vmov d17, d26 vext.8 d16, d16, d26, #4 2: bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d26, #4 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d26, d2[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 subs \h, \h, #2 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd ble 0f vmov d16, d18 vmov d17, d26 b 2b 280: // 2x8, 2x16, 2x32 hv vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.8 {d26}, [\src], \s_strd vmovl.u8 q13, d26 vext.8 q14, q13, q13, #2 vmul.s16 d26, d26, d0 vmul.s16 d28, d28, d0 vpadd.s16 d26, d26, d28 vpadd.s16 d26, d26, d26 vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vmov d17, d26 vext.8 d16, d16, d26, #4 bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d26, #4 vmov d19, d26 bl L(\type\()_8tap_filter_2) vext.8 d20, d19, d26, #4 vmov d21, d26 28: bl L(\type\()_8tap_filter_2) vext.8 d22, d21, d26, #4 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d19, d2[3] vmlal.s16 q2, d20, d3[0] vmlal.s16 q2, d21, d3[1] vmlal.s16 q2, d22, d3[2] vmlal.s16 q2, d26, d3[3] vqrshrn.s32 d4, q2, #\shift_hv vqmovun.s16 d4, q2 subs \h, \h, #2 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd ble 0f vmov d16, d18 vmov d17, d19 vmov d18, d20 vmov d19, d21 vmov d20, d22 vmov d21, d26 b 28b 0: pop {r4-r11,pc} L(\type\()_8tap_filter_2): vld1.8 {d28}, [\sr2], \s_strd vld1.8 {d30}, [\src], \s_strd vext.8 d29, d28, d28, #1 vext.8 d31, d30, d30, #1 vmovl.u8 q13, d28 vmovl.u8 q14, d29 vmov d27, d28 vmovl.u8 q14, d30 vmovl.u8 q15, d31 vtrn.32 d26, d28 vtrn.32 d27, d30 vmul.s16 d26, d26, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d28, d0[2] vmla.s16 d26, d30, d0[3] vrshr.s16 d26, d26, #2 vext.8 d27, d26, d26, #4 bx lr .endif 40: add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 480f add \my, \my, #2 vld1.32 {d2[]}, [\my] sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 // 4x2, 4x4 hv vld1.8 {d30}, [\src], \s_strd vmovl.u8 q14, d30 vext.8 d27, d28, d29, #2 vext.8 d30, d28, d29, #4 vext.8 d31, d28, d29, #6 vmul.s16 d26, d28, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d30, d0[2] vmla.s16 d26, d31, d0[3] vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_4) vmov d17, d26 vmov d18, d27 4: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d26, d2[3] vmull.s16 q3, d17, d2[0] vmlal.s16 q3, d18, d2[1] vmlal.s16 q3, d26, d2[2] vmlal.s16 q3, d27, d2[3] vqrshrn.s32 d4, q2, #\shift_hv vqrshrn.s32 d6, q3, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d4, q2 vqmovun.s16 d6, q3 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d6[0]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d6}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d18 vmov d17, d26 vmov d18, d27 b 4b 480: // 4x8, 4x16, 4x32 hv vld1.8 {d2}, [\my, :64] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.8 {d30}, [\src], \s_strd vmovl.u8 q14, d30 vext.8 d27, d28, d29, #2 vext.8 d30, d28, d29, #4 vext.8 d31, d28, d29, #6 vmul.s16 d26, d28, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d30, d0[2] vmla.s16 d26, d31, d0[3] vrshr.s16 d16, d26, #2 bl L(\type\()_8tap_filter_4) vmov d17, d26 vmov d18, d27 bl L(\type\()_8tap_filter_4) vmov d19, d26 vmov d20, d27 bl L(\type\()_8tap_filter_4) vmov d21, d26 vmov d22, d27 48: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d19, d2[3] vmlal.s16 q2, d20, d3[0] vmlal.s16 q2, d21, d3[1] vmlal.s16 q2, d22, d3[2] vmlal.s16 q2, d26, d3[3] vmull.s16 q3, d17, d2[0] vmlal.s16 q3, d18, d2[1] vmlal.s16 q3, d19, d2[2] vmlal.s16 q3, d20, d2[3] vmlal.s16 q3, d21, d3[0] vmlal.s16 q3, d22, d3[1] vmlal.s16 q3, d26, d3[2] vmlal.s16 q3, d27, d3[3] vqrshrn.s32 d4, q2, #\shift_hv vqrshrn.s32 d6, q3, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d4, q2 vqmovun.s16 d6, q3 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d6[0]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d6}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d18 vmov d17, d19 vmov d18, d20 vmov d19, d21 vmov d20, d22 vmov d21, d26 vmov d22, d27 b 48b 0: pop {r4-r11,pc} L(\type\()_8tap_filter_4): vld1.8 {d30}, [\sr2], \s_strd vld1.8 {d31}, [\src], \s_strd vmovl.u8 q14, d30 vext.8 d27, d28, d29, #2 vext.8 d30, d28, d29, #4 vext.8 d1, d28, d29, #6 vmul.s16 d26, d28, d0[0] vmla.s16 d26, d27, d0[1] vmla.s16 d26, d30, d0[2] vmla.s16 d26, d1, d0[3] vmovl.u8 q14, d31 vext.8 d30, d28, d29, #2 vext.8 d31, d28, d29, #4 vext.8 d1, d28, d29, #6 vmul.s16 d27, d28, d0[0] vmla.s16 d27, d30, d0[1] vmla.s16 d27, d31, d0[2] vmla.s16 d27, d1, d0[3] vrshr.s16 d26, d26, #2 vrshr.s16 d27, d27, #2 bx lr 80: 160: 320: bgt 880f vpush {q4-q7} add \my, \my, #2 vld1.8 {d0}, [\mx, :64] vld1.32 {d2[]}, [\my] sub \src, \src, #3 sub \src, \src, \s_strd vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.8 {q14}, [\src], \s_strd vmovl.u8 q12, d28 vmovl.u8 q13, d29 vmul.s16 q10, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d1[\i-4] .endr vrshr.s16 q3, q10, #2 bl L(\type\()_8tap_filter_8) vmov q4, q10 vmov q5, q11 8: bl L(\type\()_8tap_filter_8) vmull.s16 q12, d6, d2[0] vmull.s16 q13, d7, d2[0] vmull.s16 q14, d8, d2[0] vmull.s16 q15, d9, d2[0] vmlal.s16 q12, d8, d2[1] vmlal.s16 q13, d9, d2[1] vmlal.s16 q14, d10, d2[1] vmlal.s16 q15, d11, d2[1] vmlal.s16 q12, d10, d2[2] vmlal.s16 q13, d11, d2[2] vmlal.s16 q14, d20, d2[2] vmlal.s16 q15, d21, d2[2] vmlal.s16 q12, d20, d2[3] vmlal.s16 q13, d21, d2[3] vmlal.s16 q14, d22, d2[3] vmlal.s16 q15, d23, d2[3] vqrshrn.s32 d24, q12, #\shift_hv vqrshrn.s32 d25, q13, #\shift_hv vqrshrn.s32 d28, q14, #\shift_hv vqrshrn.s32 d29, q15, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d24, q12 vqmovun.s16 d28, q14 vst1.8 {d24}, [\dst, :64], \d_strd vst1.8 {d28}, [\ds2, :64], \d_strd .else vst1.16 {q12}, [\dst, :128], \d_strd vst1.16 {q14}, [\ds2, :128], \d_strd .endif ble 9f vmov q3, q5 vmov q4, q10 vmov q5, q11 b 8b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: vpush {q4-q7} vld1.8 {d0}, [\mx, :64] vld1.8 {d2}, [\my, :64] sub \src, \src, #3 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.8 {q14}, [\src], \s_strd vmovl.u8 q12, d28 vmovl.u8 q13, d29 vmul.s16 q10, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d1[\i-4] .endr vrshr.s16 q3, q10, #2 bl L(\type\()_8tap_filter_8) vmov q4, q10 vmov q5, q11 bl L(\type\()_8tap_filter_8) vmov q6, q10 vmov q7, q11 bl L(\type\()_8tap_filter_8) vmov q8, q10 vmov q9, q11 88: bl L(\type\()_8tap_filter_8) vmull.s16 q12, d6, d2[0] vmull.s16 q13, d7, d2[0] vmull.s16 q14, d8, d2[0] vmull.s16 q15, d9, d2[0] vmlal.s16 q12, d8, d2[1] vmlal.s16 q13, d9, d2[1] vmlal.s16 q14, d10, d2[1] vmlal.s16 q15, d11, d2[1] vmlal.s16 q12, d10, d2[2] vmlal.s16 q13, d11, d2[2] vmlal.s16 q14, d12, d2[2] vmlal.s16 q15, d13, d2[2] vmlal.s16 q12, d12, d2[3] vmlal.s16 q13, d13, d2[3] vmlal.s16 q14, d14, d2[3] vmlal.s16 q15, d15, d2[3] vmlal.s16 q12, d14, d3[0] vmlal.s16 q13, d15, d3[0] vmlal.s16 q14, d16, d3[0] vmlal.s16 q15, d17, d3[0] vmlal.s16 q12, d16, d3[1] vmlal.s16 q13, d17, d3[1] vmlal.s16 q14, d18, d3[1] vmlal.s16 q15, d19, d3[1] vmlal.s16 q12, d18, d3[2] vmlal.s16 q13, d19, d3[2] vmlal.s16 q14, d20, d3[2] vmlal.s16 q15, d21, d3[2] vmlal.s16 q12, d20, d3[3] vmlal.s16 q13, d21, d3[3] vmlal.s16 q14, d22, d3[3] vmlal.s16 q15, d23, d3[3] vqrshrn.s32 d24, q12, #\shift_hv vqrshrn.s32 d25, q13, #\shift_hv vqrshrn.s32 d28, q14, #\shift_hv vqrshrn.s32 d29, q15, #\shift_hv subs \h, \h, #2 .ifc \type, put vqmovun.s16 d24, q12 vqmovun.s16 d28, q14 vst1.8 {d24}, [\dst, :64], \d_strd vst1.8 {d28}, [\ds2, :64], \d_strd .else vst1.16 {q12}, [\dst, :128], \d_strd vst1.16 {q14}, [\ds2, :128], \d_strd .endif ble 9f vmov q3, q5 vmov q4, q6 vmov q5, q7 vmov q6, q8 vmov q7, q9 vmov q8, q10 vmov q9, q11 b 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: vpop {q4-q7} pop {r4-r11,pc} L(\type\()_8tap_filter_8): vld1.8 {q14}, [\sr2], \s_strd vld1.8 {q15}, [\src], \s_strd vmovl.u8 q12, d28 vmovl.u8 q13, d29 vmul.s16 q10, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q10, q14, d1[\i-4] .endr vmovl.u8 q12, d30 vmovl.u8 q13, d31 vmul.s16 q11, q12, d0[0] .irpc i, 123 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q11, q14, d0[\i] .endr .irpc i, 4567 vext.8 q14, q12, q13, #(2*\i) vmla.s16 q11, q14, d1[\i-4] .endr vrshr.s16 q10, q10, #2 vrshr.s16 q11, q11, #2 bx lr endfunc function \type\()_bilin_8bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] vdup.8 d1, \mx vdup.8 d3, \my rsb r8, \mx, #16 rsb r9, \my, #16 vdup.8 d0, r8 vdup.8 d2, r9 .ifc \type, prep lsl \d_strd, \w, #1 .endif clz r8, \w cmp \mx, #0 sub r8, r8, #24 bne L(\type\()_bilin_h) cmp \my, #0 bne L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cmp \my, #0 bne L(\type\()_bilin_hv) adr r9, L(\type\()_bilin_h_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_bilin_h_tbl): .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: vld1.32 {d4[]}, [\src], \s_strd vld1.32 {d6[]}, [\sr2], \s_strd vext.8 d5, d4, d4, #1 vext.8 d7, d6, d6, #1 vtrn.16 q2, q3 subs \h, \h, #2 vmull.u8 q3, d4, d0 vmlal.u8 q3, d5, d1 vqrshrn.u16 d4, q3, #4 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: vld1.8 {d4}, [\src], \s_strd vld1.8 {d6}, [\sr2], \s_strd vext.8 d5, d4, d4, #1 vext.8 d7, d6, d6, #1 vtrn.32 q2, q3 subs \h, \h, #2 vmull.u8 q3, d4, d0 vmlal.u8 q3, d5, d1 .ifc \type, put vqrshrn.u16 d4, q3, #4 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd .else vst1.16 {d6}, [\dst, :64], \d_strd vst1.16 {d7}, [\ds2, :64], \d_strd .endif bgt 4b pop {r4-r11,pc} 80: // 8xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: vld1.8 {q8}, [\src], \s_strd vld1.8 {q10}, [\sr2], \s_strd vext.8 q9, q8, q8, #1 vext.8 q11, q10, q10, #1 subs \h, \h, #2 vmull.u8 q8, d16, d0 vmull.u8 q10, d20, d0 vmlal.u8 q8, d18, d1 vmlal.u8 q10, d22, d1 .ifc \type, put vqrshrn.u16 d16, q8, #4 vqrshrn.u16 d18, q10, #4 vst1.8 {d16}, [\dst, :64], \d_strd vst1.8 {d18}, [\ds2, :64], \d_strd .else vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q10}, [\ds2, :128], \d_strd .endif bgt 8b pop {r4-r11,pc} 160: 320: 640: 1280: // 16xN, 32xN, ... h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w .endif 161: vld1.8 {d16}, [\src]! vld1.8 {d22}, [\sr2]! mov \mx, \w 16: vld1.8 {d17,d18}, [\src]! vld1.8 {d23,d24}, [\sr2]! vext.8 q10, q8, q9, #1 vext.8 q13, q11, q12, #1 vmull.u8 q2, d16, d0 vmull.u8 q3, d17, d0 vmull.u8 q14, d22, d0 vmull.u8 q15, d23, d0 vmlal.u8 q2, d20, d1 vmlal.u8 q3, d21, d1 vmlal.u8 q14, d26, d1 vmlal.u8 q15, d27, d1 subs \mx, \mx, #16 .ifc \type, put vqrshrn.u16 d4, q2, #4 vqrshrn.u16 d5, q3, #4 vqrshrn.u16 d28, q14, #4 vqrshrn.u16 d29, q15, #4 vst1.8 {q2}, [\dst, :128]! vst1.8 {q14}, [\ds2, :128]! .else vst1.16 {q2, q3}, [\dst, :128]! vst1.16 {q14, q15}, [\ds2, :128]! .endif ble 9f vmov d16, d18 vmov d22, d24 b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 161b pop {r4-r11,pc} L(\type\()_bilin_v): cmp \h, #4 adr r9, L(\type\()_bilin_v_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_bilin_v_tbl): .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v vld1.16 {d16[]}, [\src], \s_strd bgt 24f 22: vld1.16 {d17[]}, [\sr2], \s_strd vld1.16 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #6 vext.8 d17, d17, d18, #6 vmull.u8 q2, d16, d2 vmlal.u8 q2, d17, d3 vqrshrn.u16 d4, q2, #4 vst1.16 {d4[0]}, [\dst, :16] vst1.16 {d4[1]}, [\ds2, :16] pop {r4-r11,pc} 24: // 2x4, 2x6, 2x8, ... v vld1.16 {d17[]}, [\sr2], \s_strd vld1.16 {d18[]}, [\src], \s_strd vld1.16 {d19[]}, [\sr2], \s_strd vld1.16 {d20[]}, [\src], \s_strd sub \h, \h, #4 vext.8 d16, d16, d17, #6 vext.8 d17, d17, d18, #6 vext.8 d18, d18, d19, #6 vext.8 d19, d19, d20, #6 vtrn.32 d16, d18 vtrn.32 d17, d19 vmull.u8 q2, d16, d2 vmlal.u8 q2, d17, d3 cmp \h, #2 vqrshrn.u16 d4, q2, #4 vst1.16 {d4[0]}, [\dst, :16], \d_strd vst1.16 {d4[1]}, [\ds2, :16], \d_strd vst1.16 {d4[2]}, [\dst, :16], \d_strd vst1.16 {d4[3]}, [\ds2, :16], \d_strd blt 0f vmov d16, d20 beq 22b b 24b 0: pop {r4-r11,pc} .endif 40: // 4xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.32 {d16[]}, [\src], \s_strd 4: vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #4 vext.8 d17, d17, d18, #4 vmull.u8 q2, d16, d2 vmlal.u8 q2, d17, d3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d4, q2, #4 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd .else vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d18 b 4b 0: pop {r4-r11,pc} 80: // 8xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {d16}, [\src], \s_strd 8: vld1.8 {d17}, [\sr2], \s_strd vld1.8 {d18}, [\src], \s_strd vmull.u8 q2, d16, d2 vmull.u8 q3, d17, d2 vmlal.u8 q2, d17, d3 vmlal.u8 q3, d18, d3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d4, q2, #4 vqrshrn.u16 d6, q3, #4 vst1.8 {d4}, [\dst, :64], \d_strd vst1.8 {d6}, [\ds2, :64], \d_strd .else vst1.16 {q2}, [\dst, :128], \d_strd vst1.16 {q3}, [\ds2, :128], \d_strd .endif ble 0f vmov d16, d18 b 8b 0: pop {r4-r11,pc} 160: // 16xN, 32xN, ... 320: 640: 1280: mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {q8}, [\src], \s_strd 2: vld1.8 {q9}, [\sr2], \s_strd vld1.8 {q10}, [\src], \s_strd vmull.u8 q12, d16, d2 vmull.u8 q13, d17, d2 vmull.u8 q14, d18, d2 vmull.u8 q15, d19, d2 vmlal.u8 q12, d18, d3 vmlal.u8 q13, d19, d3 vmlal.u8 q14, d20, d3 vmlal.u8 q15, d21, d3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d24, q12, #4 vqrshrn.u16 d25, q13, #4 vqrshrn.u16 d28, q14, #4 vqrshrn.u16 d29, q15, #4 vst1.8 {q12}, [\dst, :128], \d_strd vst1.8 {q14}, [\ds2, :128], \d_strd .else vst1.16 {q12, q13}, [\dst, :128], \d_strd vst1.16 {q14, q15}, [\ds2, :128], \d_strd .endif ble 9f vmov q8, q10 b 2b 9: subs \w, \w, #16 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 .ifc \type, put add \dst, \dst, #16 .else add \dst, \dst, #32 .endif b 1b 0: pop {r4-r11,pc} L(\type\()_bilin_hv): vmovl.u8 q2, d2 vmovl.u8 q3, d3 adr r9, L(\type\()_bilin_hv_tbl) ldr r8, [r9, r8, lsl #2] add r9, r9, r8 bx r9 .align 2 L(\type\()_bilin_hv_tbl): .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB 20: // 2xN hv .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.32 {d28[]}, [\src], \s_strd vext.8 d29, d28, d28, #1 vmull.u8 q8, d28, d0 vmlal.u8 q8, d29, d1 2: vld1.32 {d28[]}, [\sr2], \s_strd vld1.32 {d30[]}, [\src], \s_strd vext.8 d29, d28, d28, #1 vext.8 d31, d30, d30, #1 vtrn.16 d28, d30 vtrn.16 d29, d31 vmull.u8 q9, d28, d0 vmlal.u8 q9, d29, d1 vtrn.32 d16, d18 vmul.u16 d20, d16, d4 vmla.u16 d20, d19, d6 vqrshrn.u16 d20, q10, #8 subs \h, \h, #2 vst1.16 {d20[0]}, [\dst, :16], \d_strd vst1.16 {d20[1]}, [\ds2, :16], \d_strd ble 0f vtrn.32 d19, d16 b 2b 0: pop {r4-r11,pc} .endif 40: // 4xN hv add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {d28}, [\src], \s_strd vext.8 d29, d28, d28, #1 vmull.u8 q8, d28, d0 vmlal.u8 q8, d29, d1 4: vld1.8 {d28}, [\sr2], \s_strd vld1.8 {d30}, [\src], \s_strd vext.8 d29, d28, d28, #1 vext.8 d31, d30, d30, #1 vtrn.32 d28, d30 vtrn.32 d29, d31 vmull.u8 q9, d28, d0 vmlal.u8 q9, d29, d1 vmov d17, d18 vmul.u16 q10, q8, q2 vmla.u16 q10, q9, q3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d20, q10, #8 vst1.32 {d20[0]}, [\dst, :32], \d_strd vst1.32 {d20[1]}, [\ds2, :32], \d_strd .else vrshr.u16 q10, q10, #4 vst1.16 {d20}, [\dst, :64], \d_strd vst1.16 {d21}, [\ds2, :64], \d_strd .endif ble 0f vmov d16, d19 b 4b 0: pop {r4-r11,pc} 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.8 {q12}, [\src], \s_strd vext.8 q13, q12, q12, #1 vmull.u8 q8, d24, d0 vmlal.u8 q8, d26, d1 2: vld1.8 {q12}, [\sr2], \s_strd vld1.8 {q14}, [\src], \s_strd vext.8 q13, q12, q12, #1 vext.8 q15, q14, q14, #1 vmull.u8 q9, d24, d0 vmlal.u8 q9, d26, d1 vmull.u8 q10, d28, d0 vmlal.u8 q10, d30, d1 vmul.u16 q8, q8, q2 vmla.u16 q8, q9, q3 vmul.u16 q9, q9, q2 vmla.u16 q9, q10, q3 subs \h, \h, #2 .ifc \type, put vqrshrn.u16 d16, q8, #8 vqrshrn.u16 d18, q9, #8 vst1.8 {d16}, [\dst, :64], \d_strd vst1.8 {d18}, [\ds2, :64], \d_strd .else vrshr.u16 q8, q8, #4 vrshr.u16 q9, q9, #4 vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q9}, [\ds2, :128], \d_strd .endif ble 9f vmov q8, q10 b 2b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 1b 0: pop {r4-r11,pc} endfunc .endm filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10 filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6 .macro load_filter_ptr src asr r12, \src, #10 add r12, r11, r12, lsl #3 .endm .macro load_filter_coef dst, src, inc add \src, \src, \inc vld1.8 {\dst}, [r12, :64] .endm .macro load_filter_row dst, src, inc load_filter_ptr \src load_filter_coef \dst, \src, \inc .endm function warp_filter_horz_neon load_filter_ptr r5 // filter 0 vld1.16 {q7}, [r2], r3 vmov.i8 q6, #128 load_filter_coef d0, r5, r7 // filter 0 load_filter_row d1, r5, r7 // filter 1 load_filter_row d2, r5, r7 // filter 2 load_filter_ptr r5 // filter 3 veor q7, q7, q6 // subtract by 128 to allow using vmull load_filter_coef d3, r5, r7 // filter 3 vext.8 d12, d14, d15, #1 // filter 1 pixels vext.8 d13, d14, d15, #2 // filter 2 pixels load_filter_ptr r5 // filter 4 vmull.s8 q2, d14, d0 // filter 0 output vmull.s8 q3, d12, d1 // filter 1 output load_filter_coef d0, r5, r7 // filter 4 load_filter_ptr r5 // filter 5 vext.8 d12, d14, d15, #3 // filter 3 pixels vmull.s8 q4, d13, d2 // filter 2 output vext.8 d13, d14, d15, #4 // filter 4 pixels vpadd.i16 d4, d4, d5 // pixel 0 (4x16) vpadd.i16 d5, d6, d7 // pixel 1 (4x16) load_filter_coef d1, r5, r7 // filter 5 load_filter_ptr r5 // filter 6 vmull.s8 q5, d12, d3 // filter 3 output vext.8 d12, d14, d15, #5 // filter 5 pixels vmull.s8 q3, d13, d0 // filter 4 output load_filter_coef d0, r5, r7 // filter 6 vext.8 d13, d14, d15, #6 // filter 6 pixels load_filter_ptr r5 // filter 7 vpadd.i16 d8, d8, d9 // pixel 2 (4x16) vpadd.i16 d9, d10, d11 // pixel 3 (4x16) vmull.s8 q5, d12, d1 // filter 5 output load_filter_coef d1, r5, r7 // filter 7 vext.8 d14, d14, d15, #7 // filter 7 pixels vpadd.i16 d6, d6, d7 // pixel 4 (4x16) vpadd.i16 d10, d10, d11 // pixel 5 (4x16) vmull.s8 q6, d13, d0 // filter 6 output vmull.s8 q7, d14, d1 // filter 7 output sub r5, r5, r7, lsl #3 vpadd.i16 d4, d4, d5 // pixel 0,1 (2x16) vpadd.i16 d5, d8, d9 // pixel 2,3 (2x16) vpadd.i16 d12, d12, d13 // pixel 6 (4x16) vpadd.i16 d14, d14, d15 // pixel 7 (4x16) vpadd.i16 d6, d6, d10 // pixel 4,5 (2x16) vpadd.i16 d10, d12, d14 // pixel 6,7 (2x16) vpadd.i16 d4, d4, d5 // pixel 0-3 vpadd.i16 d5, d6, d10 // pixel 4-7 add r5, r5, r8 bx lr endfunc // void dav1d_warp_affine_8x8_8bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my) .macro warp t, shift function warp_affine_8x8\t\()_8bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldr r6, [sp, #108] ldrd r8, r9, [r4] sxth r7, r8 asr r8, r8, #16 asr r4, r9, #16 sxth r9, r9 mov r10, #8 sub r2, r2, r3, lsl #1 sub r2, r2, r3 sub r2, r2, #3 movrel r11, X(mc_warp_filter), 64*8 .ifnb \t lsl r1, r1, #1 .endif add r5, r5, #512 add r6, r6, #512 bl warp_filter_horz_neon vrshr.s16 q8, q2, #3 bl warp_filter_horz_neon vrshr.s16 q9, q2, #3 bl warp_filter_horz_neon vrshr.s16 q10, q2, #3 bl warp_filter_horz_neon vrshr.s16 q11, q2, #3 bl warp_filter_horz_neon vrshr.s16 q12, q2, #3 bl warp_filter_horz_neon vrshr.s16 q13, q2, #3 bl warp_filter_horz_neon vrshr.s16 q14, q2, #3 1: bl warp_filter_horz_neon vrshr.s16 q15, q2, #3 load_filter_row d8, r6, r9 load_filter_row d9, r6, r9 load_filter_row d10, r6, r9 load_filter_row d11, r6, r9 load_filter_row d12, r6, r9 load_filter_row d13, r6, r9 load_filter_row d14, r6, r9 load_filter_row d15, r6, r9 transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 vmovl.s8 q1, d8 vmovl.s8 q2, d9 vmovl.s8 q3, d10 vmovl.s8 q4, d11 vmovl.s8 q5, d12 vmovl.s8 q6, d13 sub r6, r6, r9, lsl #3 // This ordering of vmull/vmlal is highly beneficial for // Cortex A8/A9/A53 here, but harmful for Cortex A7. vmull.s16 q0, d16, d2 vmlal.s16 q0, d18, d4 vmlal.s16 q0, d20, d6 vmlal.s16 q0, d22, d8 vmlal.s16 q0, d24, d10 vmlal.s16 q0, d26, d12 vmull.s16 q1, d17, d3 vmlal.s16 q1, d19, d5 vmlal.s16 q1, d21, d7 vmlal.s16 q1, d23, d9 vmlal.s16 q1, d25, d11 vmlal.s16 q1, d27, d13 vmovl.s8 q2, d14 vmovl.s8 q3, d15 vmlal.s16 q0, d28, d4 vmlal.s16 q0, d30, d6 vmlal.s16 q1, d29, d5 vmlal.s16 q1, d31, d7 .ifb \t vmov.i16 q7, #128 .else vmov.i16 q7, #0x800 .endif vmov q8, q9 vmov q9, q10 vqrshrn.s32 d0, q0, #\shift vmov q10, q11 vqrshrn.s32 d1, q1, #\shift vmov q11, q12 vadd.i16 q0, q0, q7 vmov q12, q13 .ifb \t vqmovun.s16 d0, q0 .endif vmov q13, q14 vmov q14, q15 subs r10, r10, #1 .ifnb \t vst1.16 {q0}, [r0, :128], r1 .else vst1.8 {d0}, [r0, :64], r1 .endif add r6, r6, r4 bgt 1b vpop {q4-q7} pop {r4-r11,pc} endfunc .endm warp , 11 warp t, 7 // void dav1d_emu_edge_8bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_8bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] ldrd r8, r9, [sp, #52] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub r12, r3, #1 // ih - 1 cmp r5, r3 sub lr, r2, #1 // iw - 1 it lt movlt r12, r5 // min(y, ih - 1) cmp r4, r2 bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) it lt movlt lr, r4 // min(x, iw - 1) bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) mla r8, r12, r9, r8 // ref += iclip() * stride add r8, r8, lr // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add r10, r5, r1 // y + bh neg r5, r5 // -y sub r10, r10, r3 // y + bh - ih sub r12, r1, #1 // bh - 1 cmp r10, r1 bic r5, r5, r5, asr #31 // max(-y, 0) it ge movge r10, r12 // min(y + bh - ih, bh-1) cmp r5, r1 bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) it ge movge r5, r12 // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add r11, r4, r0 // x + bw neg r4, r4 // -x sub r11, r11, r2 // x + bw - iw sub lr, r0, #1 // bw - 1 cmp r11, r0 bic r4, r4, r4, asr #31 // max(-x, 0) it ge movge r11, lr // min(x + bw - iw, bw-1) cmp r4, r0 bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) it ge movge r4, lr // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub r1, r1, r5 // bh - top_ext mla r6, r5, r7, r6 sub r2, r0, r4 // bw - left_ext sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext sub r2, r2, r11 // center_w = bw - left_ext - right_ext mov r0, r6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left vld1.8 {d0[], d1[]}, [r8] mov r12, r6 // out = dst mov r3, r4 1: subs r3, r3, #16 vst1.8 {q0}, [r12, :128]! bgt 1b .endif mov lr, r8 add r12, r6, r4 // out = dst + left_ext mov r3, r2 1: vld1.8 {q0, q1}, [lr]! subs r3, r3, #32 .if \need_left vst1.8 {q0, q1}, [r12]! .else vst1.8 {q0, q1}, [r12, :128]! .endif bgt 1b .if \need_right add r3, r8, r2 // in + center_w sub r3, r3, #1 // in + center_w - 1 add r12, r6, r4 // dst + left_ext vld1.8 {d0[], d1[]}, [r3] add r12, r12, r2 // out = dst + left_ext + center_w mov r3, r11 1: subs r3, r3, #16 vst1.8 {q0}, [r12]! bgt 1b .endif subs r1, r1, #1 // center_h-- add r6, r6, r7 add r8, r8, r9 bgt 0b .endm cmp r4, #0 beq 2f // need_left cmp r11, #0 beq 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cmp r11, #0 beq 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cmp r10, #0 // Storing the original dst in r0 overwrote bw, recalculate it here add r2, r2, r4 // center_w + left_ext add r2, r2, r11 // bw = center_w + left_ext + right_ext beq 3f // need_bottom sub r8, r6, r7 // ref = dst - stride mov r4, r2 1: vld1.8 {q0, q1}, [r8, :128]! mov r3, r10 2: subs r3, r3, #1 vst1.8 {q0, q1}, [r6, :128], r7 bgt 2b mls r6, r7, r10, r6 // dst -= bottom_ext * stride subs r4, r4, #32 // bw -= 32 add r6, r6, #32 // dst += 32 bgt 1b 3: cmp r5, #0 beq 3f // need_top mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride 1: vld1.8 {q0, q1}, [r0, :128]! mov r3, r5 2: subs r3, r3, #1 vst1.8 {q0, q1}, [r6, :128], r7 bgt 2b mls r6, r7, r5, r6 // dst -= top_ext * stride subs r2, r2, #32 // bw -= 32 add r6, r6, #32 // dst += 32 bgt 1b 3: pop {r4-r11,pc} endfunc rav1e-0.7.1/src/arm/32/mc16.S000064400000000000000000004014701046102023000133410ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define PREP_BIAS 8192 .macro avg d0, d00, d01, d1, d10, d11 vld1.16 {q0, q1}, [r2, :128]! vld1.16 {q2, q3}, [r3, :128]! vqadd.s16 q0, q0, q2 vqadd.s16 q1, q1, q3 vmax.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits vmax.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits vqsub.s16 q0, q0, q12 // -2*PREP_BIAS - 1 << intermediate_bits vqsub.s16 q1, q1, q12 // -2*PREP_BIAS - 1 << intermediate_bits vshl.s16 \d0, q0, q13 // -(intermediate_bits+1) vshl.s16 \d1, q1, q13 // -(intermediate_bits+1) .endm .macro w_avg d0, d00, d01, d1, d10, d11 vld1.16 {q0, q1}, [r2, :128]! vld1.16 {q2, q3}, [r3, :128]! // This difference requires a 17 bit range, and all bits are // significant for the following multiplication. vsubl.s16 \d0, d4, d0 vsubl.s16 q0, d5, d1 vsubl.s16 \d1, d6, d2 vsubl.s16 q1, d7, d3 vmul.s32 \d0, \d0, q4 vmul.s32 q0, q0, q4 vmul.s32 \d1, \d1, q4 vmul.s32 q1, q1, q4 vshr.s32 \d0, \d0, #4 vshr.s32 q0, q0, #4 vshr.s32 \d1, \d1, #4 vshr.s32 q1, q1, #4 vaddw.s16 \d0, \d0, d4 vaddw.s16 q0, q0, d5 vaddw.s16 \d1, \d1, d6 vaddw.s16 q1, q1, d7 vmovn.i32 \d00, \d0 vmovn.i32 \d01, q0 vmovn.i32 \d10, \d1 vmovn.i32 \d11, q1 vrshl.s16 \d0, \d0, q13 // -intermediate_bits vrshl.s16 \d1, \d1, q13 // -intermediate_bits vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits vmin.s16 \d0, \d0, q15 // bitdepth_max vmin.s16 \d1, \d1, q15 // bitdepth_max vmax.s16 \d0, \d0, q14 // 0 vmax.s16 \d1, \d1, q14 // 0 .endm .macro mask d0, d00, d01, d1, d10, d11 vld1.8 {q7}, [r6, :128]! vld1.16 {q0, q1}, [r2, :128]! vneg.s8 q7, q7 vld1.16 {q2, q3}, [r3, :128]! vmovl.s8 q6, d14 vmovl.s8 q7, d15 vmovl.s16 q4, d12 vmovl.s16 q5, d13 vmovl.s16 q6, d14 vmovl.s16 q7, d15 vsubl.s16 \d0, d4, d0 vsubl.s16 q0, d5, d1 vsubl.s16 \d1, d6, d2 vsubl.s16 q1, d7, d3 vmul.s32 \d0, \d0, q4 vmul.s32 q0, q0, q5 vmul.s32 \d1, \d1, q6 vmul.s32 q1, q1, q7 vshr.s32 \d0, \d0, #6 vshr.s32 q0, q0, #6 vshr.s32 \d1, \d1, #6 vshr.s32 q1, q1, #6 vaddw.s16 \d0, \d0, d4 vaddw.s16 q0, q0, d5 vaddw.s16 \d1, \d1, d6 vaddw.s16 q1, q1, d7 vmovn.i32 \d00, \d0 vmovn.i32 \d01, q0 vmovn.i32 \d10, \d1 vmovn.i32 \d11, q1 vrshl.s16 \d0, \d0, q13 // -intermediate_bits vrshl.s16 \d1, \d1, q13 // -intermediate_bits vadd.s16 \d0, \d0, q12 // PREP_BIAS >> intermediate_bits vadd.s16 \d1, \d1, q12 // PREP_BIAS >> intermediate_bits vmin.s16 \d0, \d0, q15 // bitdepth_max vmin.s16 \d1, \d1, q15 // bitdepth_max vmax.s16 \d0, \d0, q14 // 0 vmax.s16 \d1, \d1, q14 // 0 .endm .macro bidir_fn type, bdmax function \type\()_16bpc_neon, export=1 push {r4-r7,lr} ldrd r4, r5, [sp, #20] ldr r6, [sp, #28] clz r4, r4 .ifnc \type, avg ldr r7, [sp, #32] vmov.i16 q14, #0 vdup.16 q15, r7 // bitdepth_max .endif .ifc \type, w_avg vpush {q4} .endif .ifc \type, mask vpush {q4-q7} .endif clz r7, \bdmax sub r7, r7, #18 // intermediate_bits = clz(bitdepth_max) - 18 .ifc \type, avg mov lr, #1 movw r12, #2*PREP_BIAS lsl lr, lr, r7 // 1 << intermediate_bits neg r12, r12 // -2*PREP_BIAS add r7, r7, #1 sub r12, r12, lr // -2*PREP_BIAS - 1 << intermediate_bits neg r7, r7 // -(intermediate_bits+1) vdup.16 q12, r12 // -2*PREP_BIAS - 1 << intermediate_bits vdup.16 q13, r7 // -(intermediate_bits+1) .else mov r12, #PREP_BIAS lsr r12, r12, r7 // PREP_BIAS >> intermediate_bits neg r7, r7 // -intermediate_bits vdup.16 q12, r12 // PREP_BIAS >> intermediate_bits vdup.16 q13, r7 // -intermediate_bits .endif .ifc \type, w_avg vdup.32 q4, r6 vneg.s32 q4, q4 .endif adr r7, L(\type\()_tbl) sub r4, r4, #24 \type q8, d16, d17, q9, d18, d19 ldr r4, [r7, r4, lsl #2] add r7, r7, r4 bx r7 .align 2 L(\type\()_tbl): .word 1280f - L(\type\()_tbl) + CONFIG_THUMB .word 640f - L(\type\()_tbl) + CONFIG_THUMB .word 320f - L(\type\()_tbl) + CONFIG_THUMB .word 160f - L(\type\()_tbl) + CONFIG_THUMB .word 80f - L(\type\()_tbl) + CONFIG_THUMB .word 40f - L(\type\()_tbl) + CONFIG_THUMB 40: add r7, r0, r1 lsl r1, r1, #1 4: subs r5, r5, #4 vst1.16 {d16}, [r0, :64], r1 vst1.16 {d17}, [r7, :64], r1 vst1.16 {d18}, [r0, :64], r1 vst1.16 {d19}, [r7, :64], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 4b 80: add r7, r0, r1 lsl r1, r1, #1 8: vst1.16 {q8}, [r0, :128], r1 subs r5, r5, #2 vst1.16 {q9}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 8b 160: 16: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #2 vst1.16 {q10, q11}, [r0, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 16b 320: add r7, r0, #32 32: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #1 vst1.16 {q10, q11}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 32b 640: add r7, r0, #32 mov r12, #64 sub r1, r1, #64 64: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #1 vst1.16 {q10, q11}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 64b 1280: add r7, r0, #32 mov r12, #64 sub r1, r1, #192 128: \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r12 \type q8, d16, d17, q9, d18, d19 vst1.16 {q10, q11}, [r7, :128], r12 \type q10, d20, d21, q11, d22, d23 vst1.16 {q8, q9}, [r0, :128], r1 subs r5, r5, #1 vst1.16 {q10, q11}, [r7, :128], r1 ble 0f \type q8, d16, d17, q9, d18, d19 b 128b 0: .ifc \type, mask vpop {q4-q7} .endif .ifc \type, w_avg vpop {q4} .endif pop {r4-r7,pc} endfunc .endm bidir_fn avg, r6 bidir_fn w_avg, r7 bidir_fn mask, r7 .macro w_mask_fn type function w_mask_\type\()_16bpc_neon, export=1 push {r4-r10,lr} vpush {q4-q7} ldrd r4, r5, [sp, #96] ldrd r6, r7, [sp, #104] ldr r8, [sp, #112] clz r9, r4 adr lr, L(w_mask_\type\()_tbl) vdup.16 q15, r8 // bitdepth_max sub r9, r9, #24 clz r8, r8 // clz(bitdepth_max) ldr r9, [lr, r9, lsl #2] add r9, lr, r9 sub r8, r8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 mov r10, #PREP_BIAS*64 neg r8, r8 // -sh movw r12, #27615 // (64 + 1 - 38)<> mask_sh vshr.u16 q7, q7, #10 vadd.i32 q2, q2, q13 // += PREP_BIAS*64 vadd.i32 q3, q3, q13 vadd.i32 q4, q4, q13 vadd.i32 q5, q5, q13 vmovl.u16 q12, d12 vmovl.u16 q13, d13 vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) vmovl.u16 q12, d14 vmla.i32 q3, q9, q13 vmovl.u16 q13, d15 vmla.i32 q4, q10, q12 vmla.i32 q5, q11, q13 vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh vrshl.s32 q3, q3, q14 vrshl.s32 q4, q4, q14 vrshl.s32 q5, q5, q14 vqmovun.s32 d4, q2 // iclip_pixel vqmovun.s32 d5, q3 vqmovun.s32 d6, q4 vqmovun.s32 d7, q5 vmin.u16 q2, q2, q15 // iclip_pixel vmin.u16 q3, q3, q15 // iclip_pixel .if \type == 444 vmovn.i16 d12, q6 // 64 - m vmovn.i16 d13, q7 vsub.i16 q6, q1, q6 // m vst1.8 {q6}, [r6, :128]! .elseif \type == 422 vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) vpadd.i16 d13, d14, d15 vmovn.i16 d12, q6 vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 vst1.8 {d12}, [r6, :64]! .elseif \type == 420 vadd.i16 d12, d12, d13 // (64 - my1) + (64 - my2) (row wise addition) vadd.i16 d13, d14, d15 vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d12[0]}, [r6, :32]! .endif vst1.16 {d4}, [r0, :64], r1 vst1.16 {d5}, [r12, :64], r1 vst1.16 {d6}, [r0, :64], r1 vst1.16 {d7}, [r12, :64], r1 bgt 4b vpop {q4-q7} pop {r4-r10,pc} 8: vld1.16 {q2, q3}, [r2, :128]! // tmp1 vld1.16 {q4, q5}, [r3, :128]! // tmp2 subs r5, r5, #2 vdup.32 q13, r10 // PREP_BIAS*64 vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) vabd.s16 q7, q3, q5 vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) vsubl.s16 q9, d9, d5 vsubl.s16 q10, d10, d6 vsubl.s16 q11, d11, d7 vqsub.u16 q6, q0, q6 // 27615 - abs() vqsub.u16 q7, q0, q7 vshll.s16 q5, d7, #6 // tmp1 << 6 vshll.s16 q4, d6, #6 vshll.s16 q3, d5, #6 vshll.s16 q2, d4, #6 vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh vshr.u16 q7, q7, #10 vadd.i32 q2, q2, q13 // += PREP_BIAS*64 vadd.i32 q3, q3, q13 vadd.i32 q4, q4, q13 vadd.i32 q5, q5, q13 vmovl.u16 q12, d12 vmovl.u16 q13, d13 vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) vmovl.u16 q12, d14 vmla.i32 q3, q9, q13 vmovl.u16 q13, d15 vmla.i32 q4, q10, q12 vmla.i32 q5, q11, q13 vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh vrshl.s32 q3, q3, q14 vrshl.s32 q4, q4, q14 vrshl.s32 q5, q5, q14 vqmovun.s32 d4, q2 // iclip_pixel vqmovun.s32 d5, q3 vqmovun.s32 d6, q4 vqmovun.s32 d7, q5 vmin.u16 q2, q2, q15 // iclip_pixel vmin.u16 q3, q3, q15 // iclip_pixel .if \type == 444 vmovn.i16 d12, q6 // 64 - m vmovn.i16 d13, q7 vsub.i16 q6, q1, q6 // m vst1.8 {q6}, [r6, :128]! .elseif \type == 422 vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) vpadd.i16 d13, d14, d15 vmovn.i16 d12, q6 vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 vst1.8 {d12}, [r6, :64]! .elseif \type == 420 vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d12[0]}, [r6, :32]! .endif vst1.16 {q2}, [r0, :128], r1 vst1.16 {q3}, [r12, :128], r1 bgt 8b vpop {q4-q7} pop {r4-r10,pc} 1280: 640: 320: 160: sub r1, r1, r4, lsl #1 .if \type == 444 add lr, r6, r4 .elseif \type == 422 add lr, r6, r4, lsr #1 .endif add r7, r2, r4, lsl #1 add r9, r3, r4, lsl #1 161: mov r8, r4 16: vld1.16 {q2}, [r2, :128]! // tmp1 vld1.16 {q4}, [r3, :128]! // tmp2 vld1.16 {q3}, [r7, :128]! vld1.16 {q5}, [r9, :128]! subs r8, r8, #8 vdup.32 q13, r10 // PREP_BIAS*64 vabd.s16 q6, q2, q4 // abs(tmp1 - tmp2) vabd.s16 q7, q3, q5 vsubl.s16 q8, d8, d4 // tmp2 - tmp1 (requires 17 bit) vsubl.s16 q9, d9, d5 vsubl.s16 q10, d10, d6 vsubl.s16 q11, d11, d7 vqsub.u16 q6, q0, q6 // 27615 - abs() vqsub.u16 q7, q0, q7 vshll.s16 q5, d7, #6 // tmp1 << 6 vshll.s16 q4, d6, #6 vshll.s16 q3, d5, #6 vshll.s16 q2, d4, #6 vshr.u16 q6, q6, #10 // 64-m = (27615 - abs()) >> mask_sh vshr.u16 q7, q7, #10 vadd.i32 q2, q2, q13 // += PREP_BIAS*64 vadd.i32 q3, q3, q13 vadd.i32 q4, q4, q13 vadd.i32 q5, q5, q13 vmovl.u16 q12, d12 vmovl.u16 q13, d13 vmla.i32 q2, q8, q12 // (tmp2-tmp1)*(64-m) vmovl.u16 q12, d14 vmla.i32 q3, q9, q13 vmovl.u16 q13, d15 vmla.i32 q4, q10, q12 vmla.i32 q5, q11, q13 vrshl.s32 q2, q2, q14 // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh vrshl.s32 q3, q3, q14 vrshl.s32 q4, q4, q14 vrshl.s32 q5, q5, q14 vqmovun.s32 d4, q2 // iclip_pixel vqmovun.s32 d5, q3 vqmovun.s32 d6, q4 vqmovun.s32 d7, q5 vmin.u16 q2, q2, q15 // iclip_pixel vmin.u16 q3, q3, q15 // iclip_pixel .if \type == 444 vmovn.i16 d12, q6 // 64 - m vmovn.i16 d13, q7 vsub.i16 q6, q1, q6 // m vst1.8 {d12}, [r6, :64]! vst1.8 {d13}, [lr, :64]! .elseif \type == 422 vpadd.i16 d12, d12, d13 // (64 - m) + (64 - n) (column wise addition) vpadd.i16 d13, d14, d15 vmovn.i16 d12, q6 vhsub.u8 d12, d2, d12 // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 vst1.32 {d12[0]}, [r6, :32]! vst1.32 {d12[1]}, [lr, :32]! .elseif \type == 420 vadd.i16 q6, q6, q7 // (64 - my1) + (64 - my2) (row wise addition) vpadd.i16 d12, d12, d13 // (128 - m) + (128 - n) (column wise addition) vsub.i16 d12, d2, d12 // (256 - sign) - ((128 - m) + (128 - n)) vrshrn.i16 d12, q6, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 vst1.32 {d12[0]}, [r6, :32]! .endif vst1.16 {q2}, [r0, :128]! vst1.16 {q3}, [r12, :128]! bgt 16b subs r5, r5, #2 add r2, r2, r4, lsl #1 add r3, r3, r4, lsl #1 add r7, r7, r4, lsl #1 add r9, r9, r4, lsl #1 .if \type == 444 add r6, r6, r4 add lr, lr, r4 .elseif \type == 422 add r6, r6, r4, lsr #1 add lr, lr, r4, lsr #1 .endif add r0, r0, r1 add r12, r12, r1 bgt 161b vpop {q4-q7} pop {r4-r10,pc} endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_16bpc_neon, export=1 push {r4-r5,lr} ldrd r4, r5, [sp, #12] clz lr, r3 adr r3, L(blend_tbl) sub lr, lr, #26 ldr lr, [r3, lr, lsl #2] add r3, r3, lr bx r3 .align 2 L(blend_tbl): .word 320f - L(blend_tbl) + CONFIG_THUMB .word 160f - L(blend_tbl) + CONFIG_THUMB .word 80f - L(blend_tbl) + CONFIG_THUMB .word 40f - L(blend_tbl) + CONFIG_THUMB 40: add r12, r0, r1 lsl r1, r1, #1 4: vld1.8 {d4}, [r5, :64]! vld1.16 {q1}, [r2, :128]! vld1.16 {d0}, [r0, :64] vneg.s8 d4, d4 // -m subs r4, r4, #2 vld1.16 {d1}, [r12, :64] vmovl.s8 q2, d4 vshl.i16 q2, q2, #9 // -m << 9 vsub.i16 q1, q0, q1 // a - b vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 vadd.i16 q0, q0, q1 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d1}, [r12, :64], r1 bgt 4b pop {r4-r5,pc} 80: add r12, r0, r1 lsl r1, r1, #1 8: vld1.8 {q8}, [r5, :128]! vld1.16 {q2, q3}, [r2, :128]! vneg.s8 q9, q8 // -m vld1.16 {q0}, [r0, :128] vld1.16 {q1}, [r12, :128] vmovl.s8 q8, d18 vmovl.s8 q9, d19 vshl.i16 q8, q8, #9 // -m << 9 vshl.i16 q9, q9, #9 vsub.i16 q2, q0, q2 // a - b vsub.i16 q3, q1, q3 subs r4, r4, #2 vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q3, q3, q9 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r12, :128], r1 bgt 8b pop {r4-r5,pc} 160: add r12, r0, r1 lsl r1, r1, #1 16: vld1.8 {q12, q13}, [r5, :128]! vld1.16 {q8, q9}, [r2, :128]! subs r4, r4, #2 vneg.s8 q14, q12 // -m vld1.16 {q0, q1}, [r0, :128] vneg.s8 q15, q13 vld1.16 {q10, q11}, [r2, :128]! vmovl.s8 q12, d28 vmovl.s8 q13, d29 vmovl.s8 q14, d30 vmovl.s8 q15, d31 vld1.16 {q2, q3}, [r12, :128] vshl.i16 q12, q12, #9 // -m << 9 vshl.i16 q13, q13, #9 vshl.i16 q14, q14, #9 vshl.i16 q15, q15, #9 vsub.i16 q8, q0, q8 // a - b vsub.i16 q9, q1, q9 vsub.i16 q10, q2, q10 vsub.i16 q11, q3, q11 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q9, q9, q13 vqrdmulh.s16 q10, q10, q14 vqrdmulh.s16 q11, q11, q15 vadd.i16 q0, q0, q8 vadd.i16 q1, q1, q9 vadd.i16 q2, q2, q10 vst1.16 {q0, q1}, [r0, :128], r1 vadd.i16 q3, q3, q11 vst1.16 {q2, q3}, [r12, :128], r1 bgt 16b pop {r4-r5,pc} 320: add r12, r0, #32 32: vld1.8 {q12, q13}, [r5, :128]! vld1.16 {q8, q9}, [r2, :128]! subs r4, r4, #1 vneg.s8 q14, q12 // -m vld1.16 {q0, q1}, [r0, :128] vneg.s8 q15, q13 vld1.16 {q10, q11}, [r2, :128]! vmovl.s8 q12, d28 vmovl.s8 q13, d29 vmovl.s8 q14, d30 vmovl.s8 q15, d31 vld1.16 {q2, q3}, [r12, :128] vshl.i16 q12, q12, #9 // -m << 9 vshl.i16 q13, q13, #9 vshl.i16 q14, q14, #9 vshl.i16 q15, q15, #9 vsub.i16 q8, q0, q8 // a - b vsub.i16 q9, q1, q9 vsub.i16 q10, q2, q10 vsub.i16 q11, q3, q11 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q9, q9, q13 vqrdmulh.s16 q10, q10, q14 vqrdmulh.s16 q11, q11, q15 vadd.i16 q0, q0, q8 vadd.i16 q1, q1, q9 vadd.i16 q2, q2, q10 vst1.16 {q0, q1}, [r0, :128], r1 vadd.i16 q3, q3, q11 vst1.16 {q2, q3}, [r12, :128], r1 bgt 32b pop {r4-r5,pc} endfunc function blend_h_16bpc_neon, export=1 push {r4-r5,lr} ldr r4, [sp, #12] movrel r5, X(obmc_masks) add r5, r5, r4 sub r4, r4, r4, lsr #2 clz lr, r3 adr r12, L(blend_h_tbl) sub lr, lr, #24 ldr lr, [r12, lr, lsl #2] add r12, r12, lr bx r12 .align 2 L(blend_h_tbl): .word 1280f - L(blend_h_tbl) + CONFIG_THUMB .word 640f - L(blend_h_tbl) + CONFIG_THUMB .word 320f - L(blend_h_tbl) + CONFIG_THUMB .word 160f - L(blend_h_tbl) + CONFIG_THUMB .word 80f - L(blend_h_tbl) + CONFIG_THUMB .word 40f - L(blend_h_tbl) + CONFIG_THUMB .word 20f - L(blend_h_tbl) + CONFIG_THUMB 20: add r12, r0, r1 lsl r1, r1, #1 2: vld2.8 {d4[], d5[]}, [r5, :16]! vld1.16 {d2}, [r2, :64]! vext.8 d4, d4, d5, #6 subs r4, r4, #2 vneg.s8 d4, d4 // -m vld1.32 {d0[]}, [r0, :32] vld1.32 {d0[1]}, [r12, :32] vmovl.s8 q2, d4 vshl.i16 d4, d4, #9 // -m << 9 vsub.i16 d2, d0, d2 // a - b vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 vadd.i16 d0, d0, d2 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d0[1]}, [r12, :32], r1 bgt 2b pop {r4-r5,pc} 40: add r12, r0, r1 lsl r1, r1, #1 4: vld2.8 {d4[], d5[]}, [r5, :16]! vld1.16 {q1}, [r2, :128]! vext.8 d4, d4, d5, #4 subs r4, r4, #2 vneg.s8 d4, d4 // -m vld1.16 {d0}, [r0, :64] vld1.16 {d1}, [r12, :64] vmovl.s8 q2, d4 vshl.i16 q2, q2, #9 // -m << 9 vsub.i16 q1, q0, q1 // a - b vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 vadd.i16 q0, q0, q1 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d1}, [r12, :64], r1 bgt 4b pop {r4-r5,pc} 80: add r12, r0, r1 lsl r1, r1, #1 8: vld2.8 {d16[], d17[]}, [r5, :16]! vld1.16 {q2, q3}, [r2, :128]! vneg.s8 q9, q8 // -m vld1.16 {q0}, [r0, :128] subs r4, r4, #2 vmovl.s8 q8, d18 vmovl.s8 q9, d19 vld1.16 {q1}, [r12, :128] vshl.i16 q8, q8, #9 // -m << 9 vshl.i16 q9, q9, #9 vsub.i16 q2, q0, q2 // a - b vsub.i16 q3, q1, q3 vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q3, q3, q9 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r12, :128], r1 bgt 8b pop {r4-r5,pc} 160: add r12, r0, r1 lsl r1, r1, #1 16: vld2.8 {d24[], d25[]}, [r5, :16]! vld1.16 {q8, q9}, [r2, :128]! subs r4, r4, #2 vneg.s8 q13, q12 // -m vld1.16 {q0, q1}, [r0, :128] vmovl.s8 q12, d26 vld1.16 {q10, q11}, [r2, :128]! vmovl.s8 q13, d27 vld1.16 {q2, q3}, [r12, :128] vshl.i16 q12, q12, #9 // -m << 9 vshl.i16 q13, q13, #9 vsub.i16 q8, q0, q8 // a - b vsub.i16 q9, q1, q9 vsub.i16 q10, q2, q10 vsub.i16 q11, q3, q11 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q9, q9, q12 vqrdmulh.s16 q10, q10, q13 vqrdmulh.s16 q11, q11, q13 vadd.i16 q0, q0, q8 vadd.i16 q1, q1, q9 vadd.i16 q2, q2, q10 vadd.i16 q3, q3, q11 vst1.16 {q0, q1}, [r0, :128], r1 vst1.16 {q2, q3}, [r12, :128], r1 bgt 16b pop {r4-r5,pc} 1280: 640: 320: sub r1, r1, r3, lsl #1 321: vld1.8 {d24[]}, [r5]! mov r12, r3 vneg.s8 d24, d24 // -m vmovl.s8 q12, d24 vshl.i16 q12, q12, #9 // -m << 9 32: vld1.16 {q8, q9}, [r2, :128]! vld1.16 {q0, q1}, [r0, :128]! subs r12, r12, #32 vld1.16 {q10, q11}, [r2, :128]! vld1.16 {q2, q3}, [r0, :128] vsub.i16 q8, q0, q8 // a - b vsub.i16 q9, q1, q9 vsub.i16 q10, q2, q10 vsub.i16 q11, q3, q11 sub r0, r0, #32 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q9, q9, q12 vqrdmulh.s16 q10, q10, q12 vqrdmulh.s16 q11, q11, q12 vadd.i16 q0, q0, q8 vadd.i16 q1, q1, q9 vadd.i16 q2, q2, q10 vst1.16 {q0, q1}, [r0, :128]! vadd.i16 q3, q3, q11 vst1.16 {q2, q3}, [r0, :128]! bgt 32b subs r4, r4, #1 add r0, r0, r1 bgt 321b pop {r4-r5,pc} endfunc function blend_v_16bpc_neon, export=1 push {r4,lr} ldr r4, [sp, #8] movrel lr, X(obmc_masks) add lr, lr, r3 clz r12, r3 adr r3, L(blend_v_tbl) sub r12, r12, #26 ldr r12, [r3, r12, lsl #2] add r3, r3, r12 bx r3 .align 2 L(blend_v_tbl): .word 320f - L(blend_v_tbl) + CONFIG_THUMB .word 160f - L(blend_v_tbl) + CONFIG_THUMB .word 80f - L(blend_v_tbl) + CONFIG_THUMB .word 40f - L(blend_v_tbl) + CONFIG_THUMB .word 20f - L(blend_v_tbl) + CONFIG_THUMB 20: add r12, r0, r1 lsl r1, r1, #1 vld1.8 {d4[]}, [lr] vneg.s8 d4, d4 // -m vmovl.s8 q2, d4 vshl.i16 d4, d4, #9 // -m << 9 2: vld1.32 {d2[]}, [r2, :32]! vld1.16 {d0[]}, [r0, :16] subs r4, r4, #2 vld1.16 {d2[1]}, [r2, :16] vld1.16 {d0[1]}, [r12, :16] add r2, r2, #4 vsub.i16 d2, d0, d2 // a - b vqrdmulh.s16 d2, d2, d4 // ((a-b)*-m + 32) >> 6 vadd.i16 d0, d0, d2 vst1.16 {d0[0]}, [r0, :16], r1 vst1.16 {d0[1]}, [r12, :16], r1 bgt 2b pop {r4,pc} 40: vld1.32 {d4[]}, [lr, :32] add r12, r0, r1 vneg.s8 d4, d4 // -m lsl r1, r1, #1 vmovl.s8 q2, d4 sub r1, r1, #4 vshl.i16 q2, q2, #9 // -m << 9 4: vld1.16 {q1}, [r2, :128]! vld1.16 {d0}, [r0, :64] vld1.16 {d1}, [r12, :64] subs r4, r4, #2 vsub.i16 q1, q0, q1 // a - b vqrdmulh.s16 q1, q1, q2 // ((a-b)*-m + 32) >> 6 vadd.i16 q0, q0, q1 vst1.32 {d0[0]}, [r0, :32]! vst1.32 {d1[0]}, [r12, :32]! vst1.16 {d0[2]}, [r0, :16], r1 vst1.16 {d1[2]}, [r12, :16], r1 bgt 4b pop {r4,pc} 80: vld1.8 {d16}, [lr, :64] add r12, r0, r1 vneg.s8 d16, d16 // -m lsl r1, r1, #1 vmovl.s8 q8, d16 sub r1, r1, #8 vshl.i16 q8, q8, #9 // -m << 9 8: vld1.16 {q2, q3}, [r2, :128]! vld1.16 {q0}, [r0, :128] vld1.16 {q1}, [r12, :128] subs r4, r4, #2 vsub.i16 q2, q0, q2 // a - b vsub.i16 q3, q1, q3 vqrdmulh.s16 q2, q2, q8 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q3, q3, q8 vadd.i16 q0, q0, q2 vadd.i16 q1, q1, q3 vst1.16 {d0}, [r0, :64]! vst1.16 {d2}, [r12, :64]! vst1.32 {d1[0]}, [r0, :32], r1 vst1.32 {d3[0]}, [r12, :32], r1 bgt 8b pop {r4,pc} 160: vld1.8 {q12}, [lr, :128] add r12, r0, r1 vneg.s8 q13, q12 // -m lsl r1, r1, #1 vmovl.s8 q12, d26 vmovl.s8 q13, d27 vshl.i16 q12, q12, #9 // -m << 9 vshl.i16 d26, d26, #9 16: vld1.16 {q8, q9}, [r2, :128]! vld1.16 {d0, d1, d2}, [r0, :64] subs r4, r4, #2 vld1.16 {q10, q11}, [r2, :128]! vsub.i16 q8, q0, q8 // a - b vld1.16 {d4, d5, d6}, [r12, :64] vsub.i16 d18, d2, d18 vsub.i16 q10, q2, q10 vsub.i16 d22, d6, d22 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 d18, d18, d26 vqrdmulh.s16 q10, q10, q12 vqrdmulh.s16 d22, d22, d26 vadd.i16 q0, q0, q8 vadd.i16 d2, d2, d18 vadd.i16 q2, q2, q10 vst1.16 {d0, d1, d2}, [r0, :64], r1 vadd.i16 d6, d6, d22 vst1.16 {d4, d5, d6}, [r12, :64], r1 bgt 16b pop {r4,pc} 320: vld1.8 {d24, d25, d26}, [lr, :64] vneg.s8 q14, q12 // -m vneg.s8 d30, d26 vmovl.s8 q12, d28 vmovl.s8 q13, d29 vmovl.s8 q14, d30 sub r1, r1, #32 vshl.i16 q12, q12, #9 // -m << 9 vshl.i16 q13, q13, #9 vshl.i16 q14, q14, #9 32: vld1.16 {q8, q9}, [r2, :128]! vld1.16 {q0, q1}, [r0, :128]! subs r4, r4, #1 vld1.16 {q10}, [r2, :128] vsub.i16 q8, q0, q8 // a - b vld1.16 {q2}, [r0, :128] sub r0, r0, #32 vsub.i16 q9, q1, q9 vsub.i16 q10, q2, q10 vqrdmulh.s16 q8, q8, q12 // ((a-b)*-m + 32) >> 6 vqrdmulh.s16 q9, q9, q13 vqrdmulh.s16 q10, q10, q14 vadd.i16 q0, q0, q8 vadd.i16 q1, q1, q9 vadd.i16 q2, q2, q10 vst1.16 {q0, q1}, [r0, :128]! add r2, r2, #32 vst1.16 {q2}, [r0, :128], r1 bgt 32b pop {r4,pc} endfunc // This has got the same signature as the put_8tap functions, // and assumes that r9 is set to (clz(w)-24). function put_neon adr r10, L(put_tbl) ldr r9, [r10, r9, lsl #2] add r10, r10, r9 bx r10 .align 2 L(put_tbl): .word 1280f - L(put_tbl) + CONFIG_THUMB .word 640f - L(put_tbl) + CONFIG_THUMB .word 320f - L(put_tbl) + CONFIG_THUMB .word 16f - L(put_tbl) + CONFIG_THUMB .word 80f - L(put_tbl) + CONFIG_THUMB .word 4f - L(put_tbl) + CONFIG_THUMB .word 2f - L(put_tbl) + CONFIG_THUMB 2: vld1.32 {d0[]}, [r2], r3 vld1.32 {d1[]}, [r2], r3 subs r5, r5, #2 vst1.32 {d0[0]}, [r0, :32], r1 vst1.32 {d1[1]}, [r0, :32], r1 bgt 2b pop {r4-r11,pc} 4: vld1.16 {d0}, [r2], r3 vld1.16 {d1}, [r2], r3 subs r5, r5, #2 vst1.16 {d0}, [r0, :64], r1 vst1.16 {d1}, [r0, :64], r1 bgt 4b pop {r4-r11,pc} 80: add r8, r0, r1 lsl r1, r1, #1 add r9, r2, r3 lsl r3, r3, #1 8: vld1.16 {q0}, [r2], r3 vld1.16 {q1}, [r9], r3 subs r5, r5, #2 vst1.16 {q0}, [r0, :128], r1 vst1.16 {q1}, [r8, :128], r1 bgt 8b pop {r4-r11,pc} 16: vld1.16 {q0, q1}, [r2], r3 subs r5, r5, #1 vst1.16 {q0, q1}, [r0, :128], r1 bgt 16b pop {r4-r11,pc} 320: sub r1, r1, #32 sub r3, r3, #32 32: vld1.16 {q0, q1}, [r2]! vst1.16 {q0, q1}, [r0, :128]! vld1.16 {q2, q3}, [r2], r3 subs r5, r5, #1 vst1.16 {q2, q3}, [r0, :128], r1 bgt 32b pop {r4-r11,pc} 640: sub r1, r1, #96 sub r3, r3, #96 64: vld1.16 {q8, q9}, [r2]! vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r2]! vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q12, q13}, [r2]! vst1.16 {q12, q13}, [r0, :128]! vld1.16 {q14, q15}, [r2], r3 subs r5, r5, #1 vst1.16 {q14, q15}, [r0, :128], r1 bgt 64b pop {r4-r11,pc} 1280: sub r1, r1, #224 sub r3, r3, #224 128: vld1.16 {q8, q9}, [r2]! vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r2]! vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q12, q13}, [r2]! vst1.16 {q12, q13}, [r0, :128]! vld1.16 {q14, q15}, [r2]! vst1.16 {q14, q15}, [r0, :128]! vld1.16 {q8, q9}, [r2]! vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q10, q11}, [r2]! vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q12, q13}, [r2]! vst1.16 {q12, q13}, [r0, :128]! vld1.16 {q14, q15}, [r2], r3 subs r5, r5, #1 vst1.16 {q14, q15}, [r0, :128], r1 bgt 128b pop {r4-r11,pc} endfunc // This has got the same signature as the prep_8tap functions, // and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and // r8 to w*2. function prep_neon adr r10, L(prep_tbl) ldr r9, [r10, r9, lsl #2] vdup.16 q15, r7 // intermediate_bits vmov.i16 q14, #PREP_BIAS add r10, r10, r9 bx r10 .align 2 L(prep_tbl): .word 1280f - L(prep_tbl) + CONFIG_THUMB .word 640f - L(prep_tbl) + CONFIG_THUMB .word 320f - L(prep_tbl) + CONFIG_THUMB .word 16f - L(prep_tbl) + CONFIG_THUMB .word 80f - L(prep_tbl) + CONFIG_THUMB .word 40f - L(prep_tbl) + CONFIG_THUMB 40: add r9, r1, r2 lsl r2, r2, #1 4: vld1.16 {d0}, [r1], r2 vld1.16 {d1}, [r9], r2 subs r4, r4, #2 vshl.s16 q0, q0, q15 vsub.i16 q0, q0, q14 vst1.16 {q0}, [r0, :128]! bgt 4b pop {r4-r11,pc} 80: add r9, r1, r2 lsl r2, r2, #1 8: vld1.16 {q0}, [r1], r2 vld1.16 {q1}, [r9], r2 subs r4, r4, #2 vshl.s16 q0, q0, q15 vshl.s16 q1, q1, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vst1.16 {q0, q1}, [r0, :128]! bgt 8b pop {r4-r11,pc} 16: vld1.16 {q0, q1}, [r1], r2 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1], r2 subs r4, r4, #2 vshl.s16 q1, q1, q15 vshl.s16 q2, q2, q15 vshl.s16 q3, q3, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q3, q3, q14 vst1.16 {q2, q3}, [r0, :128]! bgt 16b pop {r4-r11,pc} 320: sub r2, r2, #32 32: vld1.16 {q0, q1}, [r1]! subs r4, r4, #1 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1], r2 vshl.s16 q1, q1, q15 vshl.s16 q2, q2, q15 vshl.s16 q3, q3, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q3, q3, q14 vst1.16 {q2, q3}, [r0, :128]! bgt 32b pop {r4-r11,pc} 640: sub r2, r2, #96 64: vld1.16 {q0, q1}, [r1]! subs r4, r4, #1 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1]! vshl.s16 q1, q1, q15 vld1.16 {q8, q9}, [r1]! vshl.s16 q2, q2, q15 vld1.16 {q10, q11}, [r1], r2 vshl.s16 q3, q3, q15 vshl.s16 q8, q8, q15 vshl.s16 q9, q9, q15 vshl.s16 q10, q10, q15 vshl.s16 q11, q11, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vsub.i16 q3, q3, q14 vsub.i16 q8, q8, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q9, q9, q14 vst1.16 {q2, q3}, [r0, :128]! vsub.i16 q10, q10, q14 vst1.16 {q8, q9}, [r0, :128]! vsub.i16 q11, q11, q14 vst1.16 {q10, q11}, [r0, :128]! bgt 64b pop {r4-r11,pc} 1280: sub r2, r2, #224 128: vld1.16 {q0, q1}, [r1]! subs r4, r4, #1 vshl.s16 q0, q0, q15 vld1.16 {q2, q3}, [r1]! vshl.s16 q1, q1, q15 vld1.16 {q8, q9}, [r1]! vshl.s16 q2, q2, q15 vld1.16 {q10, q11}, [r1]! vshl.s16 q3, q3, q15 vshl.s16 q8, q8, q15 vshl.s16 q9, q9, q15 vshl.s16 q10, q10, q15 vshl.s16 q11, q11, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vsub.i16 q3, q3, q14 vsub.i16 q8, q8, q14 vst1.16 {q0, q1}, [r0, :128]! vld1.16 {q0, q1}, [r1]! vsub.i16 q9, q9, q14 vsub.i16 q10, q10, q14 vst1.16 {q2, q3}, [r0, :128]! vld1.16 {q2, q3}, [r1]! vsub.i16 q11, q11, q14 vshl.s16 q0, q0, q15 vst1.16 {q8, q9}, [r0, :128]! vld1.16 {q8, q9}, [r1]! vshl.s16 q1, q1, q15 vshl.s16 q2, q2, q15 vst1.16 {q10, q11}, [r0, :128]! vld1.16 {q10, q11}, [r1], r2 vshl.s16 q3, q3, q15 vshl.s16 q8, q8, q15 vshl.s16 q9, q9, q15 vshl.s16 q10, q10, q15 vshl.s16 q11, q11, q15 vsub.i16 q0, q0, q14 vsub.i16 q1, q1, q14 vsub.i16 q2, q2, q14 vsub.i16 q3, q3, q14 vsub.i16 q8, q8, q14 vst1.16 {q0, q1}, [r0, :128]! vsub.i16 q9, q9, q14 vst1.16 {q2, q3}, [r0, :128]! vsub.i16 q10, q10, q14 vst1.16 {q8, q9}, [r0, :128]! vsub.i16 q11, q11, q14 vst1.16 {q10, q11}, [r0, :128]! bgt 128b pop {r4-r11,pc} endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 vld1.\wd {\d0[]}, [\s0], \strd vld1.\wd {\d1[]}, [\s1], \strd .ifnb \d2 vld1.\wd {\d2[]}, [\s0], \strd vld1.\wd {\d3[]}, [\s1], \strd .endif .ifnb \d4 vld1.\wd {\d4[]}, [\s0], \strd .endif .ifnb \d5 vld1.\wd {\d5[]}, [\s1], \strd .endif .ifnb \d6 vld1.\wd {\d6[]}, [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 vld1.16 {\d0}, [\s0], \strd vld1.16 {\d1}, [\s1], \strd .ifnb \d2 vld1.16 {\d2}, [\s0], \strd vld1.16 {\d3}, [\s1], \strd .endif .ifnb \d4 vld1.16 {\d4}, [\s0], \strd .endif .ifnb \d5 vld1.16 {\d5}, [\s1], \strd .endif .ifnb \d6 vld1.16 {\d6}, [\s0], \strd .endif .endm .macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5 vld1.16 {\d0, \d1}, [\s0], \strd .ifnb \d2 vld1.16 {\d2, \d3}, [\s1], \strd .endif .ifnb \d4 vld1.16 {\d4, \d5}, [\s0], \strd .endif .endm .macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5 load_regpair \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5 .endm .macro interleave_1_32 r0, r1, r2, r3, r4 vext.8 \r0, \r0, \r1, #4 vext.8 \r1, \r1, \r2, #4 .ifnb \r3 vext.8 \r2, \r2, \r3, #4 vext.8 \r3, \r3, \r4, #4 .endif .endm .macro vmin_u16 c, r0, r1, r2, r3 vmin.u16 \r0, \r0, \c .ifnb \r1 vmin.u16 \r1, \r1, \c .endif .ifnb \r2 vmin.u16 \r2, \r2, \c vmin.u16 \r3, \r3, \c .endif .endm .macro vsub_i16 c, r0, r1, r2, r3 vsub.i16 \r0, \r0, \c .ifnb \r1 vsub.i16 \r1, \r1, \c .endif .ifnb \r2 vsub.i16 \r2, \r2, \c vsub.i16 \r3, \r3, \c .endif .endm .macro vmull_vmlal_4 d, s0, s1, s2, s3 vmull.s16 \d, \s0, d0[0] vmlal.s16 \d, \s1, d0[1] vmlal.s16 \d, \s2, d0[2] vmlal.s16 \d, \s3, d0[3] .endm .macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 vmull.s16 \d, \s0, d0[0] vmlal.s16 \d, \s1, d0[1] vmlal.s16 \d, \s2, d0[2] vmlal.s16 \d, \s3, d0[3] vmlal.s16 \d, \s4, d1[0] vmlal.s16 \d, \s5, d1[1] vmlal.s16 \d, \s6, d1[2] vmlal.s16 \d, \s7, d1[3] .endm .macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3 vqrshrun.s32 \d0, \q0, #\shift .ifnb \q1 vqrshrun.s32 \d1, \q1, #\shift .endif .ifnb \q2 vqrshrun.s32 \d2, \q2, #\shift vqrshrun.s32 \d3, \q3, #\shift .endif .endm .macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3 vmovn.i32 \d0, \q0 .ifnb \q1 vmovn.i32 \d1, \q1 .endif .ifnb \q2 vmovn.i32 \d2, \q2 vmovn.i32 \d3, \q3 .endif .endm .macro vrshl_s32 shift, r0, r1, r2, r3 vrshl.s32 \r0, \r0, \shift vrshl.s32 \r1, \r1, \shift .ifnb \r2 vrshl.s32 \r2, \r2, \shift vrshl.s32 \r3, \r3, \shift .endif .endm .macro vst1_32 strd, r0, r1 vst1.32 {\r0[0]}, [r0, :32], \strd vst1.32 {\r0[1]}, [r9, :32], \strd .ifnb \r1 vst1.32 {\r1[0]}, [r0, :32], \strd vst1.32 {\r1[1]}, [r9, :32], \strd .endif .endm .macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7 vst1.16 {\r0}, [r0, \align], \strd vst1.16 {\r1}, [r9, \align], \strd .ifnb \r2 vst1.16 {\r2}, [r0, \align], \strd vst1.16 {\r3}, [r9, \align], \strd .endif .ifnb \r4 vst1.16 {\r4}, [r0, \align], \strd vst1.16 {\r5}, [r9, \align], \strd vst1.16 {\r6}, [r0, \align], \strd vst1.16 {\r7}, [r9, \align], \strd .endif .endm .macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3 .ifc \type, put vqrshrun_s32 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 vmin_u16 q15, \q0, \q1 .else vrshl_s32 q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits) vmovn_i32 \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3 vsub_i16 q15, \q0, \q1 // PREP_BIAS .endif .endm .macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 vst1_reg \strd, :64, \d0, \d1, \d2, \d3 .endm .macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 vst1_reg \strd, :128, \q0, \q1 .endm .macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3 finalize \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3 vst1.16 {\q0, \q1}, [r0, :128], \strd .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_16bpc_neon, export=1 push {r4-r11,lr} movw r9, \type_h movw r10, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2 make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] .ifc \bdmax, r8 ldr r8, [sp, #52] .endif movw r11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, r11 mul \my, \my, r11 add \mx, \mx, r9 // mx, 8tap_h, 4tap_h add \my, \my, r10 // my, 8tap_v, 4tap_v .ifc \type, prep lsl \d_strd, \w, #1 .endif vdup.16 q15, \bdmax // bitdepth_max clz \bdmax, \bdmax clz r9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 tst \mx, #(0x7f << 14) sub r9, r9, #24 add lr, \bdmax, #6 // 6 + intermediate_bits rsb r12, \bdmax, #6 // 6 - intermediate_bits movrel r11, X(mc_subpel_filters), -8 bne L(\type\()_8tap_h) tst \my, #(0x7f << 14) bne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx r10, \mx, #7, #7 and \mx, \mx, #0x7f it gt movgt \mx, r10 tst \my, #(0x7f << 14) add \mx, r11, \mx, lsl #3 bne L(\type\()_8tap_hv) adr r10, L(\type\()_8tap_h_tbl) vdup.32 q14, r12 // 6 - intermediate_bits ldr r9, [r10, r9, lsl #2] vneg.s32 q14, q14 // -(6-intermediate_bits) .ifc \type, put vdup.16 q13, \bdmax // intermediate_bits .else vmov.i16 q13, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vneg.s16 q13, q13 // -intermediate_bits .endif bx r10 .align 2 L(\type\()_8tap_h_tbl): .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 2: vld1.16 {q2}, [\src], \s_strd vld1.16 {q3}, [\sr2], \s_strd vext.8 d5, d4, d5, #2 vext.8 d7, d6, d7, #2 subs \h, \h, #2 vtrn.32 d4, d6 vtrn.32 d5, d7 vmull.s16 q1, d4, d0[0] vmlal.s16 q1, d5, d0[1] vmlal.s16 q1, d6, d0[2] vmlal.s16 q1, d7, d0[3] vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) vqmovun.s32 d2, q1 vrshl.s16 d2, d2, d26 // -intermediate_bits vmin.u16 d2, d2, d30 vst1.32 {d2[0]}, [\dst, :32], \d_strd vst1.32 {d2[1]}, [\ds2, :32], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 4: vld1.16 {q8}, [\src], \s_strd vld1.16 {q11}, [\sr2], \s_strd vext.8 d18, d16, d17, #2 vext.8 d19, d16, d17, #4 vext.8 d20, d16, d17, #6 vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d21, d22, d23, #6 subs \h, \h, #2 vmull.s16 q2, d16, d0[0] vmlal.s16 q2, d18, d0[1] vmlal.s16 q2, d19, d0[2] vmlal.s16 q2, d20, d0[3] vmull.s16 q3, d22, d0[0] vmlal.s16 q3, d24, d0[1] vmlal.s16 q3, d25, d0[2] vmlal.s16 q3, d21, d0[3] vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) .ifc \type, put vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vrshl.s16 q2, q2, q13 // -intermediate_bits vmin.u16 q2, q2, q15 .else vmovn.s32 d4, q2 vmovn.s32 d5, q3 vsub.i16 q2, q2, q13 // PREP_BIAS .endif vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd bgt 4b pop {r4-r11,pc} 80: 160: 320: 640: 1280: // 8xN, 16xN, 32xN, ... h vpush {q4-q5} vld1.8 {d0}, [\mx, :64] sub \src, \src, #6 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 sub \s_strd, \s_strd, \w, lsl #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, lsl #1 .endif 81: vld1.16 {q8, q9}, [\src]! vld1.16 {q10, q11}, [\sr2]! mov \mx, \w 8: vmull.s16 q1, d16, d0[0] vmull.s16 q2, d17, d0[0] vmull.s16 q3, d20, d0[0] vmull.s16 q4, d21, d0[0] .irpc i, 1234567 vext.8 q12, q8, q9, #(2*\i) vext.8 q5, q10, q11, #(2*\i) .if \i < 4 vmlal.s16 q1, d24, d0[\i] vmlal.s16 q2, d25, d0[\i] vmlal.s16 q3, d10, d0[\i] vmlal.s16 q4, d11, d0[\i] .else vmlal.s16 q1, d24, d1[\i-4] vmlal.s16 q2, d25, d1[\i-4] vmlal.s16 q3, d10, d1[\i-4] vmlal.s16 q4, d11, d1[\i-4] .endif .endr subs \mx, \mx, #8 vrshl.s32 q1, q1, q14 // -(6-intermediate_bits) vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vrshl.s32 q4, q4, q14 // -(6-intermediate_bits) .ifc \type, put vqmovun.s32 d2, q1 vqmovun.s32 d3, q2 vqmovun.s32 d4, q3 vqmovun.s32 d5, q4 vrshl.s16 q1, q1, q13 // -intermediate_bits vrshl.s16 q2, q2, q13 // -intermediate_bits vmin.u16 q1, q1, q15 vmin.u16 q2, q2, q15 .else vmovn.s32 d2, q1 vmovn.s32 d3, q2 vmovn.s32 d4, q3 vmovn.s32 d5, q4 vsub.i16 q1, q1, q13 // PREP_BIAS vsub.i16 q2, q2, q13 // PREP_BIAS .endif vst1.16 {q1}, [\dst, :128]! vst1.16 {q2}, [\ds2, :128]! ble 9f vmov q8, q9 vmov q10, q11 vld1.16 {q9}, [\src]! vld1.16 {q11}, [\sr2]! b 8b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 81b vpop {q4-q5} pop {r4-r11,pc} L(\type\()_8tap_v): cmp \h, #4 ubfx r10, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r10 add \my, r11, \my, lsl #3 .ifc \type, prep vdup.32 q14, r12 // 6 - intermediate_bits vmov.i16 q15, #PREP_BIAS .endif adr r10, L(\type\()_8tap_v_tbl) ldr r9, [r10, r9, lsl #2] .ifc \type, prep vneg.s32 q14, q14 // -(6-intermediate_bits) .endif add r10, r10, r9 bx r10 .align 2 L(\type\()_8tap_v_tbl): .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put bgt 28f cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 // 2x2 v load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5 interleave_1_32 d1, d2, d3, d4, d5 bgt 24f vmull_vmlal_4 q8, d1, d2, d3, d4 vqrshrun_s32 6, q8, d16 vmin_u16 d30, d16 vst1_32 \d_strd, d16 pop {r4-r11,pc} 24: // 2x4 v load_32 \sr2, \src, \s_strd, d6, d7 interleave_1_32 d5, d6, d7 vmull_vmlal_4 q8, d1, d2, d3, d4 vmull_vmlal_4 q9, d3, d4, d5, d6 vqrshrun_s32 6, q8, d16, q9, d17 vmin_u16 q15, q8 vst1_32 \d_strd, d16, d17 pop {r4-r11,pc} 28: // 2x6, 2x8, 2x12, 2x16 v vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vmovl.s8 q0, d0 load_32 \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16 interleave_1_32 d2, d3, d4, d5, d6 interleave_1_32 d6, d7, d16 216: subs \h, \h, #4 load_32 \sr2, \src, \s_strd, d17, d18, d19, d20 interleave_1_32 d16, d17, d18, d19, d20 vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 vmull_vmlal_8 q1, d4, d5, d6, d7, d16, d17, d18, d19 vqrshrun_s32 6, q13, d26, q1, d27 vmin_u16 q15, q13 vst1_32 \d_strd, d26, d27 ble 0f cmp \h, #2 vmov q1, q3 vmov q2, q8 vmov q3, q9 vmov d16, d20 beq 26f b 216b 26: load_32 \sr2, \src, \s_strd, d17, d18 interleave_1_32 d16, d17, d18 vmull_vmlal_8 q13, d2, d3, d4, d5, d6, d7, d16, d17 vqrshrun_s32 6, q13, d26 vmin_u16 d30, d26 vst1_32 \d_strd, d26 0: pop {r4-r11,pc} .endif 40: bgt 480f // 4x2, 4x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5 vmull_vmlal_4 q8, d1, d2, d3, d4 vmull_vmlal_4 q9, d2, d3, d4, d5 shift_store_4 \type, \d_strd, q8, q9, d16, d17 ble 0f load_reg \sr2, \src, \s_strd, d6, d7 vmull_vmlal_4 q8, d3, d4, d5, d6 vmull_vmlal_4 q9, d4, d5, d6, d7 shift_store_4 \type, \d_strd, q8, q9, d16, d17 0: pop {r4-r11,pc} 480: // 4x6, 4x8, 4x12, 4x16 v vld1.8 {d0}, [\my, :64] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22 48: subs \h, \h, #4 load_reg \sr2, \src, \s_strd, d23, d24, d25, d26 vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 vmull_vmlal_8 q3, d18, d19, d20, d21, d22, d23, d24, d25 vmull_vmlal_8 q8, d19, d20, d21, d22, d23, d24, d25, d26 shift_store_4 \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5 ble 0f cmp \h, #2 vmov q8, q10 vmov q9, q11 vmov q10, q12 vmov d22, d26 beq 46f b 48b 46: load_reg \sr2, \src, \s_strd, d23, d24 vmull_vmlal_8 q1, d16, d17, d18, d19, d20, d21, d22, d23 vmull_vmlal_8 q2, d17, d18, d19, d20, d21, d22, d23, d24 shift_store_4 \type, \d_strd, q1, q2, d2, d3 0: pop {r4-r11,pc} 80: bgt 880f // 8x2, 8x4 v cmp \h, #2 add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 load_reg \src, \sr2, \s_strd, q1, q2, q3, q8, q9 vmull_vmlal_4 q10, d2, d4, d6, d16 vmull_vmlal_4 q11, d3, d5, d7, d17 vmull_vmlal_4 q12, d4, d6, d16, d18 vmull_vmlal_4 q13, d5, d7, d17, d19 shift_store_8 \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23 ble 0f load_reg \sr2, \src, \s_strd, q10, q11 vmull_vmlal_4 q1, d6, d16, d18, d20 vmull_vmlal_4 q2, d7, d17, d19, d21 vmull_vmlal_4 q12, d16, d18, d20, d22 vmull_vmlal_4 q13, d17, d19, d21, d23 shift_store_8 \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5 0: pop {r4-r11,pc} 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: vpush {q4-q7} vld1.8 {d0}, [\my, :64] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_reg \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11 88: subs \h, \h, #2 load_reg \sr2, \src, \s_strd, q12, q13 vmull_vmlal_8 q1, d10, d12, d14, d16, d18, d20, d22, d24 vmull_vmlal_8 q2, d11, d13, d15, d17, d19, d21, d23, d25 vmull_vmlal_8 q3, d12, d14, d16, d18, d20, d22, d24, d26 vmull_vmlal_8 q4, d13, d15, d17, d19, d21, d23, d25, d27 shift_store_8 \type, \d_strd, q1, q2, d2, d3, q3, q4, d4, d5 ble 9f subs \h, \h, #2 load_reg \sr2, \src, \s_strd, q1, q2 vmull_vmlal_8 q3, d14, d16, d18, d20, d22, d24, d26, d2 vmull_vmlal_8 q4, d15, d17, d19, d21, d23, d25, d27, d3 vmull_vmlal_8 q5, d16, d18, d20, d22, d24, d26, d2, d4 vmull_vmlal_8 q6, d17, d19, d21, d23, d25, d27, d3, d5 shift_store_8 \type, \d_strd, q3, q4, d6, d7, q5, q6, d8, d9 ble 9f vmov q5, q9 vmov q6, q10 vmov q7, q11 vmov q8, q12 vmov q9, q13 vmov q10, q1 vmov q11, q2 b 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: vpop {q4-q7} pop {r4-r11,pc} 160: bgt 1680b // 16x2, 16x4 v vpush {q6-q7} add \my, \my, #2 vld1.32 {d0[]}, [\my] sub \src, \src, \s_strd vmovl.s8 q0, d0 load_16s16 \src, \src, \s_strd, q6, q7, q8, q9, q10, q11 16: load_16s16 \src, \src, \s_strd, q12, q13 subs \h, \h, #1 vmull_vmlal_4 q1, d12, d16, d20, d24 vmull_vmlal_4 q2, d13, d17, d21, d25 vmull_vmlal_4 q3, d14, d18, d22, d26 vmull_vmlal_4 q6, d15, d19, d23, d27 shift_store_16 \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5 ble 0f vmov q6, q8 vmov q7, q9 vmov q8, q10 vmov q9, q11 vmov q10, q12 vmov q11, q13 b 16b 0: vpop {q6-q7} pop {r4-r11,pc} L(\type\()_8tap_hv): cmp \h, #4 ubfx r10, \my, #7, #7 and \my, \my, #0x7f it gt movgt \my, r10 4: add \my, r11, \my, lsl #3 adr r10, L(\type\()_8tap_hv_tbl) neg r12, r12 // -(6-intermediate_bits) ldr r9, [r10, r9, lsl #2] vdup.32 q14, r12 // -(6-intermediate_bits) .ifc \type, put neg r8, lr // -(6+intermeidate_bits) .else vmov.i16 q13, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vdup.32 q13, r8 // -(6+intermediate_bits) .endif bx r10 .align 2 L(\type\()_8tap_hv_tbl): .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB 20: .ifc \type, put add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 280f add \my, \my, #2 vld1.32 {d2[]}, [\my] // 2x2, 2x4 hv sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vmull.s16 q11, d22, d0 vmull.s16 q12, d24, d0 vpadd.s32 d22, d22, d23 vpadd.s32 d23, d24, d25 vpadd.s32 d22, d22, d23 vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) vmovn.i32 d16, q8 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vext.8 d16, d16, d24, #4 vmov d17, d24 2: bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d24, #4 vmull.s16 q2, d16, d2[0] vmlal.s16 q2, d17, d2[1] vmlal.s16 q2, d18, d2[2] vmlal.s16 q2, d24, d2[3] vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vmin.u16 d4, d4, d30 subs \h, \h, #2 vst1.32 {d4[0]}, [\dst, :32], \d_strd vst1.32 {d4[1]}, [\ds2, :32], \d_strd ble 0f vmov d16, d18 vmov d17, d24 b 2b 280: // 2x8, 2x16, 2x32 hv vld1.8 {d2}, [\my, :64] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vmull.s16 q11, d22, d0 vmull.s16 q12, d24, d0 vpadd.s32 d22, d22, d23 vpadd.s32 d23, d24, d25 vpadd.s32 d22, d22, d23 vrshl.s32 d16, d22, d28 // -(6-intermediate_bits) vmovn.i32 d16, q8 bl L(\type\()_8tap_filter_2) vext.8 d16, d16, d16, #4 vext.8 d16, d16, d24, #4 vmov d17, d24 bl L(\type\()_8tap_filter_2) vext.8 d18, d17, d24, #4 vmov d19, d24 bl L(\type\()_8tap_filter_2) vext.8 d20, d19, d24, #4 vmov d21, d24 28: bl L(\type\()_8tap_filter_2) vext.8 d22, d21, d24, #4 vmull.s16 q3, d16, d2[0] vmlal.s16 q3, d17, d2[1] vmlal.s16 q3, d18, d2[2] vmlal.s16 q3, d19, d2[3] vmlal.s16 q3, d20, d3[0] vmlal.s16 q3, d21, d3[1] vmlal.s16 q3, d22, d3[2] vmlal.s16 q3, d24, d3[3] vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) vqmovun.s32 d6, q3 vmin.u16 d6, d6, d30 subs \h, \h, #2 vst1.32 {d6[0]}, [\dst, :32], \d_strd vst1.32 {d6[1]}, [\ds2, :32], \d_strd ble 0f vmov q8, q9 vmov q9, q10 vmov d20, d22 vmov d21, d24 b 28b 0: pop {r4-r11,pc} L(\type\()_8tap_filter_2): vld1.16 {q11}, [\sr2], \s_strd vld1.16 {q12}, [\src], \s_strd vext.8 d23, d22, d23, #2 vext.8 d25, d24, d25, #2 vtrn.32 q11, q12 vmull.s16 q3, d22, d0[0] vmlal.s16 q3, d23, d0[1] vmlal.s16 q3, d24, d0[2] vmlal.s16 q3, d25, d0[3] vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vmovn.i32 d24, q3 bx lr .endif 40: add \mx, \mx, #2 vld1.32 {d0[]}, [\mx] bgt 480f add \my, \my, #2 vld1.32 {d2[]}, [\my] sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 // 4x2, 4x4 hv vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d23, d22, d23, #6 vmull.s16 q10, d22, d0[0] vmlal.s16 q10, d24, d0[1] vmlal.s16 q10, d25, d0[2] vmlal.s16 q10, d23, d0[3] vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) vmovn.i32 d17, q10 bl L(\type\()_8tap_filter_4) vmov q9, q12 4: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d17, d2[0] vmlal.s16 q2, d18, d2[1] vmlal.s16 q2, d19, d2[2] vmlal.s16 q2, d24, d2[3] vmull.s16 q3, d18, d2[0] vmlal.s16 q3, d19, d2[1] vmlal.s16 q3, d24, d2[2] vmlal.s16 q3, d25, d2[3] .ifc \type, put vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vmin.u16 q2, q2, q15 .else vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vsub.i16 q2, q2, q13 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd ble 0f vmov d17, d19 vmov q9, q12 b 4b 0: pop {r4-r11,pc} 480: // 4x8, 4x16, 4x32 hv vpush {d13-d15} vld1.8 {d2}, [\my, :64] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 vld1.16 {q11}, [\src], \s_strd vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d23, d22, d23, #6 vmull.s16 q10, d22, d0[0] vmlal.s16 q10, d24, d0[1] vmlal.s16 q10, d25, d0[2] vmlal.s16 q10, d23, d0[3] vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) vmovn.i32 d13, q10 bl L(\type\()_8tap_filter_4) vmov q7, q12 bl L(\type\()_8tap_filter_4) vmov q8, q12 bl L(\type\()_8tap_filter_4) vmov q9, q12 48: bl L(\type\()_8tap_filter_4) vmull.s16 q2, d13, d2[0] vmlal.s16 q2, d14, d2[1] vmlal.s16 q2, d15, d2[2] vmlal.s16 q2, d16, d2[3] vmlal.s16 q2, d17, d3[0] vmlal.s16 q2, d18, d3[1] vmlal.s16 q2, d19, d3[2] vmlal.s16 q2, d24, d3[3] vmull.s16 q3, d14, d2[0] vmlal.s16 q3, d15, d2[1] vmlal.s16 q3, d16, d2[2] vmlal.s16 q3, d17, d2[3] vmlal.s16 q3, d18, d3[0] vmlal.s16 q3, d19, d3[1] vmlal.s16 q3, d24, d3[2] vmlal.s16 q3, d25, d3[3] .ifc \type, put vrshl.s32 q2, q2, q13 // -(6+intermediate_bits) vrshl.s32 q3, q3, q13 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vmin.u16 q2, q2, q15 .else vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vsub.i16 q2, q2, q13 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {d4}, [\dst, :64], \d_strd vst1.16 {d5}, [\ds2, :64], \d_strd ble 0f vmov d13, d15 vmov q7, q8 vmov q8, q9 vmov q9, q12 b 48b 0: vpop {d13-d15} pop {r4-r11,pc} L(\type\()_8tap_filter_4): vld1.16 {q10}, [\sr2], \s_strd vld1.16 {q11}, [\src], \s_strd vext.8 d24, d20, d21, #2 vext.8 d25, d20, d21, #4 vext.8 d21, d20, d21, #6 vmull.s16 q3, d20, d0[0] vmlal.s16 q3, d24, d0[1] vmlal.s16 q3, d25, d0[2] vmlal.s16 q3, d21, d0[3] vext.8 d24, d22, d23, #2 vext.8 d25, d22, d23, #4 vext.8 d23, d22, d23, #6 vmull.s16 q10, d22, d0[0] vmlal.s16 q10, d24, d0[1] vmlal.s16 q10, d25, d0[2] vmlal.s16 q10, d23, d0[3] vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vrshl.s32 q10, q10, q14 // -(6-intermediate_bits) vmovn.i32 d24, q3 vmovn.i32 d25, q10 bx lr 80: 160: 320: bgt 880f add \my, \my, #2 vld1.8 {d0}, [\mx, :64] vld1.32 {d2[]}, [\my] sub \src, \src, #6 sub \src, \src, \s_strd vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.16 {q11, q12}, [\src], \s_strd vmull.s16 q2, d22, d0[0] vmull.s16 q3, d23, d0[0] vdup.32 q14, r12 // -(6-intermediate_bits) .irpc i, 1234567 vext.8 q10, q11, q12, #(2*\i) .if \i < 4 vmlal.s16 q2, d20, d0[\i] vmlal.s16 q3, d21, d0[\i] .else vmlal.s16 q2, d20, d1[\i - 4] vmlal.s16 q3, d21, d1[\i - 4] .endif .endr vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vmovn.i32 d16, q2 vmovn.i32 d17, q3 bl L(\type\()_8tap_filter_8) vmov q9, q11 vmov q10, q12 8: bl L(\type\()_8tap_filter_8) vmull.s16 q2, d16, d2[0] vmull.s16 q3, d17, d2[0] vmull.s16 q13, d18, d2[0] vmull.s16 q14, d19, d2[0] .ifc \type, put vdup.32 q8, r8 // -(6+intermediate_bits) .endif vmlal.s16 q2, d18, d2[1] vmlal.s16 q3, d19, d2[1] vmlal.s16 q13, d20, d2[1] vmlal.s16 q14, d21, d2[1] vmlal.s16 q2, d20, d2[2] vmlal.s16 q3, d21, d2[2] vmlal.s16 q13, d22, d2[2] vmlal.s16 q14, d23, d2[2] vmlal.s16 q2, d22, d2[3] vmlal.s16 q3, d23, d2[3] vmlal.s16 q13, d24, d2[3] vmlal.s16 q14, d25, d2[3] .ifc \type, put vdup.16 q9, \bdmax // bitdepth_max vrshl.s32 q2, q2, q8 // -(6+intermediate_bits) vrshl.s32 q3, q3, q8 // -(6+intermediate_bits) vrshl.s32 q13, q13, q8 // -(6+intermediate_bits) vrshl.s32 q14, q14, q8 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vqmovun.s32 d6, q13 vqmovun.s32 d7, q14 vmin.u16 q2, q2, q15 vmin.u16 q3, q3, q15 .else vmov.i16 q9, #PREP_BIAS vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q13, #6 vrshrn.i32 d7, q14, #6 vsub.i16 q2, q2, q9 // PREP_BIAS vsub.i16 q3, q3, q9 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {q2}, [\dst, :128], \d_strd vst1.16 {q3}, [\ds2, :128], \d_strd ble 9f vmov q8, q10 vmov q9, q11 vmov q10, q12 b 8b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 164b 0: pop {r4-r11,pc} 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: vpush {q4-q7} vld1.8 {d0}, [\mx, :64] vld1.8 {d2}, [\my, :64] sub \src, \src, #6 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 vmovl.s8 q0, d0 vmovl.s8 q1, d2 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 vld1.16 {q11, q12}, [\src], \s_strd vmull.s16 q2, d22, d0[0] vmull.s16 q3, d23, d0[0] vdup.32 q14, r12 // -(6-intermediate_bits) .irpc i, 1234567 vext.8 q10, q11, q12, #(2*\i) .if \i < 4 vmlal.s16 q2, d20, d0[\i] vmlal.s16 q3, d21, d0[\i] .else vmlal.s16 q2, d20, d1[\i - 4] vmlal.s16 q3, d21, d1[\i - 4] .endif .endr vrshl.s32 q2, q2, q14 // -(6-intermediate_bits) vrshl.s32 q3, q3, q14 // -(6-intermediate_bits) vmovn.i32 d8, q2 vmovn.i32 d9, q3 bl L(\type\()_8tap_filter_8) vmov q5, q11 vmov q6, q12 bl L(\type\()_8tap_filter_8) vmov q7, q11 vmov q8, q12 bl L(\type\()_8tap_filter_8) vmov q9, q11 vmov q10, q12 88: bl L(\type\()_8tap_filter_8) vmull.s16 q2, d8, d2[0] vmull.s16 q3, d9, d2[0] vmull.s16 q13, d10, d2[0] vmull.s16 q14, d11, d2[0] .ifc \type, put vdup.32 q4, r8 // -(6+intermediate_bits) .endif vmlal.s16 q2, d10, d2[1] vmlal.s16 q3, d11, d2[1] vmlal.s16 q13, d12, d2[1] vmlal.s16 q14, d13, d2[1] vmlal.s16 q2, d12, d2[2] vmlal.s16 q3, d13, d2[2] vmlal.s16 q13, d14, d2[2] vmlal.s16 q14, d15, d2[2] vmlal.s16 q2, d14, d2[3] vmlal.s16 q3, d15, d2[3] vmlal.s16 q13, d16, d2[3] vmlal.s16 q14, d17, d2[3] vmlal.s16 q2, d16, d3[0] vmlal.s16 q3, d17, d3[0] vmlal.s16 q13, d18, d3[0] vmlal.s16 q14, d19, d3[0] vmlal.s16 q2, d18, d3[1] vmlal.s16 q3, d19, d3[1] vmlal.s16 q13, d20, d3[1] vmlal.s16 q14, d21, d3[1] vmlal.s16 q2, d20, d3[2] vmlal.s16 q3, d21, d3[2] vmlal.s16 q13, d22, d3[2] vmlal.s16 q14, d23, d3[2] vmlal.s16 q2, d22, d3[3] vmlal.s16 q3, d23, d3[3] vmlal.s16 q13, d24, d3[3] vmlal.s16 q14, d25, d3[3] .ifc \type, put vrshl.s32 q2, q2, q4 // -(6+intermediate_bits) vrshl.s32 q3, q3, q4 // -(6+intermediate_bits) vrshl.s32 q13, q13, q4 // -(6+intermediate_bits) vrshl.s32 q14, q14, q4 // -(6+intermediate_bits) vqmovun.s32 d4, q2 vqmovun.s32 d5, q3 vqmovun.s32 d6, q13 vqmovun.s32 d7, q14 vmin.u16 q2, q2, q15 vmin.u16 q3, q3, q15 .else vmov.i16 q5, #PREP_BIAS vrshrn.i32 d4, q2, #6 vrshrn.i32 d5, q3, #6 vrshrn.i32 d6, q13, #6 vrshrn.i32 d7, q14, #6 vsub.i16 q2, q2, q5 // PREP_BIAS vsub.i16 q3, q3, q5 // PREP_BIAS .endif subs \h, \h, #2 vst1.16 {q2}, [\dst, :128], \d_strd vst1.16 {q3}, [\ds2, :128], \d_strd ble 9f vmov q4, q6 vmov q5, q7 vmov q6, q8 vmov q7, q9 vmov q8, q10 vmov q9, q11 vmov q10, q12 b 88b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: vpop {q4-q7} pop {r4-r11,pc} L(\type\()_8tap_filter_8): vld1.16 {q13, q14}, [\sr2], \s_strd vmull.s16 q2, d26, d0[0] vmull.s16 q3, d27, d0[0] .irpc i, 1234567 vext.8 q12, q13, q14, #(2*\i) .if \i < 4 vmlal.s16 q2, d24, d0[\i] vmlal.s16 q3, d25, d0[\i] .else vmlal.s16 q2, d24, d1[\i - 4] vmlal.s16 q3, d25, d1[\i - 4] .endif .endr vdup.32 q12, r12 // -(6-intermediate_bits) vld1.16 {q13, q14}, [\src], \s_strd vrshl.s32 q2, q2, q12 // -(6-intermediate_bits) vrshl.s32 q3, q3, q12 // -(6-intermediate_bits) vmovn.i32 d4, q2 vmovn.i32 d5, q3 vmull.s16 q3, d26, d0[0] vmull.s16 q11, d27, d0[0] .irpc i, 1234567 vext.8 q12, q13, q14, #(2*\i) .if \i < 4 vmlal.s16 q3, d24, d0[\i] vmlal.s16 q11, d25, d0[\i] .else vmlal.s16 q3, d24, d1[\i - 4] vmlal.s16 q11, d25, d1[\i - 4] .endif .endr vdup.32 q13, r12 // -(6-intermediate_bits) vrshl.s32 q3, q3, q13 // -(6-intermediate_bits) vrshl.s32 q11, q11, q13 // -(6-intermediate_bits) vmovn.i32 d24, q3 vmovn.i32 d25, q11 vmov q11, q2 bx lr endfunc function \type\()_bilin_16bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] .ifc \bdmax, r8 ldr r8, [sp, #52] .endif vdup.16 q1, \mx vdup.16 q3, \my rsb r9, \mx, #16 rsb r10, \my, #16 vdup.16 q0, r9 vdup.16 q2, r10 .ifc \type, prep lsl \d_strd, \w, #1 .endif clz \bdmax, \bdmax // bitdepth_max clz r9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 cmp \mx, #0 sub r9, r9, #24 rsb r11, \bdmax, #4 // 4 - intermediate_bits add r12, \bdmax, #4 // 4 + intermediate_bits bne L(\type\()_bilin_h) cmp \my, #0 bne L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cmp \my, #0 bne L(\type\()_bilin_hv) adr r10, L(\type\()_bilin_h_tbl) vdup.16 q15, r11 // 4 - intermediate_bits ldr r9, [r10, r9, lsl #2] vneg.s16 q15, q15 // -(4-intermediate_bits) .ifc \type, put vdup.16 q14, \bdmax // intermediate_bits .else vmov.i16 q14, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vneg.s16 q14, q14 // -intermediate_bits .endif bx r10 .align 2 L(\type\()_bilin_h_tbl): .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB 20: // 2xN h .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: vld1.16 {d16}, [\src], \s_strd vld1.16 {d18}, [\sr2], \s_strd vext.8 d17, d16, d16, #2 vext.8 d19, d18, d18, #2 vtrn.32 d16, d18 vtrn.32 d17, d19 subs \h, \h, #2 vmul.i16 d16, d16, d0 vmla.i16 d16, d17, d2 vrshl.u16 d16, d16, d30 vrshl.u16 d16, d16, d28 vst1.32 {d16[0]}, [\dst, :32], \d_strd vst1.32 {d16[1]}, [\ds2, :32], \d_strd bgt 2b pop {r4-r11,pc} .endif 40: // 4xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: vld1.16 {q8}, [\src], \s_strd vld1.16 {q10}, [\sr2], \s_strd vext.8 q9, q8, q8, #2 vext.8 q11, q10, q10, #2 vmov d17, d20 vmov d19, d22 subs \h, \h, #2 vmul.i16 q8, q8, q0 vmla.i16 q8, q9, q1 vrshl.u16 q8, q8, q15 .ifc \type, put vrshl.u16 q8, q8, q14 .else vsub.i16 q8, q8, q14 .endif vst1.16 {d16}, [\dst, :64], \d_strd vst1.16 {d17}, [\ds2, :64], \d_strd bgt 4b pop {r4-r11,pc} 80: // 8xN h add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: vld1.16 {d16, d17, d18}, [\src], \s_strd vld1.16 {d20, d21, d22}, [\sr2], \s_strd vext.8 q9, q8, q9, #2 vext.8 q11, q10, q11, #2 subs \h, \h, #2 vmul.i16 q8, q8, q0 vmla.i16 q8, q9, q1 vmul.i16 q10, q10, q0 vmla.i16 q10, q11, q1 vrshl.u16 q8, q8, q15 vrshl.u16 q10, q10, q15 .ifc \type, put vrshl.u16 q8, q8, q14 vrshl.u16 q10, q10, q14 .else vsub.i16 q8, q8, q14 vsub.i16 q10, q10, q14 .endif vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q10}, [\ds2, :128], \d_strd bgt 8b pop {r4-r11,pc} 160: 320: 640: 1280: // 16xN, 32xN, ... h vpush {q4-q7} add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, lsl #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, lsl #1 .endif 161: vld1.16 {q4}, [\src]! vld1.16 {q9}, [\sr2]! mov \mx, \w 16: vld1.16 {q5, q6}, [\src]! vld1.16 {q10, q11}, [\sr2]! vext.8 q7, q4, q5, #2 vext.8 q8, q5, q6, #2 vext.8 q12, q9, q10, #2 vext.8 q13, q10, q11, #2 vmul.i16 q4, q4, q0 vmla.i16 q4, q7, q1 vmul.i16 q5, q5, q0 vmla.i16 q5, q8, q1 vmul.i16 q9, q9, q0 vmla.i16 q9, q12, q1 vmul.i16 q10, q10, q0 vmla.i16 q10, q13, q1 vrshl.u16 q4, q4, q15 vrshl.u16 q5, q5, q15 vrshl.u16 q9, q9, q15 vrshl.u16 q10, q10, q15 subs \mx, \mx, #16 .ifc \type, put vrshl.u16 q4, q4, q14 vrshl.u16 q5, q5, q14 vrshl.u16 q9, q9, q14 vrshl.u16 q10, q10, q14 .else vsub.i16 q4, q4, q14 vsub.i16 q5, q5, q14 vsub.i16 q9, q9, q14 vsub.i16 q10, q10, q14 .endif vst1.16 {q4, q5}, [\dst, :128]! vst1.16 {q9, q10}, [\ds2, :128]! ble 9f vmov q4, q6 vmov q9, q11 b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 bgt 161b vpop {q4-q7} pop {r4-r11,pc} L(\type\()_bilin_v): cmp \h, #4 adr r10, L(\type\()_bilin_v_tbl) .ifc \type, prep vdup.16 q15, r11 // 4 - intermediate_bits .endif ldr r9, [r10, r9, lsl #2] .ifc \type, prep vmov.i16 q14, #PREP_BIAS vneg.s16 q15, q15 // -(4-intermediate_bits) .endif add r10, r10, r9 bx r10 .align 2 L(\type\()_bilin_v_tbl): .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB 20: // 2xN v .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v vld1.32 {d16[]}, [\src], \s_strd bgt 24f 22: vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vext.8 d16, d16, d17, #4 vext.8 d17, d17, d18, #4 vmul.i16 d16, d16, d4 vmla.i16 d16, d17, d6 vrshr.u16 d16, d16, #4 vst1.32 {d16[0]}, [\dst, :32] vst1.32 {d16[1]}, [\ds2, :32] pop {r4-r11,pc} 24: // 2x4, 2x6, 2x8, ... v vld1.32 {d17[]}, [\sr2], \s_strd vld1.32 {d18[]}, [\src], \s_strd vld1.32 {d19[]}, [\sr2], \s_strd vld1.32 {d20[]}, [\src], \s_strd subs \h, \h, #4 vext.8 d16, d16, d17, #4 vext.8 d17, d17, d18, #4 vext.8 d18, d18, d19, #4 vext.8 d19, d19, d20, #4 vswp d17, d18 vmul.i16 q8, q8, q2 vmla.i16 q8, q9, q3 cmp \h, #2 vrshr.u16 q8, q8, #4 vst1.32 {d16[0]}, [\dst, :32], \d_strd vst1.32 {d16[1]}, [\ds2, :32], \d_strd vst1.32 {d17[0]}, [\dst, :32], \d_strd vst1.32 {d17[1]}, [\ds2, :32], \d_strd blt 0f vmov d16, d20 beq 22b b 24b 0: pop {r4-r11,pc} .endif 40: // 4xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {d16}, [\src], \s_strd 4: vld1.16 {d17}, [\sr2], \s_strd vld1.16 {d19}, [\src], \s_strd vmov d18, d17 vmul.i16 q8, q8, q2 vmla.i16 q8, q9, q3 subs \h, \h, #2 .ifc \type, put vrshr.u16 q8, q8, #4 .else vrshl.u16 q8, q8, q15 vsub.i16 q8, q8, q14 .endif vst1.16 {d16}, [\dst, :64], \d_strd vst1.16 {d17}, [\ds2, :64], \d_strd ble 0f vmov d16, d19 b 4b 0: pop {r4-r11,pc} 80: // 8xN v add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {q8}, [\src], \s_strd 8: vld1.16 {q9}, [\sr2], \s_strd vld1.16 {q10}, [\src], \s_strd vmul.i16 q8, q8, q2 vmla.i16 q8, q9, q3 vmul.i16 q9, q9, q2 vmla.i16 q9, q10, q3 subs \h, \h, #2 .ifc \type, put vrshr.u16 q8, q8, #4 vrshr.u16 q9, q9, #4 .else vrshl.u16 q8, q8, q15 vrshl.u16 q9, q9, q15 vsub.i16 q8, q8, q14 vsub.i16 q9, q9, q14 .endif vst1.16 {q8}, [\dst, :128], \d_strd vst1.16 {q9}, [\ds2, :128], \d_strd ble 0f vmov q8, q10 b 8b 0: pop {r4-r11,pc} 160: // 16xN, 32xN, ... 320: 640: 1280: mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {q8, q9}, [\src], \s_strd 2: vld1.16 {q10, q11}, [\sr2], \s_strd vld1.16 {q12, q13}, [\src], \s_strd vmul.i16 q8, q8, q2 vmla.i16 q8, q10, q3 vmul.i16 q9, q9, q2 vmla.i16 q9, q11, q3 vmul.i16 q10, q10, q2 vmla.i16 q10, q12, q3 vmul.i16 q11, q11, q2 vmla.i16 q11, q13, q3 subs \h, \h, #2 .ifc \type, put vrshr.u16 q8, q8, #4 vrshr.u16 q9, q9, #4 vrshr.u16 q10, q10, #4 vrshr.u16 q11, q11, #4 .else vrshl.u16 q8, q8, q15 vrshl.u16 q9, q9, q15 vrshl.u16 q10, q10, q15 vrshl.u16 q11, q11, q15 vsub.i16 q8, q8, q14 vsub.i16 q9, q9, q14 vsub.i16 q10, q10, q14 vsub.i16 q11, q11, q14 .endif vst1.16 {q8, q9}, [\dst, :128], \d_strd vst1.16 {q10, q11}, [\ds2, :128], \d_strd ble 9f vmov q8, q12 vmov q9, q13 b 2b 9: subs \w, \w, #16 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #32 add \dst, \dst, #32 b 1b 0: pop {r4-r11,pc} L(\type\()_bilin_hv): adr r10, L(\type\()_bilin_hv_tbl) vdup.16 q15, r11 // 4 - intermediate_bits ldr r9, [r10, r9, lsl #2] vneg.s16 q15, q15 // -(4-intermediate_bits) .ifc \type, put vdup.32 q14, r12 // 4 + intermediate_bits .else vmov.i16 q14, #PREP_BIAS .endif add r10, r10, r9 .ifc \type, put vneg.s32 q14, q14 // -(4+intermediate_bits) .endif bx r10 .align 2 L(\type\()_bilin_hv_tbl): .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB .word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB 20: // 2xN hv .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {d20}, [\src], \s_strd vext.8 d21, d20, d20, #2 vmul.i16 d16, d20, d0 vmla.i16 d16, d21, d2 vrshl.u16 d16, d16, d30 vext.8 d16, d16, d16, #4 2: vld1.16 {d20}, [\sr2], \s_strd vld1.16 {d22}, [\src], \s_strd vext.8 d21, d20, d20, #2 vext.8 d23, d22, d22, #2 vtrn.32 d20, d22 vtrn.32 d21, d23 vmul.i16 d18, d20, d0 vmla.i16 d18, d21, d2 vrshl.u16 d18, d18, d30 vext.8 d16, d16, d18, #4 vmull.u16 q8, d16, d4 vmlal.u16 q8, d18, d6 vrshl.u32 q8, q8, q14 vmovn.i32 d16, q8 subs \h, \h, #2 vst1.32 {d16[0]}, [\dst, :32], \d_strd vst1.32 {d16[1]}, [\ds2, :32], \d_strd ble 0f vmov d16, d18 b 2b 0: pop {r4-r11,pc} .endif 40: // 4xN hv add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {q10}, [\src], \s_strd vext.8 d21, d20, d21, #2 vmul.i16 d16, d20, d0 vmla.i16 d16, d21, d2 vrshl.u16 d16, d16, d30 4: vld1.16 {q10}, [\sr2], \s_strd vld1.16 {q11}, [\src], \s_strd vext.8 d21, d20, d21, #2 vext.8 d23, d22, d23, #2 vswp d21, d22 vmul.i16 q9, q10, q0 vmla.i16 q9, q11, q1 vrshl.u16 q9, q9, q15 vmull.u16 q10, d16, d4 vmlal.u16 q10, d18, d6 vmull.u16 q11, d18, d4 vmlal.u16 q11, d19, d6 .ifc \type, put vrshl.u32 q10, q10, q14 vrshl.u32 q11, q11, q14 vmovn.i32 d20, q10 vmovn.i32 d21, q11 .else vrshrn.i32 d20, q10, #4 vrshrn.i32 d21, q11, #4 vsub.i16 q10, q10, q14 .endif subs \h, \h, #2 vst1.16 {d20}, [\dst, :64], \d_strd vst1.16 {d21}, [\ds2, :64], \d_strd ble 0f vmov d16, d19 b 4b 0: pop {r4-r11,pc} 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 vld1.16 {d20, d21, d22}, [\src], \s_strd vext.8 q11, q10, q11, #2 vmul.i16 q8, q10, q0 vmla.i16 q8, q11, q1 vrshl.u16 q8, q8, q15 2: vld1.16 {d20, d21, d22}, [\sr2], \s_strd vld1.16 {d24, d25, d26}, [\src], \s_strd vext.8 q11, q10, q11, #2 vext.8 q13, q12, q13, #2 vmul.i16 q9, q10, q0 vmla.i16 q9, q11, q1 vmul.i16 q10, q12, q0 vmla.i16 q10, q13, q1 vrshl.u16 q9, q9, q15 vrshl.u16 q10, q10, q15 vmull.u16 q11, d16, d4 vmlal.u16 q11, d18, d6 vmull.u16 q12, d17, d4 vmlal.u16 q12, d19, d6 vmull.u16 q8, d18, d4 vmlal.u16 q8, d20, d6 vmull.u16 q9, d19, d4 vmlal.u16 q9, d21, d6 .ifc \type, put vrshl.u32 q11, q11, q14 vrshl.u32 q12, q12, q14 vrshl.u32 q8, q8, q14 vrshl.u32 q9, q9, q14 vmovn.i32 d22, q11 vmovn.i32 d23, q12 vmovn.i32 d16, q8 vmovn.i32 d17, q9 .else vrshrn.i32 d22, q11, #4 vrshrn.i32 d23, q12, #4 vrshrn.i32 d16, q8, #4 vrshrn.i32 d17, q9, #4 vsub.i16 q11, q11, q14 vsub.i16 q8, q8, q14 .endif subs \h, \h, #2 vst1.16 {q11}, [\dst, :128], \d_strd vst1.16 {q8}, [\ds2, :128], \d_strd ble 9f vmov q8, q10 b 2b 9: subs \w, \w, #8 ble 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 mls \src, \s_strd, \my, \src mls \dst, \d_strd, \my, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 1b 0: pop {r4-r11,pc} endfunc .endm filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10 filter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10 .macro load_filter_ptr src asr r12, \src, #10 add r12, r11, r12, lsl #3 .endm .macro load_filter_coef dst, src, inc add \src, \src, \inc vld1.8 {\dst}, [r12, :64] .endm .macro load_filter_row dst, src, inc load_filter_ptr \src load_filter_coef \dst, \src, \inc .endm function warp_filter_horz_neon load_filter_ptr r5 // filter 0 vld1.16 {q6,q7}, [r2], r3 load_filter_coef d0, r5, r7 // filter 0 load_filter_row d2, r5, r7 // filter 1 vmovl.s8 q0, d0 // filter 0 vext.8 q3, q6, q7, #2*1 // filter 1 pixels vmovl.s8 q1, d2 // filter 1 vmull.s16 q4, d12, d0 // filter 0 output (0-3) vmull.s16 q5, d13, d1 // filter 0 output (4-7) load_filter_ptr r5 // filter 2 vmull.s16 q2, d6, d2 // filter 1 output (0-3) vmull.s16 q3, d7, d3 // filter 1 output (4-7) load_filter_coef d0, r5, r7 // filter 2 vpadd.i32 d8, d8, d9 // half pixel 0 (2x32) vpadd.i32 d9, d10, d11 // half pixel 0 (2x32) load_filter_ptr r5 // filter 3 vpadd.i32 d4, d4, d5 // half pixel 1 (2x32) vpadd.i32 d5, d6, d7 // half pixel 1 (2x32) vmovl.s8 q0, d0 // filter 2 vext.8 q3, q6, q7, #2*2 // filter 2 pixels vpadd.i32 d8, d8, d9 // pixel 0 (2x32) vpadd.i32 d9, d4, d5 // pixel 1 (2x32) load_filter_coef d2, r5, r7 // filter 3 vmull.s16 q2, d6, d0 // filter 2 output (0-3) vmull.s16 q3, d7, d1 // filter 2 output (4-7) load_filter_ptr r5 // filter 4 vpadd.i32 d8, d8, d9 // pixel 0,1 vpadd.i32 d9, d4, d5 // half pixel 2 (2x32) vpadd.i32 d10, d6, d7 // half pixel 2 (2x32) vmovl.s8 q1, d2 // filter 3 vext.8 q3, q6, q7, #2*3 // filter 3 pixels load_filter_coef d0, r5, r7 // filter 4 vpadd.i32 d9, d9, d10 // pixel 2 (2x32) vmull.s16 q2, d6, d2 // filter 3 output (0-3) vmull.s16 q3, d7, d3 // filter 3 output (4-7) vmovl.s8 q0, d0 // filter 4 load_filter_ptr r5 // filter 5 vpadd.i32 d10, d4, d5 // half pixel 3 (2x32) vpadd.i32 d11, d6, d7 // half pixel 3 (2x32) vext.8 q3, q6, q7, #2*4 // filter 4 pixels load_filter_coef d2, r5, r7 // filter 5 vpadd.i32 d10, d10, d11 // pixel 3 (2x32) vpadd.i32 d9, d9, d10 // pixel 2,3 vmull.s16 q2, d6, d0 // filter 4 output (0-3) vmull.s16 q3, d7, d1 // filter 4 output (4-7) vmovl.s8 q1, d2 // filter 5 load_filter_ptr r5 // filter 6 vpadd.i32 d10, d4, d5 // half pixel 4 (2x32) vpadd.i32 d11, d6, d7 // half pixel 4 (2x32) vext.8 q3, q6, q7, #2*5 // filter 5 pixels load_filter_coef d0, r5, r7 // filter 6 vpadd.i32 d10, d10, d11 // pixel 4 (2x32) vmull.s16 q2, d6, d2 // filter 5 output (0-3) vmull.s16 q3, d7, d3 // filter 5 output (4-7) vmovl.s8 q0, d0 // filter 6 load_filter_ptr r5 // filter 7 vpadd.i32 d4, d4, d5 // half pixel 5 (2x32) vpadd.i32 d5, d6, d7 // half pixel 5 (2x32) vext.8 q3, q6, q7, #2*6 // filter 6 pixels load_filter_coef d2, r5, r7 // filter 7 vpadd.i32 d11, d4, d5 // pixel 5 (2x32) vmull.s16 q2, d6, d0 // filter 6 output (0-3) vmull.s16 q3, d7, d1 // filter 6 output (4-7) vmovl.s8 q1, d2 // filter 7 vpadd.i32 d10, d10, d11 // pixel 4,5 vpadd.i32 d4, d4, d5 // half pixel 6 (2x32) vpadd.i32 d5, d6, d7 // half pixel 6 (2x32) vext.8 q3, q6, q7, #2*7 // filter 7 pixels vpadd.i32 d11, d4, d5 // pixel 6 (2x32) vmull.s16 q2, d6, d2 // filter 7 output (0-3) vmull.s16 q3, d7, d3 // filter 7 output (4-7) vld1.32 {d14[],d15[]}, [sp] // -(7 - intermediate_bits) vpadd.i32 d4, d4, d5 // half pixel 7 (2x32) vpadd.i32 d5, d6, d7 // half pixel 7 (2x32) sub r5, r5, r7, lsl #3 vpadd.i32 d4, d4, d5 // pixel 7 (2x32) add r5, r5, r8 vpadd.i32 d11, d11, d4 // pixel 6,7 vrshl.s32 q4, q4, q7 // -(7 - intermediate_bits) vrshl.s32 q5, q5, q7 // -(7 - intermediate_bits) bx lr endfunc // void dav1d_warp_affine_8x8_16bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my, // const int bitdepth_max) .macro warp t function warp_affine_8x8\t\()_16bpc_neon, export=1 push {r4-r11,lr} vpush {q4-q7} ldrd r4, r5, [sp, #100] ldrd r6, r7, [sp, #108] sub sp, sp, #8 clz r7, r7 // intermediate_bits = clz(bitdepth_max) - 18 .ifb \t sub r8, r7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 .endif sub r7, r7, #25 // -(7 - intermediate_bits) .ifb \t neg r8, r8 // -(7 + intermediate_bits) .endif str r7, [sp] // spill -(7 - intermediate_bits) on stack .ifb \t str r8, [sp, #4] // spill -(7 + intermediate_bits) on stack .endif ldrd r8, r9, [r4] sxth r7, r8 asr r8, r8, #16 asr r4, r9, #16 sxth r9, r9 mov r10, #8 sub r2, r2, r3, lsl #1 sub r2, r2, r3 sub r2, r2, #6 movrel r11, X(mc_warp_filter), 64*8 .ifnb \t lsl r1, r1, #1 .endif add r5, r5, #512 add r6, r6, #512 bl warp_filter_horz_neon vmovn.i32 d16, q4 vmovn.i32 d17, q5 bl warp_filter_horz_neon vmovn.i32 d18, q4 vmovn.i32 d19, q5 bl warp_filter_horz_neon vmovn.i32 d20, q4 vmovn.i32 d21, q5 bl warp_filter_horz_neon vmovn.i32 d22, q4 vmovn.i32 d23, q5 bl warp_filter_horz_neon vmovn.i32 d24, q4 vmovn.i32 d25, q5 bl warp_filter_horz_neon vmovn.i32 d26, q4 vmovn.i32 d27, q5 bl warp_filter_horz_neon vmovn.i32 d28, q4 vmovn.i32 d29, q5 1: bl warp_filter_horz_neon vmovn.i32 d30, q4 vmovn.i32 d31, q5 load_filter_row d8, r6, r9 load_filter_row d9, r6, r9 load_filter_row d10, r6, r9 load_filter_row d11, r6, r9 load_filter_row d12, r6, r9 load_filter_row d13, r6, r9 load_filter_row d14, r6, r9 load_filter_row d15, r6, r9 transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15 vmovl.s8 q1, d8 vmovl.s8 q2, d9 vmovl.s8 q3, d10 vmovl.s8 q4, d11 vmovl.s8 q5, d12 vmovl.s8 q6, d13 sub r6, r6, r9, lsl #3 // This ordering of vmull/vmlal is highly beneficial for // Cortex A8/A9/A53 here, but harmful for Cortex A7. vmull.s16 q0, d16, d2 vmlal.s16 q0, d18, d4 vmlal.s16 q0, d20, d6 vmlal.s16 q0, d22, d8 vmlal.s16 q0, d24, d10 vmlal.s16 q0, d26, d12 vmull.s16 q1, d17, d3 vmlal.s16 q1, d19, d5 vmlal.s16 q1, d21, d7 vmlal.s16 q1, d23, d9 vmlal.s16 q1, d25, d11 vmlal.s16 q1, d27, d13 vmovl.s8 q2, d14 vmovl.s8 q3, d15 vmlal.s16 q0, d28, d4 vmlal.s16 q0, d30, d6 vmlal.s16 q1, d29, d5 vmlal.s16 q1, d31, d7 .ifb \t ldr lr, [sp, #4] // -(7 + intermediate_bits) ldr r12, [sp, #120] // bitdepth_max vdup.32 q2, lr // -(7 + intermediate_bits) vdup.16 q3, r12 // bitdepth_max .endif vmov q8, q9 vmov q9, q10 .ifb \t vrshl.s32 q0, q0, q2 // -(7 + intermediate_bits) vrshl.s32 q1, q1, q2 // -(7 + intermediate_bits) .else vrshrn.s32 d0, q0, #7 vrshrn.s32 d1, q1, #7 vmov.i16 q3, #PREP_BIAS .endif vmov q10, q11 .ifb \t vqmovun.s32 d0, q0 vqmovun.s32 d1, q1 .else vsub.i16 q0, q0, q3 // PREP_BIAS .endif vmov q11, q12 vmov q12, q13 .ifb \t vmin.u16 q0, q0, q3 // bitdepth_max .endif vmov q13, q14 vmov q14, q15 subs r10, r10, #1 vst1.16 {q0}, [r0, :128], r1 add r6, r6, r4 bgt 1b add sp, sp, #8 vpop {q4-q7} pop {r4-r11,pc} endfunc .endm warp warp t // void dav1d_emu_edge_16bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_16bpc_neon, export=1 push {r4-r11,lr} ldrd r4, r5, [sp, #36] ldrd r6, r7, [sp, #44] ldrd r8, r9, [sp, #52] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub r12, r3, #1 // ih - 1 cmp r5, r3 sub lr, r2, #1 // iw - 1 it lt movlt r12, r5 // min(y, ih - 1) cmp r4, r2 bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) it lt movlt lr, r4 // min(x, iw - 1) bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) mla r8, r12, r9, r8 // ref += iclip() * stride add r8, r8, lr, lsl #1 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add r10, r5, r1 // y + bh neg r5, r5 // -y sub r10, r10, r3 // y + bh - ih sub r12, r1, #1 // bh - 1 cmp r10, r1 bic r5, r5, r5, asr #31 // max(-y, 0) it ge movge r10, r12 // min(y + bh - ih, bh-1) cmp r5, r1 bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) it ge movge r5, r12 // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add r11, r4, r0 // x + bw neg r4, r4 // -x sub r11, r11, r2 // x + bw - iw sub lr, r0, #1 // bw - 1 cmp r11, r0 bic r4, r4, r4, asr #31 // max(-x, 0) it ge movge r11, lr // min(x + bw - iw, bw-1) cmp r4, r0 bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) it ge movge r4, lr // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub r1, r1, r5 // bh - top_ext mla r6, r5, r7, r6 sub r2, r0, r4 // bw - left_ext sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext sub r2, r2, r11 // center_w = bw - left_ext - right_ext mov r0, r6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left vld1.16 {d0[], d1[]}, [r8] mov r12, r6 // out = dst mov r3, r4 vmov q1, q0 1: subs r3, r3, #16 vst1.16 {q0, q1}, [r12, :128]! bgt 1b .endif mov lr, r8 add r12, r6, r4, lsl #1 // out = dst + left_ext mov r3, r2 1: vld1.16 {q0, q1}, [lr]! subs r3, r3, #32 vld1.16 {q2, q3}, [lr]! .if \need_left vst1.16 {q0, q1}, [r12]! vst1.16 {q2, q3}, [r12]! .else vst1.16 {q0, q1}, [r12, :128]! vst1.16 {q2, q3}, [r12, :128]! .endif bgt 1b .if \need_right add r3, r8, r2, lsl #1 // in + center_w sub r3, r3, #2 // in + center_w - 1 add r12, r6, r4, lsl #1 // dst + left_ext vld1.16 {d0[], d1[]}, [r3] add r12, r12, r2, lsl #1 // out = dst + left_ext + center_w mov r3, r11 vmov q1, q0 1: subs r3, r3, #16 vst1.16 {q0, q1}, [r12]! bgt 1b .endif subs r1, r1, #1 // center_h-- add r6, r6, r7 add r8, r8, r9 bgt 0b .endm cmp r4, #0 beq 2f // need_left cmp r11, #0 beq 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cmp r11, #0 beq 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cmp r10, #0 // Storing the original dst in r0 overwrote bw, recalculate it here add r2, r2, r4 // center_w + left_ext add r2, r2, r11 // bw = center_w + left_ext + right_ext beq 3f // need_bottom sub r8, r6, r7 // ref = dst - stride mov r4, r2 sub r12, r7, #32 1: vld1.16 {q0, q1}, [r8, :128]! mov r3, r10 vld1.16 {q2, q3}, [r8, :128]! 2: vst1.16 {q0, q1}, [r6, :128]! subs r3, r3, #1 vst1.16 {q2, q3}, [r6, :128], r12 bgt 2b mls r6, r7, r10, r6 // dst -= bottom_ext * stride subs r4, r4, #32 // bw -= 32 add r6, r6, #64 // dst += 32 bgt 1b 3: cmp r5, #0 beq 3f // need_top mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride sub r12, r7, #32 1: vld1.16 {q0, q1}, [r0, :128]! mov r3, r5 vld1.16 {q2, q3}, [r0, :128]! 2: vst1.16 {q0, q1}, [r6, :128]! subs r3, r3, #1 vst1.16 {q2, q3}, [r6, :128], r12 bgt 2b mls r6, r7, r5, r6 // dst -= top_ext * stride subs r2, r2, #32 // bw -= 32 add r6, r6, #64 // dst += 32 bgt 1b 3: pop {r4-r11,pc} endfunc rav1e-0.7.1/src/arm/32/msac.S000064400000000000000000000532611046102023000135170ustar 00000000000000/* * Copyright © 2019, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define BUF_POS 0 #define BUF_END 4 #define DIF 8 #define RNG 12 #define CNT 16 #define ALLOW_UPDATE_CDF 20 const coeffs .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 endconst const bits, align=4 .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 endconst .macro vld1_align_n d0, q0, q1, src, n .if \n == 4 vld1.16 {\d0}, [\src, :64] .elseif \n == 8 vld1.16 {\q0}, [\src, :128] .else vld1.16 {\q0, \q1}, [\src, :128] .endif .endm .macro vld1_n d0, q0, q1, src, n .if \n == 4 vld1.16 {\d0}, [\src] .elseif \n == 8 vld1.16 {\q0}, [\src] .else vld1.16 {\q0, \q1}, [\src] .endif .endm .macro vst1_align_n d0, q0, q1, src, n .if \n == 4 vst1.16 {\d0}, [\src, :64] .elseif \n == 8 vst1.16 {\q0}, [\src, :128] .else vst1.16 {\q0, \q1}, [\src, :128] .endif .endm .macro vst1_n d0, q0, q1, src, n .if \n == 4 vst1.16 {\d0}, [\src] .elseif \n == 8 vst1.16 {\q0}, [\src] .else vst1.16 {\q0, \q1}, [\src] .endif .endm .macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vshr.u16 \d0, \s0, \s3 .else vshr.u16 \d1, \s1, \s4 .if \n == 16 vshr.u16 \d2, \s2, \s5 .endif .endif .endm .macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vadd.i16 \d0, \s0, \s3 .else vadd.i16 \d1, \s1, \s4 .if \n == 16 vadd.i16 \d2, \s2, \s5 .endif .endif .endm .macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vsub.i16 \d0, \s0, \s3 .else vsub.i16 \d1, \s1, \s4 .if \n == 16 vsub.i16 \d2, \s2, \s5 .endif .endif .endm .macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vand \d0, \s0, \s3 .else vand \d1, \s1, \s4 .if \n == 16 vand \d2, \s2, \s5 .endif .endif .endm .macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vcge.u16 \d0, \s0, \s3 .else vcge.u16 \d1, \s1, \s4 .if \n == 16 vcge.u16 \d2, \s2, \s5 .endif .endif .endm .macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vrhadd.u16 \d0, \s0, \s3 .else vrhadd.u16 \d1, \s1, \s4 .if \n == 16 vrhadd.u16 \d2, \s2, \s5 .endif .endif .endm .macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vshl.s16 \d0, \s0, \s3 .else vshl.s16 \d1, \s1, \s4 .if \n == 16 vshl.s16 \d2, \s2, \s5 .endif .endif .endm .macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n .if \n == 4 vqdmulh.s16 \d0, \s0, \s3 .else vqdmulh.s16 \d1, \s1, \s4 .if \n == 16 vqdmulh.s16 \d2, \s2, \s5 .endif .endif .endm // unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, // size_t n_symbols); function msac_decode_symbol_adapt4_neon, export=1 .macro decode_update n push {r4-r10,lr} sub sp, sp, #48 add r8, r0, #RNG vld1_align_n d0, q0, q1, r1, \n // cdf vld1.16 {d16[]}, [r8, :16] // rng movrel_local r9, coeffs, 30 vmov.i16 d30, #0x7f00 // 0x7f00 sub r9, r9, r2, lsl #1 vmvn.i16 q14, #0x3f // 0xffc0 add r8, sp, #14 vand d22, d16, d30 // rng & 0x7f00 vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0 .if \n > 4 vmov d23, d22 .endif vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret) vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add r8, r0, #DIF + 2 vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) .if \n == 4 vmov.i16 d17, #0 .endif vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) add r9, sp, #16 vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16) movrel_local r8, bits vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access vmov d21, d20 vld1_align_n q12, q12, q13, r8, \n .if \n == 16 vmov q11, q10 .endif vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask .if \n == 16 vadd.i16 q10, q10, q11 .endif vadd.i16 d20, d20, d21 // Aggregate mask bits ldr r4, [r0, #ALLOW_UPDATE_CDF] vpadd.i16 d20, d20, d20 lsl r10, r2, #1 vpadd.i16 d20, d20, d20 vmov.u16 r3, d20[0] cmp r4, #0 rbit r3, r3 clz lr, r3 // ret beq L(renorm) // update_cdf ldrh r3, [r1, r10] // count = cdf[n_symbols] vmov.i8 q10, #0xff .if \n == 16 mov r4, #-5 .else mvn r12, r2 mov r4, #-4 cmn r12, #3 // set C if n_symbols <= 2 .endif vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768 .if \n == 16 sub r4, r4, r3, lsr #4 // -((count >> 4) + 5) .else lsr r12, r3, #4 // count >> 4 sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4) .endif vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i]) .if \n == 4 vdup.16 d20, r4 // -rate .else vdup.16 q10, r4 // -rate .endif sub r3, r3, r3, lsr #5 // count - (count == 32) vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0) vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate add r3, r3, #1 // count + (count < 32) vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate vst1_align_n d0, q0, q1, r1, \n strh r3, [r1, r10] .endm decode_update 4 L(renorm): add r8, sp, #16 add r8, r8, lr, lsl #1 ldrh r3, [r8] // v ldrh r4, [r8, #-2] // u ldr r6, [r0, #CNT] ldr r7, [r0, #DIF] sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 mvn r7, r7 // ~dif add r7, r7, r3, lsl #16 // ~dif + (v << 16) L(renorm2): lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d lsl r7, r7, r5 // (~dif + (v << 16)) << d str r4, [r0, #RNG] mvn r7, r7 // ~dif bhs 9f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 cmp r5, r4 bgt 2f ldr r3, [r3] // next_bits add r8, r6, #23 // shift_bits = cnt + 23 add r6, r6, #16 // cnt += 16 rev r3, r3 // next_bits = bswap(next_bits) sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 and r8, r8, #24 // shift_bits &= 24 lsr r3, r3, r8 // next_bits >>= shift_bits sub r8, r8, r6 // shift_bits -= 16 + cnt str r5, [r0, #BUF_POS] lsl r3, r3, r8 // next_bits <<= shift_bits rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits eor r7, r7, r3 // dif ^= next_bits b 9f 2: // refill_eob rsb r5, r6, #8 // c = 8 - cnt 3: cmp r3, r4 bge 4f ldrb r8, [r3], #1 lsl r8, r8, r5 eor r7, r7, r8 subs r5, r5, #8 bge 3b 4: // refill_eob_end str r3, [r0, #BUF_POS] rsb r6, r5, #8 // cnt = 8 - c 9: str r6, [r0, #CNT] str r7, [r0, #DIF] mov r0, lr add sp, sp, #48 pop {r4-r10,pc} endfunc function msac_decode_symbol_adapt8_neon, export=1 decode_update 8 b L(renorm) endfunc function msac_decode_symbol_adapt16_neon, export=1 decode_update 16 b L(renorm) endfunc function msac_decode_hi_tok_neon, export=1 push {r4-r10,lr} vld1.16 {d0}, [r1, :64] // cdf add r4, r0, #RNG vmov.i16 d31, #0x7f00 // 0x7f00 movrel_local r5, coeffs, 30-2*3 vmvn.i16 d30, #0x3f // 0xffc0 ldrh r9, [r1, #6] // count = cdf[n_symbols] vld1.16 {d1[]}, [r4, :16] // rng movrel_local r4, bits vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret) add r5, r0, #DIF + 2 vld1.16 {q8}, [r4, :128] mov r2, #-24 vand d20, d0, d30 // cdf & 0xffc0 ldr r10, [r0, #ALLOW_UPDATE_CDF] vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16) sub sp, sp, #48 ldr r6, [r0, #CNT] ldr r7, [r0, #DIF] vmov d3, d2 1: vand d23, d1, d31 // rng & 0x7f00 vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add r12, sp, #14 vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret) vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) vmov.i16 d7, #0 vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng add r12, sp, #16 vcge.u16 q2, q1, q3 // c >= v vst1.16 {q3}, [r12] // store v values to allow indexed access vand q9, q2, q8 // One bit per halfword set in the mask vadd.i16 d18, d18, d19 // Aggregate mask bits vpadd.i16 d18, d18, d18 vpadd.i16 d18, d18, d18 vmov.u16 r3, d18[0] cmp r10, #0 add r2, r2, #5 rbit r3, r3 add r8, sp, #16 clz lr, r3 // ret beq 2f // update_cdf vmov.i8 d22, #0xff mov r4, #-5 vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768 sub r4, r4, r9, lsr #4 // -((count >> 4) + 5) vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i]) vdup.16 d18, r4 // -rate sub r9, r9, r9, lsr #5 // count - (count == 32) vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0) vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate add r9, r9, #1 // count + (count < 32) vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate vst1.16 {d0}, [r1, :64] vand d20, d0, d30 // cdf & 0xffc0 strh r9, [r1, #6] 2: add r8, r8, lr, lsl #1 ldrh r3, [r8] // v ldrh r4, [r8, #-2] // u sub r4, r4, r3 // rng = u - v clz r5, r4 // clz(rng) eor r5, r5, #16 // d = clz(rng) ^ 16 mvn r7, r7 // ~dif add r7, r7, r3, lsl #16 // ~dif + (v << 16) lsl r4, r4, r5 // rng << d subs r6, r6, r5 // cnt -= d lsl r7, r7, r5 // (~dif + (v << 16)) << d str r4, [r0, #RNG] vdup.16 d1, r4 mvn r7, r7 // ~dif bhs 9f // refill ldr r3, [r0, #BUF_POS] // BUF_POS ldr r4, [r0, #BUF_END] // BUF_END add r5, r3, #4 cmp r5, r4 bgt 2f ldr r3, [r3] // next_bits add r8, r6, #23 // shift_bits = cnt + 23 add r6, r6, #16 // cnt += 16 rev r3, r3 // next_bits = bswap(next_bits) sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 and r8, r8, #24 // shift_bits &= 24 lsr r3, r3, r8 // next_bits >>= shift_bits sub r8, r8, r6 // shift_bits -= 16 + cnt str r5, [r0, #BUF_POS] lsl r3, r3, r8 // next_bits <<= shift_bits rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits eor r7, r7, r3 // dif ^= next_bits b 9f 2: // refill_eob rsb r5, r6, #8 // c = 40 - cnt 3: cmp r3, r4 bge 4f ldrb r8, [r3], #1 lsl r8, r8, r5 eor r7, r7, r8 subs r5, r5, #8 bge 3b 4: // refill_eob_end str r3, [r0, #BUF_POS] rsb r6, r5, #8 // cnt = 40 - c 9: lsl lr, lr, #1 sub lr, lr, #5 lsr r12, r7, #16 adds r2, r2, lr // carry = tok_br < 3 || tok == 15 vdup.16 q1, r12 bcc 1b // loop if !carry add r2, r2, #30 str r6, [r0, #CNT] add sp, sp, #48 str r7, [r0, #DIF] lsr r0, r2, #1 pop {r4-r10,pc} endfunc function msac_decode_bool_equi_neon, export=1 push {r4-r10,lr} ldr r5, [r0, #RNG] ldr r6, [r0, #CNT] sub sp, sp, #48 ldr r7, [r0, #DIF] bic r4, r5, #0xff // r &= 0xff00 add r4, r4, #8 mov r2, #0 subs r8, r7, r4, lsl #15 // dif - vw lsr r4, r4, #1 // v sub r5, r5, r4 // r - v itee lo movlo r2, #1 movhs r4, r5 // if (ret) v = r - v; movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) endfunc function msac_decode_bool_neon, export=1 push {r4-r10,lr} ldr r5, [r0, #RNG] ldr r6, [r0, #CNT] sub sp, sp, #48 ldr r7, [r0, #DIF] lsr r4, r5, #8 // r >> 8 bic r1, r1, #0x3f // f &= ~63 mul r4, r4, r1 mov r2, #0 lsr r4, r4, #7 add r4, r4, #4 // v subs r8, r7, r4, lsl #16 // dif - vw sub r5, r5, r4 // r - v itee lo movlo r2, #1 movhs r4, r5 // if (ret) v = r - v; movhs r7, r8 // if (ret) dif = dif - vw; clz r5, r4 // clz(rng) mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 b L(renorm2) endfunc function msac_decode_bool_adapt_neon, export=1 push {r4-r10,lr} ldr r9, [r1] // cdf[0-1] ldr r5, [r0, #RNG] movw lr, #0xffc0 ldr r6, [r0, #CNT] sub sp, sp, #48 ldr r7, [r0, #DIF] lsr r4, r5, #8 // r >> 8 and r2, r9, lr // f &= ~63 mul r4, r4, r2 mov r2, #0 lsr r4, r4, #7 add r4, r4, #4 // v subs r8, r7, r4, lsl #16 // dif - vw sub r5, r5, r4 // r - v ldr r10, [r0, #ALLOW_UPDATE_CDF] itee lo movlo r2, #1 movhs r4, r5 // if (ret) v = r - v; movhs r7, r8 // if (ret) dif = dif - vw; cmp r10, #0 clz r5, r4 // clz(rng) mvn r7, r7 // ~dif eor r5, r5, #16 // d = clz(rng) ^ 16 mov lr, r2 beq L(renorm2) lsr r2, r9, #16 // count = cdf[1] uxth r9, r9 // cdf[0] sub r3, r2, r2, lsr #5 // count - (count >= 32) lsr r2, r2, #4 // count >> 4 add r10, r3, #1 // count + (count < 32) add r2, r2, #4 // rate = (count >> 4) | 4 sub r9, r9, lr // cdf[0] -= bit sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769} asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate sub r9, r9, r3 // cdf[0] strh r9, [r1] strh r10, [r1, #2] b L(renorm2) endfunc rav1e-0.7.1/src/arm/32/util.S000064400000000000000000000133331046102023000135450ustar 00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2015 Martin Storsjo * Copyright © 2015 Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #ifndef DAV1D_SRC_ARM_32_UTIL_S #define DAV1D_SRC_ARM_32_UTIL_S #include "config.h" #include "src/arm/asm.S" .macro movrel_local rd, val, offset=0 #if defined(PIC) ldr \rd, 90001f b 90002f 90001: .word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB) 90002: add \rd, \rd, pc #else movw \rd, #:lower16:\val+\offset movt \rd, #:upper16:\val+\offset #endif .endm .macro movrel rd, val, offset=0 #if defined(PIC) && defined(__APPLE__) ldr \rd, 1f b 2f 1: .word 3f - (2f + 8 - 4 * CONFIG_THUMB) 2: ldr \rd, [pc, \rd] .if \offset < 0 sub \rd, \rd, #-(\offset) .elseif \offset > 0 add \rd, \rd, #\offset .endif .non_lazy_symbol_pointer 3: .indirect_symbol \val .word 0 .text #else movrel_local \rd, \val, \offset #endif .endm // This macro clobbers r7 (and r12 on windows) and stores data at the // bottom of the stack; sp is the start of the space allocated that // the caller can use. .macro sub_sp_align space #if CONFIG_THUMB mov r7, sp and r7, r7, #15 #else and r7, sp, #15 #endif sub sp, sp, r7 // Now the stack is aligned, store the amount of adjustment back // on the stack, as we don't want to waste a register as frame // pointer. str r7, [sp, #-16]! #ifdef _WIN32 .if \space > 8192 // Here, we'd need to touch two (or more) pages while decrementing // the stack pointer. .error "sub_sp_align doesn't support values over 8K at the moment" .elseif \space > 4096 sub r7, sp, #4096 ldr r12, [r7] sub r7, r7, #(\space - 4096) mov sp, r7 .else sub sp, sp, #\space .endif #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 .endif .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif #endif .endm .macro add_sp_align space .if \space >= 4096 add sp, sp, #(\space)/4096*4096 .endif .if (\space % 4096) != 0 add sp, sp, #(\space)%4096 .endif ldr r7, [sp], #16 // Add back the original stack adjustment add sp, sp, r7 .endm .macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vtrn.32 \q0, \q2 vtrn.32 \q1, \q3 vtrn.16 \r0, \r2 vtrn.16 \r1, \r3 vtrn.16 \r4, \r6 vtrn.16 \r5, \r7 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 vtrn.8 \r4, \r5 vtrn.8 \r6, \r7 .endm .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7 vswp \d0, \d4 vswp \d1, \d5 vswp \d2, \d6 vswp \d3, \d7 vtrn.32 \r0, \r2 vtrn.32 \r1, \r3 vtrn.32 \r4, \r6 vtrn.32 \r5, \r7 vtrn.16 \r0, \r1 vtrn.16 \r2, \r3 vtrn.16 \r4, \r5 vtrn.16 \r6, \r7 .endm .macro transpose_4x8b q0, q1, r0, r1, r2, r3 vtrn.16 \q0, \q1 vtrn.8 \r0, \r1 vtrn.8 \r2, \r3 .endm .macro transpose_4x4s q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 vswp \r1, \r4 // vtrn.64 \q0, \q2 vswp \r3, \r6 // vtrn.64 \q1, \q3 vtrn.32 \q0, \q1 vtrn.32 \q2, \q3 .endm .macro transpose_4x4h q0, q1, r0, r1, r2, r3 vtrn.32 \q0, \q1 vtrn.16 \r0, \r1 vtrn.16 \r2, \r3 .endm .macro transpose_4x8h r0, r1, r2, r3 vtrn.32 \r0, \r2 vtrn.32 \r1, \r3 vtrn.16 \r0, \r1 vtrn.16 \r2, \r3 .endm #endif /* DAV1D_SRC_ARM_32_UTIL_S */ rav1e-0.7.1/src/arm/64/cdef.S000064400000000000000000000452351046102023000135040ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" .macro pad_top_bottom s1, s2, w, stride, rn, rw, ret tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT sub \s1, \s1, #2 sub \s2, \s2, #2 tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr s1, [\s1, #\w] ldr \rn\()2, [\s2] ldr s3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str \rw\()0, [x0] str d1, [x0, #2*\w] add x0, x0, #2*\stride str \rw\()2, [x0] str d3, [x0, #2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr h1, [\s1, #\w] ldr \rn\()2, [\s2] ldr h3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str \rw\()0, [x0] str s1, [x0, #2*\w] str s31, [x0, #2*\w+4] add x0, x0, #2*\stride str \rw\()2, [x0] str s3, [x0, #2*\w] str s31, [x0, #2*\w+4] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr h1, [\s1, #\w] ldr \rn\()2, [\s2] ldr h3, [\s2, #\w] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b str s31, [x0] stur \rw\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \rw\()2, [x0, #4] str s3, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \rn\()0, [\s1] ldr \rn\()1, [\s2] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s31, [x0] stur \rw\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \rw\()1, [x0, #4] str s31, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride .endif 3: .endm .macro load_n_incr dst, src, incr, w .if \w == 4 ld1 {\dst\().s}[0], [\src], \incr .else ld1 {\dst\().8b}, [\src], \incr .endif .endm // void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func w, stride, rn, rw function cdef_padding\w\()_8bpc_neon, export=1 cmp w7, #0xf // fully edged b.eq cdef_padding\w\()_edged_8bpc_neon movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) tst w7, #4 // CDEF_HAVE_TOP b.ne 1f // !CDEF_HAVE_TOP st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif b 3f 1: // CDEF_HAVE_TOP add x9, x4, x2 pad_top_bottom x4, x9, \w, \stride, \rn, \rw, 0 // Middle section 3: tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b uxtl v2.8h, v2.8b str s0, [x0] stur \rw\()1, [x0, #4] str s2, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ld1 {v0.h}[0], [x3], #2 load_n_incr v1, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s0, [x0] stur \rw\()1, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b b 3f 2: tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldr h1, [x1, #\w] load_n_incr v0, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b str s31, [x0] stur \rw\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr v0, x1, x2, \w subs w6, w6, #1 uxtl v0.8h, v0.8b str s31, [x0] stur \rw\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b 3: tst w7, #8 // CDEF_HAVE_BOTTOM b.ne 1f // !CDEF_HAVE_BOTTOM st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif ret 1: // CDEF_HAVE_BOTTOM add x9, x5, x2 pad_top_bottom x5, x9, \w, \stride, \rn, \rw, 1 endfunc .endm padding_func 8, 16, d, q padding_func 4, 8, s, d // void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_edged w, stride, reg function cdef_padding\w\()_edged_8bpc_neon, export=1 sub x4, x4, #2 sub x5, x5, #2 sub x0, x0, #(2*\stride+2) .if \w == 4 ldr d0, [x4] ldr d1, [x4, x2] st1 {v0.8b, v1.8b}, [x0], #16 .else add x9, x4, x2 ldr d0, [x4] ldr s1, [x4, #8] ldr d2, [x9] ldr s3, [x9, #8] str d0, [x0] str s1, [x0, #8] str d2, [x0, #\stride] str s3, [x0, #\stride+8] add x0, x0, #2*\stride .endif 0: ld1 {v0.h}[0], [x3], #2 ldr h2, [x1, #\w] load_n_incr v1, x1, x2, \w subs w6, w6, #1 str h0, [x0] stur \reg\()1, [x0, #2] str h2, [x0, #2+\w] add x0, x0, #\stride b.gt 0b .if \w == 4 ldr d0, [x5] ldr d1, [x5, x2] st1 {v0.8b, v1.8b}, [x0], #16 .else add x9, x5, x2 ldr d0, [x5] ldr s1, [x5, #8] ldr d2, [x9] ldr s3, [x9, #8] str d0, [x0] str s1, [x0, #8] str d2, [x0, #\stride] str s3, [x0, #\stride+8] .endif ret endfunc .endm padding_func_edged 8, 16, d padding_func_edged 4, 8, s tables filter 8, 8 filter 4, 8 find_dir 8 .macro load_px_8 d1, d2, w .if \w == 8 add x6, x2, w9, sxtb // x + off sub x9, x2, w9, sxtb // x - off ld1 {\d1\().d}[0], [x6] // p0 add x6, x6, #16 // += stride ld1 {\d2\().d}[0], [x9] // p1 add x9, x9, #16 // += stride ld1 {\d1\().d}[1], [x6] // p0 ld1 {\d2\().d}[1], [x9] // p0 .else add x6, x2, w9, sxtb // x + off sub x9, x2, w9, sxtb // x - off ld1 {\d1\().s}[0], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[0], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[1], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[1], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[2], [x6] // p0 add x6, x6, #8 // += stride ld1 {\d2\().s}[2], [x9] // p1 add x9, x9, #8 // += stride ld1 {\d1\().s}[3], [x6] // p0 ld1 {\d2\().s}[3], [x9] // p1 .endif .endm .macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min .if \min umin v3.16b, v3.16b, \s1\().16b umax v4.16b, v4.16b, \s1\().16b umin v3.16b, v3.16b, \s2\().16b umax v4.16b, v4.16b, \s2\().16b .endif uabd v16.16b, v0.16b, \s1\().16b // abs(diff) uabd v20.16b, v0.16b, \s2\().16b // abs(diff) ushl v17.16b, v16.16b, \shift // abs(diff) >> shift ushl v21.16b, v20.16b, \shift // abs(diff) >> shift uqsub v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift)) uqsub v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift)) cmhi v18.16b, v0.16b, \s1\().16b // px > p0 cmhi v22.16b, v0.16b, \s2\().16b // px > p1 umin v17.16b, v17.16b, v16.16b // imin(abs(diff), clip) umin v21.16b, v21.16b, v20.16b // imin(abs(diff), clip) dup v19.16b, \tap // taps[k] neg v16.16b, v17.16b // -imin() neg v20.16b, v21.16b // -imin() bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign() bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign() mla v1.16b, v18.16b, v19.16b // sum += taps[k] * constrain() mla v2.16b, v22.16b, v19.16b // sum += taps[k] * constrain() .endm // void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride, // const uint8_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h); .macro filter_func_8 w, pri, sec, min, suffix function cdef_filter\w\suffix\()_edged_8bpc_neon .if \pri movrel x8, pri_taps and w9, w3, #1 add x8, x8, w9, uxtw #1 .endif movrel x9, directions\w add x5, x9, w5, uxtw #1 movi v30.8b, #7 dup v28.8b, w6 // damping .if \pri dup v25.16b, w3 // threshold .endif .if \sec dup v27.16b, w4 // threshold .endif trn1 v24.8b, v25.8b, v27.8b clz v24.8b, v24.8b // clz(threshold) sub v24.8b, v30.8b, v24.8b // ulog2(threshold) uqsub v24.8b, v28.8b, v24.8b // shift = imax(0, damping - ulog2(threshold)) neg v24.8b, v24.8b // -shift .if \sec dup v26.16b, v24.b[1] .endif .if \pri dup v24.16b, v24.b[0] .endif 1: .if \w == 8 add x12, x2, #16 ld1 {v0.d}[0], [x2] // px ld1 {v0.d}[1], [x12] // px .else add x12, x2, #1*8 add x13, x2, #2*8 add x14, x2, #3*8 ld1 {v0.s}[0], [x2] // px ld1 {v0.s}[1], [x12] // px ld1 {v0.s}[2], [x13] // px ld1 {v0.s}[3], [x14] // px .endif // We need 9-bits or two 8-bit accululators to fit the sum. // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228. // Start sum at -1 instead of 0 to help handle rounding later. movi v1.16b, #255 // sum movi v2.16b, #0 // sum .if \min mov v3.16b, v0.16b // min mov v4.16b, v0.16b // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov w11, #2 // sec_taps[0] 2: .if \pri ldrb w9, [x5] // off1 load_px_8 v5, v6, \w .endif .if \sec add x5, x5, #4 // +2*2 ldrb w9, [x5] // off2 load_px_8 v28, v29, \w .endif .if \pri ldrb w10, [x8] // *pri_taps handle_pixel_8 v5, v6, v25.16b, v24.16b, w10, \min .endif .if \sec add x5, x5, #8 // +2*4 ldrb w9, [x5] // off3 load_px_8 v5, v6, \w handle_pixel_8 v28, v29, v27.16b, v26.16b, w11, \min handle_pixel_8 v5, v6, v27.16b, v26.16b, w11, \min sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; .else add x5, x5, #1 // x5 += 1 .endif subs w11, w11, #1 // sec_tap-- (value) .if \pri add x8, x8, #1 // pri_taps++ (pointer) .endif b.ne 2b // Perform halving adds since the value won't fit otherwise. // To handle the offset for negative values, use both halving w/ and w/o rounding. srhadd v5.16b, v1.16b, v2.16b // sum >> 1 shadd v6.16b, v1.16b, v2.16b // (sum - 1) >> 1 cmlt v1.16b, v5.16b, #0 // sum < 0 bsl v1.16b, v6.16b, v5.16b // (sum - (sum < 0)) >> 1 srshr v1.16b, v1.16b, #3 // (8 + sum - (sum < 0)) >> 4 usqadd v0.16b, v1.16b // px + (8 + sum ...) >> 4 .if \min umin v0.16b, v0.16b, v4.16b umax v0.16b, v0.16b, v3.16b // iclip(px + .., min, max) .endif .if \w == 8 st1 {v0.d}[0], [x0], x1 add x2, x2, #2*16 // tmp += 2*tmp_stride subs w7, w7, #2 // h -= 2 st1 {v0.d}[1], [x0], x1 .else st1 {v0.s}[0], [x0], x1 add x2, x2, #4*8 // tmp += 4*tmp_stride st1 {v0.s}[1], [x0], x1 subs w7, w7, #4 // h -= 4 st1 {v0.s}[2], [x0], x1 st1 {v0.s}[3], [x0], x1 .endif // Reset pri_taps and directions back to the original point sub x5, x5, #2 .if \pri sub x8, x8, #2 .endif b.gt 1b ret endfunc .endm .macro filter_8 w filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec .endm filter_8 8 filter_8 4 rav1e-0.7.1/src/arm/64/cdef16.S000064400000000000000000000170211046102023000136430ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #include "cdef_tmpl.S" .macro pad_top_bot_16 s1, s2, w, stride, reg, ret tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT sub \s1, \s1, #4 sub \s2, \s2, #4 tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr d1, [\s1, #2*\w] ldr \reg\()2, [\s2] ldr d3, [\s2, #2*\w] str \reg\()0, [x0] str d1, [x0, #2*\w] add x0, x0, #2*\stride str \reg\()2, [x0] str d3, [x0, #2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr s1, [\s1, #2*\w] ldr \reg\()2, [\s2] ldr s3, [\s2, #2*\w] str \reg\()0, [x0] str s1, [x0, #2*\w] str s31, [x0, #2*\w+4] add x0, x0, #2*\stride str \reg\()2, [x0] str s3, [x0, #2*\w] str s31, [x0, #2*\w+4] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 2: // !CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr s1, [\s1, #2*\w] ldr \reg\()2, [\s2] ldr s3, [\s2, #2*\w] str s31, [x0] stur \reg\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \reg\()2, [x0, #4] str s3, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride b 3f .endif 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ldr \reg\()0, [\s1] ldr \reg\()1, [\s2] str s31, [x0] stur \reg\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride str s31, [x0] stur \reg\()1, [x0, #4] str s31, [x0, #4+2*\w] .if \ret ret .else add x0, x0, #2*\stride .endif 3: .endm .macro load_n_incr_16 dst, src, incr, w .if \w == 4 ld1 {\dst\().4h}, [\src], \incr .else ld1 {\dst\().8h}, [\src], \incr .endif .endm // void dav1d_cdef_paddingX_16bpc_neon(uint16_t *tmp, const pixel *src, // ptrdiff_t src_stride, const pixel (*left)[2], // const pixel *const top, // const pixel *const bottom, int h, // enum CdefEdgeFlags edges); .macro padding_func_16 w, stride, reg function cdef_padding\w\()_16bpc_neon, export=1 movi v30.8h, #0x80, lsl #8 mov v31.16b, v30.16b sub x0, x0, #2*(2*\stride+2) tst w7, #4 // CDEF_HAVE_TOP b.ne 1f // !CDEF_HAVE_TOP st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif b 3f 1: // CDEF_HAVE_TOP add x9, x4, x2 pad_top_bot_16 x4, x9, \w, \stride, \reg, 0 // Middle section 3: tst w7, #1 // CDEF_HAVE_LEFT b.eq 2f // CDEF_HAVE_LEFT tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ld1 {v0.s}[0], [x3], #4 ldr s2, [x1, #2*\w] load_n_incr_16 v1, x1, x2, \w subs w6, w6, #1 str s0, [x0] stur \reg\()1, [x0, #4] str s2, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT ld1 {v0.s}[0], [x3], #4 load_n_incr_16 v1, x1, x2, \w subs w6, w6, #1 str s0, [x0] stur \reg\()1, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b b 3f 2: tst w7, #2 // CDEF_HAVE_RIGHT b.eq 1f // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT 0: ldr s1, [x1, #2*\w] load_n_incr_16 v0, x1, x2, \w subs w6, w6, #1 str s31, [x0] stur \reg\()0, [x0, #4] str s1, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 0b b 3f 1: // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT load_n_incr_16 v0, x1, x2, \w subs w6, w6, #1 str s31, [x0] stur \reg\()0, [x0, #4] str s31, [x0, #4+2*\w] add x0, x0, #2*\stride b.gt 1b 3: tst w7, #8 // CDEF_HAVE_BOTTOM b.ne 1f // !CDEF_HAVE_BOTTOM st1 {v30.8h, v31.8h}, [x0], #32 .if \w == 8 st1 {v30.8h, v31.8h}, [x0], #32 .endif ret 1: // CDEF_HAVE_BOTTOM add x9, x5, x2 pad_top_bot_16 x5, x9, \w, \stride, \reg, 1 endfunc .endm padding_func_16 8, 16, q padding_func_16 4, 8, d tables filter 8, 16 filter 4, 16 find_dir 16 rav1e-0.7.1/src/arm/64/cdef_dist.S000064400000000000000000000141351046102023000145220ustar 00000000000000/* Copyright (c) 2023, The rav1e contributors. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "src/arm/asm.S" #include "util.S" // v0: tmp register // v1: src input // v2: dst input // v3 = sum(src_{i,j}) // v4 = sum(src_{i,j}^2) // v5 = sum(dst_{i,j}) // v6 = sum(dst_{i,j}^2) // v7 = sum(src_{i,j} * dst_{i,j}) // v16: zero register .macro CDEF_DIST_W8 uabal v3.8h, v1.8b, v16.8b // sum pixel values umull v0.8h, v1.8b, v1.8b // square uabal v4.4s, v0.4h, v16.4h // accumulate uabal2 v4.4s, v0.8h, v16.8h uabal v5.8h, v2.8b, v16.8b // same as above, but for dst umull v0.8h, v2.8b, v2.8b uabal v6.4s, v0.4h, v16.4h uabal2 v6.4s, v0.8h, v16.8h umull v0.8h, v1.8b, v2.8b // src_{i,j} * dst_{i,j} uabal v7.4s, v0.4h, v16.4h uabal2 v7.4s, v0.8h, v16.8h .endm .macro CDEF_DIST_REFINE shift=0 addv h3, v3.8h umull v3.4s, v3.4h, v3.4h urshr v3.4s, v3.4s, #(6-\shift) // s3: sum(src_{i,j})^2 / N addv s4, v4.4s // s4: sum(src_{i,j}^2) addv h5, v5.8h umull v5.4s, v5.4h, v5.4h urshr v5.4s, v5.4s, #(6-\shift) // s5: sum(dst_{i,j})^2 / N addv s6, v6.4s // s6: sum(dst_{i,j}^2) addv s7, v7.4s add v0.4s, v4.4s, v6.4s sub v0.4s, v0.4s, v7.4s sub v0.4s, v0.4s, v7.4s // s0: sse uqsub v4.4s, v4.4s, v3.4s // s4: svar uqsub v6.4s, v6.4s, v5.4s // s6: dvar .if \shift != 0 shl v4.4s, v4.4s, #\shift shl v6.4s, v6.4s, #\shift .endif str s4, [x4] str s6, [x4, #4] str s0, [x4, #8] .endm .macro LOAD_ROW ldr d1, [x0] ldr d2, [x2] add x0, x0, x1 add x2, x2, x3 .endm .macro LOAD_ROWS ldr s1, [x0] ldr s2, [x2] ldr s0, [x0, x1] ldr s17, [x2, x3] add x0, x0, x1, lsl 1 add x2, x2, x3, lsl 1 zip1 v1.2s, v1.2s, v0.2s zip1 v2.2s, v2.2s, v17.2s .endm .macro CDEF_DIST_INIT width, height .irp i, v3.8h, v4.8h, v5.8h, v6.8h, v7.8h, v16.8h movi \i, #0 .endr .if \width == 4 mov w5, #(\height / 2) .else mov w5, #\height .endif .endm // x0: src: *const u8, // x1: src_stride: isize, // x2: dst: *const u8, // x3: dst_stride: isize, // x4: ret_ptr: *mut u32, function cdef_dist_kernel_4x4_neon, export=1 CDEF_DIST_INIT 4, 4 L(cdk_4x4): LOAD_ROWS CDEF_DIST_W8 subs w5, w5, #1 bne L(cdk_4x4) CDEF_DIST_REFINE 2 ret endfunc function cdef_dist_kernel_4x8_neon, export=1 CDEF_DIST_INIT 4, 8 L(cdk_4x8): LOAD_ROWS CDEF_DIST_W8 subs w5, w5, #1 bne L(cdk_4x8) CDEF_DIST_REFINE 1 ret endfunc function cdef_dist_kernel_8x4_neon, export=1 CDEF_DIST_INIT 8, 4 L(cdk_8x4): LOAD_ROW CDEF_DIST_W8 subs w5, w5, #1 bne L(cdk_8x4) CDEF_DIST_REFINE 1 ret endfunc function cdef_dist_kernel_8x8_neon, export=1 CDEF_DIST_INIT 8, 8 L(cdk_8x8): LOAD_ROW CDEF_DIST_W8 subs w5, w5, #1 bne L(cdk_8x8) CDEF_DIST_REFINE ret endfunc // v0: tmp register // v1: src input // v2: dst input // v3 = sum(src_{i,j}) // v4 = sum(src_{i,j}^2) // v5 = sum(dst_{i,j}) // v6 = sum(dst_{i,j}^2) // v7 = sum(src_{i,j} * dst_{i,j}) // v16: zero register .macro CDEF_DIST_HBD_W8 uabal v3.4s, v1.4h, v16.4h // sum pixel values uabal2 v3.4s, v1.8h, v16.8h umlal v4.4s, v1.4h, v1.4h // square and accumulate umlal2 v4.4s, v1.8h, v1.8h uabal v5.4s, v2.4h, v16.4h // same as above, but for dst uabal2 v5.4s, v2.8h, v16.8h umlal v6.4s, v2.4h, v2.4h umlal2 v6.4s, v2.8h, v2.8h umlal v7.4s, v1.4h, v2.4h // src_{i,j} * dst_{i,j} umlal2 v7.4s, v1.8h, v2.8h .endm .macro CDEF_DIST_HBD_REFINE shift=0 addv s3, v3.4s umull v3.2d, v3.2s, v3.2s urshr d3, d3, #(6-\shift) // d3: sum(src_{i,j})^2 / N uaddlv d4, v4.4s // d4: sum(src_{i,j}^2) addv s5, v5.4s umull v5.2d, v5.2s, v5.2s urshr d5, d5, #(6-\shift) // d5: sum(dst_{i,j})^2 / N uaddlv d6, v6.4s // d6: sum(dst_{i,j}^2) uaddlv d7, v7.4s add d0, d4, d6 sub d0, d0, d7 sub d0, d0, d7 // d0: sse uqsub d4, d4, d3 // d4: svar uqsub d6, d6, d5 // d6: dvar .if \shift != 0 shl d4, d4, #\shift shl d6, d6, #\shift .endif str s4, [x4] str s6, [x4, #4] str s0, [x4, #8] .endm .macro LOAD_ROW_HBD ldr q1, [x0] ldr q2, [x2] add x0, x0, x1 add x2, x2, x3 .endm .macro LOAD_ROWS_HBD ldr d1, [x0] ldr d2, [x2] ldr d0, [x0, x1] ldr d17, [x2, x3] add x0, x0, x1, lsl 1 add x2, x2, x3, lsl 1 mov v1.d[1], v0.d[0] mov v2.d[1], v17.d[0] .endm // x0: src: *const u16, // x1: src_stride: isize, // x2: dst: *const u16, // x3: dst_stride: isize, // x4: ret_ptr: *mut u32, function cdef_dist_kernel_4x4_hbd_neon, export=1 CDEF_DIST_INIT 4, 4 L(cdk_hbd_4x4): LOAD_ROWS_HBD CDEF_DIST_HBD_W8 subs w5, w5, #1 bne L(cdk_hbd_4x4) CDEF_DIST_HBD_REFINE 2 ret endfunc function cdef_dist_kernel_4x8_hbd_neon, export=1 CDEF_DIST_INIT 4, 8 L(cdk_hbd_4x8): LOAD_ROWS_HBD CDEF_DIST_HBD_W8 subs w5, w5, #1 bne L(cdk_hbd_4x8) CDEF_DIST_HBD_REFINE 1 ret endfunc function cdef_dist_kernel_8x4_hbd_neon, export=1 CDEF_DIST_INIT 8, 4 L(cdk_hbd_8x4): LOAD_ROW_HBD CDEF_DIST_HBD_W8 subs w5, w5, #1 bne L(cdk_hbd_8x4) CDEF_DIST_HBD_REFINE 1 ret endfunc function cdef_dist_kernel_8x8_hbd_neon, export=1 CDEF_DIST_INIT 8, 8 L(cdk_hbd_8x8): LOAD_ROW_HBD CDEF_DIST_HBD_W8 subs w5, w5, #1 bne L(cdk_hbd_8x8) CDEF_DIST_HBD_REFINE ret endfunc rav1e-0.7.1/src/arm/64/cdef_tmpl.S000064400000000000000000000502601046102023000145320ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro dir_table w, stride const directions\w .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 .byte 1 * \stride + 0, 2 * \stride + 0 .byte 1 * \stride + 0, 2 * \stride - 1 // Repeated, to avoid & 7 .byte -1 * \stride + 1, -2 * \stride + 2 .byte 0 * \stride + 1, -1 * \stride + 2 .byte 0 * \stride + 1, 0 * \stride + 2 .byte 0 * \stride + 1, 1 * \stride + 2 .byte 1 * \stride + 1, 2 * \stride + 2 .byte 1 * \stride + 0, 2 * \stride + 1 endconst .endm .macro tables dir_table 8, 16 dir_table 4, 8 const pri_taps .byte 4, 2, 3, 3 endconst .endm .macro load_px d1, d2, w .if \w == 8 add x6, x2, w9, sxtb #1 // x + off sub x9, x2, w9, sxtb #1 // x - off ld1 {\d1\().8h}, [x6] // p0 ld1 {\d2\().8h}, [x9] // p1 .else add x6, x2, w9, sxtb #1 // x + off sub x9, x2, w9, sxtb #1 // x - off ld1 {\d1\().4h}, [x6] // p0 add x6, x6, #2*8 // += stride ld1 {\d2\().4h}, [x9] // p1 add x9, x9, #2*8 // += stride ld1 {\d1\().d}[1], [x6] // p0 ld1 {\d2\().d}[1], [x9] // p1 .endif .endm .macro handle_pixel s1, s2, thresh_vec, shift, tap, min .if \min umin v2.8h, v2.8h, \s1\().8h smax v3.8h, v3.8h, \s1\().8h umin v2.8h, v2.8h, \s2\().8h smax v3.8h, v3.8h, \s2\().8h .endif uabd v16.8h, v0.8h, \s1\().8h // abs(diff) uabd v20.8h, v0.8h, \s2\().8h // abs(diff) ushl v17.8h, v16.8h, \shift // abs(diff) >> shift ushl v21.8h, v20.8h, \shift // abs(diff) >> shift uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift)) uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift)) sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px neg v16.8h, v17.8h // -clip neg v20.8h, v21.8h // -clip smin v18.8h, v18.8h, v17.8h // imin(diff, clip) smin v22.8h, v22.8h, v21.8h // imin(diff, clip) dup v19.8h, \tap // taps[k] smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip) smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip) mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain() mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain() .endm // void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride, // const uint16_t *tmp, int pri_strength, // int sec_strength, int dir, int damping, // int h, size_t edges); .macro filter_func w, bpc, pri, sec, min, suffix function cdef_filter\w\suffix\()_\bpc\()bpc_neon .if \bpc == 8 ldr w8, [sp] // edges cmp w8, #0xf b.eq cdef_filter\w\suffix\()_edged_8bpc_neon .endif .if \pri .if \bpc == 16 ldr w9, [sp, #8] // bitdepth_max clz w9, w9 sub w9, w9, #24 // -bitdepth_min_8 neg w9, w9 // bitdepth_min_8 .endif movrel x8, pri_taps .if \bpc == 16 lsr w9, w3, w9 // pri_strength >> bitdepth_min_8 and w9, w9, #1 // (pri_strength >> bitdepth_min_8) & 1 .else and w9, w3, #1 .endif add x8, x8, w9, uxtw #1 .endif movrel x9, directions\w add x5, x9, w5, uxtw #1 movi v30.4h, #15 dup v28.4h, w6 // damping .if \pri dup v25.8h, w3 // threshold .endif .if \sec dup v27.8h, w4 // threshold .endif trn1 v24.4h, v25.4h, v27.4h clz v24.4h, v24.4h // clz(threshold) sub v24.4h, v30.4h, v24.4h // ulog2(threshold) uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold)) neg v24.4h, v24.4h // -shift .if \sec dup v26.8h, v24.h[1] .endif .if \pri dup v24.8h, v24.h[0] .endif 1: .if \w == 8 ld1 {v0.8h}, [x2] // px .else add x12, x2, #2*8 ld1 {v0.4h}, [x2] // px ld1 {v0.d}[1], [x12] // px .endif movi v1.8h, #0 // sum .if \min mov v2.16b, v0.16b // min mov v3.16b, v0.16b // max .endif // Instead of loading sec_taps 2, 1 from memory, just set it // to 2 initially and decrease for the second round. // This is also used as loop counter. mov w11, #2 // sec_taps[0] 2: .if \pri ldrb w9, [x5] // off1 load_px v4, v5, \w .endif .if \sec add x5, x5, #4 // +2*2 ldrb w9, [x5] // off2 load_px v6, v7, \w .endif .if \pri ldrb w10, [x8] // *pri_taps handle_pixel v4, v5, v25.8h, v24.8h, w10, \min .endif .if \sec add x5, x5, #8 // +2*4 ldrb w9, [x5] // off3 load_px v4, v5, \w handle_pixel v6, v7, v27.8h, v26.8h, w11, \min handle_pixel v4, v5, v27.8h, v26.8h, w11, \min sub x5, x5, #11 // x5 -= 2*(2+4); x5 += 1; .else add x5, x5, #1 // x5 += 1 .endif subs w11, w11, #1 // sec_tap-- (value) .if \pri add x8, x8, #1 // pri_taps++ (pointer) .endif b.ne 2b cmlt v4.8h, v1.8h, #0 // -(sum < 0) add v1.8h, v1.8h, v4.8h // sum - (sum < 0) srshr v1.8h, v1.8h, #4 // (8 + sum - (sum < 0)) >> 4 add v0.8h, v0.8h, v1.8h // px + (8 + sum ...) >> 4 .if \min smin v0.8h, v0.8h, v3.8h smax v0.8h, v0.8h, v2.8h // iclip(px + .., min, max) .endif .if \bpc == 8 xtn v0.8b, v0.8h .endif .if \w == 8 add x2, x2, #2*16 // tmp += tmp_stride subs w7, w7, #1 // h-- .if \bpc == 8 st1 {v0.8b}, [x0], x1 .else st1 {v0.8h}, [x0], x1 .endif .else .if \bpc == 8 st1 {v0.s}[0], [x0], x1 .else st1 {v0.d}[0], [x0], x1 .endif add x2, x2, #2*16 // tmp += 2*tmp_stride subs w7, w7, #2 // h -= 2 .if \bpc == 8 st1 {v0.s}[1], [x0], x1 .else st1 {v0.d}[1], [x0], x1 .endif .endif // Reset pri_taps and directions back to the original point sub x5, x5, #2 .if \pri sub x8, x8, #2 .endif b.gt 1b ret endfunc .endm .macro filter w, bpc filter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri filter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec filter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec function cdef_filter\w\()_\bpc\()bpc_neon, export=1 cbnz w3, 1f // pri_strength b cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec 1: cbnz w4, 1f // sec_strength b cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri 1: b cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec endfunc .endm const div_table .short 840, 420, 280, 210, 168, 140, 120, 105 endconst const alt_fact .short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0 endconst .macro cost_alt d1, d2, s1, s2, s3, s4 smull v22.4s, \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n] smull2 v23.4s, \s1\().8h, \s1\().8h smull v24.4s, \s2\().4h, \s2\().4h smull v25.4s, \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n] smull2 v26.4s, \s3\().8h, \s3\().8h smull v27.4s, \s4\().4h, \s4\().4h mul v22.4s, v22.4s, v29.4s // sum_alt[n]^2*fact mla v22.4s, v23.4s, v30.4s mla v22.4s, v24.4s, v31.4s mul v25.4s, v25.4s, v29.4s // sum_alt[n]^2*fact mla v25.4s, v26.4s, v30.4s mla v25.4s, v27.4s, v31.4s addv \d1, v22.4s // *cost_ptr addv \d2, v25.4s // *cost_ptr .endm .macro find_best s1, s2, s3 .ifnb \s2 mov w5, \s2\().s[0] .endif cmp w4, w1 // cost[n] > best_cost csel w0, w3, w0, gt // best_dir = n csel w1, w4, w1, gt // best_cost = cost[n] .ifnb \s2 add w3, w3, #1 // n++ cmp w5, w1 // cost[n] > best_cost mov w4, \s3\().s[0] csel w0, w3, w0, gt // best_dir = n csel w1, w5, w1, gt // best_cost = cost[n] add w3, w3, #1 // n++ .endif .endm // Steps for loading and preparing each row .macro dir_load_step1 s1, bpc .if \bpc == 8 ld1 {\s1\().8b}, [x0], x1 .else ld1 {\s1\().8h}, [x0], x1 .endif .endm .macro dir_load_step2 s1, bpc .if \bpc == 8 usubl \s1\().8h, \s1\().8b, v31.8b .else ushl \s1\().8h, \s1\().8h, v8.8h .endif .endm .macro dir_load_step3 s1, bpc // Nothing for \bpc == 8 .if \bpc != 8 sub \s1\().8h, \s1\().8h, v31.8h .endif .endm // int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride, // unsigned *const var) .macro find_dir bpc function cdef_find_dir_\bpc\()bpc_neon, export=1 .if \bpc == 16 str d8, [sp, #-0x10]! clz w3, w3 // clz(bitdepth_max) sub w3, w3, #24 // -bitdepth_min_8 dup v8.8h, w3 .endif sub sp, sp, #32 // cost mov w3, #8 .if \bpc == 8 movi v31.16b, #128 .else movi v31.8h, #128 .endif movi v30.16b, #0 movi v1.8h, #0 // v0-v1 sum_diag[0] movi v3.8h, #0 // v2-v3 sum_diag[1] movi v5.8h, #0 // v4-v5 sum_hv[0-1] movi v7.8h, #0 // v6-v7 sum_alt[0] dir_load_step1 v26, \bpc // Setup first row early movi v17.8h, #0 // v16-v17 sum_alt[1] movi v18.8h, #0 // v18-v19 sum_alt[2] dir_load_step2 v26, \bpc movi v19.8h, #0 dir_load_step3 v26, \bpc movi v21.8h, #0 // v20-v21 sum_alt[3] .irpc i, 01234567 addv h25, v26.8h // [y] rev64 v27.8h, v26.8h addp v28.8h, v26.8h, v30.8h // [(x >> 1)] add v5.8h, v5.8h, v26.8h // sum_hv[1] ext v27.16b, v27.16b, v27.16b, #8 // [-x] rev64 v29.4h, v28.4h // [-(x >> 1)] ins v4.h[\i], v25.h[0] // sum_hv[0] .if \i < 6 ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2))) ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2))) add v18.8h, v18.8h, v22.8h // sum_alt[2] add v19.4h, v19.4h, v23.4h // sum_alt[2] .else add v18.8h, v18.8h, v26.8h // sum_alt[2] .endif .if \i == 0 mov v20.16b, v26.16b // sum_alt[3] .elseif \i == 1 add v20.8h, v20.8h, v26.8h // sum_alt[3] .else ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2)) ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2)) add v20.8h, v20.8h, v24.8h // sum_alt[3] add v21.4h, v21.4h, v25.4h // sum_alt[3] .endif .if \i == 0 mov v0.16b, v26.16b // sum_diag[0] dir_load_step1 v26, \bpc mov v2.16b, v27.16b // sum_diag[1] dir_load_step2 v26, \bpc mov v6.16b, v28.16b // sum_alt[0] dir_load_step3 v26, \bpc mov v16.16b, v29.16b // sum_alt[1] .else ext v22.16b, v30.16b, v26.16b, #(16-2*\i) ext v23.16b, v26.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v27.16b, #(16-2*\i) ext v25.16b, v27.16b, v30.16b, #(16-2*\i) .if \i != 7 // Nothing to load for the final row dir_load_step1 v26, \bpc // Start setting up the next row early. .endif add v0.8h, v0.8h, v22.8h // sum_diag[0] add v1.8h, v1.8h, v23.8h // sum_diag[0] add v2.8h, v2.8h, v24.8h // sum_diag[1] add v3.8h, v3.8h, v25.8h // sum_diag[1] .if \i != 7 dir_load_step2 v26, \bpc .endif ext v22.16b, v30.16b, v28.16b, #(16-2*\i) ext v23.16b, v28.16b, v30.16b, #(16-2*\i) ext v24.16b, v30.16b, v29.16b, #(16-2*\i) ext v25.16b, v29.16b, v30.16b, #(16-2*\i) .if \i != 7 dir_load_step3 v26, \bpc .endif add v6.8h, v6.8h, v22.8h // sum_alt[0] add v7.4h, v7.4h, v23.4h // sum_alt[0] add v16.8h, v16.8h, v24.8h // sum_alt[1] add v17.4h, v17.4h, v25.4h // sum_alt[1] .endif .endr movi v31.4s, #105 smull v26.4s, v4.4h, v4.4h // sum_hv[0]*sum_hv[0] smlal2 v26.4s, v4.8h, v4.8h smull v27.4s, v5.4h, v5.4h // sum_hv[1]*sum_hv[1] smlal2 v27.4s, v5.8h, v5.8h mul v26.4s, v26.4s, v31.4s // cost[2] *= 105 mul v27.4s, v27.4s, v31.4s // cost[6] *= 105 addv s4, v26.4s // cost[2] addv s5, v27.4s // cost[6] rev64 v1.8h, v1.8h rev64 v3.8h, v3.8h ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n] ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n] str s4, [sp, #2*4] // cost[2] str s5, [sp, #6*4] // cost[6] movrel x4, div_table ld1 {v31.8h}, [x4] smull v22.4s, v0.4h, v0.4h // sum_diag[0]*sum_diag[0] smull2 v23.4s, v0.8h, v0.8h smlal v22.4s, v1.4h, v1.4h smlal2 v23.4s, v1.8h, v1.8h smull v24.4s, v2.4h, v2.4h // sum_diag[1]*sum_diag[1] smull2 v25.4s, v2.8h, v2.8h smlal v24.4s, v3.4h, v3.4h smlal2 v25.4s, v3.8h, v3.8h uxtl v30.4s, v31.4h // div_table uxtl2 v31.4s, v31.8h mul v22.4s, v22.4s, v30.4s // cost[0] mla v22.4s, v23.4s, v31.4s // cost[0] mul v24.4s, v24.4s, v30.4s // cost[4] mla v24.4s, v25.4s, v31.4s // cost[4] addv s0, v22.4s // cost[0] addv s2, v24.4s // cost[4] movrel x5, alt_fact ld1 {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105 str s0, [sp, #0*4] // cost[0] str s2, [sp, #4*4] // cost[4] uxtl v29.4s, v29.4h // div_table[2*m+1] + 105 uxtl v30.4s, v30.4h uxtl v31.4s, v31.4h cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3] cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7] str s6, [sp, #1*4] // cost[1] str s16, [sp, #3*4] // cost[3] mov w0, #0 // best_dir mov w1, v0.s[0] // best_cost mov w3, #1 // n str s18, [sp, #5*4] // cost[5] str s20, [sp, #7*4] // cost[7] mov w4, v6.s[0] find_best v6, v4, v16 find_best v16, v2, v18 find_best v18, v5, v20 find_best v20 eor w3, w0, #4 // best_dir ^4 ldr w4, [sp, w3, uxtw #2] sub w1, w1, w4 // best_cost - cost[best_dir ^ 4] lsr w1, w1, #10 str w1, [x2] // *var add sp, sp, #32 .if \bpc == 16 ldr d8, [sp], 0x10 .endif ret endfunc .endm rav1e-0.7.1/src/arm/64/ipred.S000064400000000000000000006375651046102023000137230ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_128_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_128_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] movi v0.16b, #128 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 4: AARCH64_VALID_JUMP_TARGET st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET movi v1.16b, #128 movi v2.16b, #128 movi v3.16b, #128 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_dc_128_tbl): .hword L(ipred_dc_128_tbl) - 640b .hword L(ipred_dc_128_tbl) - 320b .hword L(ipred_dc_128_tbl) - 16b .hword L(ipred_dc_128_tbl) - 8b .hword L(ipred_dc_128_tbl) - 4b endfunc // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_v_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #1 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_v_tbl): .hword L(ipred_v_tbl) - 640b .hword L(ipred_v_tbl) - 320b .hword L(ipred_v_tbl) - 160b .hword L(ipred_v_tbl) - 80b .hword L(ipred_v_tbl) - 40b endfunc // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_h_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] sub x2, x2, #4 sub x5, x5, w3, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 4: AARCH64_VALID_JUMP_TARGET ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.s}[0], [x0], x1 st1 {v2.s}[0], [x6], x1 subs w4, w4, #4 st1 {v1.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 st1 {v3.8b}, [x0], x1 st1 {v2.8b}, [x6], x1 subs w4, w4, #4 st1 {v1.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] st1 {v3.16b}, [x0], x1 st1 {v2.16b}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] st1 {v1.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 64b ret L(ipred_h_tbl): .hword L(ipred_h_tbl) - 64b .hword L(ipred_h_tbl) - 32b .hword L(ipred_h_tbl) - 16b .hword L(ipred_h_tbl) - 8b .hword L(ipred_h_tbl) - 4b endfunc // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_top_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #1 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 4: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.8b, v0.b[0] 8: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v2.4h, v0.4h, v1.4h rshrn v2.8b, v2.8h, #5 dup v0.16b, v2.b[0] dup v1.16b, v2.b[0] 32: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v4.4h, v0.4h, v1.4h add v5.4h, v2.4h, v3.4h add v4.4h, v4.4h, v5.4h rshrn v4.8b, v4.8h, #6 dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] dup v2.16b, v4.b[0] dup v3.16b, v4.b[0] 64: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 64b ret L(ipred_dc_top_tbl): .hword L(ipred_dc_top_tbl) - 640b .hword L(ipred_dc_top_tbl) - 320b .hword L(ipred_dc_top_tbl) - 160b .hword L(ipred_dc_top_tbl) - 80b .hword L(ipred_dc_top_tbl) - 40b endfunc // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w3, w3 clz w7, w4 adr x5, L(ipred_dc_left_tbl) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 ldrh w3, [x5, w3, uxtw #1] ldrh w7, [x5, w7, uxtw #1] sub x3, x5, w3, uxtw sub x5, x5, w7, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_left_h4): AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt L(ipred_dc_left_w4) ret L(ipred_dc_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b rshrn v0.8b, v0.8h, #3 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt L(ipred_dc_left_w8) ret L(ipred_dc_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w16): AARCH64_VALID_JUMP_TARGET st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt L(ipred_dc_left_w16) ret L(ipred_dc_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b add v0.4h, v0.4h, v1.4h rshrn v0.8b, v0.8h, #5 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w32): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add v0.4h, v0.4h, v2.4h rshrn v0.8b, v0.8h, #6 dup v0.16b, v0.b[0] br x3 L(ipred_dc_left_w64): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 1: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 1b ret L(ipred_dc_left_tbl): .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w7, w3, w4 // width + height clz w3, w3 clz w6, w4 dup v16.8h, w7 // width + height adr x5, L(ipred_dc_tbl) rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) ldrh w3, [x5, w3, uxtw #1] ldrh w6, [x5, w6, uxtw #1] neg w7, w7 // -ctz(width + height) sub x3, x5, w3, uxtw sub x5, x5, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w7 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w4): AARCH64_VALID_JUMP_TARGET ld1 {v1.s}[0], [x2] ins v1.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #4 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 subs w4, w4, #4 st1 {v0.s}[0], [x0], x1 st1 {v0.s}[0], [x6], x1 b.gt 2b ret L(ipred_dc_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x3 L(ipred_dc_w8): AARCH64_VALID_JUMP_TARGET ld1 {v1.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.8b cmp w4, #8 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8b, v0.b[0] 2: st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 subs w4, w4, #4 st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x6], x1 b.gt 2b ret L(ipred_dc_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x3 L(ipred_dc_w16): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b cmp w4, #16 add v0.4h, v0.4h, v1.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32/64 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.16b, v0.b[0] 2: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x2], #32 uaddlv h0, v0.16b uaddlv h1, v1.16b add x2, x2, #1 add v0.4h, v0.4h, v1.4h br x3 L(ipred_dc_w32): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b cmp w4, #32 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v2.4h ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 8/16/64 cmp w4, #8 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v4.4h, v4.4h, v16.4h 1: dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] 2: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 uaddlv h0, v0.16b uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b add v0.4h, v0.4h, v1.4h add v2.4h, v2.4h, v3.4h add x2, x2, #1 add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h1, v1.16b uaddlv h2, v2.16b uaddlv h3, v3.16b uaddlv h4, v4.16b add v1.4h, v1.4h, v2.4h add v3.4h, v3.4h, v4.4h cmp w4, #64 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v3.4h ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 16/32 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 lsr w16, w16, w4 dup v16.4h, w16 sqdmulh v4.4h, v4.4h, v16.4h 1: dup v0.16b, v4.b[0] dup v1.16b, v4.b[0] dup v2.16b, v4.b[0] dup v3.16b, v4.b[0] 2: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 subs w4, w4, #4 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 b.gt 2b ret L(ipred_dc_tbl): .hword L(ipred_dc_tbl) - L(ipred_dc_h64) .hword L(ipred_dc_tbl) - L(ipred_dc_h32) .hword L(ipred_dc_tbl) - L(ipred_dc_h16) .hword L(ipred_dc_tbl) - L(ipred_dc_h8) .hword L(ipred_dc_tbl) - L(ipred_dc_h4) .hword L(ipred_dc_tbl) - L(ipred_dc_w64) .hword L(ipred_dc_tbl) - L(ipred_dc_w32) .hword L(ipred_dc_tbl) - L(ipred_dc_w16) .hword L(ipred_dc_tbl) - L(ipred_dc_w8) .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_8bpc_neon, export=1 clz w9, w3 adr x5, L(ipred_paeth_tbl) sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x2] add x8, x2, #1 sub x2, x2, #4 sub x5, x5, w9, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v5.4s}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 zip1 v0.2s, v0.2s, v1.2s zip1 v2.2s, v2.2s, v3.2s uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v2.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h zip1 v0.2d, v0.2d, v2.2d uabd v20.16b, v5.16b, v16.16b // tdiff uabd v22.16b, v4.16b, v16.16b // tldiff uabd v16.16b, v0.16b, v16.16b // ldiff umin v18.16b, v20.16b, v22.16b // min(tdiff, tldiff) cmhs v20.16b, v22.16b, v20.16b // tldiff >= tdiff cmhs v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff bsl v20.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bit v20.16b, v0.16b, v16.16b // ldiff <= min ? left : ... st1 {v20.s}[3], [x0], x1 st1 {v20.s}[2], [x6], x1 subs w4, w4, #4 st1 {v20.s}[1], [x0], x1 st1 {v20.s}[0], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] usubl v6.8h, v5.8b, v4.8b // top - topleft 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 uaddw v16.8h, v6.8h, v0.8b uaddw v17.8h, v6.8h, v1.8b uaddw v18.8h, v6.8h, v2.8b uaddw v19.8h, v6.8h, v3.8b sqxtun v16.8b, v16.8h // base sqxtun2 v16.16b, v17.8h sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h zip1 v2.2d, v2.2d, v3.2d zip1 v0.2d, v0.2d, v1.2d uabd v21.16b, v5.16b, v18.16b // tdiff uabd v20.16b, v5.16b, v16.16b uabd v23.16b, v4.16b, v18.16b // tldiff uabd v22.16b, v4.16b, v16.16b uabd v17.16b, v2.16b, v18.16b // ldiff uabd v16.16b, v0.16b, v16.16b umin v19.16b, v21.16b, v23.16b // min(tdiff, tldiff) umin v18.16b, v20.16b, v22.16b cmhs v21.16b, v23.16b, v21.16b // tldiff >= tdiff cmhs v20.16b, v22.16b, v20.16b cmhs v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff cmhs v16.16b, v18.16b, v16.16b bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v20.16b, v5.16b, v4.16b bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... bit v20.16b, v0.16b, v16.16b st1 {v21.d}[1], [x0], x1 st1 {v21.d}[0], [x6], x1 subs w4, w4, #4 st1 {v20.d}[1], [x0], x1 st1 {v20.d}[0], [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET ld1 {v5.16b}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw 1: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x7 2: usubl v6.8h, v5.8b, v4.8b // top - topleft usubl2 v7.8h, v5.16b, v4.16b uaddw v24.8h, v6.8h, v0.8b uaddw v25.8h, v7.8h, v0.8b uaddw v26.8h, v6.8h, v1.8b uaddw v27.8h, v7.8h, v1.8b uaddw v28.8h, v6.8h, v2.8b uaddw v29.8h, v7.8h, v2.8b uaddw v30.8h, v6.8h, v3.8b uaddw v31.8h, v7.8h, v3.8b sqxtun v17.8b, v26.8h // base sqxtun2 v17.16b, v27.8h sqxtun v16.8b, v24.8h sqxtun2 v16.16b, v25.8h sqxtun v19.8b, v30.8h sqxtun2 v19.16b, v31.8h sqxtun v18.8b, v28.8h sqxtun2 v18.16b, v29.8h uabd v23.16b, v5.16b, v19.16b // tdiff uabd v22.16b, v5.16b, v18.16b uabd v21.16b, v5.16b, v17.16b uabd v20.16b, v5.16b, v16.16b uabd v27.16b, v4.16b, v19.16b // tldiff uabd v26.16b, v4.16b, v18.16b uabd v25.16b, v4.16b, v17.16b uabd v24.16b, v4.16b, v16.16b uabd v19.16b, v3.16b, v19.16b // ldiff uabd v18.16b, v2.16b, v18.16b uabd v17.16b, v1.16b, v17.16b uabd v16.16b, v0.16b, v16.16b umin v31.16b, v23.16b, v27.16b // min(tdiff, tldiff) umin v30.16b, v22.16b, v26.16b umin v29.16b, v21.16b, v25.16b umin v28.16b, v20.16b, v24.16b cmhs v23.16b, v27.16b, v23.16b // tldiff >= tdiff cmhs v22.16b, v26.16b, v22.16b cmhs v21.16b, v25.16b, v21.16b cmhs v20.16b, v24.16b, v20.16b cmhs v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff cmhs v18.16b, v30.16b, v18.16b cmhs v17.16b, v29.16b, v17.16b cmhs v16.16b, v28.16b, v16.16b bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v22.16b, v5.16b, v4.16b bsl v21.16b, v5.16b, v4.16b bsl v20.16b, v5.16b, v4.16b bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... bit v22.16b, v2.16b, v18.16b bit v21.16b, v1.16b, v17.16b bit v20.16b, v0.16b, v16.16b subs w3, w3, #16 st1 {v23.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v21.16b}, [x5], #16 st1 {v20.16b}, [x10], #16 b.le 8f ld1 {v5.16b}, [x8], #16 b 2b 8: subs w4, w4, #4 b.le 9f // End of horizontal loop, move pointers to next four rows sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 // Load the top row as early as possible ld1 {v5.16b}, [x8], #16 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_paeth_tbl): .hword L(ipred_paeth_tbl) - 640b .hword L(ipred_paeth_tbl) - 320b .hword L(ipred_paeth_tbl) - 160b .hword L(ipred_paeth_tbl) - 80b .hword L(ipred_paeth_tbl) - 40b endfunc // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_8bpc_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_tbl) sub x12, x2, w4, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x12] // bottom add x8, x2, #1 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2s}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #4 mov x7, #-4 dup v5.16b, v6.b[3] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 zip1 v1.2s, v1.2s, v0.2s // left, flipped zip1 v0.2s, v3.2s, v2.2s zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s shll v22.8h, v4.8b, #8 // bottom*256 shll v23.8h, v4.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v23.8h, v6.8h, v18.8h uhadd v20.8h, v20.8h, v22.8h uhadd v21.8h, v21.8h, v23.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 subs w4, w4, #4 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8b}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #4 mov x7, #-4 dup v5.16b, v6.b[7] // right usubl v6.8h, v6.8b, v4.8b // top-bottom uxtl v7.8h, v7.8b // weights_hor 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v5.8b usubl v3.8h, v3.8b, v5.8b shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v2.8h, v7.8h // (left flipped) mla v22.8h, v1.8h, v7.8h mla v23.8h, v0.8h, v7.8h mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v6.8h, v17.8h mla v26.8h, v6.8h, v18.8h mla v27.8h, v6.8h, v19.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v20.8b}, [x0], x1 st1 {v21.8b}, [x6], x1 subs w4, w4, #4 st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw sub x2, x2, #2 mov x7, #-2 ld1r {v5.16b}, [x12] // right sub x1, x1, w3, uxtw mov w9, w3 1: ld2r {v0.8b, v1.8b}, [x2], x7 // left ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b 2: ld1 {v7.16b}, [x10], #16 // weights_hor ld1 {v3.16b}, [x8], #16 // top shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b usubl v2.8h, v3.8b, v4.8b // top-bottom usubl2 v3.8h, v3.16b, v4.16b mla v20.8h, v1.8h, v6.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h // (left flipped) mla v22.8h, v0.8h, v6.8h mla v23.8h, v0.8h, v7.8h shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 mla v24.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v3.8h, v16.8h mla v26.8h, v2.8h, v17.8h mla v27.8h, v3.8h, v17.8h uhadd v20.8h, v20.8h, v24.8h uhadd v21.8h, v21.8h, v25.8h uhadd v22.8h, v22.8h, v26.8h uhadd v23.8h, v23.8h, v27.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x8, w9, uxtw sub x10, x10, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_tbl): .hword L(ipred_smooth_tbl) - 640b .hword L(ipred_smooth_tbl) - 320b .hword L(ipred_smooth_tbl) - 160b .hword L(ipred_smooth_tbl) - 80b .hword L(ipred_smooth_tbl) - 40b endfunc // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_8bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 adr x5, L(ipred_smooth_v_tbl) sub x8, x2, w4, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.16b}, [x8] // bottom add x2, x2, #1 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2s}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 4: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver shll v22.8h, v4.8b, #8 // bottom*256 shll v23.8h, v4.8b, #8 zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b mla v22.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v23.8h, v6.8h, v18.8h rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v22.s}[0], [x0], x1 st1 {v22.s}[1], [x6], x1 subs w4, w4, #4 st1 {v23.s}[0], [x0], x1 st1 {v23.s}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8b}, [x2] // top usubl v6.8h, v6.8b, v4.8b // top-bottom 8: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver shll v24.8h, v4.8b, #8 // bottom*256 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b mla v24.8h, v6.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v25.8h, v6.8h, v17.8h mla v26.8h, v6.8h, v18.8h mla v27.8h, v6.8h, v19.8h rshrn v24.8b, v24.8h, #8 rshrn v25.8b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn v27.8b, v27.8h, #8 st1 {v24.8b}, [x0], x1 st1 {v25.8b}, [x6], x1 subs w4, w4, #4 st1 {v26.8b}, [x0], x1 st1 {v27.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b 2: ld1 {v3.16b}, [x2], #16 // top shll v20.8h, v4.8b, #8 // bottom*256 shll v21.8h, v4.8b, #8 shll v22.8h, v4.8b, #8 shll v23.8h, v4.8b, #8 shll v24.8h, v4.8b, #8 shll v25.8h, v4.8b, #8 shll v26.8h, v4.8b, #8 shll v27.8h, v4.8b, #8 usubl v2.8h, v3.8b, v4.8b // top-bottom usubl2 v3.8h, v3.16b, v4.16b mla v20.8h, v2.8h, v16.8h // bottom*256 + (top-bottom)*weights_ver mla v21.8h, v3.8h, v16.8h mla v22.8h, v2.8h, v17.8h mla v23.8h, v3.8h, v17.8h mla v24.8h, v2.8h, v18.8h mla v25.8h, v3.8h, v18.8h mla v26.8h, v2.8h, v19.8h mla v27.8h, v3.8h, v19.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 rshrn v24.8b, v24.8h, #8 rshrn2 v24.16b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn2 v26.16b, v27.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v24.16b}, [x5], #16 st1 {v26.16b}, [x8], #16 b.gt 2b subs w4, w4, #4 b.le 9f sub x2, x2, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x8, x8, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_v_tbl): .hword L(ipred_smooth_v_tbl) - 640b .hword L(ipred_smooth_v_tbl) - 320b .hword L(ipred_smooth_v_tbl) - 160b .hword L(ipred_smooth_v_tbl) - 80b .hword L(ipred_smooth_v_tbl) - 40b endfunc // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_8bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_h_tbl) add x12, x2, w3, uxtw sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v5.16b}, [x12] // right sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 uxtl v7.8h, v7.8b // weights_hor 4: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 zip1 v1.2s, v1.2s, v0.2s // left, flipped zip1 v0.2s, v3.2s, v2.2s usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b mla v20.8h, v0.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v1.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x6], x1 subs w4, w4, #4 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #4 mov x7, #-4 uxtl v7.8h, v7.8b // weights_hor 8: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 usubl v3.8h, v3.8b, v5.8b // left-right usubl v2.8h, v2.8b, v5.8b usubl v1.8h, v1.8b, v5.8b usubl v0.8h, v0.8b, v5.8b mla v20.8h, v3.8h, v7.8h // right*256 + (left-right)*weights_hor mla v21.8h, v2.8h, v7.8h // (left flipped) mla v22.8h, v1.8h, v7.8h mla v23.8h, v0.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn v21.8b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn v23.8b, v23.8h, #8 st1 {v20.8b}, [x0], x1 st1 {v21.8b}, [x6], x1 subs w4, w4, #4 st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET sub x2, x2, #4 mov x7, #-4 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x2], x7 // left usubl v0.8h, v0.8b, v5.8b // left-right usubl v1.8h, v1.8b, v5.8b usubl v2.8h, v2.8b, v5.8b usubl v3.8h, v3.8b, v5.8b 2: ld1 {v7.16b}, [x8], #16 // weights_hor shll v20.8h, v5.8b, #8 // right*256 shll v21.8h, v5.8b, #8 shll v22.8h, v5.8b, #8 shll v23.8h, v5.8b, #8 shll v24.8h, v5.8b, #8 shll v25.8h, v5.8b, #8 shll v26.8h, v5.8b, #8 shll v27.8h, v5.8b, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b mla v20.8h, v3.8h, v6.8h // right*256 + (left-right)*weights_hor mla v21.8h, v3.8h, v7.8h // (left flipped) mla v22.8h, v2.8h, v6.8h mla v23.8h, v2.8h, v7.8h mla v24.8h, v1.8h, v6.8h mla v25.8h, v1.8h, v7.8h mla v26.8h, v0.8h, v6.8h mla v27.8h, v0.8h, v7.8h rshrn v20.8b, v20.8h, #8 rshrn2 v20.16b, v21.8h, #8 rshrn v22.8b, v22.8h, #8 rshrn2 v22.16b, v23.8h, #8 rshrn v24.8b, v24.8h, #8 rshrn2 v24.16b, v25.8h, #8 rshrn v26.8b, v26.8h, #8 rshrn2 v26.16b, v27.8h, #8 subs w3, w3, #16 st1 {v20.16b}, [x0], #16 st1 {v22.16b}, [x6], #16 st1 {v24.16b}, [x5], #16 st1 {v26.16b}, [x10], #16 b.gt 2b subs w4, w4, #4 b.le 9f sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 640b .hword L(ipred_smooth_h_tbl) - 320b .hword L(ipred_smooth_h_tbl) - 160b .hword L(ipred_smooth_h_tbl) - 80b .hword L(ipred_smooth_h_tbl) - 40b endfunc const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 padding_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz, // const pixel *const in, const int end); function ipred_z1_upsample_edge_8bpc_neon, export=1 movrel x4, padding_mask ld1 {v0.16b}, [x2] // in[] add x5, x2, w3, uxtw // in[end] sub x4, x4, w3, uxtw ld1r {v1.16b}, [x5] // padding ld1 {v3.16b}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v1.16b, v3.16b // padded in[] ext v4.16b, v0.16b, v1.16b, #1 ext v5.16b, v0.16b, v1.16b, #2 ext v6.16b, v0.16b, v1.16b, #3 uaddl v16.8h, v4.8b, v5.8b // in[i+1] + in[i+2] uaddl2 v17.8h, v4.16b, v5.16b uaddl v18.8h, v0.8b, v6.8b // in[i+0] + in[i+3] uaddl2 v19.8h, v0.16b, v6.16b mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) mul v17.8h, v17.8h, v31.8h sub v16.8h, v16.8h, v18.8h sub v17.8h, v17.8h, v19.8h sqrshrun v16.8b, v16.8h, #4 sqrshrun2 v16.16b, v17.8h, #4 zip1 v0.16b, v4.16b, v16.16b zip2 v1.16b, v4.16b, v16.16b st1 {v0.16b, v1.16b}, [x0] ret endfunc // void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz, // const pixel *const in); function ipred_z2_upsample_edge_8bpc_neon, export=1 // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. movrel x4, padding_mask ld1 {v0.16b}, [x2] // in[] add x5, x2, w1, uxtw // in[sz] sub x4, x4, w1, uxtw ld1r {v2.16b}, [x2] // in[0] for padding ld1r {v1.16b}, [x5] // padding ld1 {v3.16b}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v1.16b, v3.16b // padded in[] ext v4.16b, v2.16b, v0.16b, #15 ext v5.16b, v0.16b, v1.16b, #1 ext v6.16b, v0.16b, v1.16b, #2 uaddl v16.8h, v0.8b, v5.8b // in[i+0] + in[i+1] uaddl v18.8h, v4.8b, v6.8b // in[i-1] + in[i+2] mul v16.8h, v16.8h, v31.8h // 9*(in[i+1] + in[i+2]) sub v16.8h, v16.8h, v18.8h sqrshrun v16.8b, v16.8h, #4 add x5, x0, #16 zip1 v2.16b, v0.16b, v16.16b st1 {v1.b}[0], [x5] // In case sz=8, output one single pixel in out[16]. st1 {v2.16b}, [x0] ret endfunc const edge_filter .byte 0, 4, 8, 0 .byte 0, 5, 6, 0 // Leaving out the coeffs for strength=3 // .byte 2, 4, 4, 0 endconst // void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz, // const pixel *const in, const int end, // const int strength); function ipred_z1_filter_edge_8bpc_neon, export=1 cmp w4, #3 b.eq L(fivetap) // if (strength == 3) goto fivetap movrel x5, edge_filter, -3 add x5, x5, w4, uxtw #2 // edge_filter + (strength - 1)*4 + 1 ld1 {v31.h}[0], [x5] // kernel[1-2] ld1 {v0.16b}, [x2], #16 dup v30.16b, v31.b[0] dup v31.16b, v31.b[1] 1: // in[end], is the last valid pixel. We produce 16 pixels out by // using 18 pixels in - the last pixel used is [17] of the ones // read/buffered. cmp w3, #17 ld1 {v1.16b}, [x2], #16 b.lt 2f ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 umull v4.8h, v0.8b, v30.8b umlal v4.8h, v2.8b, v31.8b umlal v4.8h, v3.8b, v30.8b umull2 v5.8h, v0.16b, v30.16b umlal2 v5.8h, v2.16b, v31.16b umlal2 v5.8h, v3.16b, v30.16b subs w1, w1, #16 mov v0.16b, v1.16b rshrn v4.8b, v4.8h, #4 rshrn2 v4.16b, v5.8h, #4 sub w3, w3, #16 st1 {v4.16b}, [x0], #16 b.gt 1b ret 2: // Right padding // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead) movrel x5, padding_mask sub w6, w3, #32 sub x5, x5, w3, uxtw add x6, x2, w6, sxtw ld1 {v2.16b}, [x5] // padding_mask ld1r {v1.16b}, [x6] bit v0.16b, v1.16b, v2.16b // Pad v0-v1 // Filter one block ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 umull v4.8h, v0.8b, v30.8b umlal v4.8h, v2.8b, v31.8b umlal v4.8h, v3.8b, v30.8b umull2 v5.8h, v0.16b, v30.16b umlal2 v5.8h, v2.16b, v31.16b umlal2 v5.8h, v3.16b, v30.16b subs w1, w1, #16 rshrn v4.8b, v4.8h, #4 rshrn2 v4.16b, v5.8h, #4 st1 {v4.16b}, [x0], #16 b.le 9f 5: // After one block, any remaining output would only be filtering // padding - thus just store the padding. subs w1, w1, #16 st1 {v1.16b}, [x0], #16 b.gt 5b 9: ret L(fivetap): sub x2, x2, #1 // topleft -= 1 movi v29.16b, #2 ld1 {v0.16b}, [x2], #16 movi v30.16b, #4 movi v31.16b, #4 ins v0.b[0], v0.b[1] 1: // in[end+1], is the last valid pixel. We produce 16 pixels out by // using 20 pixels in - the last pixel used is [19] of the ones // read/buffered. cmp w3, #18 ld1 {v1.16b}, [x2], #16 b.lt 2f // if (end + 1 < 19) ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v0.16b, v1.16b, #3 ext v5.16b, v0.16b, v1.16b, #4 umull v6.8h, v0.8b, v29.8b umlal v6.8h, v2.8b, v30.8b umlal v6.8h, v3.8b, v31.8b umlal v6.8h, v4.8b, v30.8b umlal v6.8h, v5.8b, v29.8b umull2 v7.8h, v0.16b, v29.16b umlal2 v7.8h, v2.16b, v30.16b umlal2 v7.8h, v3.16b, v31.16b umlal2 v7.8h, v4.16b, v30.16b umlal2 v7.8h, v5.16b, v29.16b subs w1, w1, #16 mov v0.16b, v1.16b rshrn v6.8b, v6.8h, #4 rshrn2 v6.16b, v7.8h, #4 sub w3, w3, #16 st1 {v6.16b}, [x0], #16 b.gt 1b ret 2: // Right padding // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead) movrel x5, padding_mask, -1 sub w6, w3, #31 sub x5, x5, w3, uxtw add x6, x2, w6, sxtw ld1 {v2.16b, v3.16b}, [x5] // padding_mask ld1r {v28.16b}, [x6] bit v0.16b, v28.16b, v2.16b // Pad v0-v1 bit v1.16b, v28.16b, v3.16b 4: // Filter one block ext v2.16b, v0.16b, v1.16b, #1 ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v0.16b, v1.16b, #3 ext v5.16b, v0.16b, v1.16b, #4 umull v6.8h, v0.8b, v29.8b umlal v6.8h, v2.8b, v30.8b umlal v6.8h, v3.8b, v31.8b umlal v6.8h, v4.8b, v30.8b umlal v6.8h, v5.8b, v29.8b umull2 v7.8h, v0.16b, v29.16b umlal2 v7.8h, v2.16b, v30.16b umlal2 v7.8h, v3.16b, v31.16b umlal2 v7.8h, v4.16b, v30.16b umlal2 v7.8h, v5.16b, v29.16b subs w1, w1, #16 mov v0.16b, v1.16b mov v1.16b, v28.16b rshrn v6.8b, v6.8h, #4 rshrn2 v6.16b, v7.8h, #4 sub w3, w3, #16 st1 {v6.16b}, [x0], #16 b.le 9f // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to // filter properly once more - aka (w3 >= 0). cmp w3, #0 b.ge 4b 5: // When w3 <= 0, all remaining pixels in v0-v1 are equal to the // last valid pixel - thus just output that without filtering. subs w1, w1, #16 st1 {v1.16b}, [x0], #16 b.gt 5b 9: ret endfunc // void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, // const int n); function ipred_pixel_set_8bpc_neon, export=1 dup v0.16b, w1 1: subs w2, w2, #16 st1 {v0.16b}, [x0], #16 b.gt 1b ret endfunc // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const int width, const int height, // const int dx, const int max_base_x); function ipred_z1_fill1_8bpc_neon, export=1 clz w9, w3 adr x8, L(ipred_z1_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw // top[max_base_x] sub x8, x8, w9, uxtw ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f ldr d0, [x2, w8, uxtw] // top[base] ldr d2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 ext v1.8b, v0.8b, v0.8b, #1 // top[base+1] ext v3.8b, v2.8b, v2.8b, #1 usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] usubl v7.8h, v3.8b, v2.8b ushll v16.8h, v0.8b, #6 // top[base]*64 ushll v17.8h, v2.8b, #6 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac mla v17.4h, v7.4h, v5.4h rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.s}[0], [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.s}[0], [x0], x1 b.gt 4b ret 49: st1 {v31.s}[0], [x0], x1 subs w4, w4, #2 st1 {v31.s}[0], [x0], x1 b.gt 49b ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.8b, w9 // frac dup v5.8b, w11 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8b, w9 // 64 - frac dup v7.8b, w11 ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] ext v3.16b, v2.16b, v2.16b, #1 umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac umull v17.8h, v2.8b, v7.8b umlal v17.8h, v3.8b, v5.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.gt 8b ret 89: st1 {v31.8b}, [x0], x1 subs w4, w4, #2 st1 {v31.8b}, [x0], x1 b.gt 89b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w3 add x13, x0, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 169f add x8, x2, w8, uxtw add x10, x2, w10, uxtw dup v4.16b, w9 // frac dup v5.16b, w11 ld1 {v0.16b, v1.16b}, [x8], #32 // top[base] ld1 {v2.16b, v3.16b}, [x10], #32 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.16b, w9 // 64 - frac dup v7.16b, w11 add w7, w7, w5 // xpos += dx 2: ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] ext v17.16b, v2.16b, v3.16b, #1 subs w3, w3, #16 umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac umull2 v19.8h, v0.16b, v6.16b umlal2 v19.8h, v16.16b, v4.16b umull v20.8h, v2.8b, v7.8b umlal v20.8h, v17.8b, v5.8b umull2 v21.8h, v2.16b, v7.16b umlal2 v21.8h, v17.16b, v5.16b rshrn v16.8b, v18.8h, #6 rshrn2 v16.16b, v19.8h, #6 rshrn v17.8b, v20.8h, #6 rshrn2 v17.16b, v21.8h, #6 st1 {v16.16b}, [x0], #16 st1 {v17.16b}, [x13], #16 b.le 3f mov v0.16b, v1.16b ld1 {v1.16b}, [x8], #16 // top[base] mov v2.16b, v3.16b ld1 {v3.16b}, [x10], #16 b 2b 3: subs w4, w4, #2 b.le 9f add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 1b 9: ret 169: st1 {v31.16b}, [x0], #16 subs w3, w3, #16 st1 {v31.16b}, [x13], #16 b.gt 169b subs w4, w4, #2 b.le 9b add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 169b L(ipred_z1_fill1_tbl): .hword L(ipred_z1_fill1_tbl) - 640b .hword L(ipred_z1_fill1_tbl) - 320b .hword L(ipred_z1_fill1_tbl) - 160b .hword L(ipred_z1_fill1_tbl) - 80b .hword L(ipred_z1_fill1_tbl) - 40b endfunc function ipred_z1_fill2_8bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 b.eq 8f 4: // w == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f ldr d0, [x2, w8, uxtw] // top[base] ldr d2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8b, v0.8b, v0.8b // top[base+1] uzp1 v0.8b, v0.8b, v0.8b // top[base] uzp2 v3.8b, v2.8b, v2.8b uzp1 v2.8b, v2.8b, v2.8b usubl v6.8h, v1.8b, v0.8b // top[base+1]-top[base] usubl v7.8h, v3.8b, v2.8b ushll v16.8h, v0.8b, #6 // top[base]*64 ushll v17.8h, v2.8b, #6 mla v16.4h, v6.4h, v4.4h // + top[base+1]*frac mla v17.4h, v7.4h, v5.4h rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.s}[0], [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.s}[0], [x0], x1 b.gt 4b ret 49: st1 {v31.s}[0], [x0], x1 subs w4, w4, #2 st1 {v31.s}[0], [x0], x1 b.gt 49b ret 8: // w == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.8b, w9 // frac dup v5.8b, w11 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8b, w9 // 64 - frac dup v7.8b, w11 uzp2 v1.16b, v0.16b, v0.16b // top[base+1] uzp1 v0.16b, v0.16b, v0.16b // top[base] uzp2 v3.16b, v2.16b, v2.16b uzp1 v2.16b, v2.16b, v2.16b umull v16.8h, v1.8b, v4.8b // top[base+1]*frac umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) umull v17.8h, v3.8b, v5.8b umlal v17.8h, v2.8b, v7.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.gt 8b ret 89: st1 {v31.8b}, [x0], x1 subs w4, w4, #2 st1 {v31.8b}, [x0], x1 b.gt 89b ret endfunc // void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src, // const int n); function ipred_reverse_8bpc_neon, export=1 sub x1, x1, #16 add x3, x0, #8 mov x4, #16 1: ld1 {v0.16b}, [x1] subs w2, w2, #16 rev64 v0.16b, v0.16b sub x1, x1, #16 st1 {v0.d}[1], [x0], x4 st1 {v0.d}[0], [x3], x4 b.gt 1b ret endfunc const increments .short 0, 1, 2, 3, 4, 5, 6, 7 .short 8, 9, 10, 11, 12, 13, 14, 15 endconst // void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const pixel *const left, // const int width, const int height, // const int dx, const int dy); function ipred_z2_fill1_8bpc_neon, export=1 clz w10, w4 adr x9, L(ipred_z2_fill1_tbl) sub w10, w10, #25 ldrh w10, [x9, w10, uxtw #1] mov w8, #(1 << 6) // xpos = 1 << 6 sub x9, x9, w10, uxtw sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy br x9 40: AARCH64_VALID_JUMP_TARGET dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // Worst case height for w=4 is 16, but we need at least h+1 elements ld1 {v0.16b, v1.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v29.8b, #2 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos // Cut corners here; only doing tbl over v0 here; we only // seem to need the last pixel, from v1, after skipping to the // left-only codepath below. tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] ext v5.8b, v4.8b, v4.8b, #1 and v6.8b, v6.8b, v25.8b // frac_x trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] trn1 v2.2s, v2.2s, v4.2s // top[base_x] trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 4b 49: tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2] trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 49b 9: ret 80: AARCH64_VALID_JUMP_TARGET dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // Worst case height for w=8 is 32, but we need at least h+1 elements ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbl v18.8b, {v0.16b}, v29.8b // left[base_y] add v30.8b, v29.8b, v19.8b // base_y + 2 add v29.8b, v29.8b, v17.8b // base_y + 1 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} movi v24.8b, #2 // 2 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] // Cut corners here; only doing tbl over v0-v1 here; we only // seem to need the last pixel, from v2, after skipping to the // left-only codepath below. tbl v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h tbl v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2] ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 8b 89: tbl v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1] tbl v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 89b 9: ret 160: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] add x11, x11, #16 // increments dup v18.8h, w7 // -dy movi v17.16b, #1 add x3, x3, #1 // Skip past left[0] ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy mul v19.8h, v14.8h, v18.8h // {8,9,10,11,12,13,14,15}* -dy movi v25.16b, #0x3e add v16.8h, v16.8h, v18.8h // -= dy add v18.8h, v19.8h, v18.8h xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} // Worst case height is 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] ld1r {v15.16b}, [x2] // left[0] == top[0] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v16.8h // (uint8_t)ypos xtn2 v27.16b, v18.8h shrn v29.8b, v16.8h, #6 // ypos >> 6 shrn2 v29.16b, v18.8h, #6 mov v18.16b, v15.16b // left[0] and v27.16b, v27.16b, v25.16b // frac_y // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbx v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 add v29.16b, v29.16b, v17.16b // base_y + 1 sub v28.16b, v26.16b, v27.16b // 64 - frac_y movi v24.16b, #2 // 2 16: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 169f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw add x11, x2, w11, sxtw ld1 {v4.16b, v5.16b}, [x9] // top[base_x] mov v19.16b, v15.16b // left[0] ld1 {v6.16b, v7.16b}, [x11] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] mov v20.16b, v15.16b // left[0] shrn v21.8b, v16.8h, #6 // first base_x shrn v22.8b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn v17.8b, v17.8h tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] trn1 v21.2d, v21.2d, v21.2d // first base_x trn1 v22.2d, v22.2d, v22.2d trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos trn1 v17.2d, v17.2d, v17.2d ext v5.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y sub v8.16b, v26.16b, v16.16b // 64 - frac_x sub v9.16b, v26.16b, v17.16b umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b add v21.16b, v21.16b, v31.16b // actual base_x add v22.16b, v22.16b, v31.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v5.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v7.8b, v17.8b umull2 v18.8h, v6.16b, v9.16b umlal2 v18.8h, v7.16b, v17.16b cmge v21.16b, v21.16b, #0 cmge v22.16b, v22.16b, #0 rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v18.8h, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.16b}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.16b}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 add v30.16b, v30.16b, v24.16b // base_y += 2 b 16b 169: mov v19.16b, v15.16b mov v20.16b, v15.16b tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] umull v4.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v4.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v5.8h, v18.16b, v28.16b umlal2 v5.8h, v19.16b, v27.16b umull v6.8h, v19.8b, v28.8b umlal v6.8h, v20.8b, v27.8b umull2 v7.8h, v19.16b, v28.16b umlal2 v7.8h, v20.16b, v27.16b rshrn v4.8b, v4.8h, #6 rshrn2 v4.16b, v5.8h, #6 rshrn v5.8b, v6.8h, #6 rshrn2 v5.16b, v7.8h, #6 st1 {v4.16b}, [x0], x1 subs w5, w5, #2 st1 {v5.16b}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 add v30.16b, v30.16b, v24.16b // base_y += 2 b 169b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret 320: 640: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] add x11, x11, #16 // increments dup v25.8h, w7 // -dy add x3, x3, #1 // Skip past left[0] ld1 {v14.8h}, [x11] // {8,9,10,11,12,13,14,15} add x13, x0, x1 // alternating row lsl x1, x1, #1 // stride *= 2 sub x1, x1, w4, uxtw // stride -= width movi v11.8h, #8 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy add v26.8h, v26.8h, v25.8h // -= dy mul v25.8h, v25.8h, v11.8h // -8*dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} xtn2 v31.16b, v14.8h // {8,9,10,11,12,13,14,15} // Worst case height is 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[] ld1r {v15.16b}, [x2] // left[0] == top[0] mov w12, w4 // orig w neg w14, w4 // -w 1: mov v23.16b, v26.16b // reset ypos asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, w14 // base_x <= -w asr w11, w8, #6 // base_x b.le 329f dup v17.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx add x9, x2, w9, sxtw add x11, x2, w11, sxtw sqshrn v21.8b, v16.8h, #6 // first base_x sqshrn v22.8b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn v17.8b, v17.8h ld1 {v4.16b}, [x9], #16 // top[base_x] ld1 {v6.16b}, [x11], #16 trn1 v21.2d, v21.2d, v21.2d // first base_x trn1 v22.2d, v22.2d, v22.2d trn1 v16.2d, v16.2d, v16.2d // (uint8_t)xpos trn1 v17.2d, v17.2d, v17.2d movi v10.16b, #0x3e movi v11.16b, #64 and v16.16b, v16.16b, v10.16b // frac_x and v17.16b, v17.16b, v10.16b sub v8.16b, v11.16b, v16.16b // 64 - frac_x sub v9.16b, v11.16b, v17.16b add v21.16b, v21.16b, v31.16b // actual base_x add v22.16b, v22.16b, v31.16b 2: add v13.8h, v23.8h, v25.8h // ypos -= 8*dy movi v12.16b, #64 movi v20.16b, #2 movi v10.16b, #0x3e smov w10, v22.b[0] xtn v27.8b, v23.8h // (uint8_t)ypos xtn2 v27.16b, v13.8h shrn v29.8b, v23.8h, #6 // ypos >> 6 shrn2 v29.16b, v13.8h, #6 cmp w10, #0 // base_x (bottom left) >= 0 and v27.16b, v27.16b, v10.16b // frac_y mov v18.16b, v15.16b // left[0] b.ge 4f add v23.8h, v13.8h, v25.8h // ypos -= 8*dy movi v13.16b, #1 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v13.16b // base_y + 1 mov v19.16b, v15.16b // left[0] sub v28.16b, v12.16b, v27.16b // 64 - frac_y ld1 {v5.16b}, [x9], #16 // top[base_x] ld1 {v7.16b}, [x11], #16 tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v13.16b // base_y + 2 mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #1 rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v18.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v19.8b, v17.8b umull2 v20.8h, v6.16b, v9.16b umlal2 v20.8h, v19.16b, v17.16b cmge v18.16b, v21.16b, #0 cmge v19.16b, v22.16b, #0 rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v20.8h, #6 bit v10.16b, v12.16b, v18.16b bit v11.16b, v13.16b, v19.16b st1 {v10.16b}, [x0], #16 subs w4, w4, #16 st1 {v11.16b}, [x13], #16 b.le 3f movi v10.16b, #16 mov v4.16b, v5.16b mov v6.16b, v7.16b add v21.16b, v21.16b, v10.16b // base_x += 16 add v22.16b, v22.16b, v10.16b b 2b 3: subs w5, w5, #2 b.le 9f movi v10.8h, #128 add x0, x0, x1 add x13, x13, x1 mov w4, w12 // reset w add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) b 1b 4: // The rest of the row only predicted from top[] ld1 {v5.16b}, [x9], #16 // top[base_x] ld1 {v7.16b}, [x11], #16 ext v18.16b, v4.16b, v5.16b, #1 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #1 umull v12.8h, v4.8b, v8.8b // top[base_x]-*(64-frac_x) umlal v12.8h, v18.8b, v16.8b // + top[base_x+1]*frac_x umull2 v13.8h, v4.16b, v8.16b umlal2 v13.8h, v18.16b, v16.16b umull v14.8h, v6.8b, v9.8b umlal v14.8h, v19.8b, v17.8b umull2 v20.8h, v6.16b, v9.16b umlal2 v20.8h, v19.16b, v17.16b rshrn v12.8b, v12.8h, #6 rshrn2 v12.16b, v13.8h, #6 rshrn v13.8b, v14.8h, #6 rshrn2 v13.16b, v20.8h, #6 st1 {v12.16b}, [x0], #16 subs w4, w4, #16 st1 {v13.16b}, [x13], #16 b.le 3b mov v4.16b, v5.16b mov v6.16b, v7.16b b 4b 329: // The rest of the block only predicted from left[] add x1, x1, w4, uxtw // restore stride mov w12, w5 // orig remaining h 1: add v13.8h, v23.8h, v25.8h // ypos -= 8*dy movi v12.16b, #64 movi v10.16b, #0x3e xtn v27.8b, v23.8h // (uint8_t)ypos xtn2 v27.16b, v13.8h shrn v29.8b, v23.8h, #6 // ypos >> 6 shrn2 v29.16b, v13.8h, #6 and v27.16b, v27.16b, v10.16b // frac_y mov v18.16b, v15.16b // left[0] add v23.8h, v13.8h, v25.8h // ypos -= 8*dy movi v21.16b, #1 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v21.16b // base_y + 1 sub v28.16b, v12.16b, v27.16b // 64 - frac_y 2: mov v19.16b, v15.16b // left[0] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v21.16b // base_y + 2 mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] add v29.16b, v29.16b, v21.16b // next base_y umull v10.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v10.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v11.8h, v18.16b, v28.16b umlal2 v11.8h, v19.16b, v27.16b umull v12.8h, v19.8b, v28.8b umlal v12.8h, v20.8b, v27.8b umull2 v13.8h, v19.16b, v28.16b umlal2 v13.8h, v20.16b, v27.16b rshrn v10.8b, v10.8h, #6 rshrn2 v10.16b, v11.8h, #6 rshrn v11.8b, v12.8h, #6 rshrn2 v11.16b, v13.8h, #6 st1 {v10.16b}, [x0], x1 subs w5, w5, #2 st1 {v11.16b}, [x13], x1 b.le 3f mov v18.16b, v20.16b b 2b 3: subs w4, w4, #16 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w5, w12 // reset h b 1b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret L(ipred_z2_fill1_tbl): .hword L(ipred_z2_fill1_tbl) - 640b .hword L(ipred_z2_fill1_tbl) - 320b .hword L(ipred_z2_fill1_tbl) - 160b .hword L(ipred_z2_fill1_tbl) - 80b .hword L(ipred_z2_fill1_tbl) - 40b endfunc function ipred_z2_fill2_8bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 tbl v16.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2s, v30.2s, v28.2s // base_y + 1, base_y + 2 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v29.8b, #2 add v31.8b, v31.8b, v31.8b // {0,2,4,6,0,2,4,6} 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos uzp2 v3.8b, v2.8b, v4.8b // top[base_x+1] uzp1 v2.8b, v2.8b, v4.8b // top[base_x] and v6.8b, v6.8b, v25.8b // frac_x trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 4b 49: tbl v17.8b, {v0.16b}, v30.8b // left[base_y+1], left[base_y+2] trn1 v16.2s, v16.2s, v17.2s // left[base_y], left[base_y+1] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f ext v16.8b, v17.8b, v17.8b, #4 add v30.8b, v30.8b, v29.8b // base_y += 2 b 49b 9: ret 80: dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 tbl v18.8b, {v0.16b}, v29.8b // left[base_y] add v30.8b, v29.8b, v19.8b // base_y + 2 add v29.8b, v29.8b, v17.8b // base_y + 1 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} movi v24.8b, #2 // 2 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] uzp2 v5.16b, v4.16b, v6.16b // top[base_x+1] uzp1 v4.16b, v4.16b, v6.16b // top[base_x] and v16.16b, v16.16b, v25.16b // frac_x sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 8b 89: tbl v19.8b, {v0.16b}, v29.8b // left[base_y+1] tbl v20.8b, {v0.16b}, v30.8b // left[base_y+2] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull v17.8h, v19.8b, v28.8b umlal v17.8h, v20.8b, v27.8b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f mov v18.8b, v20.8b add v29.8b, v29.8b, v24.8b // base_y += 2 add v30.8b, v30.8b, v24.8b // base_y += 2 b 89b 9: ret endfunc function ipred_z2_fill3_8bpc_neon, export=1 cmp w4, #8 mov w8, #(1 << 6) // xpos = 1 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.16b, #0x3e add v30.4h, v16.4h, v30.4h // -= dy xtn v31.8b, v31.8h // {0,1,2,3} // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.16b, v1.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 add v30.8b, v29.8b, v17.8b // base_y + 1 add v28.8b, v29.8b, v19.8b // base_y + 2 trn1 v31.2s, v31.2s, v31.2s // {0,1,2,3,0,1,2,3} add v24.8b, v30.8b, v19.8b // base_y + 3 trn1 v29.2s, v29.2s, v28.2s // base_y + 0, base_y + 2 trn1 v30.2s, v30.2s, v24.2s // base_y + 1, base_y + 3 sub v28.8b, v26.8b, v27.8b // 64 - frac_y trn1 v27.2s, v27.2s, v27.2s // frac_y trn1 v28.2s, v28.2s, v28.2s // 64 - frac_y movi v24.8b, #4 4: asr w9, w8, #6 // base_x dup v6.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f dup v7.4h, w8 // xpos ldr d2, [x2, w9, sxtw] // top[base_x] ldr d4, [x2, w11, sxtw] trn1 v6.2d, v6.2d, v7.2d // xpos tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] shrn v20.8b, v6.8h, #6 // first base_x for each row xtn v6.8b, v6.8h // (uint8_t)xpos ext v3.8b, v2.8b, v2.8b, #1 // top[base_x+1] ext v5.8b, v4.8b, v4.8b, #1 and v6.8b, v6.8b, v25.8b // frac_x trn1 v2.2s, v2.2s, v4.2s // top[base_x] trn1 v3.2s, v3.2s, v5.2s // top[base_x+1] sub v7.8b, v26.8b, v6.8b // 64 - frac_x add v20.8b, v20.8b, v31.8b // actual base_x umull v16.8h, v16.8b, v28.8b // left[base_y]*(64-frac_y) umlal v16.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y umull v22.8h, v2.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v3.8b, v6.8b // + top[base_x+1]*frac_x cmge v20.8b, v20.8b, #0 rshrn v16.8b, v16.8h, #6 rshrn v22.8b, v22.8h, #6 bit v16.8b, v22.8b, v20.8b st1 {v16.s}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v16.s}[1], [x0], x1 b.le 9f add v29.8b, v29.8b, v24.8b // base_y += 4 add v30.8b, v30.8b, v24.8b // base_y += 4 b 4b 49: tbl v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2] tbl v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3] umull v18.8h, v16.8b, v28.8b // left[base_y]*(64-frac_t) umlal v18.8h, v17.8b, v27.8b // + left[base_y+1]*frac_y rshrn v18.8b, v18.8h, #6 st1 {v18.s}[0], [x0], x1 subs w5, w5, #2 st1 {v18.s}[1], [x0], x1 b.le 9f add v29.8b, v29.8b, v24.8b // base_y += 4 add v30.8b, v30.8b, v24.8b // base_y += 4 b 49b 9: ret 80: dup v30.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v30.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.16b, #0x3e add v30.8h, v16.8h, v30.8h // -= dy xtn v31.8b, v31.8h // {0,1,2,3,4,5,6,7} // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.16b, v1.16b, v2.16b}, [x3] // left[] movi v26.16b, #64 movi v19.16b, #2 xtn v27.8b, v30.8h // (uint8_t)ypos shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v27.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 add v28.8b, v29.8b, v17.8b // base_y + 1 add v30.8b, v29.8b, v19.8b // base_y + 2 trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7} add v24.8b, v28.8b, v19.8b // base_y + 3 trn1 v29.2d, v29.2d, v30.2d // base_y + 0, base_y + 2 trn1 v30.2d, v28.2d, v24.2d // base_y + 1, base_y + 3 sub v28.8b, v26.8b, v27.8b // 64 - frac_y movi v24.16b, #4 trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] shrn v21.8b, v16.8h, #6 // first base_x shrn2 v21.16b, v17.8h, #6 xtn v16.8b, v16.8h // (uint8_t)xpos xtn2 v16.16b, v17.8h ext v5.16b, v4.16b, v4.16b, #1 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #1 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v7.16b, v26.16b, v16.16b // 64 - frac_x add v21.16b, v21.16b, v31.16b // actual base_x umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v17.8h, v18.16b, v28.16b umlal2 v17.8h, v19.16b, v27.16b umull v22.8h, v4.8b, v7.8b // top[base_x]-*(64-frac_x) umlal v22.8h, v5.8b, v16.8b // + top[base_x+1]*frac_x umull2 v23.8h, v4.16b, v7.16b umlal2 v23.8h, v5.16b, v16.16b cmge v21.16b, v21.16b, #0 rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 rshrn v22.8b, v22.8h, #6 rshrn2 v22.16b, v23.8h, #6 bit v6.16b, v22.16b, v21.16b st1 {v6.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 add v30.16b, v30.16b, v24.16b // base_y += 4 b 8b 89: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] umull v6.8h, v18.8b, v28.8b // left[base_y]*(64-frac_y) umlal v6.8h, v19.8b, v27.8b // + left[base_y+1]*frac_y umull2 v17.8h, v18.16b, v28.16b umlal2 v17.8h, v19.16b, v27.16b rshrn v6.8b, v6.8h, #6 rshrn2 v6.16b, v17.8h, #6 st1 {v6.d}[0], [x0], x1 subs w5, w5, #2 st1 {v6.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 add v30.16b, v30.16b, v24.16b // base_y += 4 b 89b 9: ret endfunc // void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const left, // const int width, const int height, // const int dy, const int max_base_y); function ipred_z3_fill1_8bpc_neon, export=1 cmp w6, #64 clz w9, w3 adr x8, L(ipred_z3_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw // left[max_base_y] sub x8, x8, w9, uxtw movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments mov w7, w5 b.gt L(ipred_z3_fill1_large_h16) br x8 40: AARCH64_VALID_JUMP_TARGET dup v29.4h, w5 // dy mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac mov v4.8b, v31.8b uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base] trn1 v27.2s, v27.2s, v28.2s // base + 1, base + 2 trn1 v24.2s, v24.2s, v24.2s // frac trn1 v25.2s, v25.2s, v25.2s // 64 - frac 1: mov v5.8b, v31.8b tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2] trn1 v4.2s, v4.2s, v5.2s // left[base], left[base+1] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac rshrn v16.8b, v16.8h, #6 st1 {v16.s}[0], [x0], x1 subs w4, w4, #2 st1 {v16.s}[1], [x0], x1 b.le 9f ext v4.8b, v5.8b, v5.8b, #4 uqadd v27.8b, v27.8b, v21.8b // base += 2 b 1b 9: ret 80: AARCH64_VALID_JUMP_TARGET dup v29.8h, w5 // dy mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac mov v4.8b, v31.8b uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] 1: mov v5.8b, v31.8b mov v6.8b, v31.8b tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull v17.8h, v5.8b, v25.8b umlal v17.8h, v6.8b, v24.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.le 9f mov v4.8b, v6.8b uqadd v27.8b, v27.8b, v21.8b // base += 2 uqadd v28.8b, v28.8b, v21.8b // base += 2 b 1b 9: ret 160: AARCH64_VALID_JUMP_TARGET dup v28.8h, w5 // dy shl v29.8h, v28.8h, #3 // 8*dy mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] add v28.8h, v28.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 add v29.8h, v28.8h, v29.8h // ypos + 8*dy xtn v24.8b, v28.8h // (uint8_t)ypos xtn2 v24.16b, v29.8h uqshrn v26.8b, v28.8h, #6 // base uqshrn2 v26.16b, v29.8h, #6 and v24.16b, v24.16b, v23.16b // frac mov v4.16b, v31.16b uqadd v27.16b, v26.16b, v20.16b // base + 1 uqadd v28.16b, v26.16b, v21.16b // base + 2 sub v25.16b, v22.16b, v24.16b // 64 - frac tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base] 1: mov v5.16b, v31.16b mov v6.16b, v31.16b tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1] tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b umull v18.8h, v5.8b, v25.8b umlal v18.8h, v6.8b, v24.8b umull2 v19.8h, v5.16b, v25.16b umlal2 v19.8h, v6.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn2 v16.16b, v17.8h, #6 rshrn v17.8b, v18.8h, #6 rshrn2 v17.16b, v19.8h, #6 st1 {v16.16b}, [x0], x1 subs w4, w4, #2 st1 {v17.16b}, [x0], x1 b.le 9f mov v4.16b, v6.16b uqadd v27.16b, v27.16b, v21.16b // base += 2 uqadd v28.16b, v28.16b, v21.16b // base += 2 b 1b 9: ret 320: 640: AARCH64_VALID_JUMP_TARGET dup v28.8h, w5 // dy mov w12, w3 add x13, x0, x1 shl v29.8h, v28.8h, #3 // 8*dy mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e lsl x1, x1, #1 sub x1, x1, w3, uxtw add v30.8h, v28.8h, v30.8h // ypos // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 1: mov v26.16b, v30.16b // reset ypos 2: add v27.8h, v26.8h, v29.8h // ypos + 8*dy uqshrn v16.8b, v26.8h, #6 // base uqshrn2 v16.16b, v27.8h, #6 xtn v24.8b, v26.8h // (uint8_t)ypos xtn2 v24.16b, v27.8h umov w14, v16.b[0] and v24.16b, v24.16b, v23.16b // frac uqadd v17.16b, v16.16b, v20.16b // base + 1 cmp w14, w6 // base >= max_base_y uqadd v18.16b, v16.16b, v21.16b // base + 2 sub v25.16b, v22.16b, v24.16b // 64 - frac b.ge 4f mov v4.16b, v31.16b mov v5.16b, v31.16b mov v6.16b, v31.16b tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base] tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1] tbx v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2] subs w3, w3, #16 umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b umull v18.8h, v5.8b, v25.8b umlal v18.8h, v6.8b, v24.8b umull2 v19.8h, v5.16b, v25.16b umlal2 v19.8h, v6.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn2 v16.16b, v17.8h, #6 rshrn v17.8b, v18.8h, #6 rshrn2 v17.16b, v19.8h, #6 st1 {v16.16b}, [x0], #16 st1 {v17.16b}, [x13], #16 b.le 3f add v26.8h, v27.8h, v29.8h // ypos += 16*dy b 2b 3: subs w4, w4, #2 b.le 9f movi v16.8h, #128 add x0, x0, x1 add x13, x13, x1 add v30.8h, v30.8h, v16.8h // ypos = dy + y*(1<<6)*2 mov w3, w12 b 1b 4: subs w3, w3, #16 st1 {v31.16b}, [x0], #16 st1 {v31.16b}, [x13], #16 b.gt 4b b 3b 9: ret L(ipred_z3_fill1_large_h16): // Fallback case for max_base_y > 64; similar to the z1 // implementation. This does the filtering vertically, filling out // a 2x pixel column at a time. mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 mov w12, w4 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // ypos += dy cmp w8, w6 // base >= max_base_y lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw add x10, x2, w10, uxtw dup v4.16b, w9 // frac dup v5.16b, w11 ld1 {v0.16b, v1.16b}, [x8], #32 // left[base] ld1 {v2.16b, v3.16b}, [x10], #32 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.16b, w9 // 64 - frac dup v7.16b, w11 add w7, w7, w5 // ypos += dy 2: ext v16.16b, v0.16b, v1.16b, #1 // left[base+1] ext v17.16b, v2.16b, v3.16b, #1 subs w4, w4, #16 umull v18.8h, v16.8b, v4.8b // left[base+1]*frac umlal v18.8h, v0.8b, v6.8b // + left[base]*(64-frac) umull2 v19.8h, v16.16b, v4.16b umlal2 v19.8h, v0.16b, v6.16b umull v20.8h, v17.8b, v5.8b umlal v20.8h, v2.8b, v7.8b umull2 v21.8h, v17.16b, v5.16b umlal2 v21.8h, v2.16b, v7.16b rshrn v16.8b, v18.8h, #6 rshrn2 v16.16b, v19.8h, #6 rshrn v17.8b, v20.8h, #6 rshrn2 v17.16b, v21.8h, #6 zip1 v18.16b, v16.16b, v17.16b zip2 v19.16b, v16.16b, v17.16b st1 {v18.h}[0], [x0], x1 st1 {v18.h}[1], [x13], x1 st1 {v18.h}[2], [x0], x1 st1 {v18.h}[3], [x13], x1 st1 {v18.h}[4], [x0], x1 st1 {v18.h}[5], [x13], x1 st1 {v18.h}[6], [x0], x1 st1 {v18.h}[7], [x13], x1 st1 {v19.h}[0], [x0], x1 st1 {v19.h}[1], [x13], x1 st1 {v19.h}[2], [x0], x1 st1 {v19.h}[3], [x13], x1 st1 {v19.h}[4], [x0], x1 st1 {v19.h}[5], [x13], x1 st1 {v19.h}[6], [x0], x1 st1 {v19.h}[7], [x13], x1 b.le 3f mov v0.16b, v1.16b ld1 {v1.16b}, [x8], #16 // left[base] mov v2.16b, v3.16b ld1 {v3.16b}, [x10], #16 b 2b 3: subs w3, w3, #2 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #2 add x13, x13, #2 mov w4, w12 b 1b 9: ret L(ipred_z3_fill1_tbl): .hword L(ipred_z3_fill1_tbl) - 640b .hword L(ipred_z3_fill1_tbl) - 320b .hword L(ipred_z3_fill1_tbl) - 160b .hword L(ipred_z3_fill1_tbl) - 80b .hword L(ipred_z3_fill1_tbl) - 40b endfunc function ipred_z3_fill_padding_neon, export=0 cmp w3, #16 adr x8, L(ipred_z3_fill_padding_tbl) b.gt L(ipred_z3_fill_padding_wide) // w3 = remaining width, w4 = constant height mov w12, w4 1: // Fill a WxH rectangle with padding. W can be any number; // this fills the exact width by filling in the largest // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] sub x9, x8, w9, uxtw br x9 2: st1 {v31.h}[0], [x0], x1 subs w4, w4, #4 st1 {v31.h}[0], [x13], x1 st1 {v31.h}[0], [x0], x1 st1 {v31.h}[0], [x13], x1 b.gt 2b subs w3, w3, #2 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #2 add x13, x13, #2 mov w4, w12 b 1b 4: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 st1 {v31.s}[0], [x0], x1 st1 {v31.s}[0], [x13], x1 b.gt 4b subs w3, w3, #4 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 8: st1 {v31.8b}, [x0], x1 subs w4, w4, #4 st1 {v31.8b}, [x13], x1 st1 {v31.8b}, [x0], x1 st1 {v31.8b}, [x13], x1 b.gt 4b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #8 add x13, x13, #8 mov w4, w12 b 1b 16: 32: 64: st1 {v31.16b}, [x0], x1 subs w4, w4, #4 st1 {v31.16b}, [x13], x1 st1 {v31.16b}, [x0], x1 st1 {v31.16b}, [x13], x1 b.gt 4b subs w3, w3, #16 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w4, w12 b 1b 9: ret L(ipred_z3_fill_padding_tbl): .hword L(ipred_z3_fill_padding_tbl) - 64b .hword L(ipred_z3_fill_padding_tbl) - 32b .hword L(ipred_z3_fill_padding_tbl) - 16b .hword L(ipred_z3_fill_padding_tbl) - 8b .hword L(ipred_z3_fill_padding_tbl) - 4b .hword L(ipred_z3_fill_padding_tbl) - 2b L(ipred_z3_fill_padding_wide): // Fill a WxH rectangle with padding, with W > 16. lsr x1, x1, #1 mov w12, w3 sub x1, x1, w3, uxtw 1: ands w5, w3, #15 b.eq 2f // If the width isn't aligned to 16, first do one 16 byte write // and align the start pointer. sub w3, w3, w5 st1 {v31.16b}, [x0] add x0, x0, w5, uxtw 2: // Fill the rest of the line with aligned 16 byte writes. subs w3, w3, #16 st1 {v31.16b}, [x0], #16 b.gt 2b subs w4, w4, #1 add x0, x0, x1 b.le 9f mov w3, w12 b 1b 9: ret endfunc function ipred_z3_fill2_8bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // left[max_base_y] movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments b.eq 80f 40: // w == 4 dup v29.4h, w5 // dy mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac uqadd v29.8b, v27.8b, v21.8b // base + 3 trn1 v24.2s, v24.2s, v24.2s // frac trn1 v26.2s, v26.2s, v28.2s // base + 0, base + 2 trn1 v27.2s, v27.2s, v29.2s // base + 1, base + 3 trn1 v25.2s, v25.2s, v25.2s // 64 - frac movi v21.16b, #4 1: mov v4.8b, v31.8b mov v5.8b, v31.8b tbx v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2] tbx v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac rshrn v16.8b, v16.8h, #6 st1 {v16.s}[0], [x0], x1 subs w4, w4, #2 st1 {v16.s}[1], [x0], x1 b.le 9f uqadd v26.8b, v26.8b, v21.8b // base += 4 uqadd v27.8b, v27.8b, v21.8b // base += 4 b 1b 9: ret 80: // w == 8 dup v29.8h, w5 // dy mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos movi v22.16b, #64 movi v20.16b, #1 movi v21.16b, #2 xtn v24.8b, v30.8h // (uint8_t)ypos uqshrn v26.8b, v30.8h, #6 // base and v24.8b, v24.8b, v23.8b // frac uqadd v27.8b, v26.8b, v20.8b // base + 1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac uqadd v29.8b, v27.8b, v21.8b // base + 3 trn1 v24.2d, v24.2d, v24.2d // frac trn1 v26.2d, v26.2d, v28.2d // base + 0, base + 2 trn1 v27.2d, v27.2d, v29.2d // base + 1, base + 3 trn1 v25.2d, v25.2d, v25.2d // 64 - frac movi v21.16b, #4 1: mov v4.16b, v31.16b mov v5.16b, v31.16b tbx v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2] tbx v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac umull2 v17.8h, v4.16b, v25.16b umlal2 v17.8h, v5.16b, v24.16b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 subs w4, w4, #2 st1 {v17.8b}, [x0], x1 b.le 9f uqadd v26.16b, v26.16b, v21.16b // base += 4 uqadd v27.16b, v27.16b, v21.16b // base += 4 b 1b 9: ret endfunc // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height); function ipred_filter_8bpc_neon, export=1 and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 adr x5, L(ipred_filter_tbl) ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 ldrh w9, [x5, w9, uxtw #1] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b sub x5, x5, w9, uxtw sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 lsl x1, x1, #1 sxtl v20.8h, v20.8b sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b br x5 40: AARCH64_VALID_JUMP_TARGET ldur s0, [x2, #1] // top (0-3) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-3) 4: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) sqrshrun v2.8b, v2.8h, #4 subs w4, w4, #2 st1 {v2.s}[0], [x0], x1 uxtl v0.8h, v2.8b st1 {v2.s}[1], [x6], x1 ext v0.16b, v0.16b, v0.16b, #8 // move top from [4-7] to [0-3] b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #1] // top (0-7) sub x2, x2, #2 mov x7, #-2 uxtl v0.8h, v0.8b // top (0-7) 8: ld1 {v1.s}[0], [x2], x7 // left (0-1) + topleft (2) mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) uxtl v1.8h, v1.8b // left (0-1) + topleft (2) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) sqrshrun v2.8b, v2.8h, #4 uxtl v1.8h, v2.8b // first block, in 16 bit mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v1.h[3] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v1.h[7] // p6(left[1]) * filter(6) sqrshrun v3.8b, v3.8h, #4 subs w4, w4, #2 st2 {v2.s, v3.s}[0], [x0], x1 zip2 v0.2s, v2.2s, v3.2s st2 {v2.s, v3.s}[1], [x6], x1 uxtl v0.8h, v0.8b b.gt 8b ret 160: 320: AARCH64_VALID_JUMP_TARGET add x8, x2, #1 sub x2, x2, #2 mov x7, #-2 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v0.s}[0], [x2], x7 // left (0-1) + topleft (2) uxtl v0.8h, v0.8b // left (0-1) + topleft (2) 2: ld1 {v2.16b}, [x8], #16 // top(0-15) mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) uxtl v1.8h, v2.8b // top(0-7) uxtl2 v2.8h, v2.16b // top(8-15) mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) sqrshrun v3.8b, v3.8h, #4 uxtl v0.8h, v3.8b // first block, in 16 bit mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) mla v4.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v4.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) sqrshrun v4.8b, v4.8h, #4 uxtl v0.8h, v4.8b // second block, in 16 bit mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) mla v5.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v5.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) sqrshrun v5.8b, v5.8h, #4 uxtl v0.8h, v5.8b // third block, in 16 bit mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) mla v6.8h, v21.8h, v0.h[3] // p5(left[0]) * filter(5) mla v6.8h, v22.8h, v0.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 sqrshrun v6.8b, v6.8h, #4 st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16 st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16 b.le 8f ins v0.h[2], v2.h[7] ins v0.b[0], v6.b[7] ins v0.b[2], v6.b[3] b 2b 8: subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_filter_tbl): .hword L(ipred_filter_tbl) - 320b .hword L(ipred_filter_tbl) - 160b .hword L(ipred_filter_tbl) - 80b .hword L(ipred_filter_tbl) - 40b endfunc // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint16_t *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_8bpc_neon, export=1 ld1 {v0.8h}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) sub w9, w9, #25 ldrh w9, [x6, w9, uxtw #1] xtn v0.8b, v0.8h sub x6, x6, w9, uxtw add x2, x0, x1 lsl x1, x1, #1 br x6 4: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x2], x1 st1 {v1.s}[2], [x0], x1 st1 {v1.s}[3], [x2], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x3], #32 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b st1 {v1.d}[0], [x0], x1 tbl v2.16b, {v0.16b}, v2.16b st1 {v1.d}[1], [x2], x1 st1 {v2.d}[0], [x0], x1 st1 {v2.d}[1], [x2], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64 subs w5, w5, #4 tbl v1.16b, {v0.16b}, v1.16b tbl v2.16b, {v0.16b}, v2.16b st1 {v1.16b}, [x0], x1 tbl v3.16b, {v0.16b}, v3.16b st1 {v2.16b}, [x2], x1 tbl v4.16b, {v0.16b}, v4.16b st1 {v3.16b}, [x0], x1 st1 {v4.16b}, [x2], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #4 tbl v16.16b, {v0.16b}, v16.16b tbl v17.16b, {v0.16b}, v17.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b tbl v20.16b, {v0.16b}, v20.16b st1 {v16.16b, v17.16b}, [x0], x1 tbl v21.16b, {v0.16b}, v21.16b st1 {v18.16b, v19.16b}, [x2], x1 tbl v22.16b, {v0.16b}, v22.16b st1 {v20.16b, v21.16b}, [x0], x1 tbl v23.16b, {v0.16b}, v23.16b st1 {v22.16b, v23.16b}, [x2], x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64 subs w5, w5, #2 tbl v16.16b, {v0.16b}, v16.16b tbl v17.16b, {v0.16b}, v17.16b tbl v18.16b, {v0.16b}, v18.16b tbl v19.16b, {v0.16b}, v19.16b st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1 tbl v20.16b, {v0.16b}, v20.16b tbl v21.16b, {v0.16b}, v21.16b tbl v22.16b, {v0.16b}, v22.16b tbl v23.16b, {v0.16b}, v23.16b st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1 b.gt 64b ret L(pal_pred_tbl): .hword L(pal_pred_tbl) - 64b .hword L(pal_pred_tbl) - 32b .hword L(pal_pred_tbl) - 16b .hword L(pal_pred_tbl) - 8b .hword L(pal_pred_tbl) - 4b endfunc // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_128_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] movi v0.8h, #128 // dc dup v1.8h, w6 // alpha sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x5], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h cmlt v4.8h, v2.8h, #0 // sign cmlt v5.8h, v3.8h, #0 add v2.8h, v2.8h, v4.8h // diff + sign add v3.8h, v3.8h, v5.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h st1 {v2.s}[0], [x0], x1 st1 {v2.s}[1], [x6], x1 subs w4, w4, #4 st1 {v3.s}[0], [x0], x1 st1 {v3.s}[1], [x6], x1 b.gt L(ipred_cfl_splat_w4) ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h cmlt v16.8h, v2.8h, #0 // sign cmlt v17.8h, v3.8h, #0 cmlt v18.8h, v4.8h, #0 cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x6], x1 subs w4, w4, #4 st1 {v4.8b}, [x0], x1 st1 {v5.8b}, [x6], x1 b.gt L(ipred_cfl_splat_w8) ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw mov w9, w3 1: ld1 {v2.8h, v3.8h}, [x5], #32 ld1 {v4.8h, v5.8h}, [x7], #32 mul v2.8h, v2.8h, v1.8h // diff = ac * alpha mul v3.8h, v3.8h, v1.8h mul v4.8h, v4.8h, v1.8h mul v5.8h, v5.8h, v1.8h cmlt v16.8h, v2.8h, #0 // sign cmlt v17.8h, v3.8h, #0 cmlt v18.8h, v4.8h, #0 cmlt v19.8h, v5.8h, #0 add v2.8h, v2.8h, v16.8h // diff + sign add v3.8h, v3.8h, v17.8h add v4.8h, v4.8h, v18.8h add v5.8h, v5.8h, v19.8h srshr v2.8h, v2.8h, #6 // (diff + sign + 32) >> 6 = apply_sign() srshr v3.8h, v3.8h, #6 srshr v4.8h, v4.8h, #6 srshr v5.8h, v5.8h, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h add v4.8h, v4.8h, v0.8h add v5.8h, v5.8h, v0.8h sqxtun v2.8b, v2.8h // iclip_pixel(dc + apply_sign()) sqxtun v3.8b, v3.8h sqxtun v4.8b, v4.8h sqxtun v5.8b, v5.8h subs w3, w3, #16 st1 {v2.8b, v3.8b}, [x0], #16 st1 {v4.8b, v5.8b}, [x6], #16 b.gt 1b subs w4, w4, #2 add x5, x5, w9, uxtw #1 add x7, x7, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b.gt 1b ret L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_top_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] dup v1.8h, w6 // alpha add x2, x2, #1 sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 4: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_top_tbl): .hword L(ipred_cfl_top_tbl) - 32b .hword L(ipred_cfl_top_tbl) - 16b .hword L(ipred_cfl_top_tbl) - 8b .hword L(ipred_cfl_top_tbl) - 4b endfunc // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w9, w3 clz w8, w4 adr x10, L(ipred_cfl_splat_tbl) adr x7, L(ipred_cfl_left_tbl) sub w9, w9, #26 sub w8, w8, #26 ldrh w9, [x10, w9, uxtw #1] ldrh w8, [x7, w8, uxtw #1] dup v1.8h, w6 // alpha sub x9, x10, w9, uxtw sub x7, x7, w8, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_left_h4): AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2] uaddlv h0, v0.8b urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2] uaddlv h0, v0.16b urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] br x9 L(ipred_cfl_left_tbl): .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha); function ipred_cfl_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w8, w3, w4 // width + height dup v1.8h, w6 // alpha clz w9, w3 clz w6, w4 dup v16.8h, w8 // width + height adr x7, L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrh w9, [x7, w9, uxtw #1] ldrh w6, [x7, w6, uxtw #1] neg w8, w8 // -ctz(width + height) sub x9, x7, w9, uxtw sub x7, x7, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w8 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x7 L(ipred_cfl_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], #4 ins v0.s[1], wzr add x2, x2, #1 uaddlv h0, v0.8b br x9 L(ipred_cfl_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.s}[0], [x2] ins v2.s[1], wzr add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #4 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x3334/2) movk w16, #(0x5556/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], #8 uaddlv h0, v0.8b add x2, x2, #1 br x9 L(ipred_cfl_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.8b cmp w4, #8 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x2], #16 uaddlv h0, v0.16b add x2, x2, #1 br x9 L(ipred_cfl_w16): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b cmp w4, #16 add v0.4h, v0.4h, v2.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 4/8/32 cmp w4, #4 mov w16, #(0x3334/2) mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2], #32 uaddlv h2, v2.16b uaddlv h3, v3.16b add x2, x2, #1 add v0.4h, v2.4h, v3.4h br x9 L(ipred_cfl_w32): AARCH64_VALID_JUMP_TARGET ld1 {v2.16b, v3.16b}, [x2] add v0.4h, v0.4h, v16.4h uaddlv h2, v2.16b uaddlv h3, v3.16b cmp w4, #32 add v0.4h, v0.4h, v2.4h add v0.4h, v0.4h, v3.4h ushl v0.4h, v0.4h, v17.4h b.eq 1f // h = 8/16 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 add w17, w4, w4 // w17 = 2*h = 16 or 32 lsr w16, w16, w17 dup v16.4h, w16 sqdmulh v0.4h, v0.4h, v16.4h 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_tbl): .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_420_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_420_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 add v16.8h, v16.8h, v0.8h b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d L(ipred_cfl_ac_420_w4_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h b.gt 2b 3: // Aggregate the sums add v0.8h, v16.8h, v17.8h uaddlv s0, v0.8h // sum sub x0, x0, w6, uxtw #3 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 6b ret L(ipred_cfl_ac_420_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b add v0.8h, v0.8h, v1.8h add v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #1 shl v1.8h, v2.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v0.d}[1], [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 dup v1.4h, v0.h[3] dup v3.4h, v0.h[7] trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 add v16.4h, v16.4h, v0.4h add v17.4h, v17.4h, v1.4h add v18.4h, v18.4h, v2.4h add v19.4h, v19.4h, v3.4h b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d L(ipred_cfl_ac_420_w8_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 add v18.8h, v18.8h, v0.8h add v19.8h, v19.8h, v1.8h b.gt 2b 3: L(ipred_cfl_ac_420_w8_calc_subtract_dc): // Aggregate the sums add v0.8h, v16.8h, v17.8h add v2.8h, v18.8h, v19.8h uaddlp v0.4s, v0.8h uaddlp v2.4s, v2.8h add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #4 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] L(ipred_cfl_ac_420_w8_subtract_dc): 6: // Subtract dc from ac ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h sub v2.8h, v2.8h, v4.8h sub v3.8h, v3.8h, v4.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 6b ret L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_420_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_420_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b ld1 {v4.16b, v5.16b}, [x1], x2 uaddlp v1.8h, v1.16b ld1 {v6.16b, v7.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b uaddlp v4.8h, v4.16b uaddlp v5.8h, v5.16b uaddlp v6.8h, v6.16b uaddlp v7.8h, v7.16b add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h add v4.8h, v4.8h, v6.8h add v5.8h, v5.8h, v7.8h shl v0.8h, v0.8h, #1 shl v1.8h, v1.8h, #1 shl v2.8h, v4.8h, #1 shl v3.8h, v5.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b ldr d5, [x1, #16] uaddlp v0.8h, v0.16b ld1 {v4.16b}, [x1], x2 uaddlp v3.4h, v3.8b ldr d7, [x10, #16] uaddlp v2.8h, v2.16b ld1 {v6.16b}, [x10], x2 uaddlp v5.4h, v5.8b uaddlp v4.8h, v4.16b uaddlp v7.4h, v7.8b uaddlp v6.8h, v6.16b add v1.4h, v1.4h, v3.4h add v0.8h, v0.8h, v2.8h add v5.4h, v5.4h, v7.4h add v4.8h, v4.8h, v6.8h shl v1.4h, v1.4h, #1 shl v0.8h, v0.8h, #1 shl v3.4h, v5.4h, #1 shl v2.8h, v4.8h, #1 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 ld1 {v4.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v6.16b}, [x10], x2 uaddlp v2.8h, v2.16b uaddlp v4.8h, v4.16b uaddlp v6.8h, v6.16b add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v2.8h, v4.8h, #1 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 ld1 {v4.8b}, [x1], x2 uaddlp v0.4h, v0.8b ld1 {v6.8b}, [x10], x2 uaddlp v2.4h, v2.8b uaddlp v4.4h, v4.8b uaddlp v6.4h, v6.8b add v0.4h, v0.4h, v2.4h add v4.4h, v4.4h, v6.4h shl v0.4h, v0.4h, #1 shl v2.4h, v4.4h, #1 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 2b 3: // Double the height and reuse the w8 summing/subtracting lsl w6, w6, #1 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) L(ipred_cfl_ac_420_tbl): .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) .hword 0 L(ipred_cfl_ac_420_w16_tbl): .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_422_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_422_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v1.8b}, [x1], x2 ld1 {v1.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 subs w8, w8, #4 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x10], x2 ld1 {v2.16b}, [x1], x2 uaddlp v0.8h, v0.16b ld1 {v3.16b}, [x10], x2 uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8b}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v2.8b}, [x1], x2 ld1 {v2.d}[1], [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v0.h[3] dup v5.8h, v0.h[7] dup v6.4h, v2.h[3] dup v7.8h, v2.h[7] trn2 v1.2d, v0.2d, v5.2d trn1 v0.2d, v0.2d, v4.2d trn2 v3.2d, v2.2d, v7.2d trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_422_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_422_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.16b, v1.16b}, [x1], x2 ld1 {v2.16b, v3.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v1.8h, v1.16b uaddlp v2.8h, v2.16b uaddlp v3.8h, v3.16b shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 shl v2.8h, v2.8h, #2 shl v3.8h, v3.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr d1, [x1, #16] ld1 {v0.16b}, [x1], x2 ldr d3, [x10, #16] ld1 {v2.16b}, [x10], x2 uaddlp v1.4h, v1.8b uaddlp v0.8h, v0.16b uaddlp v3.4h, v3.8b uaddlp v2.8h, v2.16b shl v1.4h, v1.4h, #2 shl v0.8h, v0.8h, #2 shl v3.4h, v3.4h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 uaddlp v0.8h, v0.16b uaddlp v2.8h, v2.16b shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 uaddlp v0.4h, v0.8b uaddlp v2.4h, v2.8b shl v0.4h, v0.4h, #2 shl v2.4h, v2.4h, #2 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_tbl): .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) .hword 0 L(ipred_cfl_ac_422_w16_tbl): .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_444_tbl) sub w8, w8, #26 ldrh w8, [x7, w8, uxtw #1] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_444_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.s}[0], [x1], x2 ld1 {v0.s}[1], [x10], x2 ld1 {v1.s}[0], [x1], x2 ld1 {v1.s}[1], [x10], x2 ushll v0.8h, v0.8b, #3 ushll v1.8h, v1.8b, #3 subs w8, w8, #4 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x10], x2 ld1 {v2.8b}, [x1], x2 ushll v0.8h, v0.8b, #3 ld1 {v3.8b}, [x10], x2 ushll v1.8h, v1.8b, #3 ushll v2.8h, v2.8b, #3 ushll v3.8h, v3.8b, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.16b}, [x1], x2 ld1 {v2.16b}, [x10], x2 ld1 {v4.16b}, [x1], x2 ushll2 v1.8h, v0.16b, #3 ushll v0.8h, v0.8b, #3 ld1 {v6.16b}, [x10], x2 ushll2 v3.8h, v2.16b, #3 ushll v2.8h, v2.8b, #3 ushll2 v5.8h, v4.16b, #3 ushll v4.8h, v4.8b, #3 ushll2 v7.8h, v6.16b, #3 ushll v6.8h, v6.8b, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b mov v0.16b, v6.16b mov v1.16b, v7.16b mov v2.16b, v6.16b mov v3.16b, v7.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 ld1 {v0.8b}, [x1], x2 ld1 {v2.8b}, [x10], x2 ld1 {v4.8b}, [x1], x2 ld1 {v6.8b}, [x10], x2 ushll v0.8h, v0.8b, #3 ushll v2.8h, v2.8b, #3 ushll v4.8h, v4.8b, #3 ushll v6.8h, v6.8b, #3 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] dup v5.8h, v4.h[7] dup v7.8h, v6.h[7] subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b mov v0.16b, v6.16b mov v1.16b, v7.16b mov v2.16b, v6.16b mov v3.16b, v7.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_444_w32_tbl) ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_444_w32_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v2.16b, v3.16b}, [x1], x2 ld1 {v6.16b, v7.16b}, [x10], x2 ushll v0.8h, v2.8b, #3 ushll2 v1.8h, v2.16b, #3 ushll v2.8h, v3.8b, #3 ushll2 v3.8h, v3.16b, #3 ushll v4.8h, v6.8b, #3 ushll2 v5.8h, v6.16b, #3 ushll v6.8h, v7.8b, #3 ushll2 v7.8h, v7.16b, #3 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ldr d2, [x1, #16] ld1 {v1.16b}, [x1], x2 ldr d6, [x10, #16] ld1 {v5.16b}, [x10], x2 ushll v2.8h, v2.8b, #3 ushll v0.8h, v1.8b, #3 ushll2 v1.8h, v1.16b, #3 ushll v6.8h, v6.8b, #3 ushll v4.8h, v5.8b, #3 ushll2 v5.8h, v5.16b, #3 dup v3.8h, v2.h[7] dup v7.8h, v6.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v1.16b}, [x1], x2 ld1 {v5.16b}, [x10], x2 ushll v0.8h, v1.8b, #3 ushll2 v1.8h, v1.16b, #3 ushll v4.8h, v5.8b, #3 ushll2 v5.8h, v5.16b, #3 dup v2.8h, v1.h[7] dup v3.8h, v1.h[7] dup v6.8h, v5.h[7] dup v7.8h, v5.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8b}, [x1], x2 ld1 {v4.8b}, [x10], x2 ushll v0.8h, v0.8b, #3 ushll v4.8h, v4.8b, #3 dup v1.8h, v0.h[7] dup v2.8h, v0.h[7] dup v3.8h, v0.h[7] dup v5.8h, v4.h[7] dup v6.8h, v4.h[7] dup v7.8h, v4.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v16.8h, v16.8h, v0.8h add v17.8h, v17.8h, v1.8h add v18.8h, v18.8h, v2.8h add v19.8h, v19.8h, v3.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 1b L(ipred_cfl_ac_444_w32_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 add v16.8h, v16.8h, v4.8h add v17.8h, v17.8h, v5.8h add v18.8h, v18.8h, v6.8h add v19.8h, v19.8h, v7.8h b.gt 2b 3: // Quadruple the height and reuse the w8 subtracting lsl w6, w6, #2 // Aggregate the sums, with wider intermediates earlier than in // ipred_cfl_ac_420_w8_calc_subtract_dc. uaddlp v0.4s, v16.8h uaddlp v1.4s, v17.8h uaddlp v2.4s, v18.8h uaddlp v3.4s, v19.8h add v0.4s, v0.4s, v1.4s add v2.4s, v2.4s, v3.4s add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #4 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] b L(ipred_cfl_ac_420_w8_subtract_dc) L(ipred_cfl_ac_444_tbl): .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) L(ipred_cfl_ac_444_w32_tbl): .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) endfunc rav1e-0.7.1/src/arm/64/ipred16.S000064400000000000000000007170621046102023000140610ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height, // const int bitdepth_max); function ipred_dc_128_16bpc_neon, export=1 ldr w8, [sp] clz w3, w3 adr x5, L(ipred_dc_128_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] dup v0.8h, w8 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 urshr v0.8h, v0.8h, #1 br x5 4: AARCH64_VALID_JUMP_TARGET st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b sub x1, x1, #64 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret L(ipred_dc_128_tbl): .hword L(ipred_dc_128_tbl) - 640b .hword L(ipred_dc_128_tbl) - 320b .hword L(ipred_dc_128_tbl) - 160b .hword L(ipred_dc_128_tbl) - 8b .hword L(ipred_dc_128_tbl) - 4b endfunc // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_v_16bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_v_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #2 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] 4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] 8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 sub x1, x1, #64 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 b.gt 64b ret L(ipred_v_tbl): .hword L(ipred_v_tbl) - 640b .hword L(ipred_v_tbl) - 320b .hword L(ipred_v_tbl) - 160b .hword L(ipred_v_tbl) - 80b .hword L(ipred_v_tbl) - 40b endfunc // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_h_16bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_h_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] sub x2, x2, #8 sub x5, x5, w3, uxtw mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 br x5 4: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.4h}, [x0], x1 st1 {v2.4h}, [x6], x1 subs w4, w4, #4 st1 {v1.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 str q3, [x0, #16] str q2, [x6, #16] stp q3, q3, [x0, #32] stp q2, q2, [x6, #32] stp q3, q3, [x0, #64] stp q2, q2, [x6, #64] stp q3, q3, [x0, #96] stp q2, q2, [x6, #96] st1 {v3.8h}, [x0], x1 st1 {v2.8h}, [x6], x1 subs w4, w4, #4 str q1, [x0, #16] str q0, [x6, #16] stp q1, q1, [x0, #32] stp q0, q0, [x6, #32] stp q1, q1, [x0, #64] stp q0, q0, [x6, #64] stp q1, q1, [x0, #96] stp q0, q0, [x6, #96] st1 {v1.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 64b ret L(ipred_h_tbl): .hword L(ipred_h_tbl) - 64b .hword L(ipred_h_tbl) - 32b .hword L(ipred_h_tbl) - 16b .hword L(ipred_h_tbl) - 8b .hword L(ipred_h_tbl) - 4b endfunc // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_top_16bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_top_tbl) sub w3, w3, #25 ldrh w3, [x5, w3, uxtw #1] add x2, x2, #2 sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.4h, v0.h[0] 4: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] 8: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h urshr v2.4h, v0.4h, #4 dup v0.8h, v2.h[0] dup v1.8h, v2.h[0] 16: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #5 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 32: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #6 sub x1, x1, #64 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 64: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 64b ret L(ipred_dc_top_tbl): .hword L(ipred_dc_top_tbl) - 640b .hword L(ipred_dc_top_tbl) - 320b .hword L(ipred_dc_top_tbl) - 160b .hword L(ipred_dc_top_tbl) - 80b .hword L(ipred_dc_top_tbl) - 40b endfunc // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_left_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 clz w3, w3 clz w7, w4 adr x5, L(ipred_dc_left_tbl) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 ldrh w3, [x5, w3, uxtw #1] ldrh w7, [x5, w7, uxtw #1] sub x3, x5, w3, uxtw sub x5, x5, w7, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_left_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w4): AARCH64_VALID_JUMP_TARGET st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt L(ipred_dc_left_w4) ret L(ipred_dc_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x3 L(ipred_dc_left_w8): AARCH64_VALID_JUMP_TARGET st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt L(ipred_dc_left_w8) ret L(ipred_dc_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2] addp v0.8h, v0.8h, v1.8h addv h0, v0.8h urshr v2.4h, v0.4h, #4 dup v0.8h, v2.h[0] dup v1.8h, v2.h[0] br x3 L(ipred_dc_left_w16): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b 1: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h uaddlp v0.4s, v0.8h addv s0, v0.4s rshrn v4.4h, v0.4s, #5 dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w32): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b 1: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h uaddlv s0, v0.8h rshrn v4.4h, v0.4s, #6 dup v0.8h, v4.h[0] br x3 L(ipred_dc_left_w64): AARCH64_VALID_JUMP_TARGET mov v1.16b, v0.16b mov v2.16b, v0.16b mov v3.16b, v0.16b sub x1, x1, #64 1: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 1b ret L(ipred_dc_left_tbl): .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_dc_16bpc_neon, export=1 sub x2, x2, w4, uxtw #1 add w7, w3, w4 // width + height clz w3, w3 clz w6, w4 dup v16.4s, w7 // width + height adr x5, L(ipred_dc_tbl) rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) ldrh w3, [x5, w3, uxtw #1] ldrh w6, [x5, w6, uxtw #1] neg w7, w7 // -ctz(width + height) sub x3, x5, w3, uxtw sub x5, x5, w6, uxtw ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w7 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 br x5 L(ipred_dc_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x3 L(ipred_dc_w4): AARCH64_VALID_JUMP_TARGET ld1 {v1.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.4h cmp w4, #4 add v0.2s, v0.2s, v1.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.4h, v0.h[0] 2: st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 subs w4, w4, #4 st1 {v0.4h}, [x0], x1 st1 {v0.4h}, [x6], x1 b.gt 2b ret L(ipred_dc_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x3 L(ipred_dc_w8): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s1, v1.8h cmp w4, #8 add v0.2s, v0.2s, v1.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] 2: st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h}, [x0], x1 st1 {v0.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h16): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h}, [x2], #32 addp v0.8h, v0.8h, v1.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w16): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h uaddlv s1, v1.8h cmp w4, #16 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 4/8/32/64 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] 2: st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], x1 st1 {v0.8h, v1.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h32): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v2.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w32): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h addp v3.8h, v3.8h, v4.8h addp v1.8h, v1.8h, v3.8h uaddlv s1, v1.8h cmp w4, #32 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 8/16/64 cmp w4, #8 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 2: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_h64): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 addp v0.8h, v0.8h, v1.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h add x2, x2, #2 uaddlv s0, v0.8h br x3 L(ipred_dc_w64): AARCH64_VALID_JUMP_TARGET ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 add v0.2s, v0.2s, v16.2s addp v1.8h, v1.8h, v2.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] addp v3.8h, v3.8h, v4.8h addp v20.8h, v20.8h, v21.8h addp v22.8h, v22.8h, v23.8h addp v1.8h, v1.8h, v3.8h addp v20.8h, v20.8h, v22.8h addp v1.8h, v1.8h, v20.8h uaddlv s1, v1.8h cmp w4, #64 add v0.2s, v0.2s, v1.2s ushl v4.2s, v0.2s, v17.2s b.eq 1f // h = 16/32 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v4.2s, v4.2s, v16.2s ushr v4.2s, v4.2s, #17 1: sub x1, x1, #64 dup v0.8h, v4.h[0] dup v1.8h, v4.h[0] dup v2.8h, v4.h[0] dup v3.8h, v4.h[0] 2: st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 b.gt 2b ret L(ipred_dc_tbl): .hword L(ipred_dc_tbl) - L(ipred_dc_h64) .hword L(ipred_dc_tbl) - L(ipred_dc_h32) .hword L(ipred_dc_tbl) - L(ipred_dc_h16) .hword L(ipred_dc_tbl) - L(ipred_dc_h8) .hword L(ipred_dc_tbl) - L(ipred_dc_h4) .hword L(ipred_dc_tbl) - L(ipred_dc_w64) .hword L(ipred_dc_tbl) - L(ipred_dc_w32) .hword L(ipred_dc_tbl) - L(ipred_dc_w16) .hword L(ipred_dc_tbl) - L(ipred_dc_w8) .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_paeth_16bpc_neon, export=1 clz w9, w3 adr x5, L(ipred_paeth_tbl) sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.8h}, [x2] add x8, x2, #2 sub x2, x2, #8 sub x5, x5, w9, uxtw mov x7, #-8 add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v5.2d}, [x8] sub v6.8h, v5.8h, v4.8h // top - topleft 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 zip1 v0.2d, v0.2d, v1.2d zip1 v2.2d, v2.2d, v3.2d add v16.8h, v6.8h, v0.8h // base add v17.8h, v6.8h, v2.8h sabd v20.8h, v5.8h, v16.8h // tdiff sabd v21.8h, v5.8h, v17.8h sabd v22.8h, v4.8h, v16.8h // tldiff sabd v23.8h, v4.8h, v17.8h sabd v16.8h, v0.8h, v16.8h // ldiff sabd v17.8h, v2.8h, v17.8h umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) umin v19.8h, v21.8h, v23.8h cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff cmge v21.8h, v23.8h, v21.8h cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff cmge v17.8h, v19.8h, v17.8h bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v20.16b, v5.16b, v4.16b bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... bit v20.16b, v0.16b, v16.16b st1 {v21.d}[1], [x0], x1 st1 {v21.d}[0], [x6], x1 subs w4, w4, #4 st1 {v20.d}[1], [x0], x1 st1 {v20.d}[0], [x6], x1 b.gt 4b ret 80: 160: 320: 640: AARCH64_VALID_JUMP_TARGET ld1 {v5.8h}, [x8], #16 mov w9, w3 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 1: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 2: sub v6.8h, v5.8h, v4.8h // top - topleft add v16.8h, v6.8h, v0.8h // base add v17.8h, v6.8h, v1.8h add v18.8h, v6.8h, v2.8h add v19.8h, v6.8h, v3.8h sabd v20.8h, v5.8h, v16.8h // tdiff sabd v21.8h, v5.8h, v17.8h sabd v22.8h, v5.8h, v18.8h sabd v23.8h, v5.8h, v19.8h sabd v24.8h, v4.8h, v16.8h // tldiff sabd v25.8h, v4.8h, v17.8h sabd v26.8h, v4.8h, v18.8h sabd v27.8h, v4.8h, v19.8h sabd v16.8h, v0.8h, v16.8h // ldiff sabd v17.8h, v1.8h, v17.8h sabd v18.8h, v2.8h, v18.8h sabd v19.8h, v3.8h, v19.8h umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) umin v29.8h, v21.8h, v25.8h umin v30.8h, v22.8h, v26.8h umin v31.8h, v23.8h, v27.8h cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff cmge v21.8h, v25.8h, v21.8h cmge v22.8h, v26.8h, v22.8h cmge v23.8h, v27.8h, v23.8h cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff cmge v17.8h, v29.8h, v17.8h cmge v18.8h, v30.8h, v18.8h cmge v19.8h, v31.8h, v19.8h bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft bsl v22.16b, v5.16b, v4.16b bsl v21.16b, v5.16b, v4.16b bsl v20.16b, v5.16b, v4.16b bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... bit v22.16b, v2.16b, v18.16b bit v21.16b, v1.16b, v17.16b bit v20.16b, v0.16b, v16.16b st1 {v23.8h}, [x0], #16 st1 {v22.8h}, [x6], #16 subs w3, w3, #8 st1 {v21.8h}, [x5], #16 st1 {v20.8h}, [x10], #16 b.le 8f ld1 {v5.8h}, [x8], #16 b 2b 8: subs w4, w4, #4 b.le 9f // End of horizontal loop, move pointers to next four rows sub x8, x8, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 // Load the top row as early as possible ld1 {v5.8h}, [x8], #16 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_paeth_tbl): .hword L(ipred_paeth_tbl) - 640b .hword L(ipred_paeth_tbl) - 320b .hword L(ipred_paeth_tbl) - 160b .hword L(ipred_paeth_tbl) - 80b .hword L(ipred_paeth_tbl) - 40b endfunc // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_16bpc_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_tbl) sub x12, x2, w4, uxtw #1 sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.8h}, [x12] // bottom add x8, x2, #2 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x8] // top ld1r {v7.2s}, [x10] // weights_hor sub x2, x2, #8 mov x7, #-8 dup v5.8h, v6.h[3] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor add v31.4h, v4.4h, v5.4h // bottom+right 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 zip1 v1.2d, v1.2d, v0.2d // left, flipped zip1 v0.2d, v3.2d, v2.2d zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v18.8h, v18.8b smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor smlal2 v21.4s, v0.8h, v7.8h smlal v22.4s, v1.4h, v7.4h smlal2 v23.4s, v1.8h, v7.8h smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v6.8h, v16.8h smlal v22.4s, v6.4h, v18.4h smlal2 v23.4s, v6.8h, v18.8h rshrn v20.4h, v20.4s, #9 rshrn v21.4h, v21.4s, #9 rshrn v22.4h, v22.4s, #9 rshrn v23.4h, v23.4s, #9 st1 {v20.4h}, [x0], x1 st1 {v21.4h}, [x6], x1 subs w4, w4, #4 st1 {v22.4h}, [x0], x1 st1 {v23.4h}, [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x8] // top ld1 {v7.8b}, [x10] // weights_hor sub x2, x2, #8 mov x7, #-8 dup v5.8h, v6.h[7] // right sub v6.8h, v6.8h, v4.8h // top-bottom uxtl v7.8h, v7.8b // weights_hor add v31.4h, v4.4h, v5.4h // bottom+right 8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 ushll v24.4s, v31.4h, #8 ushll v25.4s, v31.4h, #8 ushll v26.4s, v31.4h, #8 ushll v27.4s, v31.4h, #8 sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sub v2.8h, v2.8h, v5.8h sub v3.8h, v3.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v19.8h, v19.8b smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor smlal2 v21.4s, v3.8h, v7.8h // (left flipped) smlal v22.4s, v2.4h, v7.4h smlal2 v23.4s, v2.8h, v7.8h smlal v24.4s, v1.4h, v7.4h smlal2 v25.4s, v1.8h, v7.8h smlal v26.4s, v0.4h, v7.4h smlal2 v27.4s, v0.8h, v7.8h smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v6.8h, v16.8h smlal v22.4s, v6.4h, v17.4h smlal2 v23.4s, v6.8h, v17.8h smlal v24.4s, v6.4h, v18.4h smlal2 v25.4s, v6.8h, v18.8h smlal v26.4s, v6.4h, v19.4h smlal2 v27.4s, v6.8h, v19.8h rshrn v20.4h, v20.4s, #9 rshrn2 v20.8h, v21.4s, #9 rshrn v21.4h, v22.4s, #9 rshrn2 v21.8h, v23.4s, #9 rshrn v22.4h, v24.4s, #9 rshrn2 v22.8h, v25.4s, #9 rshrn v23.4h, v26.4s, #9 rshrn2 v23.8h, v27.4s, #9 st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET add x12, x2, w3, uxtw #1 sub x1, x1, w3, uxtw #1 ld1r {v5.8h}, [x12] // right sub x2, x2, #4 mov x7, #-4 mov w9, w3 add v31.4h, v4.4h, v5.4h // bottom+right 1: ld2r {v0.8h, v1.8h}, [x2], x7 // left ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h uxtl v16.8h, v16.8b // weights_ver uxtl v17.8h, v17.8b 2: ld1 {v7.16b}, [x10], #16 // weights_hor ld1 {v2.8h, v3.8h}, [x8], #32 // top ushll v20.4s, v31.4h, #8 // (bottom+right)*256 ushll v21.4s, v31.4h, #8 ushll v22.4s, v31.4h, #8 ushll v23.4s, v31.4h, #8 ushll v24.4s, v31.4h, #8 ushll v25.4s, v31.4h, #8 ushll v26.4s, v31.4h, #8 ushll v27.4s, v31.4h, #8 uxtl v6.8h, v7.8b // weights_hor uxtl2 v7.8h, v7.16b sub v2.8h, v2.8h, v4.8h // top-bottom sub v3.8h, v3.8h, v4.8h smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor smlal2 v21.4s, v1.8h, v6.8h // (left flipped) smlal v22.4s, v1.4h, v7.4h smlal2 v23.4s, v1.8h, v7.8h smlal v24.4s, v0.4h, v6.4h smlal2 v25.4s, v0.8h, v6.8h smlal v26.4s, v0.4h, v7.4h smlal2 v27.4s, v0.8h, v7.8h smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver smlal2 v21.4s, v2.8h, v16.8h smlal v22.4s, v3.4h, v16.4h smlal2 v23.4s, v3.8h, v16.8h smlal v24.4s, v2.4h, v17.4h smlal2 v25.4s, v2.8h, v17.8h smlal v26.4s, v3.4h, v17.4h smlal2 v27.4s, v3.8h, v17.8h rshrn v20.4h, v20.4s, #9 rshrn2 v20.8h, v21.4s, #9 rshrn v21.4h, v22.4s, #9 rshrn2 v21.8h, v23.4s, #9 rshrn v22.4h, v24.4s, #9 rshrn2 v22.8h, v25.4s, #9 rshrn v23.4h, v26.4s, #9 rshrn2 v23.8h, v27.4s, #9 subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x8, w9, uxtw #1 sub x10, x10, w9, uxtw add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_tbl): .hword L(ipred_smooth_tbl) - 640b .hword L(ipred_smooth_tbl) - 320b .hword L(ipred_smooth_tbl) - 160b .hword L(ipred_smooth_tbl) - 80b .hword L(ipred_smooth_tbl) - 40b endfunc // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_v_16bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 adr x5, L(ipred_smooth_v_tbl) sub x8, x2, w4, uxtw #1 sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v4.8h}, [x8] // bottom add x2, x2, #2 sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v6.2d}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 4: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver zip1 v16.2s, v16.2s, v17.2s // weights_ver zip1 v18.2s, v18.2s, v19.2s ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v18.8h, v18.8b, #7 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v6.8h, v18.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h st1 {v20.d}[0], [x0], x1 st1 {v20.d}[1], [x6], x1 subs w4, w4, #4 st1 {v21.d}[0], [x0], x1 st1 {v21.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v6.8h}, [x2] // top sub v6.8h, v6.8h, v4.8h // top-bottom 8: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v17.8h, v17.8b, #7 ushll v18.8h, v18.8b, #7 ushll v19.8h, v19.8b, #7 sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v6.8h, v17.8h sqrdmulh v22.8h, v6.8h, v18.8h sqrdmulh v23.8h, v6.8h, v19.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h add v22.8h, v22.8h, v4.8h add v23.8h, v23.8h, v4.8h st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET // Set up pointers for four rows in parallel; x0, x6, x5, x8 add x5, x0, x1 add x8, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver ushll v16.8h, v16.8b, #7 // weights_ver << 7 ushll v17.8h, v17.8b, #7 ushll v18.8h, v18.8b, #7 ushll v19.8h, v19.8b, #7 2: ld1 {v2.8h, v3.8h}, [x2], #32 // top sub v2.8h, v2.8h, v4.8h // top-bottom sub v3.8h, v3.8h, v4.8h sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 sqrdmulh v21.8h, v3.8h, v16.8h sqrdmulh v22.8h, v2.8h, v17.8h sqrdmulh v23.8h, v3.8h, v17.8h sqrdmulh v24.8h, v2.8h, v18.8h sqrdmulh v25.8h, v3.8h, v18.8h sqrdmulh v26.8h, v2.8h, v19.8h sqrdmulh v27.8h, v3.8h, v19.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v4.8h add v22.8h, v22.8h, v4.8h add v23.8h, v23.8h, v4.8h add v24.8h, v24.8h, v4.8h add v25.8h, v25.8h, v4.8h add v26.8h, v26.8h, v4.8h add v27.8h, v27.8h, v4.8h subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 st1 {v24.8h, v25.8h}, [x5], #32 st1 {v26.8h, v27.8h}, [x8], #32 b.gt 2b subs w4, w4, #4 b.le 9f sub x2, x2, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x8, x8, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_v_tbl): .hword L(ipred_smooth_v_tbl) - 640b .hword L(ipred_smooth_v_tbl) - 320b .hword L(ipred_smooth_v_tbl) - 160b .hword L(ipred_smooth_v_tbl) - 80b .hword L(ipred_smooth_v_tbl) - 40b endfunc // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int a, // const int max_width, const int max_height); function ipred_smooth_h_16bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 adr x5, L(ipred_smooth_h_tbl) add x12, x2, w3, uxtw #1 sub w9, w9, #25 ldrh w9, [x5, w9, uxtw #1] ld1r {v5.8h}, [x12] // right sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 40: AARCH64_VALID_JUMP_TARGET ld1r {v7.2s}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 ushll v7.8h, v7.8b, #7 // weights_hor << 7 4: ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left zip1 v1.2d, v1.2d, v0.2d // left, flipped zip1 v0.2d, v3.2d, v2.2d sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v1.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h st1 {v20.d}[0], [x0], x1 st1 {v20.d}[1], [x6], x1 subs w4, w4, #4 st1 {v21.d}[0], [x0], x1 st1 {v21.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v7.8b}, [x8] // weights_hor sub x2, x2, #8 mov x7, #-8 ushll v7.8h, v7.8b, #7 // weights_hor << 7 8: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left sub v3.8h, v3.8h, v5.8h // left-right sub v2.8h, v2.8h, v5.8h sub v1.8h, v1.8h, v5.8h sub v0.8h, v0.8h, v5.8h sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) sqrdmulh v22.8h, v1.8h, v7.8h sqrdmulh v23.8h, v0.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h add v22.8h, v22.8h, v5.8h add v23.8h, v23.8h, v5.8h st1 {v20.8h}, [x0], x1 st1 {v21.8h}, [x6], x1 subs w4, w4, #4 st1 {v22.8h}, [x0], x1 st1 {v23.8h}, [x6], x1 b.gt 8b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET sub x2, x2, #8 mov x7, #-8 // Set up pointers for four rows in parallel; x0, x6, x5, x10 add x5, x0, x1 add x10, x6, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left sub v0.8h, v0.8h, v5.8h // left-right sub v1.8h, v1.8h, v5.8h sub v2.8h, v2.8h, v5.8h sub v3.8h, v3.8h, v5.8h 2: ld1 {v7.16b}, [x8], #16 // weights_hor ushll v6.8h, v7.8b, #7 // weights_hor << 7 ushll2 v7.8h, v7.16b, #7 sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) sqrdmulh v22.8h, v2.8h, v6.8h sqrdmulh v23.8h, v2.8h, v7.8h sqrdmulh v24.8h, v1.8h, v6.8h sqrdmulh v25.8h, v1.8h, v7.8h sqrdmulh v26.8h, v0.8h, v6.8h sqrdmulh v27.8h, v0.8h, v7.8h add v20.8h, v20.8h, v5.8h add v21.8h, v21.8h, v5.8h add v22.8h, v22.8h, v5.8h add v23.8h, v23.8h, v5.8h add v24.8h, v24.8h, v5.8h add v25.8h, v25.8h, v5.8h add v26.8h, v26.8h, v5.8h add v27.8h, v27.8h, v5.8h subs w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 st1 {v22.8h, v23.8h}, [x6], #32 st1 {v24.8h, v25.8h}, [x5], #32 st1 {v26.8h, v27.8h}, [x10], #32 b.gt 2b subs w4, w4, #4 b.le 9f sub x8, x8, w9, uxtw add x0, x0, x1 add x6, x6, x1 add x5, x5, x1 add x10, x10, x1 mov w3, w9 b 1b 9: ret L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 640b .hword L(ipred_smooth_h_tbl) - 320b .hword L(ipred_smooth_h_tbl) - 160b .hword L(ipred_smooth_h_tbl) - 80b .hword L(ipred_smooth_h_tbl) - 40b endfunc const padding_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 padding_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, // const pixel *const in, const int end, // const int bitdepth_max); function ipred_z1_upsample_edge_16bpc_neon, export=1 dup v30.8h, w4 // bitdepth_max movrel x4, padding_mask ld1 {v0.8h, v1.8h}, [x2] // in[] add x5, x2, w3, uxtw #1 // in[end] sub x4, x4, w3, uxtw #1 ld1r {v2.8h}, [x5] // padding ld1 {v3.8h, v4.8h}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v2.16b, v3.16b // padded in[] bit v1.16b, v2.16b, v4.16b ext v4.16b, v0.16b, v1.16b, #2 ext v5.16b, v1.16b, v2.16b, #2 ext v6.16b, v0.16b, v1.16b, #4 ext v7.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] add v19.8h, v5.8h, v7.8h add v20.8h, v0.8h, v16.8h add v21.8h, v1.8h, v17.8h umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) umull2 v23.4s, v18.8h, v31.8h umull v24.4s, v19.4h, v31.4h umull2 v25.4s, v19.8h, v31.8h usubw v22.4s, v22.4s, v20.4h usubw2 v23.4s, v23.4s, v20.8h usubw v24.4s, v24.4s, v21.4h usubw2 v25.4s, v25.4s, v21.8h sqrshrun v16.4h, v22.4s, #4 sqrshrun2 v16.8h, v23.4s, #4 sqrshrun v17.4h, v24.4s, #4 sqrshrun2 v17.8h, v25.4s, #4 smin v16.8h, v16.8h, v30.8h smin v17.8h, v17.8h, v30.8h zip1 v0.8h, v4.8h, v16.8h zip2 v1.8h, v4.8h, v16.8h zip1 v2.8h, v5.8h, v17.8h zip2 v3.8h, v5.8h, v17.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] ret endfunc // void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz, // const pixel *const in, // const int bitdepth_max); function ipred_z2_upsample_edge_16bpc_neon, export=1 dup v30.8h, w3 // bitdepth_max // Here, sz is 4 or 8, and we produce 2*sz+1 output elements. movrel x4, padding_mask ld1 {v0.8h, v1.8h}, [x2] // in[] add x5, x2, w1, uxtw #1 // in[sz] sub x4, x4, w1, uxtw #1 ld1r {v3.8h}, [x2] // in[0] for padding ld1r {v2.8h}, [x5] // padding ld1 {v4.8h, v5.8h}, [x4] // padding_mask movi v31.8h, #9 bit v0.16b, v2.16b, v4.16b // padded in[] bit v1.16b, v2.16b, v5.16b ext v4.16b, v3.16b, v0.16b, #14 ext v5.16b, v0.16b, v1.16b, #2 ext v6.16b, v0.16b, v1.16b, #4 add v16.8h, v0.8h, v5.8h // in[i+0] + in[i+1] add v17.8h, v4.8h, v6.8h // in[i-1] + in[i+2] umull v18.4s, v16.4h, v31.4h // 9*(in[i+1] + in[i+2]) umull2 v19.4s, v16.8h, v31.8h usubw v18.4s, v18.4s, v17.4h usubw2 v19.4s, v19.4s, v17.8h sqrshrun v16.4h, v18.4s, #4 sqrshrun2 v16.8h, v19.4s, #4 add x5, x0, #2*16 smin v16.8h, v16.8h, v30.8h zip1 v4.8h, v0.8h, v16.8h zip2 v5.8h, v0.8h, v16.8h st1 {v2.h}[0], [x5] // In case sz=8, output one single pixel in out[16]. st1 {v4.8h, v5.8h}, [x0] ret endfunc const edge_filter .short 0, 4, 8, 0 .short 0, 5, 6, 0 // Leaving out the coeffs for strength=3 // .byte 2, 4, 4, 0 endconst // void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, // const pixel *const in, const int end, // const int strength); function ipred_z1_filter_edge_16bpc_neon, export=1 cmp w4, #3 b.eq L(fivetap) // if (strength == 3) goto fivetap movrel x5, edge_filter, -6 add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) ld1 {v31.s}[0], [x5] // kernel[1-2] ld1 {v0.8h}, [x2], #16 dup v30.8h, v31.h[0] dup v31.8h, v31.h[1] 1: // in[end], is the last valid pixel. We produce 16 pixels out by // using 18 pixels in - the last pixel used is [17] of the ones // read/buffered. cmp w3, #17 ld1 {v1.8h, v2.8h}, [x2], #32 b.lt 2f ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 mul v16.8h, v0.8h, v30.8h mla v16.8h, v3.8h, v31.8h mla v16.8h, v5.8h, v30.8h mul v17.8h, v1.8h, v30.8h mla v17.8h, v4.8h, v31.8h mla v17.8h, v6.8h, v30.8h subs w1, w1, #16 mov v0.16b, v2.16b urshr v16.8h, v16.8h, #4 urshr v17.8h, v17.8h, #4 sub w3, w3, #16 st1 {v16.8h, v17.8h}, [x0], #32 b.gt 1b ret 2: // Right padding // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) movrel x5, padding_mask sub w6, w3, #24 sub x5, x5, w3, uxtw #1 add x6, x2, w6, sxtw #1 ld1 {v3.8h, v4.8h}, [x5] // padding_mask ld1r {v2.8h}, [x6] bit v0.16b, v2.16b, v3.16b // Pad v0-v1 bit v1.16b, v2.16b, v4.16b // Filter one block ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 mul v16.8h, v0.8h, v30.8h mla v16.8h, v3.8h, v31.8h mla v16.8h, v5.8h, v30.8h mul v17.8h, v1.8h, v30.8h mla v17.8h, v4.8h, v31.8h mla v17.8h, v6.8h, v30.8h subs w1, w1, #16 urshr v16.8h, v16.8h, #4 urshr v17.8h, v17.8h, #4 st1 {v16.8h, v17.8h}, [x0], #32 b.le 9f 5: // After one block, any remaining output would only be filtering // padding - thus just store the padding. subs w1, w1, #16 st1 {v2.16b}, [x0], #16 b.gt 5b 9: ret L(fivetap): sub x2, x2, #2 // topleft -= 1 pixel movi v29.8h, #2 ld1 {v0.8h}, [x2], #16 movi v30.8h, #4 movi v31.8h, #4 ins v0.h[0], v0.h[1] 1: // in[end+1], is the last valid pixel. We produce 16 pixels out by // using 20 pixels in - the last pixel used is [19] of the ones // read/buffered. cmp w3, #18 ld1 {v1.8h, v2.8h}, [x2], #32 b.lt 2f // if (end + 1 < 19) ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 ext v18.16b, v0.16b, v1.16b, #8 ext v19.16b, v1.16b, v2.16b, #8 mul v20.8h, v0.8h, v29.8h mla v20.8h, v3.8h, v30.8h mla v20.8h, v5.8h, v31.8h mla v20.8h, v16.8h, v30.8h mla v20.8h, v18.8h, v29.8h mul v21.8h, v1.8h, v29.8h mla v21.8h, v4.8h, v30.8h mla v21.8h, v6.8h, v31.8h mla v21.8h, v17.8h, v30.8h mla v21.8h, v19.8h, v29.8h subs w1, w1, #16 mov v0.16b, v2.16b urshr v20.8h, v20.8h, #4 urshr v21.8h, v21.8h, #4 sub w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 b.gt 1b ret 2: // Right padding // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) movrel x5, padding_mask, -2 sub w6, w3, #23 sub x5, x5, w3, uxtw #1 add x6, x2, w6, sxtw #1 ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask ld1r {v28.8h}, [x6] bit v0.16b, v28.16b, v3.16b // Pad v0-v2 bit v1.16b, v28.16b, v4.16b bit v2.16b, v28.16b, v5.16b 4: // Filter one block ext v3.16b, v0.16b, v1.16b, #2 ext v4.16b, v1.16b, v2.16b, #2 ext v5.16b, v0.16b, v1.16b, #4 ext v6.16b, v1.16b, v2.16b, #4 ext v16.16b, v0.16b, v1.16b, #6 ext v17.16b, v1.16b, v2.16b, #6 ext v18.16b, v0.16b, v1.16b, #8 ext v19.16b, v1.16b, v2.16b, #8 mul v20.8h, v0.8h, v29.8h mla v20.8h, v3.8h, v30.8h mla v20.8h, v5.8h, v31.8h mla v20.8h, v16.8h, v30.8h mla v20.8h, v18.8h, v29.8h mul v21.8h, v1.8h, v29.8h mla v21.8h, v4.8h, v30.8h mla v21.8h, v6.8h, v31.8h mla v21.8h, v17.8h, v30.8h mla v21.8h, v19.8h, v29.8h subs w1, w1, #16 mov v0.16b, v2.16b mov v1.16b, v28.16b mov v2.16b, v28.16b urshr v20.8h, v20.8h, #4 urshr v21.8h, v21.8h, #4 sub w3, w3, #16 st1 {v20.8h, v21.8h}, [x0], #32 b.le 9f // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to // filter properly once more - aka (w3 >= 0). cmp w3, #0 b.ge 4b 5: // When w3 <= 0, all remaining pixels in v0-v1 are equal to the // last valid pixel - thus just output that without filtering. subs w1, w1, #8 st1 {v28.8h}, [x0], #16 b.gt 5b 9: ret endfunc // void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, // const int n); function ipred_pixel_set_16bpc_neon, export=1 dup v0.8h, w1 1: subs w2, w2, #8 st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc // void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const int width, const int height, // const int dx, const int max_base_x); function ipred_z1_fill1_16bpc_neon, export=1 clz w9, w3 adr x8, L(ipred_z1_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw #1 // top[max_base_x] sub x8, x8, w9, uxtw ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] ext v3.16b, v2.16b, v2.16b, #2 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 st1 {v16.4h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.4h}, [x0], x1 b.gt 4b ret 49: st1 {v31.4h}, [x0], x1 subs w4, w4, #2 st1 {v31.4h}, [x0], x1 b.gt 49b ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h}, [x8] // top[base] ld1 {v2.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 ldr h1, [x8, #16] ldr h3, [x10, #16] dup v6.8h, w9 // 64 - frac dup v7.8h, w11 ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] ext v3.16b, v2.16b, v3.16b, #2 umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v1.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v3.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v3.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 st1 {v16.8h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8h}, [x0], x1 b.gt 8b ret 89: st1 {v31.8h}, [x0], x1 subs w4, w4, #2 st1 {v31.8h}, [x0], x1 b.gt 89b ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w3 add x13, x0, x1 lsl x1, x1, #1 sub x1, x1, w3, uxtw #1 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 169f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v6.8h, w9 // frac dup v7.8h, w11 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v16.8h, w9 // 64 - frac dup v17.8h, w11 add w7, w7, w5 // xpos += dx 2: ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] ext v19.16b, v1.16b, v2.16b, #2 ext v20.16b, v3.16b, v4.16b, #2 ext v21.16b, v4.16b, v5.16b, #2 subs w3, w3, #16 umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac umull2 v23.4s, v0.8h, v16.8h umlal2 v23.4s, v18.8h, v6.8h umull v24.4s, v1.4h, v16.4h umlal v24.4s, v19.4h, v6.4h umull2 v25.4s, v1.8h, v16.8h umlal2 v25.4s, v19.8h, v6.8h umull v26.4s, v3.4h, v17.4h umlal v26.4s, v20.4h, v7.4h umull2 v27.4s, v3.8h, v17.8h umlal2 v27.4s, v20.8h, v7.8h umull v28.4s, v4.4h, v17.4h umlal v28.4s, v21.4h, v7.4h umull2 v29.4s, v4.8h, v17.8h umlal2 v29.4s, v21.8h, v7.8h rshrn v22.4h, v22.4s, #6 rshrn2 v22.8h, v23.4s, #6 rshrn v23.4h, v24.4s, #6 rshrn2 v23.8h, v25.4s, #6 rshrn v24.4h, v26.4s, #6 rshrn2 v24.8h, v27.4s, #6 rshrn v25.4h, v28.4s, #6 rshrn2 v25.8h, v29.4s, #6 st1 {v22.8h, v23.8h}, [x0], #32 st1 {v24.8h, v25.8h}, [x13], #32 b.le 3f mov v0.16b, v2.16b ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] mov v3.16b, v5.16b ld1 {v4.8h, v5.8h}, [x10], #32 b 2b 3: subs w4, w4, #2 b.le 9f add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 1b 9: ret 169: st1 {v31.8h}, [x0], #16 subs w3, w3, #8 st1 {v31.8h}, [x13], #16 b.gt 169b subs w4, w4, #2 b.le 9b add x0, x0, x1 add x13, x13, x1 mov w3, w12 b 169b L(ipred_z1_fill1_tbl): .hword L(ipred_z1_fill1_tbl) - 640b .hword L(ipred_z1_fill1_tbl) - 320b .hword L(ipred_z1_fill1_tbl) - 160b .hword L(ipred_z1_fill1_tbl) - 80b .hword L(ipred_z1_fill1_tbl) - 40b endfunc function ipred_z1_fill2_16bpc_neon, export=1 cmp w3, #8 add x10, x2, w6, uxtw // top[max_base_x] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 b.eq 8f 4: // w == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 49f lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] uzp1 v0.8h, v0.8h, v0.8h // top[base] uzp2 v3.8h, v2.8h, v2.8h uzp1 v2.8h, v2.8h, v2.8h sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 st1 {v16.4h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.4h}, [x0], x1 b.gt 4b ret 49: st1 {v31.4h}, [x0], x1 subs w4, w4, #2 st1 {v31.4h}, [x0], x1 b.gt 49b ret 8: // w == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge 89f add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h, v1.8h}, [x8] // top[base] ld1 {v2.8h, v3.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8h, w9 // 64 - frac dup v7.8h, w11 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] uzp1 v0.8h, v0.8h, v1.8h // top[base] uzp2 v21.8h, v2.8h, v3.8h uzp1 v2.8h, v2.8h, v3.8h umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v20.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v21.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v21.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 st1 {v16.8h}, [x0], x1 add w7, w7, w5 // xpos += dx subs w4, w4, #2 st1 {v17.8h}, [x0], x1 b.gt 8b ret 89: st1 {v31.8h}, [x0], x1 subs w4, w4, #2 st1 {v31.8h}, [x0], x1 b.gt 89b ret endfunc // void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, // const int n); function ipred_reverse_16bpc_neon, export=1 sub x1, x1, #16 add x3, x0, #8 mov x4, #16 1: ld1 {v0.8h}, [x1] subs w2, w2, #8 rev64 v0.8h, v0.8h sub x1, x1, #16 st1 {v0.d}[1], [x0], x4 st1 {v0.d}[0], [x3], x4 b.gt 1b ret endfunc const increments .short 0, 1, 2, 3, 4, 5, 6, 7 endconst // void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const pixel *const left, // const int width, const int height, // const int dx, const int dy); function ipred_z2_fill1_16bpc_neon, export=1 clz w10, w4 adr x9, L(ipred_z2_fill1_tbl) sub w10, w10, #25 ldrh w10, [x9, w10, uxtw #1] mov w8, #(1 << 6) // xpos = 1 << 6 sub x9, x9, w10, uxtw sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy br x9 40: AARCH64_VALID_JUMP_TARGET dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // Worst case height for w=4 is 16, but we need at least h+1 elements ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) tbl v18.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v29.16b, #4 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos // Cut corners here; only doing tbl over v0-v1 here; we only // seem to need the last pixel, from v2, after skipping to the // left-only codepath below. tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] sshr v20.8h, v16.8h, #6 // first base_x for each row ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 4b 49: tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2] trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 49b 9: ret 80: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy add x3, x3, #2 // Skip past left[0] mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // Worst case height for w=8 is 32. ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] ld1r {v15.8h}, [x2] // left[0] == top[0] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y mov v18.16b, v15.16b // left[0] zip1 v29.16b, v29.16b, v29.16b // duplicate elements movi v17.16b, #2 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbx v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] mov v19.16b, v15.16b // left[0] ld1 {v6.8h, v7.8h}, [x11] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] mov v20.16b, v15.16b // left[0] sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v21.8h, v21.8h, #0 cmge v22.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 8b 89: mov v19.16b, v15.16b mov v20.16b, v15.16b tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v19.4h, v28.4h umlal v6.4s, v20.4h, v27.4h umull2 v7.4s, v19.8h, v28.8h umlal2 v7.4s, v20.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v25.8h, w7 // -dy add x3, x3, #2 // Skip past left[0] add x13, x0, x1 // alternating row lsl x1, x1, #1 // stride *= 2 sub x1, x1, w4, uxtw #1 // stride -= width movi v11.8h, #8 mul v26.8h, v31.8h, v25.8h // {0,1,2,3,4,5,6,7}* -dy add v26.8h, v26.8h, v25.8h // -= dy mul v25.8h, v25.8h, v11.8h // -8*dy // Worst case height is 64, but we can only fit 32 pixels into // v0-v3 usable within one tbx instruction. As long as base_y is // up to 32, we use tbx. ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[] ld1r {v15.8h}, [x2] // left[0] == top[0] mov w12, w4 // orig w neg w14, w4 // -w 1: mov v23.16b, v26.16b // reset ypos asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, w14 // base_x <= -2*w asr w11, w8, #6 // base_x b.le 169f dup v17.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 ld1 {v4.8h}, [x9], #16 // top[base_x] ld1 {v6.8h}, [x11], #16 movi v10.8h, #0x3e movi v11.8h, #64 and v16.16b, v16.16b, v10.16b // frac_x and v17.16b, v17.16b, v10.16b sub v8.8h, v11.8h, v16.8h // 64 - frac_x sub v9.8h, v11.8h, v17.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h 2: smov w10, v22.h[0] shrn v29.8b, v23.8h, #6 // ypos >> 6 movi v12.8h, #64 cmp w10, #0 // base_x (bottom left) >= 0 smov w10, v29.b[0] // base_y[0] movi v10.8h, #0x3e b.ge 4f and v27.16b, v23.16b, v10.16b // frac_y cmp w10, #(32-3) mov v18.16b, v15.16b // left[0] sub v28.8h, v12.8h, v27.8h // 64 - frac_y b.gt 22f 21: // base_y < 32, using tbx shl v29.8b, v29.8b, #1 // 2*base_y movi v11.8h, #1, lsl #8 zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... movi v13.16b, #2 tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v13.16b // base_y + 1 (*2) mov v19.16b, v15.16b // left[0] tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v13.16b // base_y + 2 (*2) mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] b 23f 22: // base_y >= 32, using separate loads. smov w15, v29.b[1] smov w16, v29.b[2] add x10, x3, w10, sxtw #1 smov w17, v29.b[3] add x15, x3, w15, sxtw #1 ld3 {v18.h, v19.h, v20.h}[0], [x10] smov w10, v29.b[4] add x16, x3, w16, sxtw #1 ld3 {v18.h, v19.h, v20.h}[1], [x15] smov w15, v29.b[5] add x17, x3, w17, sxtw #1 ld3 {v18.h, v19.h, v20.h}[2], [x16] smov w16, v29.b[6] add x10, x3, w10, sxtw #1 ld3 {v18.h, v19.h, v20.h}[3], [x17] smov w17, v29.b[7] add x15, x3, w15, sxtw #1 add x16, x3, w16, sxtw #1 ld3 {v18.h, v19.h, v20.h}[4], [x10] add x17, x3, w17, sxtw #1 ld3 {v18.h, v19.h, v20.h}[5], [x15] ld3 {v18.h, v19.h, v20.h}[6], [x16] ld3 {v18.h, v19.h, v20.h}[7], [x17] 23: ld1 {v5.8h}, [x9], #16 // top[base_x] ld1 {v7.8h}, [x11], #16 add v23.8h, v23.8h, v25.8h // ypos -= 8*dy umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #2 rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v18.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v19.4h, v17.4h umull2 v20.4s, v6.8h, v9.8h umlal2 v20.4s, v19.8h, v17.8h cmge v18.8h, v21.8h, #0 cmge v19.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v20.4s, #6 bit v10.16b, v12.16b, v18.16b bit v11.16b, v13.16b, v19.16b st1 {v10.8h}, [x0], #16 subs w4, w4, #8 st1 {v11.8h}, [x13], #16 b.le 3f movi v10.8h, #8 mov v4.16b, v5.16b mov v6.16b, v7.16b add v21.8h, v21.8h, v10.8h // base_x += 8 add v22.8h, v22.8h, v10.8h b 2b 3: subs w5, w5, #2 b.le 9f movi v10.8h, #128 add x0, x0, x1 add x13, x13, x1 mov w4, w12 // reset w add v26.8h, v26.8h, v10.8h // ypos += 2*(1<<6) b 1b 4: // The rest of the row only predicted from top[] ld1 {v5.8h}, [x9], #16 // top[base_x] ld1 {v7.8h}, [x11], #16 ext v18.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v19.16b, v6.16b, v7.16b, #2 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v18.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v18.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v19.4h, v17.4h umull2 v20.4s, v6.8h, v9.8h umlal2 v20.4s, v19.8h, v17.8h rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v20.4s, #6 st1 {v12.8h}, [x0], #16 subs w4, w4, #8 st1 {v13.8h}, [x13], #16 b.le 3b mov v4.16b, v5.16b mov v6.16b, v7.16b b 4b 169: // The rest of the block only predicted from left[] add x1, x1, w4, uxtw #1 // restore stride mov w12, w5 // orig remaining h 1: movi v12.8h, #64 movi v10.8h, #0x3e shrn v29.8b, v23.8h, #6 // ypos >> 6 and v27.16b, v23.16b, v10.16b // frac_y smov w10, v29.b[0] // base_y[0] shl v29.8b, v29.8b, #1 // 2*base_y movi v11.8h, #1, lsl #8 zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v23.8h, v23.8h, v25.8h // ypos -= 8*dy add v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ... cmp w10, #(32-1) mov v18.16b, v15.16b // left[0] movi v21.16b, #2 sub v28.8h, v12.8h, v27.8h // 64 - frac_y b.gt 31f tbx v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y] add v29.16b, v29.16b, v21.16b // base_y + 1 (*2) 2: // base_y < 32, using tbx. smov w10, v29.b[0] // base_y[0] mov v19.16b, v15.16b // left[0] cmp w10, #(64-4) b.gt 32f tbx v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1] add v29.16b, v29.16b, v21.16b // base_y + 2 (*2) mov v20.16b, v15.16b // left[0] tbx v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2] add v29.16b, v29.16b, v21.16b // next base_y umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 st1 {v10.8h}, [x0], x1 subs w5, w5, #2 st1 {v11.8h}, [x13], x1 b.le 4f mov v18.16b, v20.16b b 2b 31: // base_y >= 32, using separate loads, loading v18 if we had to bail // in the prologue. smov w10, v29.b[0] smov w15, v29.b[2] movi v21.16b, #2 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld1 {v18.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld1 {v18.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld1 {v18.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld1 {v18.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld1 {v18.h}[4], [x10] add x17, x3, w17, sxtw ld1 {v18.h}[5], [x15] add v29.16b, v29.16b, v21.16b // next base_y ld1 {v18.h}[6], [x16] ld1 {v18.h}[7], [x17] 32: // base_y >= 32, using separate loads. cmp w5, #4 b.lt 34f 33: // h >= 4, preserving v18 from the previous round, loading v19-v22. smov w10, v29.b[0] subs w5, w5, #4 smov w15, v29.b[2] movi v10.16b, #8 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[4], [x10] add x17, x3, w17, sxtw ld4 {v19.h, v20.h, v21.h, v22.h}[5], [x15] ld4 {v19.h, v20.h, v21.h, v22.h}[6], [x16] add v29.16b, v29.16b, v10.16b // next base_y ld4 {v19.h, v20.h, v21.h, v22.h}[7], [x17] umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v20.4h, v28.4h // left[base_y]*(64-frac_y) umlal v12.4s, v21.4h, v27.4h // + left[base_y+1]*frac_y umull2 v13.4s, v20.8h, v28.8h umlal2 v13.4s, v21.8h, v27.8h umull v14.4s, v21.4h, v28.4h umlal v14.4s, v22.4h, v27.4h umull2 v18.4s, v21.8h, v28.8h umlal2 v18.4s, v22.8h, v27.8h rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 st1 {v10.8h}, [x0], x1 cmp w5, #2 st1 {v11.8h}, [x13], x1 st1 {v12.8h}, [x0], x1 st1 {v13.8h}, [x13], x1 b.lt 4f mov v18.16b, v22.16b b.gt 33b 34: // h == 2, preserving v18 from the previous round, loading v19-v20. smov w10, v29.b[0] smov w15, v29.b[2] movi v21.16b, #4 smov w16, v29.b[4] add x10, x3, w10, sxtw smov w17, v29.b[6] add x15, x3, w15, sxtw ld2 {v19.h, v20.h}[0], [x10] smov w10, v29.b[8] add x16, x3, w16, sxtw ld2 {v19.h, v20.h}[1], [x15] smov w15, v29.b[10] add x17, x3, w17, sxtw ld2 {v19.h, v20.h}[2], [x16] smov w16, v29.b[12] add x10, x3, w10, sxtw ld2 {v19.h, v20.h}[3], [x17] smov w17, v29.b[14] add x15, x3, w15, sxtw add x16, x3, w16, sxtw ld2 {v19.h, v20.h}[4], [x10] add x17, x3, w17, sxtw ld2 {v19.h, v20.h}[5], [x15] ld2 {v19.h, v20.h}[6], [x16] add v29.16b, v29.16b, v21.16b // next base_y ld2 {v19.h, v20.h}[7], [x17] umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 st1 {v10.8h}, [x0], x1 st1 {v11.8h}, [x13], x1 // The h==2 case only happens once at the end, if at all. 4: subs w4, w4, #8 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w5, w12 // reset h b 1b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret L(ipred_z2_fill1_tbl): .hword L(ipred_z2_fill1_tbl) - 640b .hword L(ipred_z2_fill1_tbl) - 320b .hword L(ipred_z2_fill1_tbl) - 160b .hword L(ipred_z2_fill1_tbl) - 80b .hword L(ipred_z2_fill1_tbl) - 40b endfunc function ipred_z2_fill2_16bpc_neon, export=1 cmp w4, #8 mov w8, #(2 << 6) // xpos = 2 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.8h, v1.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) tbl v18.8b, {v0.16b}, v29.8b // left[base_y] trn1 v30.2d, v30.2d, v28.2d // base_y + 1, base_y + 2 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v29.16b, #4 add v31.8h, v31.8h, v31.8h // {0,2,4,6,0,2,4,6} 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-8 // base_x <= -8 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] sshr v20.8h, v16.8h, #6 // first base_x for each row uzp2 v5.8h, v4.8h, v6.8h // top[base_x+1] uzp1 v4.8h, v4.8h, v6.8h // top[base_x] and v16.16b, v16.16b, v25.16b // frac_x trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 4b 49: tbl v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2] trn1 v18.2d, v18.2d, v19.2d // left[base_y], left[base_y+1] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f ext v18.16b, v19.16b, v19.16b, #8 add v30.16b, v30.16b, v29.16b // base_y += 2 (*2) b 49b 9: ret 80: stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy movi v17.8b, #1 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements // from left. ld1 {v0.8h, v1.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 1 movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y zip1 v29.16b, v29.16b, v29.16b // duplicate elements movi v17.16b, #2 add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... // Cut corners here; for the first row we don't expect to need to // read outside of v0. tbl v18.16b, {v0.16b}, v29.16b // left[base_y] add v30.16b, v29.16b, v19.16b // base_y + 2 (*2) add v29.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 add v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14} 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] ld1 {v6.8h, v7.8h}, [x11] tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] sshr v21.8h, v16.8h, #6 // first base_x sshr v22.8h, v17.8h, #6 tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] uzp2 v2.8h, v4.8h, v5.8h // top[base_x+1] uzp1 v4.8h, v4.8h, v5.8h // top[base_x] uzp2 v3.8h, v6.8h, v7.8h uzp1 v6.8h, v6.8h, v7.8h mov v5.16b, v2.16b mov v7.16b, v3.16b and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v21.8h, v21.8h, v31.8h // actual base_x add v22.8h, v22.8h, v31.8h umull v12.4s, v19.4h, v28.4h umlal v12.4s, v20.4h, v27.4h umull2 v13.4s, v19.8h, v28.8h umlal2 v13.4s, v20.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v21.8h, v21.8h, #0 cmge v22.8h, v22.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v21.16b bit v11.16b, v13.16b, v22.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 8b 89: tbl v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1] tbl v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v19.4h, v28.4h umlal v6.4s, v20.4h, v27.4h umull2 v7.4s, v19.8h, v28.8h umlal2 v7.4s, v20.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f mov v18.16b, v20.16b add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b // base_y += 2 (*2) b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc function ipred_z2_fill3_16bpc_neon, export=1 cmp w4, #8 mov w8, #(1 << 6) // xpos = 1 << 6 sub w8, w8, w6 // xpos -= dx movrel x11, increments ld1 {v31.8h}, [x11] // increments neg w7, w7 // -dy b.eq 80f 40: dup v30.4h, w7 // -dy movi v17.8b, #1 mul v16.4h, v31.4h, v30.4h // {0,1,2,3}* -dy movi v25.8h, #0x3e add v30.4h, v16.4h, v30.4h // -= dy // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #2 shrn v29.8b, v30.8h, #6 // ypos >> 6 and v27.8b, v30.8b, v25.8b // frac_y add v29.8b, v29.8b, v19.8b // base_y = (ypos >> 6) + 2 movi v23.4h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y movi v19.16b, #4 zip1 v29.8b, v29.8b, v29.8b // duplicate elements movi v17.8b, #2 add v29.8b, v29.8b, v23.8b // 2*base, 2*base+1, ... add v30.8b, v29.8b, v17.8b // base_y + 1 (*2) add v28.8b, v29.8b, v19.8b // base_y + 2 (*2) trn1 v31.2d, v31.2d, v31.2d // {0,1,2,3,0,1,2,3} add v24.8b, v30.8b, v19.8b // base_y + 3 (*2) trn1 v29.2d, v29.2d, v28.2d // base_y + 0, base_y + 2 trn1 v30.2d, v30.2d, v24.2d // base_y + 1, base_y + 3 sub v28.4h, v26.4h, v27.4h // 64 - frac_y trn1 v27.2d, v27.2d, v27.2d // frac_y trn1 v28.2d, v28.2d, v28.2d // 64 - frac_y movi v24.16b, #8 4: asr w9, w8, #6 // base_x dup v16.4h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-4 // base_x <= -4 asr w11, w8, #6 // base_x b.le 49f lsl w9, w9, #1 lsl w11, w11, #1 dup v17.4h, w8 // xpos ldr q4, [x2, w9, sxtw] // top[base_x] ldr q6, [x2, w11, sxtw] trn1 v16.2d, v16.2d, v17.2d // xpos tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] sshr v20.8h, v16.8h, #6 // first base_x for each row ext v5.16b, v4.16b, v4.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v6.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x trn1 v4.2d, v4.2d, v6.2d // top[base_x] trn1 v5.2d, v5.2d, v7.2d // top[base_x+1] sub v17.8h, v26.8h, v16.8h // 64 - frac_x add v20.8h, v20.8h, v31.8h // actual base_x umull v21.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v21.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v22.4s, v18.8h, v28.8h umlal2 v22.4s, v19.8h, v27.8h umull v23.4s, v4.4h, v17.4h // top[base_x]-*(64-frac_x) umlal v23.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v24.4s, v4.8h, v17.8h umlal2 v24.4s, v5.8h, v16.8h cmge v20.8h, v20.8h, #0 rshrn v21.4h, v21.4s, #6 rshrn2 v21.8h, v22.4s, #6 rshrn v22.4h, v23.4s, #6 rshrn2 v22.8h, v24.4s, #6 movi v24.16b, #8 bit v21.16b, v22.16b, v20.16b st1 {v21.d}[0], [x0], x1 sub w8, w8, w6 // xpos -= dx subs w5, w5, #2 st1 {v21.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) b 4b 49: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2] tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3] umull v20.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v20.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v21.4s, v18.8h, v28.8h umlal2 v21.4s, v19.8h, v27.8h rshrn v20.4h, v20.4s, #6 rshrn2 v20.8h, v21.4s, #6 st1 {v20.d}[0], [x0], x1 subs w5, w5, #2 st1 {v20.d}[1], [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 4 (*2) add v30.16b, v30.16b, v24.16b // base_y += 4 (*2) b 49b 9: ret 80: stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] dup v18.8h, w7 // -dy movi v17.16b, #2 mul v16.8h, v31.8h, v18.8h // {0,1,2,3,4,5,6,7}* -dy movi v25.8h, #0x3e add v16.8h, v16.8h, v18.8h // -= dy // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements. ld1 {v0.8h, v1.8h, v2.8h}, [x3] // left[] movi v26.8h, #64 movi v19.16b, #4 shrn v29.8b, v16.8h, #6 // ypos >> 6 and v27.16b, v16.16b, v25.16b // frac_y add v29.8b, v29.8b, v17.8b // base_y = (ypos >> 6) + 2 movi v23.8h, #1, lsl #8 shl v29.8b, v29.8b, #1 // 2*base_y mov v18.16b, v15.16b // left[0] zip1 v29.16b, v29.16b, v29.16b // duplicate elements add v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ... add v30.16b, v29.16b, v17.16b // base_y + 1 (*2) sub v28.8h, v26.8h, v27.8h // 64 - frac_y movi v24.16b, #4 8: asr w9, w8, #6 // base_x dup v16.8h, w8 // xpos sub w8, w8, w6 // xpos -= dx cmp w9, #-16 // base_x <= -16 asr w11, w8, #6 // base_x b.le 89f dup v17.8h, w8 // xpos add x9, x2, w9, sxtw #1 add x11, x2, w11, sxtw #1 ld1 {v4.8h, v5.8h}, [x9] // top[base_x] ld1 {v6.8h, v7.8h}, [x11] tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] add v30.16b, v30.16b, v24.16b sshr v22.8h, v16.8h, #6 // first base_x tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] sshr v23.8h, v17.8h, #6 tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] ext v5.16b, v4.16b, v5.16b, #2 // top[base_x+1] ext v7.16b, v6.16b, v7.16b, #2 and v16.16b, v16.16b, v25.16b // frac_x and v17.16b, v17.16b, v25.16b umull v10.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v10.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y sub v8.8h, v26.8h, v16.8h // 64 - frac_x sub v9.8h, v26.8h, v17.8h umull2 v11.4s, v18.8h, v28.8h umlal2 v11.4s, v19.8h, v27.8h add v22.8h, v22.8h, v31.8h // actual base_x add v23.8h, v23.8h, v31.8h umull v12.4s, v20.4h, v28.4h umlal v12.4s, v21.4h, v27.4h umull2 v13.4s, v20.8h, v28.8h umlal2 v13.4s, v21.8h, v27.8h rshrn v10.4h, v10.4s, #6 rshrn2 v10.8h, v11.4s, #6 rshrn v11.4h, v12.4s, #6 rshrn2 v11.8h, v13.4s, #6 umull v12.4s, v4.4h, v8.4h // top[base_x]-*(64-frac_x) umlal v12.4s, v5.4h, v16.4h // + top[base_x+1]*frac_x umull2 v13.4s, v4.8h, v8.8h umlal2 v13.4s, v5.8h, v16.8h umull v14.4s, v6.4h, v9.4h umlal v14.4s, v7.4h, v17.4h umull2 v18.4s, v6.8h, v9.8h umlal2 v18.4s, v7.8h, v17.8h cmge v22.8h, v22.8h, #0 cmge v23.8h, v23.8h, #0 rshrn v12.4h, v12.4s, #6 rshrn2 v12.8h, v13.4s, #6 rshrn v13.4h, v14.4s, #6 rshrn2 v13.8h, v18.4s, #6 bit v10.16b, v12.16b, v22.16b bit v11.16b, v13.16b, v23.16b st1 {v10.8h}, [x0], x1 subs w5, w5, #2 sub w8, w8, w6 // xpos -= dx st1 {v11.8h}, [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b b 8b 89: tbl v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0] add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) tbl v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1] add v30.16b, v30.16b, v24.16b tbl v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2] tbl v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3] umull v4.4s, v18.4h, v28.4h // left[base_y]*(64-frac_y) umlal v4.4s, v19.4h, v27.4h // + left[base_y+1]*frac_y umull2 v5.4s, v18.8h, v28.8h umlal2 v5.4s, v19.8h, v27.8h umull v6.4s, v20.4h, v28.4h umlal v6.4s, v21.4h, v27.4h umull2 v7.4s, v20.8h, v28.8h umlal2 v7.4s, v21.8h, v27.8h rshrn v4.4h, v4.4s, #6 rshrn2 v4.8h, v5.4s, #6 rshrn v5.4h, v6.4s, #6 rshrn2 v5.8h, v7.4s, #6 st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x0], x1 b.le 9f add v29.16b, v29.16b, v24.16b // base_y += 2 (*2) add v30.16b, v30.16b, v24.16b b 89b 9: ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret endfunc // void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const left, // const int width, const int height, // const int dy, const int max_base_y); function ipred_z3_fill1_16bpc_neon, export=1 clz w9, w4 adr x8, L(ipred_z3_fill1_tbl) sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] add x10, x2, w6, uxtw #1 // left[max_base_y] sub x8, x8, w9, uxtw ld1r {v31.8h}, [x10] // padding mov w7, w5 mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 br x8 40: AARCH64_VALID_JUMP_TARGET 4: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // left[base] ldr q2, [x2, w10, uxtw] dup v4.8h, w9 // frac dup v5.8h, w11 ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] ext v3.16b, v2.16b, v2.16b, #2 sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 add w7, w7, w5 // xpos += dx st1 {v18.s}[2], [x0] st1 {v18.s}[3], [x13] b.le 9f sub x0, x0, x1 // ptr -= 4 * (2*stride) sub x13, x13, x1 add x0, x0, #4 add x13, x13, #4 b 4b 9: ret 80: AARCH64_VALID_JUMP_TARGET 8: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h}, [x8] // left[base] ld1 {v2.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 ldr h1, [x8, #16] ldr h3, [x10, #16] dup v6.8h, w9 // 64 - frac dup v7.8h, w11 ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] ext v3.16b, v2.16b, v3.16b, #2 umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v1.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v3.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v3.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h zip2 v19.8h, v16.8h, v17.8h add w7, w7, w5 // xpos += dx st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 b.le 9f sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) sub x13, x13, x1, lsl #2 add x0, x0, #4 add x13, x13, #4 b 8b 9: ret 160: 320: 640: AARCH64_VALID_JUMP_TARGET mov w12, w4 1: lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // ypos += dy cmp w8, w6 // base >= max_base_y lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v6.8h, w9 // frac dup v7.8h, w11 ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v16.8h, w9 // 64 - frac dup v17.8h, w11 add w7, w7, w5 // ypos += dy 2: ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] ext v19.16b, v1.16b, v2.16b, #2 ext v20.16b, v3.16b, v4.16b, #2 ext v21.16b, v4.16b, v5.16b, #2 subs w4, w4, #16 umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac umull2 v23.4s, v0.8h, v16.8h umlal2 v23.4s, v18.8h, v6.8h umull v24.4s, v1.4h, v16.4h umlal v24.4s, v19.4h, v6.4h umull2 v25.4s, v1.8h, v16.8h umlal2 v25.4s, v19.8h, v6.8h umull v26.4s, v3.4h, v17.4h umlal v26.4s, v20.4h, v7.4h umull2 v27.4s, v3.8h, v17.8h umlal2 v27.4s, v20.8h, v7.8h umull v28.4s, v4.4h, v17.4h umlal v28.4s, v21.4h, v7.4h umull2 v29.4s, v4.8h, v17.8h umlal2 v29.4s, v21.8h, v7.8h rshrn v22.4h, v22.4s, #6 rshrn2 v22.8h, v23.4s, #6 rshrn v23.4h, v24.4s, #6 rshrn2 v23.8h, v25.4s, #6 rshrn v24.4h, v26.4s, #6 rshrn2 v24.8h, v27.4s, #6 rshrn v25.4h, v28.4s, #6 rshrn2 v25.8h, v29.4s, #6 zip1 v18.8h, v22.8h, v24.8h zip2 v19.8h, v22.8h, v24.8h zip1 v20.8h, v23.8h, v25.8h zip2 v21.8h, v23.8h, v25.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 st1 {v20.s}[0], [x0], x1 st1 {v20.s}[1], [x13], x1 st1 {v20.s}[2], [x0], x1 st1 {v20.s}[3], [x13], x1 st1 {v21.s}[0], [x0], x1 st1 {v21.s}[1], [x13], x1 st1 {v21.s}[2], [x0], x1 st1 {v21.s}[3], [x13], x1 b.le 3f mov v0.16b, v2.16b ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] mov v3.16b, v5.16b ld1 {v4.8h, v5.8h}, [x10], #32 b 2b 3: subs w3, w3, #2 b.le 9f lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 9: ret L(ipred_z3_fill1_tbl): .hword L(ipred_z3_fill1_tbl) - 640b .hword L(ipred_z3_fill1_tbl) - 320b .hword L(ipred_z3_fill1_tbl) - 160b .hword L(ipred_z3_fill1_tbl) - 80b .hword L(ipred_z3_fill1_tbl) - 40b endfunc function ipred_z3_fill_padding_neon, export=0 cmp w3, #8 adr x8, L(ipred_z3_fill_padding_tbl) b.gt L(ipred_z3_fill_padding_wide) // w3 = remaining width, w4 = constant height mov w12, w4 1: // Fill a WxH rectangle with padding. W can be any number; // this fills the exact width by filling in the largest // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 ldrh w9, [x8, w9, uxtw #1] sub x9, x8, w9, uxtw br x9 2: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 st1 {v31.s}[0], [x0], x1 st1 {v31.s}[0], [x13], x1 b.gt 2b subs w3, w3, #2 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #4 add x13, x13, #4 mov w4, w12 b 1b 4: st1 {v31.4h}, [x0], x1 subs w4, w4, #4 st1 {v31.4h}, [x13], x1 st1 {v31.4h}, [x0], x1 st1 {v31.4h}, [x13], x1 b.gt 4b subs w3, w3, #4 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #8 add x13, x13, #8 mov w4, w12 b 1b 8: 16: 32: 64: st1 {v31.8h}, [x0], x1 subs w4, w4, #4 st1 {v31.8h}, [x13], x1 st1 {v31.8h}, [x0], x1 st1 {v31.8h}, [x13], x1 b.gt 4b subs w3, w3, #8 lsr x1, x1, #1 msub x0, x1, x12, x0 // ptr -= h * stride msub x13, x1, x12, x13 b.le 9f lsl x1, x1, #1 add x0, x0, #16 add x13, x13, #16 mov w4, w12 b 1b 9: ret L(ipred_z3_fill_padding_tbl): .hword L(ipred_z3_fill_padding_tbl) - 64b .hword L(ipred_z3_fill_padding_tbl) - 32b .hword L(ipred_z3_fill_padding_tbl) - 16b .hword L(ipred_z3_fill_padding_tbl) - 8b .hword L(ipred_z3_fill_padding_tbl) - 4b .hword L(ipred_z3_fill_padding_tbl) - 2b L(ipred_z3_fill_padding_wide): // Fill a WxH rectangle with padding, with W > 8. lsr x1, x1, #1 mov w12, w3 sub x1, x1, w3, uxtw #1 1: ands w5, w3, #7 b.eq 2f // If the width isn't aligned to 8, first do one 8 pixel write // and align the start pointer. sub w3, w3, w5 st1 {v31.8h}, [x0] add x0, x0, w5, uxtw #1 2: // Fill the rest of the line with aligned 8 pixel writes. subs w3, w3, #8 st1 {v31.8h}, [x0], #16 b.gt 2b subs w4, w4, #1 add x0, x0, x1 b.le 9f mov w3, w12 b 1b 9: ret endfunc function ipred_z3_fill2_16bpc_neon, export=1 cmp w4, #8 add x10, x2, w6, uxtw // left[max_base_y] ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 add x13, x0, x1 lsl x1, x1, #1 b.eq 8f 4: // h == 4 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon lsl w8, w8, #1 lsl w10, w10, #1 ldr q0, [x2, w8, uxtw] // top[base] ldr q2, [x2, w10, uxtw] dup v4.4h, w9 // frac dup v5.4h, w11 uzp2 v1.8h, v0.8h, v0.8h // top[base+1] uzp1 v0.8h, v0.8h, v0.8h // top[base] uzp2 v3.8h, v2.8h, v2.8h uzp1 v2.8h, v2.8h, v2.8h sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] sub v7.4h, v3.4h, v2.4h ushll v16.4s, v0.4h, #6 // top[base]*64 ushll v17.4s, v2.4h, #6 smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac smlal v17.4s, v7.4h, v5.4h rshrn v16.4h, v16.4s, #6 rshrn v17.4h, v17.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 add w7, w7, w5 // xpos += dx st1 {v18.s}[2], [x0] st1 {v18.s}[3], [x13] b.le 9f sub x0, x0, x1 // ptr -= 4 * (2*stride) sub x13, x13, x1 add x0, x0, #4 add x13, x13, #4 b 4b 9: ret 8: // h == 8 lsr w8, w7, #6 // base and w9, w7, #0x3e // frac add w7, w7, w5 // xpos += dx cmp w8, w6 // base >= max_base_x lsr w10, w7, #6 // base and w11, w7, #0x3e // frac b.ge ipred_z3_fill_padding_neon add x8, x2, w8, uxtw #1 add x10, x2, w10, uxtw #1 dup v4.8h, w9 // frac dup v5.8h, w11 ld1 {v0.8h, v1.8h}, [x8] // top[base] ld1 {v2.8h, v3.8h}, [x10] sub w9, w15, w9 // 64 - frac sub w11, w15, w11 dup v6.8h, w9 // 64 - frac dup v7.8h, w11 uzp2 v20.8h, v0.8h, v1.8h // top[base+1] uzp1 v0.8h, v0.8h, v1.8h // top[base] uzp2 v21.8h, v2.8h, v3.8h uzp1 v2.8h, v2.8h, v3.8h umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac umull2 v17.4s, v0.8h, v6.8h umlal2 v17.4s, v20.8h, v4.8h umull v18.4s, v2.4h, v7.4h umlal v18.4s, v21.4h, v5.4h umull2 v19.4s, v2.8h, v7.8h umlal2 v19.4s, v21.8h, v5.8h rshrn v16.4h, v16.4s, #6 rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 subs w3, w3, #2 zip1 v18.8h, v16.8h, v17.8h zip2 v19.8h, v16.8h, v17.8h add w7, w7, w5 // xpos += dx st1 {v18.s}[0], [x0], x1 st1 {v18.s}[1], [x13], x1 st1 {v18.s}[2], [x0], x1 st1 {v18.s}[3], [x13], x1 st1 {v19.s}[0], [x0], x1 st1 {v19.s}[1], [x13], x1 st1 {v19.s}[2], [x0], x1 st1 {v19.s}[3], [x13], x1 b.le 9f sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) sub x13, x13, x1, lsl #2 add x0, x0, #4 add x13, x13, #4 b 8b 9: ret endfunc // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, // const int max_width, const int max_height, // const int bitdepth_max); .macro filter_fn bpc function ipred_filter_\bpc\()bpc_neon and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 adr x5, L(ipred_filter\bpc\()_tbl) ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 ldrh w9, [x5, w9, uxtw #1] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b sub x5, x5, w9, uxtw sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 lsl x1, x1, #1 sxtl v20.8h, v20.8b sxtl v21.8h, v21.8b sxtl v22.8h, v22.8b dup v31.8h, w8 .if \bpc == 10 movi v30.8h, #0 .endif br x5 40: AARCH64_VALID_JUMP_TARGET ldur d0, [x2, #2] // top (0-3) sub x2, x2, #4 mov x7, #-4 4: ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) .if \bpc == 10 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) srshr v2.8h, v2.8h, #4 smax v2.8h, v2.8h, v30.8h .else smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) sqrshrun v2.4h, v2.4s, #4 sqrshrun2 v2.8h, v3.4s, #4 .endif smin v2.8h, v2.8h, v31.8h subs w4, w4, #2 st1 {v2.d}[0], [x0], x1 ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] st1 {v2.d}[1], [x6], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ldur q0, [x2, #2] // top (0-7) sub x2, x2, #4 mov x7, #-4 8: ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) .if \bpc == 10 mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) srshr v2.8h, v2.8h, #4 smax v2.8h, v2.8h, v30.8h smin v2.8h, v2.8h, v31.8h mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) srshr v3.8h, v3.8h, #4 smax v3.8h, v3.8h, v30.8h .else smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) sqrshrun v2.4h, v2.4s, #4 sqrshrun2 v2.8h, v3.4s, #4 smin v2.8h, v2.8h, v31.8h smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) sqrshrun v3.4h, v4.4s, #4 sqrshrun2 v3.8h, v5.4s, #4 .endif smin v3.8h, v3.8h, v31.8h subs w4, w4, #2 st2 {v2.d, v3.d}[0], [x0], x1 zip2 v0.2d, v2.2d, v3.2d st2 {v2.d, v3.d}[1], [x6], x1 b.gt 8b ret 160: 320: AARCH64_VALID_JUMP_TARGET add x8, x2, #2 sub x2, x2, #4 mov x7, #-4 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) 2: ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) .if \bpc == 10 mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) srshr v3.8h, v3.8h, #4 smax v3.8h, v3.8h, v30.8h smin v3.8h, v3.8h, v31.8h mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) srshr v4.8h, v4.8h, #4 smax v4.8h, v4.8h, v30.8h smin v4.8h, v4.8h, v31.8h mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) srshr v5.8h, v5.8h, #4 smax v5.8h, v5.8h, v30.8h smin v5.8h, v5.8h, v31.8h mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 srshr v6.8h, v6.8h, #4 smax v6.8h, v6.8h, v30.8h .else smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) sqrshrun v3.4h, v3.4s, #4 sqrshrun2 v3.8h, v4.4s, #4 smin v3.8h, v3.8h, v31.8h smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) sqrshrun v4.4h, v5.4s, #4 sqrshrun2 v4.8h, v6.4s, #4 smin v4.8h, v4.8h, v31.8h smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) sqrshrun v5.4h, v24.4s, #4 sqrshrun2 v5.8h, v25.4s, #4 smin v5.8h, v5.8h, v31.8h smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) subs w3, w3, #16 sqrshrun v6.4h, v26.4s, #4 sqrshrun2 v6.8h, v27.4s, #4 .endif smin v6.8h, v6.8h, v31.8h ins v0.h[2], v2.h[7] st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 ins v0.h[0], v6.h[7] st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 ins v0.h[1], v6.h[3] b.gt 2b subs w4, w4, #2 b.le 9f sub x8, x6, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b 1b 9: ret L(ipred_filter\bpc\()_tbl): .hword L(ipred_filter\bpc\()_tbl) - 320b .hword L(ipred_filter\bpc\()_tbl) - 160b .hword L(ipred_filter\bpc\()_tbl) - 80b .hword L(ipred_filter\bpc\()_tbl) - 40b endfunc .endm filter_fn 10 filter_fn 12 function ipred_filter_16bpc_neon, export=1 ldr w8, [sp] cmp w8, 0x3ff b.le ipred_filter_10bpc_neon b ipred_filter_12bpc_neon endfunc // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint16_t *const pal, const uint8_t *idx, // const int w, const int h); function pal_pred_16bpc_neon, export=1 ld1 {v30.8h}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) sub w9, w9, #25 ldrh w9, [x6, w9, uxtw #1] movi v31.8h, #1, lsl #8 sub x6, x6, w9, uxtw br x6 40: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 4: ld1 {v1.16b}, [x3], #16 subs w5, w5, #4 // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... add v1.16b, v1.16b, v1.16b zip1 v0.16b, v1.16b, v1.16b zip2 v1.16b, v1.16b, v1.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b st1 {v0.d}[0], [x0], x1 tbl v1.16b, {v30.16b}, v1.16b st1 {v0.d}[1], [x2], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x2], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 8: ld1 {v2.16b, v3.16b}, [x3], #32 subs w5, w5, #4 add v2.16b, v2.16b, v2.16b add v3.16b, v3.16b, v3.16b zip1 v0.16b, v2.16b, v2.16b zip2 v1.16b, v2.16b, v2.16b zip1 v2.16b, v3.16b, v3.16b zip2 v3.16b, v3.16b, v3.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b tbl v1.16b, {v30.16b}, v1.16b st1 {v0.8h}, [x0], x1 tbl v2.16b, {v30.16b}, v2.16b st1 {v1.8h}, [x2], x1 tbl v3.16b, {v30.16b}, v3.16b st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x2], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 16: ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 subs w5, w5, #4 add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b st1 {v2.8h, v3.8h}, [x2], x1 tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h}, [x0], x1 st1 {v6.8h, v7.8h}, [x2], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET add x2, x0, x1 lsl x1, x1, #1 32: ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 subs w5, w5, #2 add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET add x2, x0, #64 64: ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 subs w5, w5, #1 add v4.16b, v4.16b, v4.16b add v5.16b, v5.16b, v5.16b add v6.16b, v6.16b, v6.16b add v7.16b, v7.16b, v7.16b zip1 v0.16b, v4.16b, v4.16b zip2 v1.16b, v4.16b, v4.16b zip1 v2.16b, v5.16b, v5.16b zip2 v3.16b, v5.16b, v5.16b zip1 v4.16b, v6.16b, v6.16b zip2 v5.16b, v6.16b, v6.16b zip1 v6.16b, v7.16b, v7.16b zip2 v7.16b, v7.16b, v7.16b add v0.8h, v0.8h, v31.8h add v1.8h, v1.8h, v31.8h add v2.8h, v2.8h, v31.8h add v3.8h, v3.8h, v31.8h add v4.8h, v4.8h, v31.8h tbl v0.16b, {v30.16b}, v0.16b add v5.8h, v5.8h, v31.8h tbl v1.16b, {v30.16b}, v1.16b add v6.8h, v6.8h, v31.8h tbl v2.16b, {v30.16b}, v2.16b add v7.8h, v7.8h, v31.8h tbl v3.16b, {v30.16b}, v3.16b tbl v4.16b, {v30.16b}, v4.16b tbl v5.16b, {v30.16b}, v5.16b st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 tbl v6.16b, {v30.16b}, v6.16b tbl v7.16b, {v30.16b}, v7.16b st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 b.gt 64b ret L(pal_pred_tbl): .hword L(pal_pred_tbl) - 640b .hword L(pal_pred_tbl) - 320b .hword L(pal_pred_tbl) - 160b .hword L(pal_pred_tbl) - 80b .hword L(pal_pred_tbl) - 40b endfunc // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_128_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 adr x7, L(ipred_cfl_128_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] urshr v0.8h, v31.8h, #1 dup v1.8h, w6 // alpha sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_splat_w4): AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #4 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v16.4s, v2.4s, #0 // sign cmlt v17.4s, v3.4s, #0 cmlt v18.4s, v4.4s, #0 cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s add v5.4s, v5.4s, v19.4s rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v2.d}[0], [x0], x1 st1 {v2.d}[1], [x6], x1 st1 {v3.d}[0], [x0], x1 st1 {v3.d}[1], [x6], x1 b.gt L(ipred_cfl_splat_w4) ret L(ipred_cfl_splat_w8): AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x5], #32 subs w4, w4, #2 smull v2.4s, v4.4h, v1.4h // diff = ac * alpha smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v16.4s, v2.4s, #0 // sign cmlt v17.4s, v3.4s, #0 cmlt v18.4s, v4.4s, #0 cmlt v19.4s, v5.4s, #0 add v2.4s, v2.4s, v16.4s // diff + sign add v3.4s, v3.4s, v17.4s add v4.4s, v4.4s, v18.4s add v5.4s, v5.4s, v19.4s rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 add v2.8h, v2.8h, v0.8h // dc + apply_sign() add v3.8h, v3.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x6], x1 b.gt L(ipred_cfl_splat_w8) ret L(ipred_cfl_splat_w16): AARCH64_VALID_JUMP_TARGET add x7, x5, w3, uxtw #1 sub x1, x1, w3, uxtw #1 mov w9, w3 1: ld1 {v2.8h, v3.8h}, [x5], #32 ld1 {v4.8h, v5.8h}, [x7], #32 subs w3, w3, #16 smull v16.4s, v2.4h, v1.4h // diff = ac * alpha smull2 v17.4s, v2.8h, v1.8h smull v18.4s, v3.4h, v1.4h smull2 v19.4s, v3.8h, v1.8h smull v2.4s, v4.4h, v1.4h smull2 v3.4s, v4.8h, v1.8h smull v4.4s, v5.4h, v1.4h smull2 v5.4s, v5.8h, v1.8h cmlt v20.4s, v16.4s, #0 // sign cmlt v21.4s, v17.4s, #0 cmlt v22.4s, v18.4s, #0 cmlt v23.4s, v19.4s, #0 cmlt v24.4s, v2.4s, #0 cmlt v25.4s, v3.4s, #0 cmlt v26.4s, v4.4s, #0 cmlt v27.4s, v5.4s, #0 add v16.4s, v16.4s, v20.4s // diff + sign add v17.4s, v17.4s, v21.4s add v18.4s, v18.4s, v22.4s add v19.4s, v19.4s, v23.4s add v2.4s, v2.4s, v24.4s add v3.4s, v3.4s, v25.4s add v4.4s, v4.4s, v26.4s add v5.4s, v5.4s, v27.4s rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() rshrn2 v16.8h, v17.4s, #6 rshrn v17.4h, v18.4s, #6 rshrn2 v17.8h, v19.4s, #6 rshrn v6.4h, v2.4s, #6 rshrn2 v6.8h, v3.4s, #6 rshrn v7.4h, v4.4s, #6 rshrn2 v7.8h, v5.4s, #6 add v2.8h, v16.8h, v0.8h // dc + apply_sign() add v3.8h, v17.8h, v0.8h add v4.8h, v6.8h, v0.8h add v5.8h, v7.8h, v0.8h smax v2.8h, v2.8h, v30.8h smax v3.8h, v3.8h, v30.8h smax v4.8h, v4.8h, v30.8h smax v5.8h, v5.8h, v30.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h smin v4.8h, v4.8h, v31.8h smin v5.8h, v5.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], #32 st1 {v4.8h, v5.8h}, [x6], #32 b.gt 1b subs w4, w4, #2 add x5, x5, w9, uxtw #1 add x7, x7, w9, uxtw #1 add x0, x0, x1 add x6, x6, x1 mov w3, w9 b.gt 1b ret L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_top_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max clz w9, w3 adr x7, L(ipred_cfl_top_tbl) sub w9, w9, #26 ldrh w9, [x7, w9, uxtw #1] dup v1.8h, w6 // alpha add x2, x2, #2 sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h uaddlv s0, v0.8h rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_top_tbl): .hword L(ipred_cfl_top_tbl) - 32b .hword L(ipred_cfl_top_tbl) - 16b .hword L(ipred_cfl_top_tbl) - 8b .hword L(ipred_cfl_top_tbl) - 4b endfunc // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_left_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max sub x2, x2, w4, uxtw #1 clz w9, w3 clz w8, w4 adr x10, L(ipred_cfl_splat_tbl) adr x7, L(ipred_cfl_left_tbl) sub w9, w9, #26 sub w8, w8, #26 ldrh w9, [x10, w9, uxtw #1] ldrh w8, [x7, w8, uxtw #1] dup v1.8h, w6 // alpha sub x9, x10, w9, uxtw sub x7, x7, w8, uxtw add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_left_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2] addv h0, v0.4h urshr v0.4h, v0.4h, #2 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2] addv h0, v0.8h urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] addp v0.8h, v2.8h, v3.8h addv h0, v0.8h urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h uaddlv s0, v0.8h rshrn v0.4h, v0.4s, #5 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_tbl): .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, // const int16_t *ac, const int alpha, // const int bitdepth_max); function ipred_cfl_16bpc_neon, export=1 dup v31.8h, w7 // bitdepth_max sub x2, x2, w4, uxtw #1 add w8, w3, w4 // width + height dup v1.8h, w6 // alpha clz w9, w3 clz w6, w4 dup v16.4s, w8 // width + height adr x7, L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) ldrh w9, [x7, w9, uxtw #1] ldrh w6, [x7, w6, uxtw #1] neg w8, w8 // -ctz(width + height) sub x9, x7, w9, uxtw sub x7, x7, w6, uxtw ushr v16.4s, v16.4s, #1 // (width + height) >> 1 dup v17.4s, w8 // -ctz(width + height) add x6, x0, x1 lsl x1, x1, #1 movi v30.8h, #0 br x7 L(ipred_cfl_h4): AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], #8 uaddlv s0, v0.4h add x2, x2, #2 br x9 L(ipred_cfl_w4): AARCH64_VALID_JUMP_TARGET ld1 {v2.4h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.4h cmp w4, #4 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #16 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) L(ipred_cfl_h8): AARCH64_VALID_JUMP_TARGET ld1 {v0.8h}, [x2], #16 uaddlv s0, v0.8h add x2, x2, #2 br x9 L(ipred_cfl_w8): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h}, [x2] add v0.2s, v0.2s, v16.2s uaddlv s2, v2.8h cmp w4, #8 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/16/32 cmp w4, #32 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) L(ipred_cfl_h16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2], #32 addp v0.8h, v2.8h, v3.8h add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w16): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h}, [x2] add v0.2s, v0.2s, v16.2s addp v2.8h, v2.8h, v3.8h uaddlv s2, v2.8h cmp w4, #16 add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 4/8/32 tst w4, #(32+16+8) // 16 added to make a consecutive bitmask mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_h32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v0.8h, v2.8h, v4.8h add x2, x2, #2 uaddlv s0, v0.8h br x9 L(ipred_cfl_w32): AARCH64_VALID_JUMP_TARGET ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] add v0.4s, v0.4s, v16.4s addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v2.8h, v2.8h, v4.8h cmp w4, #32 uaddlv s2, v2.8h add v0.2s, v0.2s, v2.2s ushl v0.2s, v0.2s, v17.2s b.eq 1f // h = 8/16 cmp w4, #8 mov w16, #0x6667 mov w17, #0xAAAB csel w16, w16, w17, eq dup v16.2s, w16 mul v0.2s, v0.2s, v16.2s ushr v0.2s, v0.2s, #17 1: dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) L(ipred_cfl_tbl): .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_420_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_420_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_420_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v2.8h addp v1.8h, v1.8h, v3.8h add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d L(ipred_cfl_ac_420_w4_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 2b 3: L(ipred_cfl_ac_420_w4_calc_subtract_dc): // Aggregate the sums add v24.4s, v24.4s, v25.4s add v26.4s, v26.4s, v27.4s add v0.4s, v24.4s, v26.4s addv s0, v0.4s // sum sub x0, x0, w6, uxtw #3 urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 6b ret L(ipred_cfl_ac_420_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_420_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v1.8h, v4.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_420_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v2.8h addp v1.8h, v1.8h, v3.8h add v0.8h, v0.8h, v1.8h shl v0.8h, v0.8h, #1 dup v1.4h, v0.h[3] dup v3.4h, v0.h[7] trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw v25.4s, v25.4s, v1.4h uaddw v26.4s, v26.4s, v2.4h uaddw v27.4s, v27.4s, v3.4h b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d L(ipred_cfl_ac_420_w8_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 2b 3: // Double the height and reuse the w4 summing/subtracting lsl w6, w6, #1 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_420_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_420_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 add v0.8h, v0.8h, v4.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 add v2.8h, v2.8h, v6.8h addp v16.8h, v16.8h, v17.8h addp v18.8h, v18.8h, v19.8h addp v20.8h, v20.8h, v21.8h addp v22.8h, v22.8h, v23.8h add v16.8h, v16.8h, v20.8h add v18.8h, v18.8h, v22.8h shl v0.8h, v0.8h, #1 shl v1.8h, v2.8h, #1 shl v2.8h, v16.8h, #1 shl v3.8h, v18.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 ldr q5, [x10, #32] ld1 {v3.8h, v4.8h}, [x10], x2 addp v2.8h, v2.8h, v2.8h addp v0.8h, v0.8h, v1.8h addp v5.8h, v5.8h, v5.8h addp v3.8h, v3.8h, v4.8h ldr q18, [x1, #32] add v2.4h, v2.4h, v5.4h ld1 {v16.8h, v17.8h}, [x1], x2 add v0.8h, v0.8h, v3.8h ldr q21, [x10, #32] ld1 {v19.8h, v20.8h}, [x10], x2 addp v18.8h, v18.8h, v18.8h addp v16.8h, v16.8h, v17.8h addp v21.8h, v21.8h, v21.8h addp v19.8h, v19.8h, v20.8h add v18.4h, v18.4h, v21.4h add v16.8h, v16.8h, v19.8h shl v1.4h, v2.4h, #1 shl v0.8h, v0.8h, #1 shl v3.4h, v18.4h, #1 shl v2.8h, v16.8h, #1 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h add v0.8h, v0.8h, v2.8h add v4.8h, v4.8h, v6.8h shl v0.8h, v0.8h, #1 shl v2.8h, v4.8h, #1 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_420_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 ld1 {v4.8h}, [x1], x2 ld1 {v6.8h}, [x10], x2 addp v0.8h, v0.8h, v4.8h addp v2.8h, v2.8h, v6.8h add v0.8h, v0.8h, v2.8h shl v0.8h, v0.8h, #1 dup v1.8h, v0.h[3] dup v3.8h, v0.h[7] trn2 v2.2d, v0.2d, v3.2d trn1 v0.2d, v0.2d, v1.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b L(ipred_cfl_ac_420_w16_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 2b 3: // Quadruple the height and reuse the w4 summing/subtracting lsl w6, w6, #2 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_420_tbl): .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) .hword 0 L(ipred_cfl_ac_420_w16_tbl): .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_422_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_422_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_422_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_422_w8): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_422_w8_wpad) 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 ld1 {v4.8h, v5.8h}, [x1], x2 addp v0.8h, v0.8h, v1.8h ld1 {v6.8h, v7.8h}, [x10], x2 addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 shl v2.8h, v4.8h, #2 shl v3.8h, v6.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w8_wpad): 1: // Copy and subsample input, padding 4 ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 ld1 {v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v4.4h, v0.h[3] dup v5.8h, v0.h[7] dup v6.4h, v2.h[3] dup v7.8h, v2.h[7] trn2 v1.2d, v0.2d, v5.2d trn1 v0.2d, v0.2d, v4.2d trn2 v3.2d, v2.2d, v7.2d trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_422_w16_tbl) ldrh w3, [x7, w3, uxtw #1] sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_422_w16_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h shl v0.8h, v0.8h, #2 shl v1.8h, v2.8h, #2 shl v2.8h, v4.8h, #2 shl v3.8h, v6.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad1): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 4 ldr q2, [x1, #32] ld1 {v0.8h, v1.8h}, [x1], x2 ldr q6, [x10, #32] ld1 {v4.8h, v5.8h}, [x10], x2 addp v2.8h, v2.8h, v2.8h addp v0.8h, v0.8h, v1.8h addp v6.8h, v6.8h, v6.8h addp v4.8h, v4.8h, v5.8h shl v1.4h, v2.4h, #2 shl v0.8h, v0.8h, #2 shl v3.4h, v6.4h, #2 shl v2.8h, v4.8h, #2 dup v4.4h, v1.h[3] dup v5.4h, v3.h[3] trn1 v1.2d, v1.2d, v4.2d trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 8 ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h shl v0.8h, v0.8h, #2 shl v2.8h, v2.8h, #2 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_w16_wpad3): AARCH64_VALID_JUMP_TARGET 1: // Copy and subsample input, padding 12 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 addp v0.8h, v0.8h, v0.8h addp v2.8h, v2.8h, v2.8h shl v0.4h, v0.4h, #2 shl v2.4h, v2.4h, #2 dup v1.8h, v0.h[3] dup v3.8h, v2.h[3] trn1 v0.2d, v0.2d, v1.2d trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_422_tbl): .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) .hword 0 L(ipred_cfl_ac_422_w16_tbl): .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) endfunc // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx, // const ptrdiff_t stride, const int w_pad, // const int h_pad, const int cw, const int ch); function ipred_cfl_ac_444_16bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_444_tbl) sub w8, w8, #26 ldrh w8, [x7, w8, uxtw #1] movi v24.4s, #0 movi v25.4s, #0 movi v26.4s, #0 movi v27.4s, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz add x10, x1, x2 dup v31.4s, w9 lsl x2, x2, #1 neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_444_w4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.4h}, [x1], x2 ld1 {v0.d}[1], [x10], x2 ld1 {v1.4h}, [x1], x2 ld1 {v1.d}[1], [x10], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h}, [x0], #32 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h b.gt 1b trn2 v0.2d, v1.2d, v1.2d trn2 v1.2d, v1.2d, v1.2d b L(ipred_cfl_ac_420_w4_hpad) L(ipred_cfl_ac_444_w8): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x10], x2 ld1 {v2.8h}, [x1], x2 shl v0.8h, v0.8h, #3 ld1 {v3.8h}, [x10], x2 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w8_hpad) L(ipred_cfl_ac_444_w16): AARCH64_VALID_JUMP_TARGET cbnz w3, L(ipred_cfl_ac_444_w16_wpad) 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h}, [x1], x2 ld1 {v2.8h, v3.8h}, [x10], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w16_wpad): 1: // Copy and expand input, padding 8 ld1 {v0.8h}, [x1], x2 ld1 {v2.8h}, [x10], x2 shl v0.8h, v0.8h, #3 shl v2.8h, v2.8h, #3 dup v1.8h, v0.h[7] dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET adr x7, L(ipred_cfl_ac_444_w32_tbl) ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 lsr x2, x2, #1 // Restore the stride to one line increments sub x7, x7, w3, uxtw br x7 L(ipred_cfl_ac_444_w32_wpad0): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, without padding ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 shl v2.8h, v2.8h, #3 shl v3.8h, v3.8h, #3 subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad2): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 8 ld1 {v0.8h, v1.8h, v2.8h}, [x1], x2 shl v2.8h, v2.8h, #3 shl v0.8h, v0.8h, #3 shl v1.8h, v1.8h, #3 dup v3.8h, v2.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad4): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 16 ld1 {v0.8h, v1.8h}, [x1], x2 shl v1.8h, v1.8h, #3 shl v0.8h, v0.8h, #3 dup v2.8h, v1.h[7] dup v3.8h, v1.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b b L(ipred_cfl_ac_444_w32_hpad) L(ipred_cfl_ac_444_w32_wpad6): AARCH64_VALID_JUMP_TARGET 1: // Copy and expand input, padding 24 ld1 {v0.8h}, [x1], x2 shl v0.8h, v0.8h, #3 dup v1.8h, v0.h[7] dup v2.8h, v0.h[7] dup v3.8h, v0.h[7] subs w8, w8, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 1b L(ipred_cfl_ac_444_w32_hpad): cbz w4, 3f 2: // Vertical padding (h_pad > 0) subs w4, w4, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 uaddw v24.4s, v24.4s, v0.4h uaddw2 v25.4s, v25.4s, v0.8h uaddw v26.4s, v26.4s, v1.4h uaddw2 v27.4s, v27.4s, v1.8h uaddw v24.4s, v24.4s, v2.4h uaddw2 v25.4s, v25.4s, v2.8h uaddw v26.4s, v26.4s, v3.4h uaddw2 v27.4s, v27.4s, v3.8h b.gt 2b 3: // Multiply the height by eight and reuse the w4 subtracting lsl w6, w6, #3 b L(ipred_cfl_ac_420_w4_calc_subtract_dc) L(ipred_cfl_ac_444_tbl): .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) L(ipred_cfl_ac_444_w32_tbl): .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) endfunc rav1e-0.7.1/src/arm/64/itx.S000064400000000000000000003561061046102023000134110ustar 00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); // Most of the functions use the following register layout: // x0-x3 external parameters // x4 function pointer to first transform // x5 function pointer to second transform // x6 output parameter for helper function // x7 input parameter for helper function // x8 input stride for helper function // x9-x12 scratch variables for helper functions // x13 pointer to list of eob thresholds // x14 return pointer for helper function // x15 return pointer for main function // The SIMD registers most often use the following layout: // v0-v1 multiplication coefficients // v2-v7 scratch registers // v8-v15 unused // v16-v31 inputs/outputs of transforms // Potential further optimizations, that are left unimplemented for now: // - Trying to keep multiplication coefficients in registers across multiple // transform functions. (The register layout is designed to potentially // allow this.) // - Use a simplified version of the transforms themselves for cases where // we know a significant number of inputs are zero. E.g. if the eob value // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. const idct_coeffs, align=4 // idct4 .short 2896, 2896*8, 1567, 3784 // idct8 .short 799, 4017, 3406, 2276 // idct16 .short 401, 4076, 3166, 2598 .short 1931, 3612, 3920, 1189 // idct32 .short 201, 4091, 3035, 2751 .short 1751, 3703, 3857, 1380 .short 995, 3973, 3513, 2106 .short 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .short 101*8, 4095*8, 2967*8, -2824*8 .short 1660*8, 3745*8, 3822*8, -1474*8 .short 4076, 401, 4017, 799 .short 0, 0, 0, 0 .short 4036*8, -700*8, 2359*8, 3349*8 .short 3461*8, -2191*8, 897*8, 3996*8 .short -3166, -2598, -799, -4017 .short 0, 0, 0, 0 .short 501*8, 4065*8, 3229*8, -2520*8 .short 2019*8, 3564*8, 3948*8, -1092*8 .short 3612, 1931, 2276, 3406 .short 0, 0, 0, 0 .short 4085*8, -301*8, 2675*8, 3102*8 .short 3659*8, -1842*8, 1285*8, 3889*8 .short -3920, -1189, -3406, -2276 .short 0, 0, 0, 0 endconst const iadst4_coeffs, align=4 // .h[4-5] can be interpreted as .s[2] .short 1321, 3803, 2482, 3344, 3344, 0 endconst const iadst8_coeffs, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 // idct_coeffs .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst const iadst16_coeffs, align=4 .short 4091, 201, 3973, 995 .short 3703, 1751, 3290, 2440 .short 2751, 3035, 2106, 3513 .short 1380, 3857, 601, 4052 endconst .macro smull_smlal d0, d1, s0, s1, c0, c1, sz smull \d0\().4s, \s0\().4h, \c0 smlal \d0\().4s, \s1\().4h, \c1 .ifc \sz, .8h smull2 \d1\().4s, \s0\().8h, \c0 smlal2 \d1\().4s, \s1\().8h, \c1 .endif .endm .macro smull_smlsl d0, d1, s0, s1, c0, c1, sz smull \d0\().4s, \s0\().4h, \c0 smlsl \d0\().4s, \s1\().4h, \c1 .ifc \sz, .8h smull2 \d1\().4s, \s0\().8h, \c0 smlsl2 \d1\().4s, \s1\().8h, \c1 .endif .endm .macro sqrshrn_sz d0, s0, s1, shift, sz sqrshrn \d0\().4h, \s0\().4s, \shift .ifc \sz, .8h sqrshrn2 \d0\().8h, \s1\().4s, \shift .endif .endm .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 sqrdmulh \r0\sz, \r0\sz, \c sqrdmulh \r1\sz, \r1\sz, \c sqrdmulh \r2\sz, \r2\sz, \c sqrdmulh \r3\sz, \r3\sz, \c .ifnb \r4 sqrdmulh \r4\sz, \r4\sz, \c sqrdmulh \r5\sz, \r5\sz, \c sqrdmulh \r6\sz, \r6\sz, \c sqrdmulh \r7\sz, \r7\sz, \c .endif .endm .macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 .endif .ifnb \shift srshr \shift, \shift, #\shiftbits .endif .ifnb \addsrc uaddw \adddst, \adddst, \addsrc .endif .ifnb \narrowsrc sqxtun \narrowdst, \narrowsrc .endif .ifnb \store st1 {\store}, [\dst], x1 .endif .endm .macro load_add_store_8x16 dst, src mov \src, \dst load_add_store v2.8b, v16.8h, , , , , , \dst, \src load_add_store v3.8b, v17.8h, , , , , , \dst, \src load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src load_add_store v4.8b, v24.8h, v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src load_add_store v5.8b, v25.8h, v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src load_add_store v6.8b, v26.8h, v4.8b, v24.8h, v23.8h, v3.8b, v2.8b, \dst, \src load_add_store v7.8b, v27.8h, v5.8b, v25.8h, v24.8h, v4.8b, v3.8b, \dst, \src load_add_store v2.8b, v28.8h, v6.8b, v26.8h, v25.8h, v5.8b, v4.8b, \dst, \src load_add_store v3.8b, v29.8h, v7.8b, v27.8h, v26.8h, v6.8b, v5.8b, \dst, \src load_add_store v4.8b, v30.8h, v2.8b, v28.8h, v27.8h, v7.8b, v6.8b, \dst, \src load_add_store v5.8b, v31.8h, v3.8b, v29.8h, v28.8h, v2.8b, v7.8b, \dst, \src load_add_store , , v4.8b, v30.8h, v29.8h, v3.8b, v2.8b, \dst, \src load_add_store , , v5.8b, v31.8h, v30.8h, v4.8b, v3.8b, \dst, \src load_add_store , , , , v31.8h, v5.8b, v4.8b, \dst, \src load_add_store , , , , , , v5.8b, \dst, \src .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst load_add_store v2.8b, v16.8h, , , , , , \dst, \src, \shiftbits load_add_store v3.8b, v17.8h, , , , , , \dst, \src, \shiftbits load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src, \shiftbits load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src, \shiftbits load_add_store v6.8b, v20.8h, v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src, \shiftbits load_add_store v7.8b, v21.8h, v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src, \shiftbits load_add_store v2.8b, v22.8h, v6.8b, v20.8h, v19.8h, v5.8b, v4.8b, \dst, \src, \shiftbits load_add_store v3.8b, v23.8h, v7.8b, v21.8h, v20.8h, v6.8b, v5.8b, \dst, \src, \shiftbits load_add_store , , v2.8b, v22.8h, v21.8h, v7.8b, v6.8b, \dst, \src, \shiftbits load_add_store , , v3.8b, v23.8h, v22.8h, v2.8b, v7.8b, \dst, \src, \shiftbits load_add_store , , , , v23.8h, v3.8b, v2.8b, \dst, \src, \shiftbits load_add_store , , , , , , v3.8b, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src mov \src, \dst load_add_store v2.8b, v16.8h, , , , , , \dst, \src load_add_store v3.8b, v17.8h, , , , , , \dst, \src load_add_store v4.8b, v18.8h, v2.8b, v16.8h, , , , \dst, \src load_add_store v5.8b, v19.8h, v3.8b, v17.8h, v16.8h, v2.8b, , \dst, \src load_add_store , , v4.8b, v18.8h, v17.8h, v3.8b, v2.8b, \dst, \src load_add_store , , v5.8b, v19.8h, v18.8h, v4.8b, v3.8b, \dst, \src load_add_store , , , , v19.8h, v5.8b, v4.8b, \dst, \src load_add_store , , , , , , v5.8b, \dst, \src .endm .macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src .ifnb \load ld1 {\load}[0], [\src], x1 .endif .ifnb \inssrc ins \insdst\().d[1], \inssrc\().d[0] .endif .ifnb \shift srshr \shift, \shift, #4 .endif .ifnb \load ld1 {\load}[1], [\src], x1 .endif .ifnb \addsrc uaddw \adddst, \adddst, \addsrc .endif .ifnb \store st1 {\store}[0], [\dst], x1 .endif .ifnb \narrowsrc sqxtun \narrowdst, \narrowsrc .endif .ifnb \store st1 {\store}[1], [\dst], x1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src load_add_store4 v4.s, v25, v24, v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src load_add_store4 v5.s, v27, v26, v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src load_add_store4 v6.s, v29, v28, v24.8h, v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src load_add_store4 v7.s, v31, v30, v26.8h, v4.8b, v24.8h, v22.8h, v3.8b, v2.s, \dst, \src load_add_store4 , , , v28.8h, v5.8b, v26.8h, v24.8h, v4.8b, v3.s, \dst, \src load_add_store4 , , , v30.8h, v6.8b, v28.8h, v26.8h, v5.8b, v4.s, \dst, \src load_add_store4 , , , , v7.8b, v30.8h, v28.8h, v6.8b, v5.s, \dst, \src load_add_store4 , , , , , , v30.8h, v7.8b, v6.s, \dst, \src load_add_store4 , , , , , , , , v7.s, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst load_add_store4 v0.s, v17, v16, , , , , , , \dst, \src load_add_store4 v1.s, v19, v18, , , , , , , \dst, \src load_add_store4 v2.s, v21, v20, v16.8h, , , , , , \dst, \src load_add_store4 v3.s, v23, v22, v18.8h, v0.8b, v16.8h, , , , \dst, \src load_add_store4 , , , v20.8h, v1.8b, v18.8h, v16.8h, v0.8b, , \dst, \src load_add_store4 , , , v22.8h, v2.8b, v20.8h, v18.8h, v1.8b, v0.s, \dst, \src load_add_store4 , , , , v3.8b, v22.8h, v20.8h, v2.8b, v1.s, \dst, \src load_add_store4 , , , , , , v22.8h, v3.8b, v2.s, \dst, \src load_add_store4 , , , , , , , , v3.s, \dst, \src .endm .macro idct_dc w, h, shift cbnz w3, 1f mov w16, #2896*8 ld1r {v16.8h}, [x2] dup v0.4h, w16 sqrdmulh v16.8h, v16.8h, v0.h[0] strh wzr, [x2] .if (\w == 2*\h) || (2*\w == \h) sqrdmulh v16.8h, v16.8h, v0.h[0] .endif .if \shift > 0 srshr v16.8h, v16.8h, #\shift .endif sqrdmulh v16.8h, v16.8h, v0.h[0] srshr v16.8h, v16.8h, #4 mov w4, #\h b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon 1: ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0], x1 subs w4, w4, #4 sub x0, x0, x1, lsl #2 uaddw v0.8h, v16.8h, v0.8b sqxtun v0.8b, v0.8h uaddw v1.8h, v16.8h, v1.8b st1 {v0.s}[0], [x0], x1 sqxtun v1.8b, v1.8h st1 {v0.s}[1], [x0], x1 st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x0], x1 b.gt 1b ret endfunc function idct_dc_w8_neon 1: ld1 {v0.8b}, [x0], x1 ld1 {v1.8b}, [x0], x1 ld1 {v2.8b}, [x0], x1 uaddw v20.8h, v16.8h, v0.8b ld1 {v3.8b}, [x0], x1 sub x0, x0, x1, lsl #2 subs w4, w4, #4 uaddw v21.8h, v16.8h, v1.8b sqxtun v0.8b, v20.8h uaddw v22.8h, v16.8h, v2.8b sqxtun v1.8b, v21.8h uaddw v23.8h, v16.8h, v3.8b st1 {v0.8b}, [x0], x1 sqxtun v2.8b, v22.8h st1 {v1.8b}, [x0], x1 sqxtun v3.8b, v23.8h st1 {v2.8b}, [x0], x1 st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w16_neon 1: ld1 {v0.16b}, [x0], x1 ld1 {v1.16b}, [x0], x1 ld1 {v2.16b}, [x0], x1 subs w4, w4, #4 uaddw v20.8h, v16.8h, v0.8b uaddw2 v21.8h, v16.8h, v0.16b ld1 {v3.16b}, [x0], x1 uaddw v22.8h, v16.8h, v1.8b uaddw2 v23.8h, v16.8h, v1.16b sub x0, x0, x1, lsl #2 uaddw v24.8h, v16.8h, v2.8b uaddw2 v25.8h, v16.8h, v2.16b sqxtun v0.8b, v20.8h sqxtun2 v0.16b, v21.8h uaddw v26.8h, v16.8h, v3.8b uaddw2 v27.8h, v16.8h, v3.16b sqxtun v1.8b, v22.8h sqxtun2 v1.16b, v23.8h sqxtun v2.8b, v24.8h sqxtun2 v2.16b, v25.8h st1 {v0.16b}, [x0], x1 sqxtun v3.8b, v26.8h sqxtun2 v3.16b, v27.8h st1 {v1.16b}, [x0], x1 st1 {v2.16b}, [x0], x1 st1 {v3.16b}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w32_neon 1: ld1 {v0.16b, v1.16b}, [x0], x1 subs w4, w4, #2 uaddw v20.8h, v16.8h, v0.8b uaddw2 v21.8h, v16.8h, v0.16b ld1 {v2.16b, v3.16b}, [x0] uaddw v22.8h, v16.8h, v1.8b uaddw2 v23.8h, v16.8h, v1.16b sub x0, x0, x1 uaddw v24.8h, v16.8h, v2.8b uaddw2 v25.8h, v16.8h, v2.16b sqxtun v0.8b, v20.8h sqxtun2 v0.16b, v21.8h uaddw v26.8h, v16.8h, v3.8b uaddw2 v27.8h, v16.8h, v3.16b sqxtun v1.8b, v22.8h sqxtun2 v1.16b, v23.8h sqxtun v2.8b, v24.8h sqxtun2 v2.16b, v25.8h st1 {v0.16b, v1.16b}, [x0], x1 sqxtun v3.8b, v26.8h sqxtun2 v3.16b, v27.8h st1 {v2.16b, v3.16b}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w64_neon 1: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] subs w4, w4, #1 uaddw v20.8h, v16.8h, v0.8b uaddw2 v21.8h, v16.8h, v0.16b uaddw v22.8h, v16.8h, v1.8b uaddw2 v23.8h, v16.8h, v1.16b uaddw v24.8h, v16.8h, v2.8b uaddw2 v25.8h, v16.8h, v2.16b sqxtun v0.8b, v20.8h sqxtun2 v0.16b, v21.8h uaddw v26.8h, v16.8h, v3.8b uaddw2 v27.8h, v16.8h, v3.16b sqxtun v1.8b, v22.8h sqxtun2 v1.16b, v23.8h sqxtun v2.8b, v24.8h sqxtun2 v2.16b, v25.8h sqxtun v3.8b, v26.8h sqxtun2 v3.16b, v27.8h st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 b.gt 1b ret endfunc .macro iwht4 add v16.4h, v16.4h, v17.4h sub v21.4h, v18.4h, v19.4h sub v20.4h, v16.4h, v21.4h sshr v20.4h, v20.4h, #1 sub v18.4h, v20.4h, v17.4h sub v17.4h, v20.4h, v19.4h add v19.4h, v21.4h, v18.4h sub v16.4h, v16.4h, v17.4h .endm .macro idct_4 r0, r1, r2, r3, sz smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz sqrshrn_sz v6, v6, v7, #12, \sz sqrshrn_sz v7, v4, v5, #12, \sz smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz sqrshrn_sz v2, v2, v3, #12, \sz sqrshrn_sz v3, v4, v5, #12, \sz sqadd \r0\sz, v2\sz, v6\sz sqsub \r3\sz, v2\sz, v6\sz sqadd \r1\sz, v3\sz, v7\sz sqsub \r2\sz, v3\sz, v7\sz .endm function inv_dct_4h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .4h ret endfunc function inv_dct_8h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .8h ret endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel x16, iadst4_coeffs ld1 {v0.8h}, [x16] ssubl v3.4s, v16.4h, v18.4h smull v4.4s, v16.4h, v0.h[0] smlal v4.4s, v18.4h, v0.h[1] smlal v4.4s, v19.4h, v0.h[2] smull v7.4s, v17.4h, v0.h[3] saddw v3.4s, v3.4s, v19.4h smull v5.4s, v16.4h, v0.h[2] smlsl v5.4s, v18.4h, v0.h[0] smlsl v5.4s, v19.4h, v0.h[1] add \o3\().4s, v4.4s, v5.4s mul \o2\().4s, v3.4s, v0.s[2] add \o0\().4s, v4.4s, v7.4s add \o1\().4s, v5.4s, v7.4s sub \o3\().4s, \o3\().4s, v7.4s sqrshrn \o0\().4h, \o0\().4s, #12 sqrshrn \o2\().4h, \o2\().4s, #12 sqrshrn \o1\().4h, \o1\().4s, #12 sqrshrn \o3\().4h, \o3\().4s, #12 .endm function inv_adst_4h_x4_neon, export=1 iadst_4x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_4h_x4_neon, export=1 iadst_4x4 v19, v18, v17, v16 ret endfunc .macro iadst_8x4 o0, o1, o2, o3 movrel x16, iadst4_coeffs ld1 {v0.8h}, [x16] ssubl v2.4s, v16.4h, v18.4h ssubl2 v3.4s, v16.8h, v18.8h smull v4.4s, v16.4h, v0.h[0] smlal v4.4s, v18.4h, v0.h[1] smlal v4.4s, v19.4h, v0.h[2] smull2 v5.4s, v16.8h, v0.h[0] smlal2 v5.4s, v18.8h, v0.h[1] smlal2 v5.4s, v19.8h, v0.h[2] saddw v2.4s, v2.4s, v19.4h saddw2 v3.4s, v3.4s, v19.8h smull v6.4s, v16.4h, v0.h[2] smlsl v6.4s, v18.4h, v0.h[0] smlsl v6.4s, v19.4h, v0.h[1] smull2 v7.4s, v16.8h, v0.h[2] smlsl2 v7.4s, v18.8h, v0.h[0] smlsl2 v7.4s, v19.8h, v0.h[1] mul v18.4s, v2.4s, v0.s[2] mul v19.4s, v3.4s, v0.s[2] smull v2.4s, v17.4h, v0.h[3] smull2 v3.4s, v17.8h, v0.h[3] add v16.4s, v4.4s, v2.4s // out0 add v17.4s, v5.4s, v3.4s add v4.4s, v4.4s, v6.4s // out3 add v5.4s, v5.4s, v7.4s add v6.4s, v6.4s, v2.4s // out1 add v7.4s, v7.4s, v3.4s sub v4.4s, v4.4s, v2.4s // out3 sub v5.4s, v5.4s, v3.4s sqrshrn v18.4h, v18.4s, #12 sqrshrn2 v18.8h, v19.4s, #12 sqrshrn \o0\().4h, v16.4s, #12 sqrshrn2 \o0\().8h, v17.4s, #12 .ifc \o2, v17 mov v17.16b, v18.16b .endif sqrshrn \o1\().4h, v6.4s, #12 sqrshrn2 \o1\().8h, v7.4s, #12 sqrshrn \o3\().4h, v4.4s, #12 sqrshrn2 \o3\().8h, v5.4s, #12 .endm function inv_adst_8h_x4_neon, export=1 iadst_8x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_8h_x4_neon, export=1 iadst_8x4 v19, v18, v17, v16 ret endfunc function inv_identity_4h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.4h, v16.4h, v0.h[0] sqrdmulh v5.4h, v17.4h, v0.h[0] sqrdmulh v6.4h, v18.4h, v0.h[0] sqrdmulh v7.4h, v19.4h, v0.h[0] sqadd v16.4h, v16.4h, v4.4h sqadd v17.4h, v17.4h, v5.4h sqadd v18.4h, v18.4h, v6.4h sqadd v19.4h, v19.4h, v7.4h ret endfunc function inv_identity_8h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.8h, v16.8h, v0.h[0] sqrdmulh v5.8h, v17.8h, v0.h[0] sqrdmulh v6.8h, v18.8h, v0.h[0] sqrdmulh v7.8h, v19.8h, v0.h[0] sqadd v16.8h, v16.8h, v4.8h sqadd v17.8h, v17.8h, v5.8h sqadd v18.8h, v18.8h, v6.8h sqadd v19.8h, v19.8h, v7.8h ret endfunc .macro identity_8x4_shift1 r0, r1, r2, r3, c .irp i, \r0\().8h, \r1\().8h, \r2\().8h, \r3\().8h sqrdmulh v2.8h, \i, \c srhadd \i, \i, v2.8h .endr .endm function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 mov x15, x30 movi v31.8h, #0 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] st1 {v31.8h}, [x2], #16 sshr v16.4h, v16.4h, #2 sshr v17.4h, v17.4h, #2 sshr v18.4h, v18.4h, #2 sshr v19.4h, v19.4h, #2 iwht4 st1 {v31.8h}, [x2], #16 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 iwht4 ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ins v16.d[1], v17.d[0] ins v18.d[1], v19.d[0] ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0], x1 b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon movi v31.8h, #0 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] st1 {v31.8h}, [x2], #16 blr x4 st1 {v31.8h}, [x2], #16 transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 blr x5 ld1 {v0.s}[0], [x0], x1 ld1 {v0.s}[1], [x0], x1 ins v16.d[1], v17.d[0] ins v18.d[1], v19.d[0] ld1 {v1.s}[0], [x0], x1 ld1 {v1.s}[1], [x0], x1 srshr v16.8h, v16.8h, #4 srshr v18.8h, v18.8h, #4 L(itx_4x4_end): sub x0, x0, x1, lsl #2 uaddw v16.8h, v16.8h, v0.8b sqxtun v0.8b, v16.8h uaddw v18.8h, v18.8h, v1.8b st1 {v0.s}[0], [x0], x1 sqxtun v1.8b, v18.8h st1 {v0.s}[1], [x0], x1 st1 {v1.s}[0], [x0], x1 st1 {v1.s}[1], [x0], x1 ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct cbnz w3, 1f mov w16, #2896*8 ld1r {v16.8h}, [x2] dup v4.8h, w16 strh wzr, [x2] sqrdmulh v16.8h, v16.8h, v4.h[0] ld1 {v0.s}[0], [x0], x1 sqrdmulh v20.8h, v16.8h, v4.h[0] ld1 {v0.s}[1], [x0], x1 srshr v16.8h, v20.8h, #4 ld1 {v1.s}[0], [x0], x1 srshr v18.8h, v20.8h, #4 ld1 {v1.s}[1], [x0], x1 b L(itx_4x4_end) 1: .endif adr x4, inv_\txfm1\()_4h_x4_neon adr x5, inv_\txfm2\()_4h_x4_neon b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7, sz, szb idct_4 \r0, \r2, \r4, \r6, \sz smull_smlsl v2, v3, \r1, \r7, v0.h[4], v0.h[5], \sz // -> t4a smull_smlal v4, v5, \r1, \r7, v0.h[5], v0.h[4], \sz // -> t7a smull_smlsl v6, v7, \r5, \r3, v0.h[6], v0.h[7], \sz // -> t5a sqrshrn_sz \r1, v2, v3, #12, \sz // t4a sqrshrn_sz \r7, v4, v5, #12, \sz // t7a smull_smlal v2, v3, \r5, \r3, v0.h[7], v0.h[6], \sz // -> t6a sqrshrn_sz \r3, v6, v7, #12, \sz // t5a sqrshrn_sz \r5, v2, v3, #12, \sz // t6a sqadd v2\sz, \r1\sz, \r3\sz // t4 sqsub \r1\sz, \r1\sz, \r3\sz // t5a sqadd v3\sz, \r7\sz, \r5\sz // t7 sqsub \r3\sz, \r7\sz, \r5\sz // t6a smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5 smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6 sqrshrn_sz v4, v4, v5, #12, \sz // t5 sqrshrn_sz v5, v6, v7, #12, \sz // t6 sqsub \r7\sz, \r0\sz, v3\sz // out7 sqadd \r0\sz, \r0\sz, v3\sz // out0 sqadd \r1\sz, \r2\sz, v5\sz // out1 sqsub v6\sz, \r2\sz, v5\sz // out6 sqadd \r2\sz, \r4\sz, v4\sz // out2 sqsub \r5\sz, \r4\sz, v4\sz // out5 sqadd \r3\sz, \r6\sz, v2\sz // out3 sqsub \r4\sz, \r6\sz, v2\sz // out4 mov \r6\szb, v6\szb // out6 .endm function inv_dct_8h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b ret endfunc function inv_dct_4h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b ret endfunc .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7, sz movrel x16, iadst8_coeffs ld1 {v0.8h, v1.8h}, [x16] smull_smlal v2, v3, v23, v16, v0.h[0], v0.h[1], \sz smull_smlsl v4, v5, v23, v16, v0.h[1], v0.h[0], \sz smull_smlal v6, v7, v21, v18, v0.h[2], v0.h[3], \sz sqrshrn_sz v16, v2, v3, #12, \sz // t0a sqrshrn_sz v23, v4, v5, #12, \sz // t1a smull_smlsl v2, v3, v21, v18, v0.h[3], v0.h[2], \sz smull_smlal v4, v5, v19, v20, v0.h[4], v0.h[5], \sz sqrshrn_sz v18, v6, v7, #12, \sz // t2a sqrshrn_sz v21, v2, v3, #12, \sz // t3a smull_smlsl v6, v7, v19, v20, v0.h[5], v0.h[4], \sz smull_smlal v2, v3, v17, v22, v0.h[6], v0.h[7], \sz sqrshrn_sz v20, v4, v5, #12, \sz // t4a sqrshrn_sz v19, v6, v7, #12, \sz // t5a smull_smlsl v4, v5, v17, v22, v0.h[7], v0.h[6], \sz sqrshrn_sz v22, v2, v3, #12, \sz // t6a sqrshrn_sz v17, v4, v5, #12, \sz // t7a sqadd v2\sz, v16\sz, v20\sz // t0 sqsub v3\sz, v16\sz, v20\sz // t4 sqadd v4\sz, v23\sz, v19\sz // t1 sqsub v5\sz, v23\sz, v19\sz // t5 sqadd v6\sz, v18\sz, v22\sz // t2 sqsub v7\sz, v18\sz, v22\sz // t6 sqadd v18\sz, v21\sz, v17\sz // t3 sqsub v19\sz, v21\sz, v17\sz // t7 smull_smlal v16, v17, v3, v5, v1.h[3], v1.h[2], \sz smull_smlsl v20, v21, v3, v5, v1.h[2], v1.h[3], \sz smull_smlsl v22, v23, v19, v7, v1.h[3], v1.h[2], \sz sqrshrn_sz v3, v16, v17, #12, \sz // t4a sqrshrn_sz v5, v20, v21, #12, \sz // t5a smull_smlal v16, v17, v19, v7, v1.h[2], v1.h[3], \sz sqrshrn_sz v7, v22, v23, #12, \sz // t6a sqrshrn_sz v19, v16, v17, #12, \sz // t7a sqadd \o0\()\sz, v2\sz, v6\sz // out0 sqsub v2\sz, v2\sz, v6\sz // t2 sqadd \o7\()\sz, v4\sz, v18\sz // out7 sqsub v4\sz, v4\sz, v18\sz // t3 sqneg \o7\()\sz, \o7\()\sz // out7 sqadd \o1\()\sz, v3\sz, v7\sz // out1 sqsub v3\sz, v3\sz, v7\sz // t6 sqadd \o6\()\sz, v5\sz, v19\sz // out6 sqsub v5\sz, v5\sz, v19\sz // t7 sqneg \o1\()\sz, \o1\()\sz // out1 smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20) smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19) smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18) sqrshrn_sz v2, v18, v19, #12, \sz // out3 smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21) sqrshrn_sz v3, v20, v21, #12, \sz // out5 sqrshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21) sqrshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19) sqneg \o3\()\sz, v2\sz // out3 sqneg \o5\()\sz, v3\sz // out5 .endm function inv_adst_8h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h ret endfunc function inv_flipadst_8h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h ret endfunc function inv_adst_4h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h ret endfunc function inv_flipadst_4h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h ret endfunc function inv_identity_8h_x8_neon, export=1 sqshl v16.8h, v16.8h, #1 sqshl v17.8h, v17.8h, #1 sqshl v18.8h, v18.8h, #1 sqshl v19.8h, v19.8h, #1 sqshl v20.8h, v20.8h, #1 sqshl v21.8h, v21.8h, #1 sqshl v22.8h, v22.8h, #1 sqshl v23.8h, v23.8h, #1 ret endfunc function inv_identity_4h_x8_neon, export=1 sqshl v16.4h, v16.4h, #1 sqshl v17.4h, v17.4h, #1 sqshl v18.4h, v18.4h, #1 sqshl v19.4h, v19.4h, #1 sqshl v20.4h, v20.4h, #1 sqshl v21.4h, v21.4h, #1 sqshl v22.4h, v22.4h, #1 sqshl v23.4h, v23.4h, #1 ret endfunc .macro def_fn_8x8_base variant function inv_txfm_\variant\()add_8x8_neon movi v28.8h, #0 movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2], #64 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2] st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out .else blr x4 srshr v16.8h, v16.8h, #1 srshr v17.8h, v17.8h, #1 srshr v18.8h, v18.8h, #1 srshr v19.8h, v19.8h, #1 srshr v20.8h, v20.8h, #1 srshr v21.8h, v21.8h, #1 srshr v22.8h, v22.8h, #1 srshr v23.8h, v23.8h, #1 .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 blr x5 load_add_store_8x8 x0, x7 ret x15 endfunc .endm def_fn_8x8_base def_fn_8x8_base identity_ .macro def_fn_8x8 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif adr x5, inv_\txfm2\()_8h_x8_neon .ifc \txfm1, identity b inv_txfm_identity_add_8x8_neon .else adr x4, inv_\txfm1\()_8h_x8_neon b inv_txfm_add_8x8_neon .endif endfunc .endm def_fn_8x8 dct, dct def_fn_8x8 identity, identity def_fn_8x8 dct, adst def_fn_8x8 dct, flipadst def_fn_8x8 dct, identity def_fn_8x8 adst, dct def_fn_8x8 adst, adst def_fn_8x8 adst, flipadst def_fn_8x8 flipadst, dct def_fn_8x8 flipadst, adst def_fn_8x8 flipadst, flipadst def_fn_8x8 identity, dct def_fn_8x8 adst, identity def_fn_8x8 flipadst, identity def_fn_8x8 identity, adst def_fn_8x8 identity, flipadst function inv_txfm_add_8x4_neon movi v30.8h, #0 movi v31.8h, #0 mov w16, #2896*8 dup v0.4h, w16 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] st1 {v30.8h,v31.8h}, [x2], #32 ld1 {v20.4h,v21.4h,v22.4h,v23.4h}, [x2] st1 {v30.8h,v31.8h}, [x2] scale_input .4h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 ins v16.d[1], v20.d[0] ins v17.d[1], v21.d[0] ins v18.d[1], v22.d[0] ins v19.d[1], v23.d[0] blr x5 load_add_store_8x4 x0, x7 ret x15 endfunc function inv_txfm_add_4x8_neon movi v28.8h, #0 movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 mov w16, #2896*8 dup v0.4h, w16 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2] st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x2] scale_input .8h, v0.h[0], v16, v17, v18, v19 blr x4 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 ins v20.d[0], v16.d[1] ins v21.d[0], v17.d[1] ins v22.d[0], v18.d[1] ins v23.d[0], v19.d[1] blr x5 load_add_store_4x8 x0, x7 ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct def_fn_48 \w, \h, identity, identity def_fn_48 \w, \h, dct, adst def_fn_48 \w, \h, dct, flipadst def_fn_48 \w, \h, dct, identity def_fn_48 \w, \h, adst, dct def_fn_48 \w, \h, adst, adst def_fn_48 \w, \h, adst, flipadst def_fn_48 \w, \h, flipadst, dct def_fn_48 \w, \h, flipadst, adst def_fn_48 \w, \h, flipadst, flipadst def_fn_48 \w, \h, identity, dct def_fn_48 \w, \h, adst, identity def_fn_48 \w, \h, flipadst, identity def_fn_48 \w, \h, identity, adst def_fn_48 \w, \h, identity, flipadst .endm def_fns_48 4, 8 def_fns_48 8, 4 .macro idct_16 sz, szb idct_8 v16, v18, v20, v22, v24, v26, v28, v30, \sz, \szb smull_smlsl v2, v3, v17, v31, v1.h[0], v1.h[1], \sz // -> t8a smull_smlal v4, v5, v17, v31, v1.h[1], v1.h[0], \sz // -> t15a smull_smlsl v6, v7, v25, v23, v1.h[2], v1.h[3], \sz // -> t9a sqrshrn_sz v17, v2, v3, #12, \sz // t8a sqrshrn_sz v31, v4, v5, #12, \sz // t15a smull_smlal v2, v3, v25, v23, v1.h[3], v1.h[2], \sz // -> t14a smull_smlsl v4, v5, v21, v27, v1.h[4], v1.h[5], \sz // -> t10a sqrshrn_sz v23, v6, v7, #12, \sz // t9a sqrshrn_sz v25, v2, v3, #12, \sz // t14a smull_smlal v6, v7, v21, v27, v1.h[5], v1.h[4], \sz // -> t13a smull_smlsl v2, v3, v29, v19, v1.h[6], v1.h[7], \sz // -> t11a sqrshrn_sz v21, v4, v5, #12, \sz // t10a sqrshrn_sz v27, v6, v7, #12, \sz // t13a smull_smlal v4, v5, v29, v19, v1.h[7], v1.h[6], \sz // -> t12a sqrshrn_sz v19, v2, v3, #12, \sz // t11a sqrshrn_sz v29, v4, v5, #12, \sz // t12a sqsub v2\sz, v17\sz, v23\sz // t9 sqadd v17\sz, v17\sz, v23\sz // t8 sqsub v3\sz, v31\sz, v25\sz // t14 sqadd v31\sz, v31\sz, v25\sz // t15 sqsub v23\sz, v19\sz, v21\sz // t10 sqadd v19\sz, v19\sz, v21\sz // t11 sqadd v25\sz, v29\sz, v27\sz // t12 sqsub v29\sz, v29\sz, v27\sz // t13 smull_smlsl v4, v5, v3, v2, v0.h[2], v0.h[3], \sz // -> t9a smull_smlal v6, v7, v3, v2, v0.h[3], v0.h[2], \sz // -> t14a sqrshrn_sz v21, v4, v5, #12, \sz // t9a sqrshrn_sz v27, v6, v7, #12, \sz // t14a smull_smlsl v4, v5, v29, v23, v0.h[2], v0.h[3], \sz // -> t13a smull_smlal v6, v7, v29, v23, v0.h[3], v0.h[2], \sz // -> t10a sqrshrn_sz v29, v4, v5, #12, \sz // t13a neg v6.4s, v6.4s .ifc \sz, .8h neg v7.4s, v7.4s .endif sqrshrn_sz v23, v6, v7, #12, \sz // t10a sqsub v2\sz, v17\sz, v19\sz // t11a sqadd v17\sz, v17\sz, v19\sz // t8a sqsub v3\sz, v31\sz, v25\sz // t12a sqadd v31\sz, v31\sz, v25\sz // t15a sqadd v19\sz, v21\sz, v23\sz // t9 sqsub v21\sz, v21\sz, v23\sz // t10 sqsub v25\sz, v27\sz, v29\sz // t13 sqadd v27\sz, v27\sz, v29\sz // t14 smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11 smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12 smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a sqrshrn_sz v4, v4, v5, #12, \sz // t11 sqrshrn_sz v5, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t13a sqrshrn_sz v2, v2, v3, #12, \sz // t10a sqrshrn_sz v3, v6, v7, #12, \sz // t13a sqadd v6\sz, v16\sz, v31\sz // out0 sqsub v31\sz, v16\sz, v31\sz // out15 mov v16\szb, v6\szb sqadd v23\sz, v30\sz, v17\sz // out7 sqsub v7\sz, v30\sz, v17\sz // out8 sqadd v17\sz, v18\sz, v27\sz // out1 sqsub v30\sz, v18\sz, v27\sz // out14 sqadd v18\sz, v20\sz, v3\sz // out2 sqsub v29\sz, v20\sz, v3\sz // out13 sqadd v3\sz, v28\sz, v19\sz // out6 sqsub v25\sz, v28\sz, v19\sz // out9 sqadd v19\sz, v22\sz, v5\sz // out3 sqsub v28\sz, v22\sz, v5\sz // out12 sqadd v20\sz, v24\sz, v4\sz // out4 sqsub v27\sz, v24\sz, v4\sz // out11 sqadd v21\sz, v26\sz, v2\sz // out5 sqsub v26\sz, v26\sz, v2\sz // out10 mov v24\szb, v7\szb mov v22\szb, v3\szb .endm function inv_dct_8h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .8h, .16b ret endfunc function inv_dct_4h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .4h, .8b ret endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15, sz, szb movrel x16, iadst16_coeffs ld1 {v0.8h, v1.8h}, [x16] movrel x16, idct_coeffs smull_smlal v2, v3, v31, v16, v0.h[0], v0.h[1], \sz // -> t0 smull_smlsl v4, v5, v31, v16, v0.h[1], v0.h[0], \sz // -> t1 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t2 sqrshrn_sz v16, v2, v3, #12, \sz // t0 sqrshrn_sz v31, v4, v5, #12, \sz // t1 smull_smlsl v2, v3, v29, v18, v0.h[3], v0.h[2], \sz // -> t3 smull_smlal v4, v5, v27, v20, v0.h[4], v0.h[5], \sz // -> t4 sqrshrn_sz v18, v6, v7, #12, \sz // t2 sqrshrn_sz v29, v2, v3, #12, \sz // t3 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t5 smull_smlal v2, v3, v25, v22, v0.h[6], v0.h[7], \sz // -> t6 sqrshrn_sz v20, v4, v5, #12, \sz // t4 sqrshrn_sz v27, v6, v7, #12, \sz // t5 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t7 smull_smlal v6, v7, v23, v24, v1.h[0], v1.h[1], \sz // -> t8 sqrshrn_sz v22, v2, v3, #12, \sz // t6 sqrshrn_sz v25, v4, v5, #12, \sz // t7 smull_smlsl v2, v3, v23, v24, v1.h[1], v1.h[0], \sz // -> t9 smull_smlal v4, v5, v21, v26, v1.h[2], v1.h[3], \sz // -> t10 sqrshrn_sz v23, v6, v7, #12, \sz // t8 sqrshrn_sz v24, v2, v3, #12, \sz // t9 smull_smlsl v6, v7, v21, v26, v1.h[3], v1.h[2], \sz // -> t11 smull_smlal v2, v3, v19, v28, v1.h[4], v1.h[5], \sz // -> t12 sqrshrn_sz v21, v4, v5, #12, \sz // t10 sqrshrn_sz v26, v6, v7, #12, \sz // t11 smull_smlsl v4, v5, v19, v28, v1.h[5], v1.h[4], \sz // -> t13 smull_smlal v6, v7, v17, v30, v1.h[6], v1.h[7], \sz // -> t14 sqrshrn_sz v19, v2, v3, #12, \sz // t12 sqrshrn_sz v28, v4, v5, #12, \sz // t13 smull_smlsl v2, v3, v17, v30, v1.h[7], v1.h[6], \sz // -> t15 sqrshrn_sz v17, v6, v7, #12, \sz // t14 sqrshrn_sz v30, v2, v3, #12, \sz // t15 ld1 {v0.8h}, [x16] sqsub v2\sz, v16\sz, v23\sz // t8a sqadd v16\sz, v16\sz, v23\sz // t0a sqsub v3\sz, v31\sz, v24\sz // t9a sqadd v31\sz, v31\sz, v24\sz // t1a sqadd v23\sz, v18\sz, v21\sz // t2a sqsub v18\sz, v18\sz, v21\sz // t10a sqadd v24\sz, v29\sz, v26\sz // t3a sqsub v29\sz, v29\sz, v26\sz // t11a sqadd v21\sz, v20\sz, v19\sz // t4a sqsub v20\sz, v20\sz, v19\sz // t12a sqadd v26\sz, v27\sz, v28\sz // t5a sqsub v27\sz, v27\sz, v28\sz // t13a sqadd v19\sz, v22\sz, v17\sz // t6a sqsub v22\sz, v22\sz, v17\sz // t14a sqadd v28\sz, v25\sz, v30\sz // t7a sqsub v25\sz, v25\sz, v30\sz // t15a smull_smlal v4, v5, v2, v3, v0.h[5], v0.h[4], \sz // -> t8 smull_smlsl v6, v7, v2, v3, v0.h[4], v0.h[5], \sz // -> t9 smull_smlal v2, v3, v18, v29, v0.h[7], v0.h[6], \sz // -> t10 sqrshrn_sz v17, v4, v5, #12, \sz // t8 sqrshrn_sz v30, v6, v7, #12, \sz // t9 smull_smlsl v4, v5, v18, v29, v0.h[6], v0.h[7], \sz // -> t11 smull_smlsl v6, v7, v27, v20, v0.h[5], v0.h[4], \sz // -> t12 sqrshrn_sz v18, v2, v3, #12, \sz // t10 sqrshrn_sz v29, v4, v5, #12, \sz // t11 smull_smlal v2, v3, v27, v20, v0.h[4], v0.h[5], \sz // -> t13 smull_smlsl v4, v5, v25, v22, v0.h[7], v0.h[6], \sz // -> t14 sqrshrn_sz v27, v6, v7, #12, \sz // t12 sqrshrn_sz v20, v2, v3, #12, \sz // t13 smull_smlal v6, v7, v25, v22, v0.h[6], v0.h[7], \sz // -> t15 sqrshrn_sz v25, v4, v5, #12, \sz // t14 sqrshrn_sz v22, v6, v7, #12, \sz // t15 sqsub v2\sz, v16\sz, v21\sz // t4 sqadd v16\sz, v16\sz, v21\sz // t0 sqsub v3\sz, v31\sz, v26\sz // t5 sqadd v31\sz, v31\sz, v26\sz // t1 sqadd v21\sz, v23\sz, v19\sz // t2 sqsub v23\sz, v23\sz, v19\sz // t6 sqadd v26\sz, v24\sz, v28\sz // t3 sqsub v24\sz, v24\sz, v28\sz // t7 sqadd v19\sz, v17\sz, v27\sz // t8a sqsub v17\sz, v17\sz, v27\sz // t12a sqadd v28\sz, v30\sz, v20\sz // t9a sqsub v30\sz, v30\sz, v20\sz // t13a sqadd v27\sz, v18\sz, v25\sz // t10a sqsub v18\sz, v18\sz, v25\sz // t14a sqadd v20\sz, v29\sz, v22\sz // t11a sqsub v29\sz, v29\sz, v22\sz // t15a smull_smlal v4, v5, v2, v3, v0.h[3], v0.h[2], \sz // -> t4a smull_smlsl v6, v7, v2, v3, v0.h[2], v0.h[3], \sz // -> t5a smull_smlsl v2, v3, v24, v23, v0.h[3], v0.h[2], \sz // -> t6a sqrshrn_sz v22, v4, v5, #12, \sz // t4a sqrshrn_sz v25, v6, v7, #12, \sz // t5a smull_smlal v4, v5, v24, v23, v0.h[2], v0.h[3], \sz // -> t7a smull_smlal v6, v7, v17, v30, v0.h[3], v0.h[2], \sz // -> t12 sqrshrn_sz v24, v2, v3, #12, \sz // t6a sqrshrn_sz v23, v4, v5, #12, \sz // t7a smull_smlsl v2, v3, v17, v30, v0.h[2], v0.h[3], \sz // -> t13 smull_smlsl v4, v5, v29, v18, v0.h[3], v0.h[2], \sz // -> t14 sqrshrn_sz v17, v6, v7, #12, \sz // t12 smull_smlal v6, v7, v29, v18, v0.h[2], v0.h[3], \sz // -> t15 sqrshrn_sz v29, v2, v3, #12, \sz // t13 sqrshrn_sz v30, v4, v5, #12, \sz // t14 sqrshrn_sz v18, v6, v7, #12, \sz // t15 sqsub v2\sz, v16\sz, v21\sz // t2a .ifc \o0, v16 sqadd \o0\sz, v16\sz, v21\sz // out0 sqsub v21\sz, v31\sz, v26\sz // t3a sqadd \o15\sz, v31\sz, v26\sz // out15 .else sqadd v4\sz, v16\sz, v21\sz // out0 sqsub v21\sz, v31\sz, v26\sz // t3a sqadd \o15\sz, v31\sz, v26\sz // out15 mov \o0\szb, v4\szb .endif sqneg \o15\sz, \o15\sz // out15 sqsub v3\sz, v29\sz, v18\sz // t15a sqadd \o13\sz, v29\sz, v18\sz // out13 sqadd \o2\sz, v17\sz, v30\sz // out2 sqsub v26\sz, v17\sz, v30\sz // t14a sqneg \o13\sz, \o13\sz // out13 sqadd \o1\sz, v19\sz, v27\sz // out1 sqsub v27\sz, v19\sz, v27\sz // t10 sqadd \o14\sz, v28\sz, v20\sz // out14 sqsub v20\sz, v28\sz, v20\sz // t11 sqneg \o1\sz, \o1\sz // out1 sqadd \o3\sz, v22\sz, v24\sz // out3 sqsub v22\sz, v22\sz, v24\sz // t6 sqadd \o12\sz, v25\sz, v23\sz // out12 sqsub v23\sz, v25\sz, v23\sz // t7 sqneg \o3\sz, \o3\sz // out3 smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23) smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24) smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26) sqrshrn_sz v24, v24, v25, #12, \sz // out8 sqrshrn_sz v4, v4, v5, #12, \sz // out7 sqrshrn_sz v5, v6, v7, #12, \sz // out5 smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21) smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27) sqrshrn_sz v26, v6, v7, #12, \sz // out10 smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20) smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25) smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22) sqrshrn_sz \o4, v2, v3, #12, \sz // out4 sqrshrn_sz v6, v6, v7, #12, \sz // out11 sqrshrn_sz v7, v21, v25, #12, \sz // out9 sqrshrn_sz \o6, v22, v23, #12, \sz // out6 .ifc \o8, v23 mov \o8\szb, v24\szb mov \o10\szb, v26\szb .endif sqneg \o7\sz, v4\sz // out7 sqneg \o5\sz, v5\sz // out5 sqneg \o11\sz, v6\sz // out11 sqneg \o9\sz, v7\sz // out9 .endm function inv_adst_8h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b ret endfunc function inv_flipadst_8h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b ret endfunc function inv_adst_4h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b ret endfunc function inv_flipadst_4h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b ret endfunc function inv_identity_8h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 sqrdmulh v2.8h, v\i\().8h, v0.h[0] sqadd v\i\().8h, v\i\().8h, v\i\().8h sqadd v\i\().8h, v\i\().8h, v2.8h .endr ret endfunc function inv_identity_4h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 sqrdmulh v2.4h, v\i\().4h, v0.h[0] sqadd v\i\().4h, v\i\().4h, v\i\().4h sqadd v\i\().4h, v\i\().4h, v2.4h .endr ret endfunc .macro identity_8x16_shift2 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h sqrdmulh v2.8h, \i, \c sshr v2.8h, v2.8h, #1 srhadd \i, \i, v2.8h .endr .endm .macro identity_8x16_shift1 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h sqrdmulh v2.8h, \i, \c srshr v2.8h, v2.8h, #1 sqadd \i, \i, v2.8h .endr .endm .macro identity_8x8_shift1 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h sqrdmulh v2.8h, \i, \c srshr v2.8h, v2.8h, #1 sqadd \i, \i, v2.8h .endr .endm .macro identity_8x8 c .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h sqrdmulh v2.8h, \i, \c sqadd \i, \i, \i sqadd \i, \i, v2.8h .endr .endm .macro def_horz_16 scale=0, identity=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x8_neon AARCH64_VALID_CALL_TARGET mov x14, x30 movi v7.8h, #0 .if \identity mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .elseif \scale mov w16, #2896*8 dup v0.4h, w16 .endif .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr .if \scale scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif .if \identity identity_8x16_shift2 v0.h[0] .else blr x4 .endif .if \shift > 0 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h srshr \i, \i, #\shift .endr .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 .irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h st1 {\i}, [x6], #16 .endr ret x14 endfunc .endm def_horz_16 scale=0, identity=0, shift=2 def_horz_16 scale=1, identity=0, shift=1, suffix=_scale def_horz_16 scale=0, identity=1, shift=0, suffix=_identity function inv_txfm_add_vert_8x16_neon mov x14, x30 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr blr x5 load_add_store_8x16 x6, x7 ret x14 endfunc function inv_txfm_add_16x16_neon mov x15, x30 sub sp, sp, #512 .irp i, 0, 8 add x6, sp, #(\i*16*2) .if \i == 8 cmp w3, w13 b.lt 1f .endif add x7, x2, #(\i*2) mov x8, #16*2 blr x9 .endr b 2f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr 2: .irp i, 0, 8 add x6, x0, #(\i) add x7, sp, #(\i*2) mov x8, #32 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #512 ret x15 endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif .ifc \txfm1, identity adr x9, inv_txfm_horz_identity_16x8_neon .else adr x9, inv_txfm_horz_16x8_neon adr x4, inv_\txfm1\()_8h_x16_neon .endif adr x5, inv_\txfm2\()_8h_x16_neon mov x13, #\eob_half b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct, 36 def_fn_16x16 identity, identity, 36 def_fn_16x16 dct, adst, 36 def_fn_16x16 dct, flipadst, 36 def_fn_16x16 dct, identity, 8 def_fn_16x16 adst, dct, 36 def_fn_16x16 adst, adst, 36 def_fn_16x16 adst, flipadst, 36 def_fn_16x16 flipadst, dct, 36 def_fn_16x16 flipadst, adst, 36 def_fn_16x16 flipadst, flipadst, 36 def_fn_16x16 identity, dct, 8 .macro def_fn_416_base variant function inv_txfm_\variant\()add_16x4_neon mov x15, x30 movi v4.8h, #0 .ifc \variant, identity_ .irp i, v16.4h, v17.4h, v18.4h, v19.4h ld1 {\i}, [x2] st1 {v4.4h}, [x2], #8 .endr .irp i, v16.d, v17.d, v18.d, v19.d ld1 {\i}[1], [x2] st1 {v4.4h}, [x2], #8 .endr mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, v20.4h, v21.4h, v22.4h, v23.4h ld1 {\i}, [x2] st1 {v4.4h}, [x2], #8 .endr .irp i, v20.d, v21.d, v22.d, v23.d ld1 {\i}[1], [x2] st1 {v4.4h}, [x2], #8 .endr identity_8x16_shift1 v0.h[0] .else .irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h ld1 {\i}, [x2] st1 {v4.4h}, [x2], #8 .endr blr x4 ins v16.d[1], v20.d[0] ins v17.d[1], v21.d[0] ins v18.d[1], v22.d[0] ins v19.d[1], v23.d[0] .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr .endif transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x4 x6, x7 .ifc \variant, identity_ mov v16.16b, v20.16b mov v17.16b, v21.16b mov v18.16b, v22.16b mov v19.16b, v23.16b .else ins v24.d[1], v28.d[0] ins v25.d[1], v29.d[0] ins v26.d[1], v30.d[0] ins v27.d[1], v31.d[0] srshr v16.8h, v24.8h, #1 srshr v17.8h, v25.8h, #1 srshr v18.8h, v26.8h, #1 srshr v19.8h, v27.8h, #1 .endif transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 add x6, x0, #8 load_add_store_8x4 x6, x7 ret x15 endfunc function inv_txfm_\variant\()add_4x16_neon mov x15, x30 movi v2.8h, #0 mov x11, #32 cmp w3, w13 b.lt 1f add x6, x2, #16 .ifc \variant, identity_ .irp i, v24.8h, v25.8h, v26.8h, v27.8h ld1 {\i}, [x6] st1 {v2.8h}, [x6], x11 .endr mov w16, #(5793-4096)*8 dup v0.4h, w16 identity_8x4_shift1 v24, v25, v26, v27, v0.h[0] .else .irp i, v16.8h, v17.8h, v18.8h, v19.8h ld1 {\i}, [x6] st1 {v2.8h}, [x6], x11 .endr blr x4 srshr v24.8h, v16.8h, #1 srshr v25.8h, v17.8h, #1 srshr v26.8h, v18.8h, #1 srshr v27.8h, v19.8h, #1 .endif transpose_4x8h v24, v25, v26, v27, v4, v5, v6, v7 ins v28.d[0], v24.d[1] ins v29.d[0], v25.d[1] ins v30.d[0], v26.d[1] ins v31.d[0], v27.d[1] b 2f 1: .irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h movi \i, #0 .endr 2: movi v2.8h, #0 .irp i, v16.8h, v17.8h, v18.8h, v19.8h ld1 {\i}, [x2] st1 {v2.8h}, [x2], x11 .endr .ifc \variant, identity_ mov w16, #(5793-4096)*8 dup v0.4h, w16 identity_8x4_shift1 v16, v17, v18, v19, v0.h[0] .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h srshr \i, \i, #1 .endr .endif transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 ins v20.d[0], v16.d[1] ins v21.d[0], v17.d[1] ins v22.d[0], v18.d[1] ins v23.d[0], v19.d[1] blr x5 load_add_store_4x16 x0, x6 ret x15 endfunc .endm def_fn_416_base def_fn_416_base identity_ .macro def_fn_416 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .if \w == 4 adr x4, inv_\txfm1\()_8h_x\w\()_neon adr x5, inv_\txfm2\()_4h_x\h\()_neon mov w13, #\eob_half .else adr x4, inv_\txfm1\()_4h_x\w\()_neon adr x5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct, 29 def_fn_416 \w, \h, identity, identity, 29 def_fn_416 \w, \h, dct, adst, 29 def_fn_416 \w, \h, dct, flipadst, 29 def_fn_416 \w, \h, dct, identity, 8 def_fn_416 \w, \h, adst, dct, 29 def_fn_416 \w, \h, adst, adst, 29 def_fn_416 \w, \h, adst, flipadst, 29 def_fn_416 \w, \h, flipadst, dct, 29 def_fn_416 \w, \h, flipadst, adst, 29 def_fn_416 \w, \h, flipadst, flipadst, 29 def_fn_416 \w, \h, identity, dct, 32 def_fn_416 \w, \h, adst, identity, 8 def_fn_416 \w, \h, flipadst, identity, 8 def_fn_416 \w, \h, identity, adst, 32 def_fn_416 \w, \h, identity, flipadst, 32 .endm def_fns_416 4, 16 def_fns_416 16, 4 .macro def_fn_816_base variant function inv_txfm_\variant\()add_16x8_neon mov x15, x30 movi v4.8h, #0 mov w16, #2896*8 dup v0.4h, w16 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x2] st1 {v4.8h}, [x2], #16 .endr scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .ifc \variant, identity_ mov w16, #2*(5793-4096)*8 dup v0.4h, w16 identity_8x16_shift1 v0.h[0] .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h srshr \i, \i, #1 .endr .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 mov x6, x0 load_add_store_8x8 x6, x7 .ifc \variant, identity_ mov v16.16b, v24.16b mov v17.16b, v25.16b mov v18.16b, v26.16b mov v19.16b, v27.16b mov v20.16b, v28.16b mov v21.16b, v29.16b mov v22.16b, v30.16b mov v23.16b, v31.16b .else srshr v16.8h, v24.8h, #1 srshr v17.8h, v25.8h, #1 srshr v18.8h, v26.8h, #1 srshr v19.8h, v27.8h, #1 srshr v20.8h, v28.8h, #1 srshr v21.8h, v29.8h, #1 srshr v22.8h, v30.8h, #1 srshr v23.8h, v31.8h, #1 .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 add x0, x0, #8 load_add_store_8x8 x0, x7 ret x15 endfunc function inv_txfm_\variant\()add_8x16_neon mov x15, x30 movi v4.8h, #0 mov w16, #2896*8 dup v0.4h, w16 mov x11, #32 cmp w3, w13 b.lt 1f add x6, x2, #16 .ifc \variant, identity_ .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x6] st1 {v4.8h}, [x6], x11 .endr scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 // The identity shl #1 and downshift srshr #1 cancel out .else .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x6] st1 {v4.8h}, [x6], x11 .endr scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 srshr v24.8h, v16.8h, #1 srshr v25.8h, v17.8h, #1 srshr v26.8h, v18.8h, #1 srshr v27.8h, v19.8h, #1 srshr v28.8h, v20.8h, #1 srshr v29.8h, v21.8h, #1 srshr v30.8h, v22.8h, #1 srshr v31.8h, v23.8h, #1 .endif transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v2, v3 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h movi \i, #0 .endr 2: movi v4.8h, #0 mov w16, #2896*8 dup v0.4h, w16 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v4.8h}, [x2], x11 .endr scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 .ifc \variant, identity_ // The identity shl #1 and downshift srshr #1 cancel out .else blr x4 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h srshr \i, \i, #1 .endr .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3 blr x5 load_add_store_8x16 x0, x6 ret x15 endfunc .endm def_fn_816_base def_fn_816_base identity_ .macro def_fn_816 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif adr x4, inv_\txfm1\()_8h_x\w\()_neon adr x5, inv_\txfm2\()_8h_x\h\()_neon .if \w == 8 mov x13, #\eob_half .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon .else b inv_txfm_add_\w\()x\h\()_neon .endif endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct, 43 def_fn_816 \w, \h, identity, identity, 43 def_fn_816 \w, \h, dct, adst, 43 def_fn_816 \w, \h, dct, flipadst, 43 def_fn_816 \w, \h, dct, identity, 8 def_fn_816 \w, \h, adst, dct, 43 def_fn_816 \w, \h, adst, adst, 43 def_fn_816 \w, \h, adst, flipadst, 43 def_fn_816 \w, \h, flipadst, dct, 43 def_fn_816 \w, \h, flipadst, adst, 43 def_fn_816 \w, \h, flipadst, flipadst, 43 def_fn_816 \w, \h, identity, dct, 64 def_fn_816 \w, \h, adst, identity, 8 def_fn_816 \w, \h, flipadst, identity, 8 def_fn_816 \w, \h, identity, adst, 64 def_fn_816 \w, \h, identity, flipadst, 64 .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_8h_x16_neon, export=1 movrel x16, idct_coeffs, 2*16 ld1 {v0.8h, v1.8h}, [x16] sub x16, x16, #2*16 smull_smlsl v2, v3, v16, v31, v0.h[0], v0.h[1], .8h // -> t16a smull_smlal v4, v5, v16, v31, v0.h[1], v0.h[0], .8h // -> t31a smull_smlsl v6, v7, v24, v23, v0.h[2], v0.h[3], .8h // -> t17a sqrshrn_sz v16, v2, v3, #12, .8h // t16a sqrshrn_sz v31, v4, v5, #12, .8h // t31a smull_smlal v2, v3, v24, v23, v0.h[3], v0.h[2], .8h // -> t30a smull_smlsl v4, v5, v20, v27, v0.h[4], v0.h[5], .8h // -> t18a sqrshrn_sz v24, v6, v7, #12, .8h // t17a sqrshrn_sz v23, v2, v3, #12, .8h // t30a smull_smlal v6, v7, v20, v27, v0.h[5], v0.h[4], .8h // -> t29a smull_smlsl v2, v3, v28, v19, v0.h[6], v0.h[7], .8h // -> t19a sqrshrn_sz v20, v4, v5, #12, .8h // t18a sqrshrn_sz v27, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v28, v19, v0.h[7], v0.h[6], .8h // -> t28a smull_smlsl v6, v7, v18, v29, v1.h[0], v1.h[1], .8h // -> t20a sqrshrn_sz v28, v2, v3, #12, .8h // t19a sqrshrn_sz v19, v4, v5, #12, .8h // t28a smull_smlal v2, v3, v18, v29, v1.h[1], v1.h[0], .8h // -> t27a smull_smlsl v4, v5, v26, v21, v1.h[2], v1.h[3], .8h // -> t21a sqrshrn_sz v18, v6, v7, #12, .8h // t20a sqrshrn_sz v29, v2, v3, #12, .8h // t27a smull_smlal v6, v7, v26, v21, v1.h[3], v1.h[2], .8h // -> t26a smull_smlsl v2, v3, v22, v25, v1.h[4], v1.h[5], .8h // -> t22a sqrshrn_sz v26, v4, v5, #12, .8h // t21a sqrshrn_sz v21, v6, v7, #12, .8h // t26a smull_smlal v4, v5, v22, v25, v1.h[5], v1.h[4], .8h // -> t25a smull_smlsl v6, v7, v30, v17, v1.h[6], v1.h[7], .8h // -> t23a sqrshrn_sz v22, v2, v3, #12, .8h // t22a sqrshrn_sz v25, v4, v5, #12, .8h // t25a smull_smlal v2, v3, v30, v17, v1.h[7], v1.h[6], .8h // -> t24a sqrshrn_sz v30, v6, v7, #12, .8h // t23a sqrshrn_sz v17, v2, v3, #12, .8h // t24a ld1 {v0.8h}, [x16] sqsub v2.8h, v16.8h, v24.8h // t17 sqadd v16.8h, v16.8h, v24.8h // t16 sqsub v3.8h, v31.8h, v23.8h // t30 sqadd v31.8h, v31.8h, v23.8h // t31 sqsub v24.8h, v28.8h, v20.8h // t18 sqadd v28.8h, v28.8h, v20.8h // t19 sqadd v23.8h, v18.8h, v26.8h // t20 sqsub v18.8h, v18.8h, v26.8h // t21 sqsub v20.8h, v30.8h, v22.8h // t22 sqadd v30.8h, v30.8h, v22.8h // t23 sqadd v26.8h, v17.8h, v25.8h // t24 sqsub v17.8h, v17.8h, v25.8h // t25 sqsub v22.8h, v29.8h, v21.8h // t26 sqadd v29.8h, v29.8h, v21.8h // t27 sqadd v25.8h, v19.8h, v27.8h // t28 sqsub v19.8h, v19.8h, v27.8h // t29 smull_smlsl v4, v5, v3, v2, v0.h[4], v0.h[5], .8h // -> t17a smull_smlal v6, v7, v3, v2, v0.h[5], v0.h[4], .8h // -> t30a smull_smlal v2, v3, v19, v24, v0.h[5], v0.h[4], .8h // -> t18a sqrshrn_sz v21, v4, v5, #12, .8h // t17a sqrshrn_sz v27, v6, v7, #12, .8h // t30a neg v2.4s, v2.4s // -> t18a neg v3.4s, v3.4s // -> t18a smull_smlsl v4, v5, v19, v24, v0.h[4], v0.h[5], .8h // -> t29a smull_smlsl v6, v7, v22, v18, v0.h[6], v0.h[7], .8h // -> t21a sqrshrn_sz v19, v2, v3, #12, .8h // t18a sqrshrn_sz v24, v4, v5, #12, .8h // t29a smull_smlal v2, v3, v22, v18, v0.h[7], v0.h[6], .8h // -> t26a smull_smlal v4, v5, v17, v20, v0.h[7], v0.h[6], .8h // -> t22a sqrshrn_sz v22, v6, v7, #12, .8h // t21a sqrshrn_sz v18, v2, v3, #12, .8h // t26a neg v4.4s, v4.4s // -> t22a neg v5.4s, v5.4s // -> t22a smull_smlsl v6, v7, v17, v20, v0.h[6], v0.h[7], .8h // -> t25a sqrshrn_sz v17, v4, v5, #12, .8h // t22a sqrshrn_sz v20, v6, v7, #12, .8h // t25a sqsub v2.8h, v27.8h, v24.8h // t29 sqadd v27.8h, v27.8h, v24.8h // t30 sqsub v3.8h, v21.8h, v19.8h // t18 sqadd v21.8h, v21.8h, v19.8h // t17 sqsub v24.8h, v16.8h, v28.8h // t19a sqadd v16.8h, v16.8h, v28.8h // t16a sqsub v19.8h, v30.8h, v23.8h // t20a sqadd v30.8h, v30.8h, v23.8h // t23a sqsub v28.8h, v17.8h, v22.8h // t21 sqadd v17.8h, v17.8h, v22.8h // t22 sqadd v23.8h, v26.8h, v29.8h // t24a sqsub v26.8h, v26.8h, v29.8h // t27a sqadd v22.8h, v20.8h, v18.8h // t25 sqsub v20.8h, v20.8h, v18.8h // t26 sqsub v29.8h, v31.8h, v25.8h // t28a sqadd v31.8h, v31.8h, v25.8h // t31a smull_smlsl v4, v5, v2, v3, v0.h[2], v0.h[3], .8h // -> t18a smull_smlal v6, v7, v2, v3, v0.h[3], v0.h[2], .8h // -> t29a smull_smlsl v2, v3, v29, v24, v0.h[2], v0.h[3], .8h // -> t19 sqrshrn_sz v18, v4, v5, #12, .8h // t18a sqrshrn_sz v25, v6, v7, #12, .8h // t29a smull_smlal v4, v5, v29, v24, v0.h[3], v0.h[2], .8h // -> t28 smull_smlal v6, v7, v26, v19, v0.h[3], v0.h[2], .8h // -> t20 sqrshrn_sz v29, v2, v3, #12, .8h // t19 sqrshrn_sz v24, v4, v5, #12, .8h // t28 neg v6.4s, v6.4s // -> t20 neg v7.4s, v7.4s // -> t20 smull_smlsl v2, v3, v26, v19, v0.h[2], v0.h[3], .8h // -> t27 smull_smlal v4, v5, v20, v28, v0.h[3], v0.h[2], .8h // -> t21a sqrshrn_sz v26, v6, v7, #12, .8h // t20 sqrshrn_sz v19, v2, v3, #12, .8h // t27 neg v4.4s, v4.4s // -> t21a neg v5.4s, v5.4s // -> t21a smull_smlsl v6, v7, v20, v28, v0.h[2], v0.h[3], .8h // -> t26a sqrshrn_sz v20, v4, v5, #12, .8h // t21a sqrshrn_sz v28, v6, v7, #12, .8h // t26a sqsub v2.8h, v16.8h, v30.8h // t23 sqadd v16.8h, v16.8h, v30.8h // t16 = out16 sqsub v3.8h, v31.8h, v23.8h // t24 sqadd v31.8h, v31.8h, v23.8h // t31 = out31 sqsub v23.8h, v21.8h, v17.8h // t22a sqadd v17.8h, v21.8h, v17.8h // t17a = out17 sqadd v30.8h, v27.8h, v22.8h // t30a = out30 sqsub v21.8h, v27.8h, v22.8h // t25a sqsub v27.8h, v18.8h, v20.8h // t21 sqadd v18.8h, v18.8h, v20.8h // t18 = out18 sqadd v4.8h, v29.8h, v26.8h // t19a = out19 sqsub v26.8h, v29.8h, v26.8h // t20a sqadd v29.8h, v25.8h, v28.8h // t29 = out29 sqsub v25.8h, v25.8h, v28.8h // t26 sqadd v28.8h, v24.8h, v19.8h // t28a = out28 sqsub v24.8h, v24.8h, v19.8h // t27a mov v19.16b, v4.16b // out19 smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20 smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27 sqrshrn_sz v20, v4, v5, #12, .8h // t20 sqrshrn_sz v22, v6, v7, #12, .8h // t27 smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a mov v27.16b, v22.16b // t27 sqrshrn_sz v26, v4, v5, #12, .8h // t26a smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22 smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25 sqrshrn_sz v21, v6, v7, #12, .8h // t21a sqrshrn_sz v22, v24, v25, #12, .8h // t22 sqrshrn_sz v25, v4, v5, #12, .8h // t25 smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a sqrshrn_sz v23, v4, v5, #12, .8h // t23a sqrshrn_sz v24, v6, v7, #12, .8h // t24a ret endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x8_neon mov x14, x30 movi v7.8h, #0 lsl x8, x8, #1 .if \scale mov w16, #2896*8 dup v0.4h, w16 .endif .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .if \scale scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_8h_x16_neon transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 .macro store1 r0, r1 st1 {\r0}, [x6], #16 st1 {\r1}, [x6], #16 add x6, x6, #32 .endm store1 v16.8h, v24.8h store1 v17.8h, v25.8h store1 v18.8h, v26.8h store1 v19.8h, v27.8h store1 v20.8h, v28.8h store1 v21.8h, v29.8h store1 v22.8h, v30.8h store1 v23.8h, v31.8h .purgem store1 sub x6, x6, #64*8 movi v7.8h, #0 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in v0.h[1] scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct32_odd_8h_x16_neon transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 .macro store2 r0, r1, shift ld1 {v4.8h, v5.8h}, [x6] sqsub v7.8h, v4.8h, \r0 sqsub v6.8h, v5.8h, \r1 sqadd v4.8h, v4.8h, \r0 sqadd v5.8h, v5.8h, \r1 rev64 v6.8h, v6.8h rev64 v7.8h, v7.8h srshr v4.8h, v4.8h, #\shift srshr v5.8h, v5.8h, #\shift srshr v6.8h, v6.8h, #\shift srshr v7.8h, v7.8h, #\shift ext v6.16b, v6.16b, v6.16b, #8 st1 {v4.8h, v5.8h}, [x6], #32 ext v7.16b, v7.16b, v7.16b, #8 st1 {v6.8h, v7.8h}, [x6], #32 .endm store2 v31.8h, v23.8h, \shift store2 v30.8h, v22.8h, \shift store2 v29.8h, v21.8h, \shift store2 v28.8h, v20.8h, \shift store2 v27.8h, v19.8h, \shift store2 v26.8h, v18.8h, \shift store2 v25.8h, v17.8h, \shift store2 v24.8h, v16.8h, \shift .purgem store2 ret x14 endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_8x32_neon mov x14, x30 lsl x8, x8, #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 bl inv_dct_8h_x16_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 bl inv_dct32_odd_8h_x16_neon neg x9, x8 mov x10, x6 .macro combine r0, r1, r2, r3, op, stride ld1 {v5.8h}, [x7], \stride ld1 {v2.8b}, [x10], x1 ld1 {v6.8h}, [x7], \stride ld1 {v3.8b}, [x10], x1 \op v5.8h, v5.8h, \r0 ld1 {v7.8h}, [x7], \stride ld1 {v4.8b}, [x10], x1 srshr v5.8h, v5.8h, #4 \op v6.8h, v6.8h, \r1 uaddw v5.8h, v5.8h, v2.8b srshr v6.8h, v6.8h, #4 \op v7.8h, v7.8h, \r2 sqxtun v2.8b, v5.8h ld1 {v5.8h}, [x7], \stride uaddw v6.8h, v6.8h, v3.8b srshr v7.8h, v7.8h, #4 \op v5.8h, v5.8h, \r3 st1 {v2.8b}, [x6], x1 ld1 {v2.8b}, [x10], x1 sqxtun v3.8b, v6.8h uaddw v7.8h, v7.8h, v4.8b srshr v5.8h, v5.8h, #4 st1 {v3.8b}, [x6], x1 sqxtun v4.8b, v7.8h uaddw v5.8h, v5.8h, v2.8b st1 {v4.8b}, [x6], x1 sqxtun v2.8b, v5.8h st1 {v2.8b}, [x6], x1 .endm combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 sub x7, x7, x8 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine ret x14 endfunc const eob_32x32 .short 36, 136, 300, 1024 endconst const eob_16x32 .short 36, 151, 279, 512 endconst const eob_16x32_shortside .short 36, 512 endconst const eob_8x32 .short 43, 107, 171, 256 endconst function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_32x32 mov x8, #2*32 1: mov w9, #0 movrel x12, eob_32x32 2: add w9, w9, #8 .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x2] st1 {v0.8h}, [x2], x8 .endr transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 load_add_store_8x8 x0, x7, shiftbits=2 ldrh w11, [x12], #2 sub x0, x0, x1, lsl #3 add x0, x0, #8 cmp w3, w11 b.ge 2b ldrh w11, [x13], #2 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #2*8 b 1b 9: ret endfunc .macro shift_8_regs op, shift .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 mov w16, #2896*8 mov w17, #2*(5793-4096)*8 dup v1.4h, w16 movi v0.8h, #0 mov v1.h[1], w17 movrel x13, eob_16x32\hshort mov x8, #2*\h 1: mov w9, #0 movrel x12, eob_16x32\wshort 2: add w9, w9, #8 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v0.8h}, [x2], x8 .endr scale_input .8h, v1.h[0], v16, v17, v18, v19, v20, v21, v22, v23 .if \w == 16 // 16x32 identity_8x8_shift1 v1.h[1] .else // 32x16 shift_8_regs sqshl, 1 identity_8x8 v1.h[1] .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 .if \w == 16 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=4 .endif ldrh w11, [x12], #2 sub x0, x0, x1, lsl #3 add x0, x0, #8 cmp w3, w11 b.ge 2b ldrh w11, [x13], #2 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #2*8 b 1b 9: ret endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_8x32 mov w8, #2*\h 1: ldrh w12, [x13], #2 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v0.8h}, [x2], x8 .endr .if \w == 8 // 8x32 shift_8_regs srshr, 1 .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 add x2, x2, #2*8 .else sub x0, x0, x1, lsl #3 add x0, x0, #8 .endif b 1b 9: ret endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 idct_dc 32, 32, 2 mov x15, x30 sub sp, sp, #2048 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 8, 16, 24 add x6, sp, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_dct_32x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #2048 ret x15 endfunc function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 idct_dc 16, 32, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, sp, #(\i*16*2) add x7, x2, #(\i*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif mov x8, #2*32 bl inv_txfm_horz_scale_16x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #8 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i) add x7, sp, #(\i*2) mov x8, #16*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 idct_dc 32, 16, 1 mov x15, x30 sub sp, sp, #1024 adr x5, inv_dct_8h_x16_neon .irp i, 0, 8 add x6, sp, #(\i*32*2) add x7, x2, #(\i*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, #36 b.lt 1f .endif mov x8, #2*16 bl inv_txfm_horz_scale_dct_32x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 idct_dc 8, 32, 2 mov x15, x30 sub sp, sp, #512 movrel x13, eob_8x32 movi v28.8h, #0 mov x8, #2*32 mov w9, #32 mov x6, sp 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x2] st1 {v28.8h}, [x2], x8 .endr ldrh w12, [x13], #2 sub x2, x2, x8, lsl #3 sub w9, w9, #8 add x2, x2, #2*8 bl inv_dct_8h_x8_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23 srshr v\i\().8h, v\i\().8h, #2 .endr transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 cmp w3, w12 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 b.ge 1b cbz w9, 3f movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 2: subs w9, w9, #8 .rept 2 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 .endr b.gt 2b 3: mov x6, x0 mov x7, sp mov x8, #8*2 bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 ret x15 endfunc function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 idct_dc 32, 8, 2 mov x15, x30 sub sp, sp, #512 mov x6, sp mov x7, x2 mov x8, #8*2 bl inv_txfm_horz_dct_32x8_neon mov x8, #2*32 mov w9, #0 1: add x6, x0, x9 add x7, sp, x9, lsl #1 // #(\i*2) .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x7], x8 .endr add w9, w9, #8 bl inv_dct_8h_x8_neon cmp w9, #32 load_add_store_8x8 x6, x7 b.lt 1b add sp, sp, #512 ret x15 endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a ld1 {v0.8h, v1.8h}, [x17], #32 sqrdmulh v23.8h, v16.8h, v0.h[1] // t63a sqrdmulh v16.8h, v16.8h, v0.h[0] // t32a sqrdmulh v22.8h, v17.8h, v0.h[2] // t62a sqrdmulh v17.8h, v17.8h, v0.h[3] // t33a sqrdmulh v21.8h, v18.8h, v0.h[5] // t61a sqrdmulh v18.8h, v18.8h, v0.h[4] // t34a sqrdmulh v20.8h, v19.8h, v0.h[6] // t60a sqrdmulh v19.8h, v19.8h, v0.h[7] // t35a sqadd v24.8h, v16.8h, v17.8h // t32 sqsub v25.8h, v16.8h, v17.8h // t33 sqsub v26.8h, v19.8h, v18.8h // t34 sqadd v27.8h, v19.8h, v18.8h // t35 sqadd v28.8h, v20.8h, v21.8h // t60 sqsub v29.8h, v20.8h, v21.8h // t61 sqsub v30.8h, v23.8h, v22.8h // t62 sqadd v31.8h, v23.8h, v22.8h // t63 smull_smlal v2, v3, v29, v26, v1.h[0], v1.h[1], .8h // -> t34a smull_smlsl v4, v5, v29, v26, v1.h[1], v1.h[0], .8h // -> t61a neg v2.4s, v2.4s // t34a neg v3.4s, v3.4s // t34a smull_smlsl v6, v7, v30, v25, v1.h[1], v1.h[0], .8h // -> t33a sqrshrn_sz v26, v2, v3, #12, .8h // t34a smull_smlal v2, v3, v30, v25, v1.h[0], v1.h[1], .8h // -> t62a sqrshrn_sz v29, v4, v5, #12, .8h // t61a sqrshrn_sz v25, v6, v7, #12, .8h // t33a sqrshrn_sz v30, v2, v3, #12, .8h // t62a sqadd v16.8h, v24.8h, v27.8h // t32a sqsub v19.8h, v24.8h, v27.8h // t35a sqadd v17.8h, v25.8h, v26.8h // t33 sqsub v18.8h, v25.8h, v26.8h // t34 sqsub v20.8h, v31.8h, v28.8h // t60a sqadd v23.8h, v31.8h, v28.8h // t63a sqsub v21.8h, v30.8h, v29.8h // t61 sqadd v22.8h, v30.8h, v29.8h // t62 smull_smlal v2, v3, v21, v18, v1.h[2], v1.h[3], .8h // -> t61a smull_smlsl v4, v5, v21, v18, v1.h[3], v1.h[2], .8h // -> t34a smull_smlal v6, v7, v20, v19, v1.h[2], v1.h[3], .8h // -> t60 sqrshrn_sz v21, v2, v3, #12, .8h // t61a sqrshrn_sz v18, v4, v5, #12, .8h // t34a smull_smlsl v2, v3, v20, v19, v1.h[3], v1.h[2], .8h // -> t35 sqrshrn_sz v20, v6, v7, #12, .8h // t60 sqrshrn_sz v19, v2, v3, #12, .8h // t35 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 ret endfunc function inv_dct64_step2_neon movrel x16, idct_coeffs ld1 {v0.4h}, [x16] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a ldr q16, [x6, #2*8*0] // t32a ldr q17, [x9, #2*8*8] // t39a ldr q18, [x9, #2*8*0] // t63a ldr q19, [x6, #2*8*8] // t56a ldr q20, [x6, #2*8*16] // t40a ldr q21, [x9, #2*8*24] // t47a ldr q22, [x9, #2*8*16] // t55a ldr q23, [x6, #2*8*24] // t48a sqadd v24.8h, v16.8h, v17.8h // t32 sqsub v25.8h, v16.8h, v17.8h // t39 sqadd v26.8h, v18.8h, v19.8h // t63 sqsub v27.8h, v18.8h, v19.8h // t56 sqsub v28.8h, v21.8h, v20.8h // t40 sqadd v29.8h, v21.8h, v20.8h // t47 sqadd v30.8h, v23.8h, v22.8h // t48 sqsub v31.8h, v23.8h, v22.8h // t55 smull_smlal v2, v3, v27, v25, v0.h[3], v0.h[2], .8h // -> t56a smull_smlsl v4, v5, v27, v25, v0.h[2], v0.h[3], .8h // -> t39a smull_smlal v6, v7, v31, v28, v0.h[3], v0.h[2], .8h // -> t40a sqrshrn_sz v25, v2, v3, #12, .8h // t56a sqrshrn_sz v27, v4, v5, #12, .8h // t39a neg v6.4s, v6.4s // t40a neg v7.4s, v7.4s // t40a smull_smlsl v2, v3, v31, v28, v0.h[2], v0.h[3], .8h // -> t55a sqrshrn_sz v31, v6, v7, #12, .8h // t40a sqrshrn_sz v28, v2, v3, #12, .8h // t55a sqadd v16.8h, v24.8h, v29.8h // t32a sqsub v19.8h, v24.8h, v29.8h // t47a sqadd v17.8h, v27.8h, v31.8h // t39 sqsub v18.8h, v27.8h, v31.8h // t40 sqsub v20.8h, v26.8h, v30.8h // t48a sqadd v23.8h, v26.8h, v30.8h // t63a sqsub v21.8h, v25.8h, v28.8h // t55 sqadd v22.8h, v25.8h, v28.8h // t56 smull_smlsl v2, v3, v21, v18, v0.h[0], v0.h[0], .8h // -> t40a smull_smlal v4, v5, v21, v18, v0.h[0], v0.h[0], .8h // -> t55a smull_smlsl v6, v7, v20, v19, v0.h[0], v0.h[0], .8h // -> t47 sqrshrn_sz v18, v2, v3, #12, .8h // t40a sqrshrn_sz v21, v4, v5, #12, .8h // t55a smull_smlal v2, v3, v20, v19, v0.h[0], v0.h[0], .8h // -> t48 sqrshrn_sz v19, v6, v7, #12, .8h // t47 sqrshrn_sz v20, v2, v3, #12, .8h // t48 str q16, [x6, #2*8*0] // t32a str q17, [x9, #2*8*0] // t39 str q18, [x6, #2*8*8] // t40a str q19, [x9, #2*8*8] // t47 str q20, [x6, #2*8*16] // t48 str q21, [x9, #2*8*16] // t55a str q22, [x6, #2*8*24] // t56 str q23, [x9, #2*8*24] // t63a add x6, x6, #2*8 sub x9, x9, #2*8 cmp x6, x9 b.lt 1b ret endfunc .macro load8 src, strd, zero, clear .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h .if \clear ld1 {\i}, [\src] st1 {\zero}, [\src], \strd .else ld1 {\i}, [\src], \strd .endif .endr .endm .macro store16 dst .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h st1 {\i}, [\dst], #16 .endr .endm .macro clear_upper8 .irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h movi \i, #0 .endr .endm .macro movi_if reg, val, cond .if \cond movi \reg, \val .endif .endm .macro movdup_if reg, gpr, val, cond .if \cond mov \gpr, \val dup \reg, \gpr .endif .endm .macro st1_if regs, dst, cond .if \cond st1 \regs, \dst .endif .endm .macro str_if reg, dst, cond .if \cond str \reg, \dst .endif .endm .macro stroff_if reg, dst, dstoff, cond .if \cond str \reg, \dst, \dstoff .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input .8h, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 mov x14, x30 mov x6, sp lsl x8, x8, #2 movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear load8 x7, x8, v7.8h, \clear clear_upper8 sub x7, x7, x8, lsl #3 add x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct_8h_x16_neon store16 x6 movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear load8 x7, x8, v7.8h, \clear clear_upper8 sub x7, x7, x8, lsl #3 lsr x8, x8, #1 sub x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct32_odd_8h_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 mov x9, #-16 .macro store_addsub r0, r1, r2, r3 ld1 {v2.8h}, [x6], #16 ld1 {v3.8h}, [x6], #16 sqadd v6.8h, v2.8h, \r0 sqsub \r0, v2.8h, \r0 ld1 {v4.8h}, [x6], #16 sqadd v7.8h, v3.8h, \r1 sqsub \r1, v3.8h, \r1 ld1 {v5.8h}, [x6], #16 sqadd v2.8h, v4.8h, \r2 sub x6, x6, #16*4 sqsub \r2, v4.8h, \r2 st1 {v6.8h}, [x6], #16 st1 {\r0}, [x10], x9 sqadd v3.8h, v5.8h, \r3 sqsub \r3, v5.8h, \r3 st1 {v7.8h}, [x6], #16 st1 {\r1}, [x10], x9 st1 {v2.8h}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.8h}, [x6], #16 st1 {\r3}, [x10], x9 .endm store_addsub v31.8h, v30.8h, v29.8h, v28.8h store_addsub v27.8h, v26.8h, v25.8h, v24.8h store_addsub v23.8h, v22.8h, v21.8h, v20.8h store_addsub v19.8h, v18.8h, v17.8h, v16.8h .purgem store_addsub add x6, x6, #2*8*16 movrel x17, idct64_coeffs movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear add x9, x7, x8, lsl #4 // offset 16 add x10, x7, x8, lsl #3 // offset 8 sub x9, x9, x8 // offset 15 sub x11, x10, x8 // offset 7 ld1 {v16.8h}, [x7] // in1 (offset 0) ld1 {v17.8h}, [x9] // in31 (offset 15) ld1 {v18.8h}, [x10] // in17 (offset 8) ld1 {v19.8h}, [x11] // in15 (offset 7) st1_if {v7.8h}, [x7], \clear st1_if {v7.8h}, [x9], \clear st1_if {v7.8h}, [x10], \clear st1_if {v7.8h}, [x11], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear add x7, x7, x8, lsl #2 // offset 4 sub x9, x9, x8, lsl #2 // offset 11 sub x10, x7, x8 // offset 3 add x11, x9, x8 // offset 12 ld1 {v16.8h}, [x10] // in7 (offset 3) ld1 {v17.8h}, [x11] // in25 (offset 12) ld1 {v18.8h}, [x9] // in23 (offset 11) ld1 {v19.8h}, [x7] // in9 (offset 4) st1_if {v7.8h}, [x7], \clear st1_if {v7.8h}, [x9], \clear st1_if {v7.8h}, [x10], \clear st1_if {v7.8h}, [x11], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear sub x10, x10, x8, lsl #1 // offset 1 sub x9, x9, x8, lsl #1 // offset 9 add x7, x7, x8 // offset 5 add x11, x11, x8 // offset 13 ldr q16, [x10, x8] // in5 (offset 2) ldr q17, [x11] // in27 (offset 13) ldr q18, [x9, x8] // in21 (offset 10) ldr q19, [x7] // in11 (offset 5) stroff_if q7, [x10, x8], \clear str_if q7, [x11], \clear stroff_if q7, [x9, x8], \clear str_if q7, [x7], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movdup_if v0.4h, w16, #2896*8, \scale movi_if v7.8h, #0, \clear ldr q16, [x10] // in3 (offset 1) ldr q17, [x11, x8] // in29 (offset 14) ldr q18, [x9] // in19 (offset 9) ldr q19, [x7, x8] // in13 (offset 6) str_if q7, [x10], \clear stroff_if q7, [x11, x8], \clear str_if q7, [x9], \clear stroff_if q7, [x7, x8], \clear scale_if \scale, v0.h[0], v16, v17, v18, v19 bl inv_dct64_step1_neon sub x6, x6, #2*8*32 add x9, x6, #2*8*7 bl inv_dct64_step2_neon ret x14 endfunc .endm def_dct64_func def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x8_neon mov x14, x30 mov x7, sp add x8, sp, #2*8*(64 - 4) add x9, x6, #2*56 mov x10, #2*64 mov x11, #-2*8*4 dup v7.8h, w12 1: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 .macro store_addsub src0, src1, src2, src3 sqsub v1.8h, \src0, \src1 sqadd v0.8h, \src0, \src1 sqsub v3.8h, \src2, \src3 srshl v1.8h, v1.8h, v7.8h sqadd v2.8h, \src2, \src3 srshl v0.8h, v0.8h, v7.8h srshl v3.8h, v3.8h, v7.8h rev64 v1.8h, v1.8h srshl v2.8h, v2.8h, v7.8h rev64 v3.8h, v3.8h ext v1.16b, v1.16b, v1.16b, #8 st1 {v0.8h}, [x6], x10 ext v3.16b, v3.16b, v3.16b, #8 st1 {v1.8h}, [x9], x10 st1 {v2.8h}, [x6], x10 st1 {v3.8h}, [x9], x10 .endm store_addsub v16.8h, v31.8h, v17.8h, v30.8h store_addsub v18.8h, v29.8h, v19.8h, v28.8h store_addsub v20.8h, v27.8h, v21.8h, v26.8h store_addsub v22.8h, v25.8h, v23.8h, v24.8h .purgem store_addsub sub x6, x6, x10, lsl #3 sub x9, x9, x10, lsl #3 add x6, x6, #16 sub x9, x9, #16 cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon mov x14, x30 lsl x8, x8, #1 mov x7, sp add x8, sp, #2*8*(64 - 4) add x9, x6, x1, lsl #6 sub x9, x9, x1 neg x10, x1 mov x11, #-2*8*4 1: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 .macro add_dest_addsub src0, src1, src2, src3 ld1 {v0.8b}, [x6], x1 ld1 {v1.8b}, [x9], x10 sqadd v4.8h, \src0, \src1 ld1 {v2.8b}, [x6] sqsub v5.8h, \src0, \src1 ld1 {v3.8b}, [x9] sqadd v6.8h, \src2, \src3 sqsub v7.8h, \src2, \src3 sub x6, x6, x1 sub x9, x9, x10 srshr v4.8h, v4.8h, #4 srshr v5.8h, v5.8h, #4 srshr v6.8h, v6.8h, #4 uaddw v4.8h, v4.8h, v0.8b srshr v7.8h, v7.8h, #4 uaddw v5.8h, v5.8h, v1.8b uaddw v6.8h, v6.8h, v2.8b sqxtun v0.8b, v4.8h uaddw v7.8h, v7.8h, v3.8b sqxtun v1.8b, v5.8h st1 {v0.8b}, [x6], x1 sqxtun v2.8b, v6.8h st1 {v1.8b}, [x9], x10 sqxtun v3.8b, v7.8h st1 {v2.8b}, [x6], x1 st1 {v3.8b}, [x9], x10 .endm add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h .purgem add_dest_addsub cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 mov x15, x30 sub_sp 64*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 .irp i, 0, 8, 16, 24 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-2 // shift bl inv_txfm_dct_clear_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 idct_dc 64, 32, 1 mov x15, x30 sub_sp 64*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 .irp i, 0, 8, 16, 24 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-1 // shift bl inv_txfm_dct_clear_scale_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x5, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 idct_dc 32, 64, 1 mov x15, x30 sub_sp 32*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 8, 16, 24 add x6, x5, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_scale_dct_32x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #32*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 idct_dc 64, 16, 2 mov x15, x30 sub_sp 64*16*2+64*8*2 add x4, sp, #64*8*2 movrel x13, eob_16x32 .irp i, 0, 8 add x6, x4, #(\i*64*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*2) mov x8, #16*2 mov x12, #-2 // shift bl inv_txfm_dct_clear_8h_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 8 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: adr x5, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x4, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, x4, #64*16*2 ret x15 endfunc function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 idct_dc 16, 64, 2 mov x15, x30 sub_sp 16*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, x5, #(\i*16*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 24 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*2) mov x8, #32*2 bl inv_txfm_horz_16x8_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #8 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #16*32*2 ret x15 endfunc rav1e-0.7.1/src/arm/64/itx16.S000064400000000000000000004066721046102023000135640ustar 00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "src/arm/asm.S" #include "util.S" // The exported functions in this file have got the following signature: // void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, // int bitdepth_max); // Most of the functions use the following register layout: // x0-x3 external parameters // x4 function pointer to first transform // x5 function pointer to second transform // x6 output parameter for helper function // x7 input parameter for helper function // x8 input stride for helper function // x9-x12 scratch variables for helper functions // x13 pointer to list of eob thresholds // x14 return pointer for helper function // x15 return pointer for main function // The SIMD registers most often use the following layout: // v0-v1 multiplication coefficients // v2-v7 scratch registers // v8-v15 unused // v16-v31 inputs/outputs of transforms const idct_coeffs, align=4 // idct4 .int 2896, 2896*8*(1<<16), 1567, 3784 // idct8 .int 799, 4017, 3406, 2276 // idct16 .int 401, 4076, 3166, 2598 .int 1931, 3612, 3920, 1189 // idct32 .int 201, 4091, 3035, 2751 .int 1751, 3703, 3857, 1380 .int 995, 3973, 3513, 2106 .int 2440, 3290, 4052, 601 endconst const idct64_coeffs, align=4 .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) .int 4076, 401, 4017, 799 .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) .int -3166, -2598, -799, -4017 .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) .int 3612, 1931, 2276, 3406 .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) .int -3920, -1189, -3406, -2276 endconst const iadst4_coeffs, align=4 .int 1321, 3803, 2482, 3344 endconst const iadst8_coeffs, align=4 .int 4076, 401, 3612, 1931 .int 2598, 3166, 1189, 3920 // idct_coeffs .int 2896, 0, 1567, 3784 endconst const iadst16_coeffs, align=4 .int 4091, 201, 3973, 995 .int 3703, 1751, 3290, 2440 .int 2751, 3035, 2106, 3513 .int 1380, 3857, 601, 4052 endconst .macro mul_mla d, s0, s1, c0, c1 mul \d\().4s, \s0\().4s, \c0 mla \d\().4s, \s1\().4s, \c1 .endm .macro mul_mls d, s0, s1, c0, c1 mul \d\().4s, \s0\().4s, \c0 mls \d\().4s, \s1\().4s, \c1 .endm .macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 sqrdmulh \r0\sz, \r0\sz, \c sqrdmulh \r1\sz, \r1\sz, \c sqrdmulh \r2\sz, \r2\sz, \c sqrdmulh \r3\sz, \r3\sz, \c .ifnb \r4 sqrdmulh \r4\sz, \r4\sz, \c sqrdmulh \r5\sz, \r5\sz, \c sqrdmulh \r6\sz, \r6\sz, \c sqrdmulh \r7\sz, \r7\sz, \c .endif .endm .macro smin_4s r0, r1, r2 smin \r0\().4s, \r1\().4s, \r2\().4s .endm .macro smax_4s r0, r1, r2 smax \r0\().4s, \r1\().4s, \r2\().4s .endm .macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4 .ifnb \load ld1 {\load}, [\src], x1 .endif .ifnb \shift srshr \shift, \shift, #\shiftbits .endif .ifnb \addsrc usqadd \adddst, \addsrc .endif .ifnb \min smin \min, \min, v7.8h .endif .ifnb \store st1 {\store}, [\dst], x1 .endif .endm .macro load_add_store_8x16 dst, src mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , \dst, \src load_add_store v3.8h, v17.8h, , , , , \dst, \src load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src load_add_store , , , , v27.8h, v26.8h, \dst, \src load_add_store , , , , , v27.8h, \dst, \src .endm .macro load_add_store_8x8 dst, src, shiftbits=4 mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits load_add_store , , , , , v19.8h, \dst, \src, \shiftbits .endm .macro load_add_store_8x4 dst, src, shiftbits=4 mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits load_add_store , , , , , v5.8h, \dst, \src, \shiftbits .endm .macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src .ifnb \load ld1 {\load}[0], [\src], x1 .endif .ifnb \inssrc ins \insdst\().d[1], \inssrc\().d[0] .endif .ifnb \shift srshr \shift, \shift, #4 .endif .ifnb \load ld1 {\load}[1], [\src], x1 .endif .ifnb \addsrc usqadd \adddst, \addsrc .endif .ifnb \store st1 {\store}[0], [\dst], x1 .endif .ifnb \min smin \min, \min, v7.8h .endif .ifnb \store st1 {\store}[1], [\dst], x1 .endif .endm .macro load_add_store_4x16 dst, src mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store4 v0.d, v17, v16, , , , , , \dst, \src load_add_store4 v1.d, v19, v18, , , , , , \dst, \src load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src load_add_store4 , , , , , , , v23.d, \dst, \src .endm .macro load_add_store_4x8 dst, src mov \src, \dst mvni v7.8h, #0xfc, lsl #8 // 0x3ff load_add_store4 v0.d, v17, v16, , , , , , \dst, \src load_add_store4 v1.d, v19, v18, , , , , , \dst, \src load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src load_add_store4 , , , , , , , v3.d, \dst, \src .endm .macro idct_dc w, h, shift cbnz w3, 1f movz w16, #2896*8, lsl #16 ld1r {v16.4s}, [x2] dup v0.2s, w16 sqrdmulh v20.4s, v16.4s, v0.s[0] str wzr, [x2] .if (\w == 2*\h) || (2*\w == \h) sqrdmulh v20.4s, v20.4s, v0.s[0] .endif .if \shift > 0 sqrshrn v16.4h, v20.4s, #\shift sqrshrn2 v16.8h, v20.4s, #\shift .else sqxtn v16.4h, v20.4s sqxtn2 v16.8h, v20.4s .endif sqrdmulh v16.8h, v16.8h, v0.h[1] srshr v16.8h, v16.8h, #4 mov w4, #\h b idct_dc_w\w\()_neon 1: .endm function idct_dc_w4_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 ld1 {v1.d}[0], [x0], x1 subs w4, w4, #4 ld1 {v1.d}[1], [x0], x1 usqadd v0.8h, v16.8h sub x0, x0, x1, lsl #2 usqadd v1.8h, v16.8h smin v0.8h, v0.8h, v31.8h st1 {v0.d}[0], [x0], x1 smin v1.8h, v1.8h, v31.8h st1 {v0.d}[1], [x0], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 b.gt 1b ret endfunc function idct_dc_w8_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h}, [x0], x1 subs w4, w4, #4 ld1 {v1.8h}, [x0], x1 usqadd v0.8h, v16.8h ld1 {v2.8h}, [x0], x1 usqadd v1.8h, v16.8h ld1 {v3.8h}, [x0], x1 usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h sub x0, x0, x1, lsl #2 smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h st1 {v0.8h}, [x0], x1 smin v2.8h, v2.8h, v31.8h st1 {v1.8h}, [x0], x1 smin v3.8h, v3.8h, v31.8h st1 {v2.8h}, [x0], x1 st1 {v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w16_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h}, [x0], x1 subs w4, w4, #2 ld1 {v2.8h, v3.8h}, [x0], x1 usqadd v0.8h, v16.8h usqadd v1.8h, v16.8h sub x0, x0, x1, lsl #1 usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h st1 {v0.8h, v1.8h}, [x0], x1 smin v3.8h, v3.8h, v31.8h st1 {v2.8h, v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w32_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w4, w4, #1 usqadd v0.8h, v16.8h usqadd v1.8h, v16.8h usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 1b ret endfunc function idct_dc_w64_neon mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x1, x1, #64 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 subs w4, w4, #1 usqadd v0.8h, v16.8h ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] usqadd v1.8h, v16.8h sub x0, x0, #64 usqadd v2.8h, v16.8h usqadd v3.8h, v16.8h usqadd v4.8h, v16.8h usqadd v5.8h, v16.8h usqadd v6.8h, v16.8h usqadd v7.8h, v16.8h smin v0.8h, v0.8h, v31.8h smin v1.8h, v1.8h, v31.8h smin v2.8h, v2.8h, v31.8h smin v3.8h, v3.8h, v31.8h smin v4.8h, v4.8h, v31.8h smin v5.8h, v5.8h, v31.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 smin v6.8h, v6.8h, v31.8h smin v7.8h, v7.8h, v31.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 b.gt 1b ret endfunc .macro iwht4 add v16.4s, v16.4s, v17.4s sub v21.4s, v18.4s, v19.4s sub v20.4s, v16.4s, v21.4s sshr v20.4s, v20.4s, #1 sub v18.4s, v20.4s, v17.4s sub v17.4s, v20.4s, v19.4s add v19.4s, v21.4s, v18.4s sub v16.4s, v16.4s, v17.4s .endm .macro idct_4 r0, r1, r2, r3 mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] srshr v6.4s, v6.4s, #12 srshr v2.4s, v2.4s, #12 srshr v7.4s, v4.4s, #12 srshr v3.4s, v3.4s, #12 sqadd \r0\().4s, v2.4s, v6.4s sqsub \r3\().4s, v2.4s, v6.4s sqadd \r1\().4s, v3.4s, v7.4s sqsub \r2\().4s, v3.4s, v7.4s .endm function inv_dct_4s_x4_neon AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s}, [x16] idct_4 v16, v17, v18, v19 ret endfunc .macro iadst_4x4 o0, o1, o2, o3 movrel x16, iadst4_coeffs ld1 {v0.4s}, [x16] sub v3.4s, v16.4s, v18.4s mul v4.4s, v16.4s, v0.s[0] mla v4.4s, v18.4s, v0.s[1] mla v4.4s, v19.4s, v0.s[2] mul v7.4s, v17.4s, v0.s[3] add v3.4s, v3.4s, v19.4s mul v5.4s, v16.4s, v0.s[2] mls v5.4s, v18.4s, v0.s[0] mls v5.4s, v19.4s, v0.s[1] add \o3\().4s, v4.4s, v5.4s mul \o2\().4s, v3.4s, v0.s[3] add \o0\().4s, v4.4s, v7.4s add \o1\().4s, v5.4s, v7.4s sub \o3\().4s, \o3\().4s, v7.4s srshr \o0\().4s, \o0\().4s, #12 srshr \o2\().4s, \o2\().4s, #12 srshr \o1\().4s, \o1\().4s, #12 srshr \o3\().4s, \o3\().4s, #12 .endm function inv_adst_4s_x4_neon AARCH64_VALID_CALL_TARGET iadst_4x4 v16, v17, v18, v19 ret endfunc function inv_flipadst_4s_x4_neon AARCH64_VALID_CALL_TARGET iadst_4x4 v19, v18, v17, v16 ret endfunc function inv_identity_4s_x4_neon AARCH64_VALID_CALL_TARGET movz w16, #(5793-4096)*8, lsl #16 dup v0.2s, w16 sqrdmulh v4.4s, v16.4s, v0.s[0] sqrdmulh v5.4s, v17.4s, v0.s[0] sqrdmulh v6.4s, v18.4s, v0.s[0] sqrdmulh v7.4s, v19.4s, v0.s[0] sqadd v16.4s, v16.4s, v4.4s sqadd v17.4s, v17.4s, v5.4s sqadd v18.4s, v18.4s, v6.4s sqadd v19.4s, v19.4s, v7.4s ret endfunc function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 mov x15, x30 movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 sshr v16.4s, v16.4s, #2 sshr v17.4s, v17.4s, #2 sshr v18.4s, v18.4s, #2 sshr v19.4s, v19.4s, #2 iwht4 st1 {v30.4s, v31.4s}, [x2], #32 transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 iwht4 ld1 {v0.d}[0], [x0], x1 sqxtn v16.4h, v16.4s ld1 {v0.d}[1], [x0], x1 sqxtn2 v16.8h, v17.4s ld1 {v1.d}[0], [x0], x1 sqxtn v18.4h, v18.4s ld1 {v1.d}[1], [x0], x1 sqxtn2 v18.8h, v19.4s b L(itx_4x4_end) endfunc function inv_txfm_add_4x4_neon movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v30.4s, v31.4s}, [x2], #32 blr x4 st1 {v30.4s, v31.4s}, [x2], #32 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 blr x5 ld1 {v0.d}[0], [x0], x1 ld1 {v0.d}[1], [x0], x1 ins v16.d[1], v17.d[0] ins v18.d[1], v19.d[0] ld1 {v1.d}[0], [x0], x1 ld1 {v1.d}[1], [x0], x1 srshr v16.8h, v16.8h, #4 srshr v18.8h, v18.8h, #4 L(itx_4x4_end): mvni v31.8h, #0xfc, lsl #8 // 0x3ff sub x0, x0, x1, lsl #2 usqadd v0.8h, v16.8h usqadd v1.8h, v18.8h smin v0.8h, v0.8h, v31.8h st1 {v0.d}[0], [x0], x1 smin v1.8h, v1.8h, v31.8h st1 {v0.d}[1], [x0], x1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 ret x15 endfunc .macro def_fn_4x4 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct cbnz w3, 1f movz w16, #2896*8, lsl #16 ld1r {v16.4s}, [x2] dup v4.2s, w16 str wzr, [x2] sqrdmulh v16.4s, v16.4s, v4.s[0] ld1 {v0.d}[0], [x0], x1 sqxtn v20.4h, v16.4s sqxtn2 v20.8h, v16.4s ld1 {v0.d}[1], [x0], x1 sqrdmulh v20.8h, v20.8h, v4.h[1] ld1 {v1.d}[0], [x0], x1 srshr v16.8h, v20.8h, #4 ld1 {v1.d}[1], [x0], x1 srshr v18.8h, v20.8h, #4 movi v30.8h, #0 b L(itx_4x4_end) 1: .endif adr x4, inv_\txfm1\()_4s_x4_neon movrel x5, X(inv_\txfm2\()_4h_x4_neon) b inv_txfm_add_4x4_neon endfunc .endm def_fn_4x4 dct, dct def_fn_4x4 identity, identity def_fn_4x4 dct, adst def_fn_4x4 dct, flipadst def_fn_4x4 dct, identity def_fn_4x4 adst, dct def_fn_4x4 adst, adst def_fn_4x4 adst, flipadst def_fn_4x4 flipadst, dct def_fn_4x4 flipadst, adst def_fn_4x4 flipadst, flipadst def_fn_4x4 identity, dct def_fn_4x4 adst, identity def_fn_4x4 flipadst, identity def_fn_4x4 identity, adst def_fn_4x4 identity, flipadst .macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 idct_4 \r0, \r2, \r4, \r6 movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 .irp r, \r0, \r2, \r4, \r6 smin_4s \r, \r, v5 .endr .irp r, \r0, \r2, \r4, \r6 smax_4s \r, \r, v4 .endr mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a srshr \r1\().4s, v2.4s, #12 // t4a srshr \r7\().4s, v3.4s, #12 // t7a srshr \r3\().4s, v6.4s, #12 // t5a srshr \r5\().4s, v7.4s, #12 // t6a sqadd v2.4s, \r1\().4s, \r3\().4s // t4 sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a sqadd v3.4s, \r7\().4s, \r5\().4s // t7 sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a .irp r, v2, \r1, v3, \r3 smin_4s \r, \r, v5 .endr .irp r, v2, \r1, v3, \r3 smax_4s \r, \r, v4 .endr mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5 mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 srshr v7.4s, v7.4s, #12 // t5 srshr v6.4s, v6.4s, #12 // t6 sqsub \r7\().4s, \r0\().4s, v3.4s // out7 sqadd \r0\().4s, \r0\().4s, v3.4s // out0 sqadd \r1\().4s, \r2\().4s, v6.4s // out1 sqsub v6.4s, \r2\().4s, v6.4s // out6 sqadd \r2\().4s, \r4\().4s, v7.4s // out2 sqsub \r5\().4s, \r4\().4s, v7.4s // out5 sqadd \r3\().4s, \r6\().4s, v2.4s // out3 sqsub \r4\().4s, \r6\().4s, v2.4s // out4 mov \r6\().16b, v6.16b // out6 .endm function inv_dct_4s_x8_neon AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23 ret endfunc .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 movrel x16, iadst8_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 mul_mla v2, v23, v16, v0.s[0], v0.s[1] mul_mls v4, v23, v16, v0.s[1], v0.s[0] mul_mla v6, v21, v18, v0.s[2], v0.s[3] srshr v16.4s, v2.4s, #12 // t0a srshr v23.4s, v4.4s, #12 // t1a mul_mls v2, v21, v18, v0.s[3], v0.s[2] mul_mla v4, v19, v20, v1.s[0], v1.s[1] srshr v18.4s, v6.4s, #12 // t2a srshr v21.4s, v2.4s, #12 // t3a mul_mls v6, v19, v20, v1.s[1], v1.s[0] mul_mla v2, v17, v22, v1.s[2], v1.s[3] srshr v20.4s, v4.4s, #12 // t4a srshr v19.4s, v6.4s, #12 // t5a mul_mls v4, v17, v22, v1.s[3], v1.s[2] srshr v22.4s, v2.4s, #12 // t6a srshr v17.4s, v4.4s, #12 // t7a ld1 {v0.4s}, [x16] movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff sqadd v2.4s, v16.4s, v20.4s // t0 sqsub v3.4s, v16.4s, v20.4s // t4 mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd v4.4s, v23.4s, v19.4s // t1 sqsub v5.4s, v23.4s, v19.4s // t5 sqadd v6.4s, v18.4s, v22.4s // t2 sqsub v7.4s, v18.4s, v22.4s // t6 sqadd v18.4s, v21.4s, v17.4s // t3 sqsub v19.4s, v21.4s, v17.4s // t7 .irp r, v2, v3, v4, v5, v6, v7, v18, v19 smin_4s \r, \r, v1 .endr .irp r, v2, v3, v4, v5, v6, v7, v18, v19 smax_4s \r, \r, v20 .endr mul_mla v16, v3, v5, v0.s[3], v0.s[2] mul_mls v20, v3, v5, v0.s[2], v0.s[3] mul_mls v22, v19, v7, v0.s[3], v0.s[2] srshr v3.4s, v16.4s, #12 // t4a srshr v5.4s, v20.4s, #12 // t5a mul_mla v16, v19, v7, v0.s[2], v0.s[3] srshr v7.4s, v22.4s, #12 // t6a srshr v19.4s, v16.4s, #12 // t7a sqadd \o0\().4s, v2.4s, v6.4s // out0 sqsub v2.4s, v2.4s, v6.4s // t2 sqadd \o7\().4s, v4.4s, v18.4s // out7 sqsub v4.4s, v4.4s, v18.4s // t3 mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqadd \o1\().4s, v3.4s, v7.4s // out1 sqsub v3.4s, v3.4s, v7.4s // t6 sqadd \o6\().4s, v5.4s, v19.4s // out6 sqsub v5.4s, v5.4s, v19.4s // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, v2, v4, v3, v5 smin_4s \r, \r, v1 .endr .irp r, v2, v4, v3, v5 smax_4s \r, \r, v18 .endr sqneg \o7\().4s, \o7\().4s // out7 sqneg \o1\().4s, \o1\().4s // out1 mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) srshr v2.4s, v18.4s, #12 // out3 mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) srshr v3.4s, v20.4s, #12 // out5 srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) sqneg \o3\().4s, v2.4s // out3 sqneg \o5\().4s, v3.4s // out5 .endm function inv_adst_4s_x8_neon AARCH64_VALID_CALL_TARGET iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 ret endfunc function inv_flipadst_4s_x8_neon AARCH64_VALID_CALL_TARGET iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x8_neon AARCH64_VALID_CALL_TARGET sqshl v16.4s, v16.4s, #1 sqshl v17.4s, v17.4s, #1 sqshl v18.4s, v18.4s, #1 sqshl v19.4s, v19.4s, #1 sqshl v20.4s, v20.4s, #1 sqshl v21.4s, v21.4s, #1 sqshl v22.4s, v22.4s, #1 sqshl v23.4s, v23.4s, #1 ret endfunc function inv_txfm_add_8x8_neon movi v31.4s, #0 cmp w3, w13 mov x11, #32 b.lt 1f add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v31.4s}, [x6], x11 .endr blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 sqrshrn2 v24.8h, v20.4s, #1 sqrshrn2 v25.8h, v21.4s, #1 sqrshrn2 v26.8h, v22.4s, #1 sqrshrn2 v27.8h, v23.4s, #1 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h movi \i, #0 .endr 2: .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x2] st1 {v31.4s}, [x2], x11 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 mov v20.16b, v24.16b mov v21.16b, v25.16b mov v22.16b, v26.16b mov v23.16b, v27.16b blr x5 load_add_store_8x8 x0, x7 ret x15 endfunc .macro def_fn_8x8 txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif movrel x5, X(inv_\txfm2\()_8h_x8_neon) mov w13, #\eob_half adr x4, inv_\txfm1\()_4s_x8_neon b inv_txfm_add_8x8_neon endfunc .endm def_fn_8x8 dct, dct, 10 def_fn_8x8 identity, identity, 10 def_fn_8x8 dct, adst, 10 def_fn_8x8 dct, flipadst, 10 def_fn_8x8 dct, identity, 4 def_fn_8x8 adst, dct, 10 def_fn_8x8 adst, adst, 10 def_fn_8x8 adst, flipadst, 10 def_fn_8x8 flipadst, dct, 10 def_fn_8x8 flipadst, adst, 10 def_fn_8x8 flipadst, flipadst, 10 def_fn_8x8 identity, dct, 4 def_fn_8x8 adst, identity, 4 def_fn_8x8 flipadst, identity, 4 def_fn_8x8 identity, adst, 4 def_fn_8x8 identity, flipadst, 4 function inv_txfm_add_8x4_neon movi v28.4s, #0 movi v29.4s, #0 movi v30.4s, #0 movi v31.4s, #0 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 movz w16, #2896*8, lsl #16 dup v0.2s, w16 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s sqxtn v20.4h, v20.4s sqxtn v21.4h, v21.4s sqxtn v22.4h, v22.4s sqxtn v23.4h, v23.4s transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 ins v16.d[1], v20.d[0] ins v17.d[1], v21.d[0] ins v18.d[1], v22.d[0] ins v19.d[1], v23.d[0] blr x5 load_add_store_8x4 x0, x7 ret x15 endfunc function inv_txfm_add_4x8_neon movz w16, #2896*8, lsl #16 movi v31.4s, #0 dup v30.2s, w16 cmp w3, w13 mov x11, #32 b.lt 1f add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v31.4s}, [x6], x11 .endr scale_input .4s, v30.s[0], v16, v17, v18, v19 blr x4 sqxtn v20.4h, v16.4s sqxtn v21.4h, v17.4s sqxtn v22.4h, v18.4s sqxtn v23.4h, v19.4s transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f 1: .irp i, v20, v21, v22, v23 movi \i\().4h, #0 .endr 2: .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x2] st1 {v31.4s}, [x2], x11 .endr scale_input .4s, v30.s[0], v16, v17, v18, v19 blr x4 sqxtn v16.4h, v16.4s sqxtn v17.4h, v17.4s sqxtn v18.4h, v18.4s sqxtn v19.4h, v19.4s transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 load_add_store_4x8 x0, x7 ret x15 endfunc .macro def_fn_48 w, h, txfm1, txfm2, eob_half function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif adr x4, inv_\txfm1\()_4s_x\w\()_neon .if \w == 4 mov w13, #\eob_half .endif movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_48 w, h def_fn_48 \w, \h, dct, dct, 13 def_fn_48 \w, \h, identity, identity, 13 def_fn_48 \w, \h, dct, adst, 13 def_fn_48 \w, \h, dct, flipadst, 13 def_fn_48 \w, \h, dct, identity, 4 def_fn_48 \w, \h, adst, dct, 13 def_fn_48 \w, \h, adst, adst, 13 def_fn_48 \w, \h, adst, flipadst, 13 def_fn_48 \w, \h, flipadst, dct, 13 def_fn_48 \w, \h, flipadst, adst, 13 def_fn_48 \w, \h, flipadst, flipadst, 13 def_fn_48 \w, \h, identity, dct, 16 def_fn_48 \w, \h, adst, identity, 4 def_fn_48 \w, \h, flipadst, identity, 4 def_fn_48 \w, \h, identity, adst, 16 def_fn_48 \w, \h, identity, flipadst, 16 .endm def_fns_48 4, 8 def_fns_48 8, 4 function inv_dct_4s_x16_neon AARCH64_VALID_CALL_TARGET movrel x16, idct_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 idct_8 v16, v18, v20, v22, v24, v26, v28, v30 // idct_8 leaves the row_clip_max/min constants in v5 and v4 .irp r, v16, v18, v20, v22, v24, v26, v28, v30 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v16, v18, v20, v22, v24, v26, v28, v30 smax \r\().4s, \r\().4s, v4.4s .endr ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #32 mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a srshr v17.4s, v2.4s, #12 // t8a srshr v31.4s, v3.4s, #12 // t15a mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a srshr v23.4s, v6.4s, #12 // t9a srshr v25.4s, v2.4s, #12 // t14a mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a srshr v21.4s, v3.4s, #12 // t10a srshr v27.4s, v6.4s, #12 // t13a mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a srshr v19.4s, v2.4s, #12 // t11a srshr v29.4s, v3.4s, #12 // t12a ld1 {v0.4s}, [x16] sqsub v2.4s, v17.4s, v23.4s // t9 sqadd v17.4s, v17.4s, v23.4s // t8 sqsub v3.4s, v31.4s, v25.4s // t14 sqadd v31.4s, v31.4s, v25.4s // t15 sqsub v23.4s, v19.4s, v21.4s // t10 sqadd v19.4s, v19.4s, v21.4s // t11 sqadd v25.4s, v29.4s, v27.4s // t12 sqsub v29.4s, v29.4s, v27.4s // t13 .irp r, v2, v17, v3, v31, v23, v19, v25, v29 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v17, v3, v31, v23, v19, v25, v29 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a srshr v21.4s, v7.4s, #12 // t9a srshr v27.4s, v6.4s, #12 // t14a mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a srshr v29.4s, v7.4s, #12 // t13a neg v6.4s, v6.4s srshr v23.4s, v6.4s, #12 // t10a sqsub v2.4s, v17.4s, v19.4s // t11a sqadd v17.4s, v17.4s, v19.4s // t8a sqsub v3.4s, v31.4s, v25.4s // t12a sqadd v31.4s, v31.4s, v25.4s // t15a sqadd v19.4s, v21.4s, v23.4s // t9 sqsub v21.4s, v21.4s, v23.4s // t10 sqsub v25.4s, v27.4s, v29.4s // t13 sqadd v27.4s, v27.4s, v29.4s // t14 .irp r, v2, v17, v3, v31, v19, v21, v25, v27 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v17, v3, v31, v19, v21, v25, v27 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11 mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a srshr v7.4s, v7.4s, #12 // t11 srshr v6.4s, v6.4s, #12 // t12 mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a srshr v2.4s, v2.4s, #12 // t10a srshr v3.4s, v3.4s, #12 // t13a sqadd v1.4s, v16.4s, v31.4s // out0 sqsub v31.4s, v16.4s, v31.4s // out15 mov v16.16b, v1.16b sqadd v23.4s, v30.4s, v17.4s // out7 sqsub v1.4s, v30.4s, v17.4s // out8 sqadd v17.4s, v18.4s, v27.4s // out1 sqsub v30.4s, v18.4s, v27.4s // out14 sqadd v18.4s, v20.4s, v3.4s // out2 sqsub v29.4s, v20.4s, v3.4s // out13 sqadd v3.4s, v28.4s, v19.4s // out6 sqsub v25.4s, v28.4s, v19.4s // out9 sqadd v19.4s, v22.4s, v6.4s // out3 sqsub v28.4s, v22.4s, v6.4s // out12 sqadd v20.4s, v24.4s, v7.4s // out4 sqsub v27.4s, v24.4s, v7.4s // out11 sqadd v21.4s, v26.4s, v2.4s // out5 sqsub v26.4s, v26.4s, v2.4s // out10 mov v24.16b, v1.16b mov v22.16b, v3.16b ret endfunc .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 movrel x16, iadst16_coeffs ld1 {v0.4s, v1.4s}, [x16], #32 mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 srshr v16.4s, v2.4s, #12 // t0 srshr v31.4s, v4.4s, #12 // t1 mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 srshr v18.4s, v6.4s, #12 // t2 srshr v29.4s, v2.4s, #12 // t3 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 srshr v20.4s, v4.4s, #12 // t4 srshr v27.4s, v6.4s, #12 // t5 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 ld1 {v0.4s, v1.4s}, [x16] movrel x16, idct_coeffs mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 srshr v22.4s, v2.4s, #12 // t6 srshr v25.4s, v4.4s, #12 // t7 mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 srshr v23.4s, v6.4s, #12 // t8 srshr v24.4s, v2.4s, #12 // t9 mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 srshr v21.4s, v4.4s, #12 // t10 srshr v26.4s, v6.4s, #12 // t11 mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 srshr v19.4s, v2.4s, #12 // t12 srshr v28.4s, v4.4s, #12 // t13 mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 srshr v17.4s, v6.4s, #12 // t14 srshr v30.4s, v2.4s, #12 // t15 ld1 {v0.4s, v1.4s}, [x16] movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqsub v2.4s, v16.4s, v23.4s // t8a sqadd v16.4s, v16.4s, v23.4s // t0a sqsub v3.4s, v31.4s, v24.4s // t9a sqadd v31.4s, v31.4s, v24.4s // t1a sqadd v23.4s, v18.4s, v21.4s // t2a sqsub v18.4s, v18.4s, v21.4s // t10a sqadd v24.4s, v29.4s, v26.4s // t3a sqsub v29.4s, v29.4s, v26.4s // t11a sqadd v21.4s, v20.4s, v19.4s // t4a sqsub v20.4s, v20.4s, v19.4s // t12a sqadd v26.4s, v27.4s, v28.4s // t5a sqsub v27.4s, v27.4s, v28.4s // t13a sqadd v19.4s, v22.4s, v17.4s // t6a sqsub v22.4s, v22.4s, v17.4s // t14a sqadd v28.4s, v25.4s, v30.4s // t7a sqsub v25.4s, v25.4s, v30.4s // t15a .irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 smin_4s \r, \r, v5 .endr .irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25 smax_4s \r, \r, v7 .endr mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 srshr v17.4s, v4.4s, #12 // t8 srshr v30.4s, v6.4s, #12 // t9 mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 srshr v18.4s, v2.4s, #12 // t10 srshr v29.4s, v4.4s, #12 // t11 mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 srshr v27.4s, v6.4s, #12 // t12 srshr v20.4s, v2.4s, #12 // t13 mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 srshr v25.4s, v4.4s, #12 // t14 srshr v22.4s, v6.4s, #12 // t15 sqsub v2.4s, v16.4s, v21.4s // t4 sqadd v16.4s, v16.4s, v21.4s // t0 sqsub v3.4s, v31.4s, v26.4s // t5 sqadd v31.4s, v31.4s, v26.4s // t1 sqadd v21.4s, v23.4s, v19.4s // t2 sqsub v23.4s, v23.4s, v19.4s // t6 sqadd v26.4s, v24.4s, v28.4s // t3 sqsub v24.4s, v24.4s, v28.4s // t7 sqadd v19.4s, v17.4s, v27.4s // t8a sqsub v17.4s, v17.4s, v27.4s // t12a sqadd v28.4s, v30.4s, v20.4s // t9a sqsub v30.4s, v30.4s, v20.4s // t13a sqadd v27.4s, v18.4s, v25.4s // t10a sqsub v18.4s, v18.4s, v25.4s // t14a sqadd v20.4s, v29.4s, v22.4s // t11a sqsub v29.4s, v29.4s, v22.4s // t15a .irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 smin_4s \r, \r, v5 .endr .irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29 smax_4s \r, \r, v7 .endr mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a srshr v22.4s, v4.4s, #12 // t4a srshr v25.4s, v6.4s, #12 // t5a mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 srshr v24.4s, v2.4s, #12 // t6a srshr v23.4s, v4.4s, #12 // t7a mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 srshr v17.4s, v6.4s, #12 // t12 mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 srshr v29.4s, v2.4s, #12 // t13 srshr v30.4s, v4.4s, #12 // t14 srshr v18.4s, v6.4s, #12 // t15 sqsub v2.4s, v16.4s, v21.4s // t2a .ifc \o0, v16 sqadd \o0\().4s, v16.4s, v21.4s // out0 sqsub v21.4s, v31.4s, v26.4s // t3a sqadd \o15\().4s, v31.4s, v26.4s // out15 .else sqadd v4.4s, v16.4s, v21.4s // out0 sqsub v21.4s, v31.4s, v26.4s // t3a sqadd \o15\().4s, v31.4s, v26.4s // out15 mov \o0\().16b, v4.16b .endif sqsub v3.4s, v29.4s, v18.4s // t15a sqadd \o13\().4s, v29.4s, v18.4s // out13 sqadd \o2\().4s, v17.4s, v30.4s // out2 sqsub v26.4s, v17.4s, v30.4s // t14a sqadd \o1\().4s, v19.4s, v27.4s // out1 sqsub v27.4s, v19.4s, v27.4s // t10 sqadd \o14\().4s, v28.4s, v20.4s // out14 sqsub v20.4s, v28.4s, v20.4s // t11 sqadd \o3\().4s, v22.4s, v24.4s // out3 sqsub v22.4s, v22.4s, v24.4s // t6 sqadd \o12\().4s, v25.4s, v23.4s // out12 sqsub v23.4s, v25.4s, v23.4s // t7 // Not clipping the output registers, as they will be downshifted and // narrowed afterwards anyway. .irp r, v2, v21, v3, v26, v27, v20, v22, v23 smin_4s \r, \r, v5 .endr .irp r, v2, v21, v3, v26, v27, v20, v22, v23 smax_4s \r, \r, v7 .endr sqneg \o15\().4s, \o15\().4s // out15 sqneg \o13\().4s, \o13\().4s // out13 sqneg \o1\().4s, \o1\().4s // out1 sqneg \o3\().4s, \o3\().4s // out3 mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) srshr v24.4s, v24.4s, #12 // out8 srshr v4.4s, v4.4s, #12 // out7 srshr v5.4s, v6.4s, #12 // out5 mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) srshr v26.4s, v6.4s, #12 // out10 mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) srshr \o4\().4s, v2.4s, #12 // out4 srshr v6.4s, v6.4s, #12 // out11 srshr v7.4s, v21.4s, #12 // out9 srshr \o6\().4s, v22.4s, #12 // out6 .ifc \o8, v23 mov \o8\().16b, v24.16b mov \o10\().16b, v26.16b .endif sqneg \o7\().4s, v4.4s // out7 sqneg \o5\().4s, v5.4s // out5 sqneg \o11\().4s, v6.4s // out11 sqneg \o9\().4s, v7.4s // out9 .endm function inv_adst_4s_x16_neon AARCH64_VALID_CALL_TARGET iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 ret endfunc function inv_flipadst_4s_x16_neon AARCH64_VALID_CALL_TARGET iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 ret endfunc function inv_identity_4s_x16_neon AARCH64_VALID_CALL_TARGET movz w16, #2*(5793-4096)*8, lsl #16 dup v0.2s, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 sqrdmulh v2.4s, v\i\().4s, v0.s[0] sqadd v\i\().4s, v\i\().4s, v\i\().4s sqadd v\i\().4s, v\i\().4s, v2.4s .endr ret endfunc .macro identity_4x16_shift1 c .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s sqrdmulh v3.4s, \i, \c srshr v3.4s, v3.4s, #1 sqadd \i, \i, v3.4s .endr .endm .macro identity_4x16 c .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s sqrdmulh v3.4s, \i, \c sqadd \i, \i, \i sqadd \i, \i, v3.4s .endr .endm .macro def_horz_16 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_16x4_neon mov x14, x30 movi v7.4s, #0 .if \scale movz w16, #2896*8, lsl #16 dup v0.2s, w16 .endif .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr .if \scale scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif blr x4 sqrshrn v16.4h, v16.4s, #\shift sqrshrn v17.4h, v17.4s, #\shift sqrshrn v18.4h, v18.4s, #\shift sqrshrn v19.4h, v19.4s, #\shift sqrshrn2 v16.8h, v20.4s, #\shift sqrshrn2 v17.8h, v21.4s, #\shift sqrshrn2 v18.8h, v22.4s, #\shift sqrshrn2 v19.8h, v23.4s, #\shift sqrshrn v20.4h, v24.4s, #\shift sqrshrn v21.4h, v25.4s, #\shift sqrshrn v22.4h, v26.4s, #\shift sqrshrn v23.4h, v27.4s, #\shift sqrshrn2 v20.8h, v28.4s, #\shift sqrshrn2 v21.8h, v29.4s, #\shift sqrshrn2 v22.8h, v30.4s, #\shift sqrshrn2 v23.8h, v31.4s, #\shift transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 .irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h st1 {\i}, [x6], #16 .endr ret x14 endfunc .endm def_horz_16 scale=0, shift=2 def_horz_16 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_8x16_neon mov x14, x30 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr blr x5 load_add_store_8x16 x6, x7 ret x14 endfunc function inv_txfm_add_16x16_neon mov x15, x30 sub sp, sp, #512 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12 add x6, sp, #(\i*16*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .if \i < 12 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #16*4 bl inv_txfm_horz_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #512 ret x15 endfunc const eob_16x16 .short 10, 36, 78, 256 endconst const eob_16x16_identity .short 4, 8, 12, 256 endconst .macro def_fn_16x16 txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif adr x4, inv_\txfm1\()_4s_x16_neon movrel x5, X(inv_\txfm2\()_8h_x16_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_16x16 .else movrel x13, eob_16x16_identity .endif .else .ifc \txfm2, identity movrel x13, eob_16x16_identity .else movrel x13, eob_16x16 .endif .endif b inv_txfm_add_16x16_neon endfunc .endm def_fn_16x16 dct, dct def_fn_16x16 identity, identity def_fn_16x16 dct, adst def_fn_16x16 dct, flipadst def_fn_16x16 dct, identity def_fn_16x16 adst, dct def_fn_16x16 adst, adst def_fn_16x16 adst, flipadst def_fn_16x16 flipadst, dct def_fn_16x16 flipadst, adst def_fn_16x16 flipadst, flipadst def_fn_16x16 identity, dct function inv_txfm_add_16x4_neon mov x15, x30 movi v4.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], #16 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x4 x6, x7 sqrshrn v16.4h, v24.4s, #1 sqrshrn v17.4h, v25.4s, #1 sqrshrn v18.4h, v26.4s, #1 sqrshrn v19.4h, v27.4s, #1 sqrshrn2 v16.8h, v28.4s, #1 sqrshrn2 v17.8h, v29.4s, #1 sqrshrn2 v18.8h, v30.4s, #1 sqrshrn2 v19.8h, v31.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 blr x5 add x6, x0, #16 load_add_store_8x4 x6, x7 ret x15 endfunc function inv_txfm_add_4x16_neon ldrh w12, [x13, #4] mov x15, x30 mov x11, #64 cmp w3, w12 ldrh w12, [x13, #2] b.lt 1f add x6, x2, #48 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 sqrshrn v28.4h, v16.4s, #1 sqrshrn v29.4h, v17.4s, #1 sqrshrn v30.4h, v18.4s, #1 sqrshrn v31.4h, v19.4s, #1 transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 b 2f 1: .irp i, v28.4h, v29.4h, v30.4h, v31.4h movi \i, #0 .endr 2: cmp w3, w12 ldrh w12, [x13, #0] b.lt 1f add x6, x2, #32 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 b 2f 1: .irp i, v24.4h, v25.4h, v26.4h, v27.4h movi \i, #0 .endr 2: cmp w3, w12 b.lt 1f add x6, x2, #16 movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x6] st1 {v2.4s}, [x6], x11 .endr blr x4 sqrshrn v20.4h, v16.4s, #1 sqrshrn v21.4h, v17.4s, #1 sqrshrn v22.4h, v18.4s, #1 sqrshrn v23.4h, v19.4s, #1 transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 b 2f 1: .irp i, v20.4h, v21.4h, v22.4h, v23.4h movi \i, #0 .endr 2: movi v2.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s ld1 {\i}, [x2] st1 {v2.4s}, [x2], x11 .endr blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 blr x5 load_add_store_4x16 x0, x6 ret x15 endfunc const eob_4x16 .short 13, 29, 45, 64 endconst const eob_4x16_identity1 .short 16, 32, 48, 64 endconst const eob_4x16_identity2 .short 4, 8, 12, 64 endconst .macro def_fn_416 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .if \w == 4 adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_4x16 .else movrel x13, eob_4x16_identity1 .endif .else .ifc \txfm2, identity movrel x13, eob_4x16_identity2 .else movrel x13, eob_4x16 .endif .endif .else adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_416 w, h def_fn_416 \w, \h, dct, dct def_fn_416 \w, \h, identity, identity def_fn_416 \w, \h, dct, adst def_fn_416 \w, \h, dct, flipadst def_fn_416 \w, \h, dct, identity def_fn_416 \w, \h, adst, dct def_fn_416 \w, \h, adst, adst def_fn_416 \w, \h, adst, flipadst def_fn_416 \w, \h, flipadst, dct def_fn_416 \w, \h, flipadst, adst def_fn_416 \w, \h, flipadst, flipadst def_fn_416 \w, \h, identity, dct def_fn_416 \w, \h, adst, identity def_fn_416 \w, \h, flipadst, identity def_fn_416 \w, \h, identity, adst def_fn_416 \w, \h, identity, flipadst .endm def_fns_416 4, 16 def_fns_416 16, 4 function inv_txfm_add_16x8_neon mov x15, x30 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] cmp w3, w13 mov x11, #32 b.lt 1f movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 add x6, x2, #16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 blr x4 sqrshrn v8.4h, v16.4s, #1 sqrshrn v9.4h, v17.4s, #1 sqrshrn v10.4h, v18.4s, #1 sqrshrn v11.4h, v19.4s, #1 sqrshrn2 v8.8h, v20.4s, #1 sqrshrn2 v9.8h, v21.4s, #1 sqrshrn2 v10.8h, v22.4s, #1 sqrshrn2 v11.8h, v23.4s, #1 sqrshrn v12.4h, v24.4s, #1 sqrshrn v13.4h, v25.4s, #1 sqrshrn v14.4h, v26.4s, #1 sqrshrn v15.4h, v27.4s, #1 sqrshrn2 v12.8h, v28.4s, #1 sqrshrn2 v13.8h, v29.4s, #1 sqrshrn2 v14.8h, v30.4s, #1 sqrshrn2 v15.8h, v31.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 b 2f 1: .irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h movi \i, #0 .endr 2: movz w16, #2896*8, lsl #16 dup v0.2s, w16 movi v4.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 mov v20.16b, v8.16b mov v21.16b, v9.16b mov v22.16b, v10.16b mov v23.16b, v11.16b transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 sqrshrn v8.4h, v24.4s, #1 sqrshrn v9.4h, v25.4s, #1 sqrshrn v10.4h, v26.4s, #1 sqrshrn v11.4h, v27.4s, #1 sqrshrn2 v8.8h, v28.4s, #1 sqrshrn2 v9.8h, v29.4s, #1 sqrshrn2 v10.8h, v30.4s, #1 sqrshrn2 v11.8h, v31.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 blr x5 mov x6, x0 load_add_store_8x8 x6, x7 mov v16.16b, v8.16b mov v17.16b, v9.16b mov v18.16b, v10.16b mov v19.16b, v11.16b mov v20.16b, v12.16b mov v21.16b, v13.16b mov v22.16b, v14.16b mov v23.16b, v15.16b blr x5 add x0, x0, #16 load_add_store_8x8 x0, x7 ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x15 endfunc function inv_txfm_add_8x16_neon mov x15, x30 stp d8, d9, [sp, #-0x20]! stp d10, d11, [sp, #0x10] ldrh w12, [x13, #4] mov x11, #64 cmp w3, w12 ldrh w12, [x13, #2] b.lt 1f add x6, x2, #48 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v28.4h, v16.4s, #1 sqrshrn v29.4h, v17.4s, #1 sqrshrn v30.4h, v18.4s, #1 sqrshrn v31.4h, v19.4s, #1 sqrshrn2 v28.8h, v20.4s, #1 sqrshrn2 v29.8h, v21.4s, #1 sqrshrn2 v30.8h, v22.4s, #1 sqrshrn2 v31.8h, v23.4s, #1 transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 b 2f 1: .irp i, v28.8h, v29.8h, v30.8h, v31.8h movi \i, #0 .endr 2: cmp w3, w12 ldrh w12, [x13, #0] b.lt 1f add x6, x2, #32 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v24.4h, v16.4s, #1 sqrshrn v25.4h, v17.4s, #1 sqrshrn v26.4h, v18.4s, #1 sqrshrn v27.4h, v19.4s, #1 sqrshrn2 v24.8h, v20.4s, #1 sqrshrn2 v25.8h, v21.4s, #1 sqrshrn2 v26.8h, v22.4s, #1 sqrshrn2 v27.8h, v23.4s, #1 transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 b 2f 1: .irp i, v24.8h, v25.8h, v26.8h, v27.8h movi \i, #0 .endr 2: cmp w3, w12 b.lt 1f add x6, x2, #16 movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x6] st1 {v4.4s}, [x6], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v8.4h, v16.4s, #1 sqrshrn v9.4h, v17.4s, #1 sqrshrn v10.4h, v18.4s, #1 sqrshrn v11.4h, v19.4s, #1 sqrshrn2 v8.8h, v20.4s, #1 sqrshrn2 v9.8h, v21.4s, #1 sqrshrn2 v10.8h, v22.4s, #1 sqrshrn2 v11.8h, v23.4s, #1 transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 b 2f 1: .irp i, v8.8h, v9.8h, v10.8h, v11.8h movi \i, #0 .endr 2: movi v4.4s, #0 movz w16, #2896*8, lsl #16 dup v0.2s, w16 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s ld1 {\i}, [x2] st1 {v4.4s}, [x2], x11 .endr scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 blr x4 sqrshrn v16.4h, v16.4s, #1 sqrshrn v17.4h, v17.4s, #1 sqrshrn v18.4h, v18.4s, #1 sqrshrn v19.4h, v19.4s, #1 sqrshrn2 v16.8h, v20.4s, #1 sqrshrn2 v17.8h, v21.4s, #1 sqrshrn2 v18.8h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 mov v20.16b, v8.16b mov v21.16b, v9.16b mov v22.16b, v10.16b mov v23.16b, v11.16b blr x5 load_add_store_8x16 x0, x6 ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x20 ret x15 endfunc const eob_8x16 .short 10, 43, 75, 128 endconst const eob_8x16_identity1 .short 4, 64, 96, 128 endconst const eob_8x16_identity2 .short 4, 8, 12, 128 endconst .macro def_fn_816 w, h, txfm1, txfm2 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif adr x4, inv_\txfm1\()_4s_x\w\()_neon movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) .ifc \txfm1, identity .ifc \txfm2, identity movrel x13, eob_8x16 .else movrel x13, eob_8x16_identity1 .endif .else .ifc \txfm2, identity movrel x13, eob_8x16_identity2 .else movrel x13, eob_8x16 .endif .endif .if \h == 8 ldrh w13, [x13] .endif b inv_txfm_add_\w\()x\h\()_neon endfunc .endm .macro def_fns_816 w, h def_fn_816 \w, \h, dct, dct def_fn_816 \w, \h, identity, identity def_fn_816 \w, \h, dct, adst def_fn_816 \w, \h, dct, flipadst def_fn_816 \w, \h, dct, identity def_fn_816 \w, \h, adst, dct def_fn_816 \w, \h, adst, adst def_fn_816 \w, \h, adst, flipadst def_fn_816 \w, \h, flipadst, dct def_fn_816 \w, \h, flipadst, adst def_fn_816 \w, \h, flipadst, flipadst def_fn_816 \w, \h, identity, dct def_fn_816 \w, \h, adst, identity def_fn_816 \w, \h, flipadst, identity def_fn_816 \w, \h, identity, adst def_fn_816 \w, \h, identity, flipadst .endm def_fns_816 8, 16 def_fns_816 16, 8 function inv_dct32_odd_4s_x16_neon movrel x16, idct_coeffs, 4*16 ld1 {v0.4s, v1.4s}, [x16], #32 mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a srshr v16.4s, v2.4s, #12 // t16a srshr v31.4s, v4.4s, #12 // t31a mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a srshr v24.4s, v6.4s, #12 // t17a srshr v23.4s, v2.4s, #12 // t30a mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a srshr v20.4s, v4.4s, #12 // t18a srshr v27.4s, v6.4s, #12 // t29a mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a ld1 {v0.4s, v1.4s}, [x16] sub x16, x16, #4*24 mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a srshr v28.4s, v2.4s, #12 // t19a srshr v19.4s, v4.4s, #12 // t28a mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a srshr v18.4s, v6.4s, #12 // t20a srshr v29.4s, v2.4s, #12 // t27a mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a srshr v26.4s, v4.4s, #12 // t21a srshr v21.4s, v6.4s, #12 // t26a mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a srshr v22.4s, v2.4s, #12 // t22a srshr v25.4s, v4.4s, #12 // t25a mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a srshr v30.4s, v6.4s, #12 // t23a srshr v17.4s, v2.4s, #12 // t24a ld1 {v0.4s, v1.4s}, [x16] movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 sqsub v2.4s, v16.4s, v24.4s // t17 sqadd v16.4s, v16.4s, v24.4s // t16 sqsub v3.4s, v31.4s, v23.4s // t30 sqadd v31.4s, v31.4s, v23.4s // t31 sqsub v24.4s, v28.4s, v20.4s // t18 sqadd v28.4s, v28.4s, v20.4s // t19 sqadd v23.4s, v18.4s, v26.4s // t20 sqsub v18.4s, v18.4s, v26.4s // t21 sqsub v20.4s, v30.4s, v22.4s // t22 sqadd v30.4s, v30.4s, v22.4s // t23 sqadd v26.4s, v17.4s, v25.4s // t24 sqsub v17.4s, v17.4s, v25.4s // t25 sqsub v22.4s, v29.4s, v21.4s // t26 sqadd v29.4s, v29.4s, v21.4s // t27 sqadd v25.4s, v19.4s, v27.4s // t28 sqsub v19.4s, v19.4s, v27.4s // t29 .irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a srshr v21.4s, v7.4s, #12 // t17a srshr v27.4s, v6.4s, #12 // t30a neg v2.4s, v2.4s // -> t18a mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a srshr v19.4s, v2.4s, #12 // t18a srshr v24.4s, v7.4s, #12 // t29a mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a srshr v22.4s, v6.4s, #12 // t21a srshr v18.4s, v2.4s, #12 // t26a neg v7.4s, v7.4s // -> t22a mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a srshr v17.4s, v7.4s, #12 // t22a srshr v20.4s, v6.4s, #12 // t25a sqsub v2.4s, v27.4s, v24.4s // t29 sqadd v27.4s, v27.4s, v24.4s // t30 sqsub v3.4s, v21.4s, v19.4s // t18 sqadd v21.4s, v21.4s, v19.4s // t17 sqsub v24.4s, v16.4s, v28.4s // t19a sqadd v16.4s, v16.4s, v28.4s // t16a sqsub v19.4s, v30.4s, v23.4s // t20a sqadd v30.4s, v30.4s, v23.4s // t23a sqsub v28.4s, v17.4s, v22.4s // t21 sqadd v17.4s, v17.4s, v22.4s // t22 sqadd v23.4s, v26.4s, v29.4s // t24a sqsub v26.4s, v26.4s, v29.4s // t27a sqadd v22.4s, v20.4s, v18.4s // t25 sqsub v20.4s, v20.4s, v18.4s // t26 sqsub v29.4s, v31.4s, v25.4s // t28a sqadd v31.4s, v31.4s, v25.4s // t31a .irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 srshr v18.4s, v7.4s, #12 // t18a srshr v25.4s, v6.4s, #12 // t29a mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28 mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 srshr v29.4s, v2.4s, #12 // t19 srshr v24.4s, v7.4s, #12 // t28 neg v6.4s, v6.4s // -> t20 mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a srshr v26.4s, v6.4s, #12 // t20 srshr v19.4s, v2.4s, #12 // t27 neg v7.4s, v7.4s // -> t21a mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a srshr v20.4s, v7.4s, #12 // t21a srshr v28.4s, v6.4s, #12 // t26a sqsub v2.4s, v16.4s, v30.4s // t23 sqadd v16.4s, v16.4s, v30.4s // t16 = out16 sqsub v3.4s, v31.4s, v23.4s // t24 sqadd v31.4s, v31.4s, v23.4s // t31 = out31 sqsub v23.4s, v21.4s, v17.4s // t22a sqadd v17.4s, v21.4s, v17.4s // t17a = out17 sqadd v30.4s, v27.4s, v22.4s // t30a = out30 sqsub v21.4s, v27.4s, v22.4s // t25a sqsub v27.4s, v18.4s, v20.4s // t21 sqadd v18.4s, v18.4s, v20.4s // t18 = out18 sqadd v7.4s, v29.4s, v26.4s // t19a = out19 sqsub v26.4s, v29.4s, v26.4s // t20a sqadd v29.4s, v25.4s, v28.4s // t29 = out29 sqsub v25.4s, v25.4s, v28.4s // t26 sqadd v28.4s, v24.4s, v19.4s // t28a = out28 sqsub v24.4s, v24.4s, v19.4s // t27a mov v19.16b, v7.16b // out19 .irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 smin \r\().4s, \r\().4s, v5.4s .endr .irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24 smax \r\().4s, \r\().4s, v4.4s .endr mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20 mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 srshr v20.4s, v7.4s, #12 // t20 srshr v22.4s, v6.4s, #12 // t27 mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a mov v27.16b, v22.16b // t27 srshr v26.4s, v7.4s, #12 // t26a mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25 srshr v21.4s, v6.4s, #12 // t21a srshr v22.4s, v24.4s, #12 // t22 srshr v25.4s, v7.4s, #12 // t25 mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a srshr v23.4s, v7.4s, #12 // t23a srshr v24.4s, v6.4s, #12 // t24a ret endfunc .macro def_horz_32 scale=0, shift=2, suffix function inv_txfm_horz\suffix\()_dct_32x4_neon mov x14, x30 movi v7.4s, #0 lsl x8, x8, #1 .if \scale movz w16, #2896*8, lsl #16 dup v0.2s, w16 .endif .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .if \scale scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct_4s_x16_neon // idct_16 leaves the row_clip_max/min constants in v5 and v4 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 .macro store1 r0, r1, r2, r3 st1 {\r0}, [x6], #16 st1 {\r1}, [x6], #16 st1 {\r2}, [x6], #16 st1 {\r3}, [x6], #16 .endm store1 v16.4s, v20.4s, v24.4s, v28.4s store1 v17.4s, v21.4s, v25.4s, v29.4s store1 v18.4s, v22.4s, v26.4s, v30.4s store1 v19.4s, v23.4s, v27.4s, v31.4s .purgem store1 sub x6, x6, #64*4 movi v7.4s, #0 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s ld1 {\i}, [x7] st1 {v7.4s}, [x7], x8 .endr .if \scale // This relies on the fact that the idct also leaves the right coeff in v0.s[1] scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif bl inv_dct32_odd_4s_x16_neon transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 .macro store2 r0, r1, r2, r3, shift ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] sqsub v4.4s, v0.4s, \r0 sqadd v0.4s, v0.4s, \r0 sqsub v5.4s, v1.4s, \r1 sqadd v1.4s, v1.4s, \r1 sqsub v6.4s, v2.4s, \r2 sqadd v2.4s, v2.4s, \r2 sqsub v7.4s, v3.4s, \r3 sqadd v3.4s, v3.4s, \r3 sqrshrn v0.4h, v0.4s, #\shift sqrshrn2 v0.8h, v1.4s, #\shift sqrshrn v1.4h, v2.4s, #\shift sqrshrn2 v1.8h, v3.4s, #\shift sqrshrn v2.4h, v7.4s, #\shift sqrshrn2 v2.8h, v6.4s, #\shift sqrshrn v3.4h, v5.4s, #\shift sqrshrn2 v3.8h, v4.4s, #\shift st1 {v0.8h, v1.8h}, [x6], #32 rev64 v2.8h, v2.8h rev64 v3.8h, v3.8h st1 {v2.8h, v3.8h}, [x6], #32 .endm store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift .purgem store2 ret x14 endfunc .endm def_horz_32 scale=0, shift=2 def_horz_32 scale=1, shift=1, suffix=_scale function inv_txfm_add_vert_dct_8x32_neon mov x14, x30 lsl x8, x8, #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 bl X(inv_dct_8h_x16_neon) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 add x7, x7, x8, lsr #1 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x7], x8 .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 bl X(inv_dct32_odd_8h_x16_neon) neg x9, x8 mov x10, x6 mvni v1.8h, #0xfc, lsl #8 // 0x3ff .macro combine r0, r1, r2, r3, op, stride ld1 {v5.8h}, [x7], \stride ld1 {v2.8h}, [x10], x1 ld1 {v6.8h}, [x7], \stride ld1 {v3.8h}, [x10], x1 \op v5.8h, v5.8h, \r0 ld1 {v7.8h}, [x7], \stride ld1 {v4.8h}, [x10], x1 srshr v5.8h, v5.8h, #4 \op v6.8h, v6.8h, \r1 usqadd v2.8h, v5.8h srshr v6.8h, v6.8h, #4 \op v7.8h, v7.8h, \r2 ld1 {v5.8h}, [x7], \stride usqadd v3.8h, v6.8h smin v2.8h, v2.8h, v1.8h srshr v7.8h, v7.8h, #4 \op v5.8h, v5.8h, \r3 st1 {v2.8h}, [x6], x1 ld1 {v2.8h}, [x10], x1 usqadd v4.8h, v7.8h smin v3.8h, v3.8h, v1.8h srshr v5.8h, v5.8h, #4 st1 {v3.8h}, [x6], x1 usqadd v2.8h, v5.8h smin v4.8h, v4.8h, v1.8h st1 {v4.8h}, [x6], x1 smin v2.8h, v2.8h, v1.8h st1 {v2.8h}, [x6], x1 .endm combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 sub x7, x7, x8 combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 .purgem combine ret x14 endfunc const eob_32x32 .short 10, 36, 78, 136, 210, 300, 406, 1024 endconst const eob_16x32 .short 10, 36, 78, 151, 215, 279, 343, 512 endconst const eob_16x32_shortside .short 10, 36, 78, 512 endconst const eob_8x32 .short 10, 43, 75, 107, 139, 171, 203, 256 endconst function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 movi v0.8h, #0 movi v1.8h, #0 movrel x13, eob_32x32, 2 mov x8, #4*32 1: mov w9, #0 movrel x12, eob_32x32, 2 2: add w9, w9, #8 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 load_add_store_8x8 x0, x7, shiftbits=2 ldrh w11, [x12], #4 sub x0, x0, x1, lsl #3 add x0, x0, #2*8 cmp w3, w11 b.ge 2b ldrh w11, [x13], #4 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw #1 add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #4*8 b 1b 9: ret endfunc .macro shift_16_regs op, shift .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s \op \i, \i, #\shift .endr .endm .macro def_identity_1632 w, h, wshort, hshort function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 movz w16, #2896*8, lsl #16 movz w17, #2*(5793-4096)*8, lsl #16 movi v0.4s, #0 movi v1.4s, #0 movrel x13, eob_16x32\hshort, 2 mov x8, #4*\h 1: mov w9, #0 movrel x12, eob_16x32\wshort, 2 2: add w9, w9, #8 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 dup v2.2s, w16 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 mov v2.s[1], w17 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 .if \w == 16 // 16x32 identity_4x16_shift1 v2.s[1] .else // 32x16 shift_16_regs sqshl, 1 identity_4x16 v2.s[1] .endif sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 .if \w == 16 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=4 .endif ldrh w11, [x12], #4 sub x0, x0, x1, lsl #3 add x0, x0, #16 cmp w3, w11 b.ge 2b ldrh w11, [x13], #4 cmp w3, w11 b.lt 9f sub x0, x0, w9, uxtw #1 add x0, x0, x1, lsl #3 msub x2, x8, x9, x2 add x2, x2, #4*8 b 1b 9: ret endfunc .endm def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 movi v0.4s, #0 movi v1.4s, #0 // Working on 8x8 blocks, read every other entry from eob_8x32 movrel x13, eob_8x32, 2 mov w8, #4*\h 1: // Working on 8x8 blocks, read every other entry from eob_8x32 ldrh w12, [x13], #4 ld1 {v16.4s, v17.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v18.4s, v19.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v20.4s, v21.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v22.4s, v23.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v24.4s, v25.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v26.4s, v27.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v28.4s, v29.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 ld1 {v30.4s, v31.4s}, [x2] st1 {v0.4s, v1.4s}, [x2], x8 .if \w == 8 sqrshrn v16.4h, v16.4s, #1 sqrshrn2 v16.8h, v17.4s, #1 sqrshrn v17.4h, v18.4s, #1 sqrshrn2 v17.8h, v19.4s, #1 sqrshrn v18.4h, v20.4s, #1 sqrshrn2 v18.8h, v21.4s, #1 sqrshrn v19.4h, v22.4s, #1 sqrshrn2 v19.8h, v23.4s, #1 sqrshrn v20.4h, v24.4s, #1 sqrshrn2 v20.8h, v25.4s, #1 sqrshrn v21.4h, v26.4s, #1 sqrshrn2 v21.8h, v27.4s, #1 sqrshrn v22.4h, v28.4s, #1 sqrshrn2 v22.8h, v29.4s, #1 sqrshrn v23.4h, v30.4s, #1 sqrshrn2 v23.8h, v31.4s, #1 .else sqxtn v16.4h, v16.4s sqxtn2 v16.8h, v17.4s sqxtn v17.4h, v18.4s sqxtn2 v17.8h, v19.4s sqxtn v18.4h, v20.4s sqxtn2 v18.8h, v21.4s sqxtn v19.4h, v22.4s sqxtn2 v19.8h, v23.4s sqxtn v20.4h, v24.4s sqxtn2 v20.8h, v25.4s sqxtn v21.4h, v26.4s sqxtn2 v21.8h, v27.4s sqxtn v22.4h, v28.4s sqxtn2 v22.8h, v29.4s sqxtn v23.4h, v30.4s sqxtn2 v23.8h, v31.4s .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 add x2, x2, #4*8 .else sub x0, x0, x1, lsl #3 add x0, x0, #2*8 .endif b 1b 9: ret endfunc .endm def_identity_832 8, 32 def_identity_832 32, 8 function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 idct_dc 32, 32, 2 mov x15, x30 sub sp, sp, #2048 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, sp, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #2048 ret x15 endfunc function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 idct_dc 16, 32, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_4s_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, sp, #(\i*16*2) add x7, x2, #(\i*4) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif mov x8, #4*32 bl inv_txfm_horz_scale_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #16*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 idct_dc 32, 16, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 movrel x5, X(inv_dct_8h_x16_neon) ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12 add x6, sp, #(\i*32*2) add x7, x2, #(\i*4) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .if \i < 12 ldrh w12, [x13], #2 .endif .endif mov x8, #4*16 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x6, x0, #(\i*2) add x7, sp, #(\i*2) mov x8, #32*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, sp, #1024 ret x15 endfunc function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 idct_dc 8, 32, 2 mov x15, x30 sub sp, sp, #512 movrel x13, eob_8x32 movi v28.4s, #0 mov x8, #4*32 mov w9, #32 mov x6, sp mov x7, x2 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().4s}, [x7] st1 {v28.4s}, [x7], x8 .endr ldrh w12, [x13], #2 sub w9, w9, #4 sub x7, x7, x8, lsl #3 add x7, x7, #4*4 bl inv_dct_4s_x8_neon sqrshrn v16.4h, v16.4s, #2 sqrshrn v17.4h, v17.4s, #2 sqrshrn v18.4h, v18.4s, #2 sqrshrn v19.4h, v19.4s, #2 sqrshrn2 v16.8h, v20.4s, #2 sqrshrn2 v17.8h, v21.4s, #2 sqrshrn2 v18.8h, v22.4s, #2 sqrshrn2 v19.8h, v23.4s, #2 transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 cmp w3, w12 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 b.ge 1b cbz w9, 3f movi v29.8h, #0 movi v30.8h, #0 movi v31.8h, #0 2: subs w9, w9, #4 st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 b.gt 2b 3: mov x6, x0 mov x7, sp mov x8, #8*2 bl inv_txfm_add_vert_dct_8x32_neon add sp, sp, #512 ret x15 endfunc function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 idct_dc 32, 8, 2 mov x15, x30 sub sp, sp, #512 .irp i, 0, 4 add x6, sp, #(\i*32*2) add x7, x2, #(\i*4) .if \i > 0 cmp w3, #10 b.lt 1f .endif mov x8, #8*4 bl inv_txfm_horz_dct_32x4_neon .endr b 2f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr 2: mov x8, #2*32 mov w9, #0 1: add x6, x0, x9, lsl #1 add x7, sp, x9, lsl #1 // #(\i*2) .irp i, 16, 17, 18, 19, 20, 21, 22, 23 ld1 {v\i\().8h}, [x7], x8 .endr add w9, w9, #8 bl X(inv_dct_8h_x8_neon) cmp w9, #32 load_add_store_8x8 x6, x7 b.lt 1b add sp, sp, #512 ret x15 endfunc function inv_dct64_step1_neon // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a ld1 {v0.4s, v1.4s}, [x17], #32 sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a ld1 {v0.4s}, [x17], #16 sqadd v24.4s, v16.4s, v17.4s // t32 sqsub v25.4s, v16.4s, v17.4s // t33 sqsub v26.4s, v19.4s, v18.4s // t34 sqadd v27.4s, v19.4s, v18.4s // t35 sqadd v28.4s, v20.4s, v21.4s // t60 sqsub v29.4s, v20.4s, v21.4s // t61 sqsub v30.4s, v23.4s, v22.4s // t62 sqadd v31.4s, v23.4s, v22.4s // t63 .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a neg v2.4s, v2.4s // t34a mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a srshr v26.4s, v2.4s, #12 // t34a mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a srshr v29.4s, v7.4s, #12 // t61a srshr v25.4s, v6.4s, #12 // t33a srshr v30.4s, v2.4s, #12 // t62a sqadd v16.4s, v24.4s, v27.4s // t32a sqsub v19.4s, v24.4s, v27.4s // t35a sqadd v17.4s, v25.4s, v26.4s // t33 sqsub v18.4s, v25.4s, v26.4s // t34 sqsub v20.4s, v31.4s, v28.4s // t60a sqadd v23.4s, v31.4s, v28.4s // t63a sqsub v21.4s, v30.4s, v29.4s // t61 sqadd v22.4s, v30.4s, v29.4s // t62 .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smin_4s \r, \r, v5 .endr .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smax_4s \r, \r, v4 .endr mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 srshr v21.4s, v2.4s, #12 // t61a srshr v18.4s, v7.4s, #12 // t34a mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 srshr v20.4s, v6.4s, #12 // t60 srshr v19.4s, v2.4s, #12 // t35 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 ret endfunc function inv_dct64_step2_neon movrel x16, idct_coeffs ld1 {v0.4s}, [x16] 1: // t32a/33/34a/35/60/61a/62/63a // t56a/57/58a/59/36/37a/38/39a // t40a/41/42a/43/52/53a/54/55a // t48a/49/50a/51/44/45a/46/47a ldr q16, [x6, #4*4*0] // t32a ldr q17, [x9, #4*4*8] // t39a ldr q18, [x9, #4*4*0] // t63a ldr q19, [x6, #4*4*8] // t56a ldr q20, [x6, #4*4*16] // t40a ldr q21, [x9, #4*4*24] // t47a ldr q22, [x9, #4*4*16] // t55a ldr q23, [x6, #4*4*24] // t48a sqadd v24.4s, v16.4s, v17.4s // t32 sqsub v25.4s, v16.4s, v17.4s // t39 sqadd v26.4s, v18.4s, v19.4s // t63 sqsub v27.4s, v18.4s, v19.4s // t56 sqsub v28.4s, v21.4s, v20.4s // t40 sqadd v29.4s, v21.4s, v20.4s // t47 sqadd v30.4s, v23.4s, v22.4s // t48 sqsub v31.4s, v23.4s, v22.4s // t55 .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a srshr v25.4s, v2.4s, #12 // t56a srshr v27.4s, v7.4s, #12 // t39a neg v6.4s, v6.4s // t40a mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a srshr v31.4s, v6.4s, #12 // t40a srshr v28.4s, v2.4s, #12 // t55a sqadd v16.4s, v24.4s, v29.4s // t32a sqsub v19.4s, v24.4s, v29.4s // t47a sqadd v17.4s, v27.4s, v31.4s // t39 sqsub v18.4s, v27.4s, v31.4s // t40 sqsub v20.4s, v26.4s, v30.4s // t48a sqadd v23.4s, v26.4s, v30.4s // t63a sqsub v21.4s, v25.4s, v28.4s // t55 sqadd v22.4s, v25.4s, v28.4s // t56 .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smin_4s \r, \r, v5 .endr .irp r, v16, v19, v17, v18, v20, v23, v21, v22 smax_4s \r, \r, v4 .endr mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 srshr v18.4s, v2.4s, #12 // t40a srshr v21.4s, v7.4s, #12 // t55a mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 srshr v19.4s, v6.4s, #12 // t47 srshr v20.4s, v2.4s, #12 // t48 str q16, [x6, #4*4*0] // t32a str q17, [x9, #4*4*0] // t39 str q18, [x6, #4*4*8] // t40a str q19, [x9, #4*4*8] // t47 str q20, [x6, #4*4*16] // t48 str q21, [x9, #4*4*16] // t55a str q22, [x6, #4*4*24] // t56 str q23, [x9, #4*4*24] // t63a add x6, x6, #4*4 sub x9, x9, #4*4 cmp x6, x9 b.lt 1b ret endfunc .macro load8 src, strd, zero, clear .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s .if \clear ld1 {\i}, [\src] st1 {\zero}, [\src], \strd .else ld1 {\i}, [\src], \strd .endif .endr .endm .macro store16 dst .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s st1 {\i}, [\dst], #16 .endr .endm .macro clear_upper8 .irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s movi \i, #0 .endr .endm .macro movi_if reg, val, cond .if \cond movi \reg, \val .endif .endm .macro movz16dup_if reg, gpr, val, cond .if \cond movz \gpr, \val, lsl #16 dup \reg, \gpr .endif .endm .macro st1_if regs, dst, cond .if \cond st1 \regs, \dst .endif .endm .macro str_if reg, dst, cond .if \cond str \reg, \dst .endif .endm .macro stroff_if reg, dst, dstoff, cond .if \cond str \reg, \dst, \dstoff .endif .endm .macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 .if \cond scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endif .endm .macro def_dct64_func suffix, clear=0, scale=0 function inv_txfm_dct\suffix\()_4s_x64_neon mov x14, x30 mov x6, sp lsl x8, x8, #2 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear load8 x7, x8, v7.4s, \clear clear_upper8 sub x7, x7, x8, lsl #3 add x7, x7, x8, lsr #1 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct_4s_x16_neon // idct_16 leaves the row_clip_max/min constants in v5 and v4 .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smin_4s \r, \r, v5 .endr .irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 smax_4s \r, \r, v4 .endr store16 x6 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.8h, #0, \clear load8 x7, x8, v7.4s, \clear clear_upper8 sub x7, x7, x8, lsl #3 lsr x8, x8, #1 sub x7, x7, x8, lsr #1 scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 bl inv_dct32_odd_4s_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 mov x9, #-16 movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 .macro store_addsub r0, r1, r2, r3 ld1 {v2.4s}, [x6], #16 ld1 {v3.4s}, [x6], #16 sqadd v6.4s, v2.4s, \r0 sqsub \r0, v2.4s, \r0 ld1 {v4.4s}, [x6], #16 sqadd v7.4s, v3.4s, \r1 sqsub \r1, v3.4s, \r1 smin v6.4s, v6.4s, v1.4s smin \r0, \r0, v1.4s ld1 {v5.4s}, [x6], #16 sqadd v2.4s, v4.4s, \r2 sub x6, x6, #16*4 smax v6.4s, v6.4s, v0.4s smax \r0, \r0, v0.4s sqsub \r2, v4.4s, \r2 smin v7.4s, v7.4s, v1.4s smin \r1, \r1, v1.4s st1 {v6.4s}, [x6], #16 st1 {\r0}, [x10], x9 smin v2.4s, v2.4s, v1.4s smin \r2, \r2, v1.4s smax v7.4s, v7.4s, v0.4s smax \r1, \r1, v0.4s sqadd v3.4s, v5.4s, \r3 sqsub \r3, v5.4s, \r3 smax v2.4s, v2.4s, v0.4s smax \r2, \r2, v0.4s smin v3.4s, v3.4s, v1.4s smin \r3, \r3, v1.4s st1 {v7.4s}, [x6], #16 st1 {\r1}, [x10], x9 smax v3.4s, v3.4s, v0.4s smax \r3, \r3, v0.4s st1 {v2.4s}, [x6], #16 st1 {\r2}, [x10], x9 st1 {v3.4s}, [x6], #16 st1 {\r3}, [x10], x9 .endm store_addsub v31.4s, v30.4s, v29.4s, v28.4s store_addsub v27.4s, v26.4s, v25.4s, v24.4s store_addsub v23.4s, v22.4s, v21.4s, v20.4s store_addsub v19.4s, v18.4s, v17.4s, v16.4s .purgem store_addsub add x6, x6, #4*4*16 movrel x17, idct64_coeffs movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000 movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x9, x7, x8, lsl #4 // offset 16 add x10, x7, x8, lsl #3 // offset 8 sub x9, x9, x8 // offset 15 sub x11, x10, x8 // offset 7 ld1 {v16.4s}, [x7] // in1 (offset 0) ld1 {v17.4s}, [x9] // in31 (offset 15) ld1 {v18.4s}, [x10] // in17 (offset 8) ld1 {v19.4s}, [x11] // in15 (offset 7) st1_if {v7.4s}, [x7], \clear st1_if {v7.4s}, [x9], \clear st1_if {v7.4s}, [x10], \clear st1_if {v7.4s}, [x11], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear add x7, x7, x8, lsl #2 // offset 4 sub x9, x9, x8, lsl #2 // offset 11 sub x10, x7, x8 // offset 3 add x11, x9, x8 // offset 12 ld1 {v16.4s}, [x10] // in7 (offset 3) ld1 {v17.4s}, [x11] // in25 (offset 12) ld1 {v18.4s}, [x9] // in23 (offset 11) ld1 {v19.4s}, [x7] // in9 (offset 4) st1_if {v7.4s}, [x7], \clear st1_if {v7.4s}, [x9], \clear st1_if {v7.4s}, [x10], \clear st1_if {v7.4s}, [x11], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear sub x10, x10, x8, lsl #1 // offset 1 sub x9, x9, x8, lsl #1 // offset 9 add x7, x7, x8 // offset 5 add x11, x11, x8 // offset 13 ldr q16, [x10, x8] // in5 (offset 2) ldr q17, [x11] // in27 (offset 13) ldr q18, [x9, x8] // in21 (offset 10) ldr q19, [x7] // in11 (offset 5) stroff_if q7, [x10, x8], \clear str_if q7, [x11], \clear stroff_if q7, [x9, x8], \clear str_if q7, [x7], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon movz16dup_if v0.2s, w16, #2896*8, \scale movi_if v7.4s, #0, \clear ldr q16, [x10] // in3 (offset 1) ldr q17, [x11, x8] // in29 (offset 14) ldr q18, [x9] // in19 (offset 9) ldr q19, [x7, x8] // in13 (offset 6) str_if q7, [x10], \clear stroff_if q7, [x11, x8], \clear str_if q7, [x9], \clear stroff_if q7, [x7, x8], \clear scale_if \scale, v0.s[0], v16, v17, v18, v19 bl inv_dct64_step1_neon sub x6, x6, #4*4*32 add x9, x6, #4*4*7 bl inv_dct64_step2_neon ret x14 endfunc .endm def_dct64_func _clear, clear=1 def_dct64_func _clear_scale, clear=1, scale=1 function inv_txfm_horz_dct_64x4_neon mov x14, x30 mov x7, sp add x8, sp, #4*4*(64 - 4) add x9, x6, #2*56 mov x10, #2*64 mov x11, #-4*4*4 dup v7.4s, w12 1: ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 .macro store_addsub src0, src1, src2, src3 sqsub v1.4s, \src0, \src1 sqadd v0.4s, \src0, \src1 sqsub v3.4s, \src2, \src3 srshl v1.4s, v1.4s, v7.4s sqadd v2.4s, \src2, \src3 srshl v3.4s, v3.4s, v7.4s srshl v0.4s, v0.4s, v7.4s srshl v2.4s, v2.4s, v7.4s sqxtn v3.4h, v3.4s sqxtn2 v3.8h, v1.4s sqxtn v0.4h, v0.4s sqxtn2 v0.8h, v2.4s rev64 v3.8h, v3.8h st1 {v0.8h}, [x6], x10 st1 {v3.8h}, [x9], x10 .endm store_addsub v16.4s, v31.4s, v20.4s, v27.4s store_addsub v17.4s, v30.4s, v21.4s, v26.4s store_addsub v18.4s, v29.4s, v22.4s, v25.4s store_addsub v19.4s, v28.4s, v23.4s, v24.4s .purgem store_addsub sub x6, x6, x10, lsl #2 sub x9, x9, x10, lsl #2 add x6, x6, #16 sub x9, x9, #16 cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_vert_dct_8x64_neon mov x14, x30 lsl x8, x8, #1 mov x7, sp add x8, sp, #2*8*(64 - 4) add x9, x6, x1, lsl #6 sub x9, x9, x1 neg x10, x1 mov x11, #-2*8*4 1: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 mvni v7.8h, #0xfc, lsl #8 // 0x3ff .macro add_dest_addsub src0, src1, src2, src3 ld1 {v0.8h}, [x6], x1 ld1 {v1.8h}, [x9], x10 sqadd v4.8h, \src0, \src1 ld1 {v2.8h}, [x6] sqsub \src0, \src0, \src1 ld1 {v3.8h}, [x9] sqadd v5.8h, \src2, \src3 sqsub \src2, \src2, \src3 sub x6, x6, x1 sub x9, x9, x10 srshr v4.8h, v4.8h, #4 srshr v5.8h, v5.8h, #4 srshr \src0, \src0, #4 usqadd v0.8h, v4.8h srshr \src2, \src2, #4 usqadd v1.8h, \src0 usqadd v2.8h, v5.8h smin v0.8h, v0.8h, v7.8h usqadd v3.8h, \src2 smin v1.8h, v1.8h, v7.8h st1 {v0.8h}, [x6], x1 smin v2.8h, v2.8h, v7.8h st1 {v1.8h}, [x9], x10 smin v3.8h, v3.8h, v7.8h st1 {v2.8h}, [x6], x1 st1 {v3.8h}, [x9], x10 .endm add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h .purgem add_dest_addsub cmp x7, x8 b.lt 1b ret x14 endfunc function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 idct_dc 64, 64, 2 mov x15, x30 sub_sp 64*32*2+64*4*4 add x5, sp, #64*4*4 movrel x13, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #32*4 mov x12, #-2 // shift bl inv_txfm_dct_clear_4s_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 idct_dc 64, 32, 1 mov x15, x30 sub_sp 64*32*2+64*4*4 add x5, sp, #64*4*4 movrel x13, eob_32x32 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*64*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #32*4 mov x12, #-1 // shift bl inv_txfm_dct_clear_scale_4s_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 28 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i*2) add x7, x5, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_dct_8x32_neon .endr add sp, x5, #64*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 idct_dc 32, 64, 1 mov x15, x30 sub_sp 32*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_32x32 ldrh w12, [x13], #2 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*32*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f ldrh w12, [x13], #2 .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_scale_dct_32x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #32*32*2 ret x15 endfunc function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 idct_dc 64, 16, 2 mov x15, x30 sub_sp 64*16*2+64*4*4 add x4, sp, #64*4*4 movrel x13, eob_16x32 .irp i, 0, 4, 8, 12 add x6, x4, #(\i*64*2) .if \i > 0 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f .endif add x7, x2, #(\i*4) mov x8, #16*4 mov x12, #-2 // shift bl inv_txfm_dct_clear_4s_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x4_neon .if \i < 12 ldrh w12, [x13], #2 .endif .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #2 .rept 4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: movrel x5, X(inv_dct_8h_x16_neon) .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i*2) add x7, x4, #(\i*2) mov x8, #64*2 bl inv_txfm_add_vert_8x16_neon .endr add sp, x4, #64*16*2 ret x15 endfunc function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 idct_dc 16, 64, 2 mov x15, x30 sub_sp 16*32*2+64*8*2 add x5, sp, #64*8*2 movrel x13, eob_16x32 ldrh w12, [x13], #2 adr x4, inv_dct_4s_x16_neon .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add x6, x5, #(\i*16*2) .if \i > 0 mov w8, #(32 - \i) cmp w3, w12 b.lt 1f .if \i < 28 ldrh w12, [x13], #2 .endif .endif add x7, x2, #(\i*4) mov x8, #32*4 bl inv_txfm_horz_16x4_neon .endr b 3f 1: movi v4.8h, #0 movi v5.8h, #0 movi v6.8h, #0 movi v7.8h, #0 2: subs w8, w8, #4 .rept 2 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 .endr b.gt 2b 3: .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 bl X(inv_txfm_dct_8h_x64_neon) add x6, x0, #(\i*2) bl inv_txfm_add_vert_dct_8x64_neon .endr add sp, x5, #16*32*2 ret x15 endfunc rav1e-0.7.1/src/arm/64/loopfilter.S000064400000000000000000001350031046102023000147530ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // depending on how many pixels need to be stored, returns: // x14 = (1 << 0) : 0 pixels // x14 = (1 << 4) : inner 4 pixels // x14 = (1 << 6) : inner 6 pixels // x14 = 0 : all pixels .macro loop_filter wd function lpf_16_wd\wd\()_neon uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0) uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0) uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1) .if \wd >= 6 uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1) uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1) .endif .if \wd >= 8 uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2) uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3) .endif .if \wd >= 6 umax v4.16b, v4.16b, v5.16b .endif uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2 .if \wd >= 8 umax v6.16b, v6.16b, v7.16b .endif ushr v3.16b, v3.16b, #1 .if \wd >= 8 umax v4.16b, v4.16b, v6.16b .endif .if \wd >= 6 and v4.16b, v4.16b, v14.16b .endif umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0)) uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 umax v4.16b, v0.16b, v4.16b cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E and v1.16b, v1.16b, v2.16b // fm and v1.16b, v1.16b, v13.16b // fm && wd >= 4 .if \wd >= 6 and v14.16b, v14.16b, v1.16b // fm && wd > 4 .endif .if \wd >= 16 and v15.16b, v15.16b, v1.16b // fm && wd == 16 .endif mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 b.ne 9f // if (!fm || wd < 4) return; mov x14, #(1 << 0) ret 9: .if \wd >= 6 movi v10.16b, #1 uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0) uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0) uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0) .if \wd >= 8 uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0) uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0) .endif umax v2.16b, v2.16b, v3.16b umax v4.16b, v4.16b, v5.16b .if \wd >= 8 umax v6.16b, v6.16b, v7.16b .endif umax v2.16b, v2.16b, v4.16b .if \wd >= 8 umax v2.16b, v2.16b, v6.16b .endif .if \wd == 16 uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0) uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0) uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0) .endif cmhs v2.16b, v10.16b, v2.16b // flat8in .if \wd == 16 uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0) uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0) uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0) .endif and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in .if \wd == 16 umax v3.16b, v3.16b, v4.16b umax v5.16b, v5.16b, v6.16b .endif mov x16, v1.d[0] mov x17, v1.d[1] .if \wd == 16 umax v7.16b, v7.16b, v8.16b umax v3.16b, v3.16b, v5.16b umax v3.16b, v3.16b, v7.16b cmhs v3.16b, v10.16b, v3.16b // flat8out .endif adds x16, x16, x17 .if \wd == 16 and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out .endif b.eq 1f // skip wd == 4 case .endif movi v3.16b, #128 eor v2.16b, v22.16b, v3.16b // p1 - 128 eor v3.16b, v25.16b, v3.16b // q1 - 128 cmhi v0.16b, v0.16b, v12.16b // hev sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1) and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) usubl v2.8h, v24.8b, v23.8b movi v5.8h, #3 usubl2 v3.8h, v24.16b, v23.16b mul v2.8h, v2.8h, v5.8h mul v3.8h, v3.8h, v5.8h movi v6.16b, #4 saddw v2.8h, v2.8h, v4.8b saddw2 v3.8h, v3.8h, v4.16b movi v7.16b, #3 sqxtn v2.8b, v2.8h // f sqxtn2 v2.16b, v3.8h sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127) sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127) sshr v4.16b, v4.16b, #3 // f1 sshr v5.16b, v5.16b, #3 // f2 mov v2.16b, v23.16b // p0 mov v3.16b, v24.16b // q0 neg v6.16b, v4.16b // -f1 srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1 // p0 + f2, q0 - f1 usqadd v2.16b, v5.16b // out p0 usqadd v3.16b, v6.16b // out q0 neg v6.16b, v4.16b // -((f1 + 1) >> 1) bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4) mov v2.16b, v22.16b // p1 mov v3.16b, v25.16b // q1 // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1) usqadd v2.16b, v4.16b // out p1 usqadd v3.16b, v6.16b // out q1 bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 2f // skip if there's no flat8in uaddl v0.8h, v21.8b, v21.8b // p2 * 2 uaddl2 v1.8h, v21.16b, v21.16b uaddl v2.8h, v21.8b, v22.8b // p2 + p1 uaddl2 v3.8h, v21.16b, v22.16b uaddl v4.8h, v22.8b, v23.8b // p1 + p0 uaddl2 v5.8h, v22.16b, v23.16b uaddl v6.8h, v23.8b, v24.8b // p0 + q0 uaddl2 v7.8h, v23.16b, v24.16b add v8.8h, v0.8h, v2.8h add v9.8h, v1.8h, v3.8h add v10.8h, v4.8h, v6.8h add v11.8h, v5.8h, v7.8h uaddl v12.8h, v24.8b, v25.8b // q0 + q1 uaddl2 v13.8h, v24.16b, v25.16b add v8.8h, v8.8h, v10.8h add v9.8h, v9.8h, v11.8h sub v12.8h, v12.8h, v0.8h sub v13.8h, v13.8h, v1.8h uaddl v10.8h, v25.8b, v26.8b // q1 + q2 uaddl2 v11.8h, v25.16b, v26.16b rshrn v0.8b, v8.8h, #3 // out p1 rshrn2 v0.16b, v9.8h, #3 add v8.8h, v8.8h, v12.8h add v9.8h, v9.8h, v13.8h sub v10.8h, v10.8h, v2.8h sub v11.8h, v11.8h, v3.8h uaddl v12.8h, v26.8b, v26.8b // q2 + q2 uaddl2 v13.8h, v26.16b, v26.16b rshrn v1.8b, v8.8h, #3 // out p0 rshrn2 v1.16b, v9.8h, #3 add v8.8h, v8.8h, v10.8h add v9.8h, v9.8h, v11.8h sub v12.8h, v12.8h, v4.8h sub v13.8h, v13.8h, v5.8h rshrn v2.8b, v8.8h, #3 // out q0 rshrn2 v2.16b, v9.8h, #3 bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) add v8.8h, v8.8h, v12.8h add v9.8h, v9.8h, v13.8h bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) rshrn v3.8b, v8.8h, #3 // out q1 rshrn2 v3.16b, v9.8h, #3 bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) .elseif \wd >= 8 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 .if \wd == 8 b.eq 8f // skip if there's no flat8in .else b.eq 2f // skip if there's no flat8in .endif uaddl v0.8h, v20.8b, v21.8b // p3 + p2 uaddl2 v1.8h, v20.16b, v21.16b uaddl v2.8h, v22.8b, v25.8b // p1 + q1 uaddl2 v3.8h, v22.16b, v25.16b uaddl v4.8h, v20.8b, v22.8b // p3 + p1 uaddl2 v5.8h, v20.16b, v22.16b uaddl v6.8h, v23.8b, v26.8b // p0 + q2 uaddl2 v7.8h, v23.16b, v26.16b add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) add v9.8h, v1.8h, v1.8h uaddw v8.8h, v8.8h, v23.8b // + p0 uaddw2 v9.8h, v9.8h, v23.16b uaddw v8.8h, v8.8h, v24.8b // + q0 uaddw2 v9.8h, v9.8h, v24.16b add v8.8h, v8.8h, v4.8h add v9.8h, v9.8h, v5.8h // + p3 + p1 sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 sub v3.8h, v3.8h, v1.8h sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 sub v7.8h, v7.8h, v5.8h rshrn v10.8b, v8.8h, #3 // out p2 rshrn2 v10.16b, v9.8h, #3 add v8.8h, v8.8h, v2.8h add v9.8h, v9.8h, v3.8h uaddl v0.8h, v20.8b, v23.8b // p3 + p0 uaddl2 v1.8h, v20.16b, v23.16b uaddl v2.8h, v24.8b, v27.8b // q0 + q3 uaddl2 v3.8h, v24.16b, v27.16b rshrn v11.8b, v8.8h, #3 // out p1 rshrn2 v11.16b, v9.8h, #3 add v8.8h, v8.8h, v6.8h add v9.8h, v9.8h, v7.8h sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 sub v3.8h, v3.8h, v1.8h uaddl v4.8h, v21.8b, v24.8b // p2 + q0 uaddl2 v5.8h, v21.16b, v24.16b uaddl v6.8h, v25.8b, v27.8b // q1 + q3 uaddl2 v7.8h, v25.16b, v27.16b rshrn v12.8b, v8.8h, #3 // out p0 rshrn2 v12.16b, v9.8h, #3 add v8.8h, v8.8h, v2.8h add v9.8h, v9.8h, v3.8h sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 sub v7.8h, v7.8h, v5.8h uaddl v0.8h, v22.8b, v25.8b // p1 + q1 uaddl2 v1.8h, v22.16b, v25.16b uaddl v2.8h, v26.8b, v27.8b // q2 + q3 uaddl2 v3.8h, v26.16b, v27.16b rshrn v13.8b, v8.8h, #3 // out q0 rshrn2 v13.16b, v9.8h, #3 add v8.8h, v8.8h, v6.8h add v9.8h, v9.8h, v7.8h sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 sub v3.8h, v3.8h, v1.8h rshrn v0.8b, v8.8h, #3 // out q1 rshrn2 v0.16b, v9.8h, #3 add v8.8h, v8.8h, v2.8h add v9.8h , v9.8h, v3.8h bit v21.16b, v10.16b, v14.16b bit v22.16b, v11.16b, v14.16b bit v23.16b, v12.16b, v14.16b rshrn v1.8b, v8.8h, #3 // out q2 rshrn2 v1.16b, v9.8h, #3 bit v24.16b, v13.16b, v14.16b bit v25.16b, v0.16b, v14.16b bit v26.16b, v1.16b, v14.16b .endif 2: .if \wd == 16 mov x16, v15.d[0] mov x17, v15.d[1] adds x16, x16, x17 b.ne 1f // check if flat8out is needed mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: uaddl v2.8h, v17.8b, v17.8b // p6 + p6 uaddl2 v3.8h, v17.16b, v17.16b uaddl v4.8h, v17.8b, v18.8b // p6 + p5 uaddl2 v5.8h, v17.16b, v18.16b uaddl v6.8h, v17.8b, v19.8b // p6 + p4 uaddl2 v7.8h, v17.16b, v19.16b uaddl v8.8h, v17.8b, v20.8b // p6 + p3 uaddl2 v9.8h, v17.16b, v20.16b add v12.8h, v2.8h, v4.8h add v13.8h, v3.8h, v5.8h add v10.8h, v6.8h, v8.8h add v11.8h, v7.8h, v9.8h uaddl v6.8h, v17.8b, v21.8b // p6 + p2 uaddl2 v7.8h, v17.16b, v21.16b add v12.8h, v12.8h, v10.8h add v13.8h, v13.8h, v11.8h uaddl v8.8h, v17.8b, v22.8b // p6 + p1 uaddl2 v9.8h, v17.16b, v22.16b uaddl v10.8h, v18.8b, v23.8b // p5 + p0 uaddl2 v11.8h, v18.16b, v23.16b add v6.8h, v6.8h, v8.8h add v7.8h, v7.8h, v9.8h uaddl v8.8h, v19.8b, v24.8b // p4 + q0 uaddl2 v9.8h, v19.16b, v24.16b add v12.8h, v12.8h, v6.8h add v13.8h, v13.8h, v7.8h add v10.8h, v10.8h, v8.8h add v11.8h, v11.8h, v9.8h uaddl v6.8h, v20.8b, v25.8b // p3 + q1 uaddl2 v7.8h, v20.16b, v25.16b add v12.8h, v12.8h, v10.8h add v13.8h, v13.8h, v11.8h sub v6.8h, v6.8h, v2.8h sub v7.8h, v7.8h, v3.8h uaddl v2.8h, v21.8b, v26.8b // p2 + q2 uaddl2 v3.8h, v21.16b, v26.16b rshrn v0.8b, v12.8h, #4 // out p5 rshrn2 v0.16b, v13.8h, #4 add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) add v13.8h, v13.8h, v7.8h sub v2.8h, v2.8h, v4.8h sub v3.8h, v3.8h, v5.8h uaddl v4.8h, v22.8b, v27.8b // p1 + q3 uaddl2 v5.8h, v22.16b, v27.16b uaddl v6.8h, v17.8b, v19.8b // p6 + p4 uaddl2 v7.8h, v17.16b, v19.16b rshrn v1.8b, v12.8h, #4 // out p4 rshrn2 v1.16b, v13.8h, #4 add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) add v13.8h, v13.8h, v3.8h sub v4.8h, v4.8h, v6.8h sub v5.8h, v5.8h, v7.8h uaddl v6.8h, v23.8b, v28.8b // p0 + q4 uaddl2 v7.8h, v23.16b, v28.16b uaddl v8.8h, v17.8b, v20.8b // p6 + p3 uaddl2 v9.8h, v17.16b, v20.16b rshrn v2.8b, v12.8h, #4 // out p3 rshrn2 v2.16b, v13.8h, #4 add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) add v13.8h, v13.8h, v5.8h sub v6.8h, v6.8h, v8.8h sub v7.8h, v7.8h, v9.8h uaddl v8.8h, v24.8b, v29.8b // q0 + q5 uaddl2 v9.8h, v24.16b, v29.16b uaddl v4.8h, v17.8b, v21.8b // p6 + p2 uaddl2 v5.8h, v17.16b, v21.16b rshrn v3.8b, v12.8h, #4 // out p2 rshrn2 v3.16b, v13.8h, #4 add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) add v13.8h, v13.8h, v7.8h sub v8.8h, v8.8h, v4.8h sub v9.8h, v9.8h, v5.8h uaddl v6.8h, v25.8b, v30.8b // q1 + q6 uaddl2 v7.8h, v25.16b, v30.16b uaddl v10.8h, v17.8b, v22.8b // p6 + p1 uaddl2 v11.8h, v17.16b, v22.16b rshrn v4.8b, v12.8h, #4 // out p1 rshrn2 v4.16b, v13.8h, #4 add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) add v13.8h, v13.8h, v9.8h sub v6.8h, v6.8h, v10.8h sub v7.8h, v7.8h, v11.8h uaddl v8.8h, v26.8b, v30.8b // q2 + q6 uaddl2 v9.8h, v26.16b, v30.16b bif v0.16b, v18.16b, v15.16b // out p5 uaddl v10.8h, v18.8b, v23.8b // p5 + p0 uaddl2 v11.8h, v18.16b, v23.16b rshrn v5.8b, v12.8h, #4 // out p0 rshrn2 v5.16b, v13.8h, #4 add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) add v13.8h, v13.8h, v7.8h sub v8.8h, v8.8h, v10.8h sub v9.8h, v9.8h, v11.8h uaddl v10.8h, v27.8b, v30.8b // q3 + q6 uaddl2 v11.8h, v27.16b, v30.16b bif v1.16b, v19.16b, v15.16b // out p4 uaddl v18.8h, v19.8b, v24.8b // p4 + q0 uaddl2 v19.8h, v19.16b, v24.16b rshrn v6.8b, v12.8h, #4 // out q0 rshrn2 v6.16b, v13.8h, #4 add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) add v13.8h, v13.8h, v9.8h sub v10.8h, v10.8h, v18.8h sub v11.8h, v11.8h, v19.8h uaddl v8.8h, v28.8b, v30.8b // q4 + q6 uaddl2 v9.8h, v28.16b, v30.16b bif v2.16b, v20.16b, v15.16b // out p3 uaddl v18.8h, v20.8b, v25.8b // p3 + q1 uaddl2 v19.8h, v20.16b, v25.16b rshrn v7.8b, v12.8h, #4 // out q1 rshrn2 v7.16b, v13.8h, #4 add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) add v13.8h, v13.8h, v11.8h sub v18.8h, v8.8h, v18.8h sub v19.8h, v9.8h, v19.8h uaddl v10.8h, v29.8b, v30.8b // q5 + q6 uaddl2 v11.8h, v29.16b, v30.16b bif v3.16b, v21.16b, v15.16b // out p2 uaddl v20.8h, v21.8b, v26.8b // p2 + q2 uaddl2 v21.8h, v21.16b, v26.16b rshrn v8.8b, v12.8h, #4 // out q2 rshrn2 v8.16b, v13.8h, #4 add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) add v13.8h, v13.8h, v19.8h sub v10.8h, v10.8h, v20.8h sub v11.8h, v11.8h, v21.8h uaddl v18.8h, v30.8b, v30.8b // q6 + q6 uaddl2 v19.8h, v30.16b, v30.16b bif v4.16b, v22.16b, v15.16b // out p1 uaddl v20.8h, v22.8b, v27.8b // p1 + q3 uaddl2 v21.8h, v22.16b, v27.16b rshrn v9.8b, v12.8h, #4 // out q3 rshrn2 v9.16b, v13.8h, #4 add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) add v13.8h, v13.8h, v11.8h sub v18.8h, v18.8h, v20.8h sub v19.8h, v19.8h, v21.8h bif v5.16b, v23.16b, v15.16b // out p0 rshrn v10.8b, v12.8h, #4 // out q4 rshrn2 v10.16b, v13.8h, #4 add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) add v13.8h, v13.8h, v19.8h rshrn v11.8b, v12.8h, #4 // out q5 rshrn2 v11.16b, v13.8h, #4 bif v6.16b, v24.16b, v15.16b // out q0 bif v7.16b, v25.16b, v15.16b // out q1 bif v8.16b, v26.16b, v15.16b // out q2 bif v9.16b, v27.16b, v15.16b // out q3 bif v10.16b, v28.16b, v15.16b // out q4 bif v11.16b, v29.16b, v15.16b // out q5 .endif mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels mov x14, #(1 << 6) ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels mov x14, #(1 << 4) ret .endif endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_16_wd16 bl lpf_16_wd16_neon cbz x14, 1f tbnz x14, #6, 7f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_16_wd8 bl lpf_16_wd8_neon cbz x14, 1f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_16_wd6 bl lpf_16_wd6_neon cbz x14, 1f ret x15 1: .endm .macro lpf_16_wd4 bl lpf_16_wd4_neon cbz x14, 1f ret x15 1: .endm function lpf_v_4_16_neon mov x15, x30 sub x16, x0, x1, lsl #1 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 lpf_16_wd4 sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_4_16_neon mov x15, x30 sub x16, x0, #2 add x0, x16, x1, lsl #3 ld1 {v22.s}[0], [x16], x1 ld1 {v22.s}[2], [x0], x1 ld1 {v23.s}[0], [x16], x1 ld1 {v23.s}[2], [x0], x1 ld1 {v24.s}[0], [x16], x1 ld1 {v24.s}[2], [x0], x1 ld1 {v25.s}[0], [x16], x1 ld1 {v25.s}[2], [x0], x1 ld1 {v22.s}[1], [x16], x1 ld1 {v22.s}[3], [x0], x1 ld1 {v23.s}[1], [x16], x1 ld1 {v23.s}[3], [x0], x1 ld1 {v24.s}[1], [x16], x1 ld1 {v24.s}[3], [x0], x1 ld1 {v25.s}[1], [x16], x1 ld1 {v25.s}[3], [x0], x1 add x0, x0, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 lpf_16_wd4 sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc function lpf_v_6_16_neon mov x15, x30 sub x16, x0, x1, lsl #1 sub x16, x16, x1 ld1 {v21.16b}, [x16], x1 // p2 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v25.16b}, [x0], x1 // q1 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 lpf_16_wd6 sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_6_16_neon mov x15, x30 sub x16, x0, #4 add x0, x16, x1, lsl #3 ld1 {v20.d}[0], [x16], x1 ld1 {v20.d}[1], [x0], x1 ld1 {v21.d}[0], [x16], x1 ld1 {v21.d}[1], [x0], x1 ld1 {v22.d}[0], [x16], x1 ld1 {v22.d}[1], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v23.d}[1], [x0], x1 ld1 {v24.d}[0], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v25.d}[0], [x16], x1 ld1 {v25.d}[1], [x0], x1 ld1 {v26.d}[0], [x16], x1 ld1 {v26.d}[1], [x0], x1 ld1 {v27.d}[0], [x16], x1 ld1 {v27.d}[1], [x0], x1 add x0, x0, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_16_wd6 sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc function lpf_v_8_16_neon mov x15, x30 sub x16, x0, x1, lsl #2 ld1 {v20.16b}, [x16], x1 // p3 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v21.16b}, [x16], x1 // p2 ld1 {v25.16b}, [x0], x1 // q1 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v26.16b}, [x0], x1 // q2 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v27.16b}, [x0], x1 // q3 sub x0, x0, x1, lsl #2 lpf_16_wd8 sub x16, x0, x1, lsl #1 sub x16, x16, x1 st1 {v21.16b}, [x16], x1 // p2 st1 {v24.16b}, [x0], x1 // q0 st1 {v22.16b}, [x16], x1 // p1 st1 {v25.16b}, [x0], x1 // q1 st1 {v23.16b}, [x16], x1 // p0 st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_8_16_neon mov x15, x30 sub x16, x0, #4 add x0, x16, x1, lsl #3 ld1 {v20.d}[0], [x16], x1 ld1 {v20.d}[1], [x0], x1 ld1 {v21.d}[0], [x16], x1 ld1 {v21.d}[1], [x0], x1 ld1 {v22.d}[0], [x16], x1 ld1 {v22.d}[1], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v23.d}[1], [x0], x1 ld1 {v24.d}[0], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v25.d}[0], [x16], x1 ld1 {v25.d}[1], [x0], x1 ld1 {v26.d}[0], [x16], x1 ld1 {v26.d}[1], [x0], x1 ld1 {v27.d}[0], [x16], x1 ld1 {v27.d}[1], [x0], x1 add x0, x0, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_16_wd8 sub x16, x0, x1, lsl #4 sub x16, x16, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v20.d}[0], [x16], x1 st1 {v20.d}[1], [x0], x1 st1 {v21.d}[0], [x16], x1 st1 {v21.d}[1], [x0], x1 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 st1 {v26.d}[0], [x16], x1 st1 {v26.d}[1], [x0], x1 st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc function lpf_v_16_16_neon mov x15, x30 sub x16, x0, x1, lsl #3 add x16, x16, x1 ld1 {v17.16b}, [x16], x1 // p6 ld1 {v24.16b}, [x0], x1 // q0 ld1 {v18.16b}, [x16], x1 // p5 ld1 {v25.16b}, [x0], x1 // q1 ld1 {v19.16b}, [x16], x1 // p4 ld1 {v26.16b}, [x0], x1 // q2 ld1 {v20.16b}, [x16], x1 // p3 ld1 {v27.16b}, [x0], x1 // q3 ld1 {v21.16b}, [x16], x1 // p2 ld1 {v28.16b}, [x0], x1 // q4 ld1 {v22.16b}, [x16], x1 // p1 ld1 {v29.16b}, [x0], x1 // q5 ld1 {v23.16b}, [x16], x1 // p0 ld1 {v30.16b}, [x0], x1 // q6 sub x0, x0, x1, lsl #3 add x0, x0, x1 lpf_16_wd16 sub x16, x0, x1, lsl #2 sub x16, x16, x1, lsl #1 st1 {v0.16b}, [x16], x1 // p5 st1 {v6.16b}, [x0], x1 // q0 st1 {v1.16b}, [x16], x1 // p4 st1 {v7.16b}, [x0], x1 // q1 st1 {v2.16b}, [x16], x1 // p3 st1 {v8.16b}, [x0], x1 // q2 st1 {v3.16b}, [x16], x1 // p2 st1 {v9.16b}, [x0], x1 // q3 st1 {v4.16b}, [x16], x1 // p1 st1 {v10.16b}, [x0], x1 // q4 st1 {v5.16b}, [x16], x1 // p0 st1 {v11.16b}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 st1 {v21.16b}, [x16], x1 // p2 st1 {v24.16b}, [x0], x1 // q0 st1 {v22.16b}, [x16], x1 // p1 st1 {v25.16b}, [x0], x1 // q1 st1 {v23.16b}, [x16], x1 // p0 st1 {v26.16b}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.16b}, [x16], x1 // p1 st1 {v24.16b}, [x0], x1 // q0 st1 {v23.16b}, [x16], x1 // p0 st1 {v25.16b}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_16_16_neon mov x15, x30 sub x16, x0, #8 ld1 {v16.d}[0], [x16], x1 ld1 {v24.d}[0], [x0], x1 ld1 {v17.d}[0], [x16], x1 ld1 {v25.d}[0], [x0], x1 ld1 {v18.d}[0], [x16], x1 ld1 {v26.d}[0], [x0], x1 ld1 {v19.d}[0], [x16], x1 ld1 {v27.d}[0], [x0], x1 ld1 {v20.d}[0], [x16], x1 ld1 {v28.d}[0], [x0], x1 ld1 {v21.d}[0], [x16], x1 ld1 {v29.d}[0], [x0], x1 ld1 {v22.d}[0], [x16], x1 ld1 {v30.d}[0], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v31.d}[0], [x0], x1 ld1 {v16.d}[1], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v17.d}[1], [x16], x1 ld1 {v25.d}[1], [x0], x1 ld1 {v18.d}[1], [x16], x1 ld1 {v26.d}[1], [x0], x1 ld1 {v19.d}[1], [x16], x1 ld1 {v27.d}[1], [x0], x1 ld1 {v20.d}[1], [x16], x1 ld1 {v28.d}[1], [x0], x1 ld1 {v21.d}[1], [x16], x1 ld1 {v29.d}[1], [x0], x1 ld1 {v22.d}[1], [x16], x1 ld1 {v30.d}[1], [x0], x1 ld1 {v23.d}[1], [x16], x1 ld1 {v31.d}[1], [x0], x1 transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 lpf_16_wd16 sub x0, x0, x1, lsl #4 sub x16, x0, #8 transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 st1 {v16.d}[0], [x16], x1 st1 {v6.d}[0], [x0], x1 st1 {v17.d}[0], [x16], x1 st1 {v7.d}[0], [x0], x1 st1 {v0.d}[0], [x16], x1 st1 {v8.d}[0], [x0], x1 st1 {v1.d}[0], [x16], x1 st1 {v9.d}[0], [x0], x1 st1 {v2.d}[0], [x16], x1 st1 {v10.d}[0], [x0], x1 st1 {v3.d}[0], [x16], x1 st1 {v11.d}[0], [x0], x1 st1 {v4.d}[0], [x16], x1 st1 {v30.d}[0], [x0], x1 st1 {v5.d}[0], [x16], x1 st1 {v31.d}[0], [x0], x1 st1 {v16.d}[1], [x16], x1 st1 {v6.d}[1], [x0], x1 st1 {v17.d}[1], [x16], x1 st1 {v7.d}[1], [x0], x1 st1 {v0.d}[1], [x16], x1 st1 {v8.d}[1], [x0], x1 st1 {v1.d}[1], [x16], x1 st1 {v9.d}[1], [x0], x1 st1 {v2.d}[1], [x16], x1 st1 {v10.d}[1], [x0], x1 st1 {v3.d}[1], [x16], x1 st1 {v11.d}[1], [x0], x1 st1 {v4.d}[1], [x16], x1 st1 {v30.d}[1], [x0], x1 st1 {v5.d}[1], [x16], x1 st1 {v31.d}[1], [x0], x1 ret x15 7: sub x16, x0, x1, lsl #4 sub x16, x16, #4 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v20.d}[0], [x16], x1 st1 {v20.d}[1], [x0], x1 st1 {v21.d}[0], [x16], x1 st1 {v21.d}[1], [x0], x1 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 st1 {v26.d}[0], [x16], x1 st1 {v26.d}[1], [x0], x1 st1 {v27.d}[0], [x16], x1 st1 {v27.d}[1], [x0], x1 add x0, x0, #4 ret x15 8: sub x16, x0, x1, lsl #4 sub x16, x16, #2 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #3 st1 {v22.s}[0], [x16], x1 st1 {v22.s}[2], [x0], x1 st1 {v23.s}[0], [x16], x1 st1 {v23.s}[2], [x0], x1 st1 {v24.s}[0], [x16], x1 st1 {v24.s}[2], [x0], x1 st1 {v25.s}[0], [x16], x1 st1 {v25.s}[2], [x0], x1 st1 {v22.s}[1], [x16], x1 st1 {v22.s}[3], [x0], x1 st1 {v23.s}[1], [x16], x1 st1 {v23.s}[3], [x0], x1 st1 {v24.s}[1], [x16], x1 st1 {v24.s}[3], [x0], x1 st1 {v25.s}[1], [x16], x1 st1 {v25.s}[3], [x0], x1 add x0, x0, #2 ret x15 endfunc // void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av1FilterLUT *lut, const int w) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 mov x11, x30 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] ldp w6, w7, [x2] // vmask[0], vmask[1] .ifc \type, y ldr w2, [x2, #8] // vmask[2] .endif add x5, x5, #128 // Move to sharp part of lut .ifc \type, y orr w7, w7, w2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub x4, x3, x4, lsl #2 .else sub x3, x3, #4 lsl x4, x4, #2 .endif orr w6, w6, w7 // vmask[0] |= vmask[1] 1: tst w6, #0x0f .ifc \dir, v ld1 {v0.16b}, [x4], #16 ld1 {v1.16b}, [x3], #16 .else ld2 {v0.s,v1.s}[0], [x3], x4 ld2 {v0.s,v1.s}[1], [x3], x4 ld2 {v0.s,v1.s}[2], [x3], x4 ld2 {v0.s,v1.s}[3], [x3], x4 .endif b.eq 7f // if (!(vm & bits)) continue; ld1r {v5.16b}, [x5] // sharp[0] add x5, x5, #8 movi v2.4s, #0xff dup v13.4s, w6 // vmask[0] and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word and v1.16b, v1.16b, v2.16b cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0] movi v4.16b, #1 ld1r {v6.16b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0] cmtst v2.4s, v1.4s, v2.4s // L != 0 mul v1.4s, v1.4s, v4.4s // L .ifc \type, y dup v15.4s, w2 // vmask[2] .endif dup v14.4s, w7 // vmask[1] mov x16, v2.d[0] mov x17, v2.d[1] adds x16, x16, x17 b.eq 7f // if (!L) continue; neg v5.16b, v5.16b // -sharp[0] movrel x16, word_1248 ushr v12.16b, v1.16b, #4 // H ld1 {v16.4s}, [x16] sshl v3.16b, v1.16b, v5.16b // L >> sharp[0] .ifc \type, y cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits) .endif movi v7.16b, #2 umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1]) add v0.16b, v1.16b, v7.16b // L + 2 umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I add v0.16b, v0.16b, v0.16b // 2*(L + 2) cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits) add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits) and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0 .ifc \type, y tst w2, #0x0f b.eq 2f // wd16 bl lpf_\dir\()_16_16_neon b 8f 2: .endif tst w7, #0x0f b.eq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_16_neon .else // wd6 bl lpf_\dir\()_6_16_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_16_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment x0. // If the whole function is skipped, increment it here instead. add x0, x0, x1, lsl #4 .else 7: .endif 8: lsr w6, w6, #4 // vmask[0] >>= 4 lsr w7, w7, #4 // vmask[1] >>= 4 .ifc \type, y lsr w2, w2, #4 // vmask[2] >>= 4 .endif .ifc \dir, v add x0, x0, #16 .else // For dir h, x0 is returned incremented .endif cbnz w6, 1b ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x11 endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv const word_1248 .word 1, 2, 4, 8 endconst rav1e-0.7.1/src/arm/64/loopfilter16.S000064400000000000000000001122421046102023000151220ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" // depending on how many pixels need to be stored, returns: // x14 = (1 << 0) : 0 pixels // x14 = (1 << 4) : inner 4 pixels // x14 = (1 << 6) : inner 6 pixels // x14 = 0 : all pixels .macro loop_filter wd function lpf_8_wd\wd\()_neon uabd v0.8h, v22.8h, v23.8h // abs(p1 - p0) uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) uabd v2.8h, v23.8h, v24.8h // abs(p0 - q0) uabd v3.8h, v22.8h, v25.8h // abs(p1 - q1) .if \wd >= 6 uabd v4.8h, v21.8h, v22.8h // abs(p2 - p1) uabd v5.8h, v26.8h, v25.8h // abs(q2 - q1) .endif .if \wd >= 8 uabd v6.8h, v20.8h, v21.8h // abs(p3 - p2) uabd v7.8h, v27.8h, v26.8h // abs(q3 - q3) .endif .if \wd >= 6 umax v4.8h, v4.8h, v5.8h .endif uqadd v2.8h, v2.8h, v2.8h // abs(p0 - q0) * 2 .if \wd >= 8 umax v6.8h, v6.8h, v7.8h .endif ushr v3.8h, v3.8h, #1 .if \wd >= 8 umax v4.8h, v4.8h, v6.8h .endif .if \wd >= 6 and v4.16b, v4.16b, v14.16b .endif umax v0.8h, v0.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) uqadd v2.8h, v2.8h, v3.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 .if \wd >= 6 umax v4.8h, v0.8h, v4.8h cmhs v1.8h, v11.8h, v4.8h // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I .else cmhs v1.8h, v11.8h, v0.8h // max(abs(p1 - p0), abs(q1 - q0)) <= I .endif cmhs v2.8h, v10.8h, v2.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E and v1.16b, v1.16b, v2.16b // fm and v1.16b, v1.16b, v13.16b // fm && wd >= 4 .if \wd >= 6 and v14.16b, v14.16b, v1.16b // fm && wd > 4 .endif .if \wd >= 16 and v15.16b, v15.16b, v1.16b // fm && wd == 16 .endif mov x16, v1.d[0] mov x17, v1.d[1] adds x16, x16, x17 b.ne 9f // if (!fm || wd < 4) return; mov x14, #(1 << 0) ret 9: .if \wd >= 6 movi v10.8h, #1 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) uabd v3.8h, v22.8h, v23.8h // abs(p1 - p0) uabd v4.8h, v25.8h, v24.8h // abs(q1 - q0) uabd v5.8h, v26.8h, v24.8h // abs(q2 - q0) dup v9.8h, w9 // bitdepth_min_8 .if \wd >= 8 uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) uabd v7.8h, v27.8h, v24.8h // abs(q3 - q0) .endif umax v2.8h, v2.8h, v3.8h umax v4.8h, v4.8h, v5.8h .if \wd >= 8 umax v6.8h, v6.8h, v7.8h .endif umax v2.8h, v2.8h, v4.8h ushl v10.8h, v10.8h, v9.8h // F = 1 << bitdepth_min_8 .if \wd >= 8 umax v2.8h, v2.8h, v6.8h .endif .if \wd == 16 uabd v3.8h, v17.8h, v23.8h // abs(p6 - p0) uabd v4.8h, v18.8h, v23.8h // abs(p5 - p0) uabd v5.8h, v19.8h, v23.8h // abs(p4 - p0) .endif cmhs v2.8h, v10.8h, v2.8h // flat8in .if \wd == 16 uabd v6.8h, v28.8h, v24.8h // abs(q4 - q0) uabd v7.8h, v29.8h, v24.8h // abs(q5 - q0) uabd v8.8h, v30.8h, v24.8h // abs(q6 - q0) .endif and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in .if \wd == 16 umax v3.8h, v3.8h, v4.8h umax v5.8h, v5.8h, v6.8h .endif mov x16, v1.d[0] mov x17, v1.d[1] .if \wd == 16 umax v7.8h, v7.8h, v8.8h umax v3.8h, v3.8h, v5.8h umax v3.8h, v3.8h, v7.8h cmhs v3.8h, v10.8h, v3.8h // flat8out .endif adds x16, x16, x17 .if \wd == 16 and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out .endif b.eq 1f // skip wd == 4 case .endif dup v3.8h, w8 // bitdepth_max sub v2.8h, v22.8h, v25.8h // p1 - q1 ushr v3.8h, v3.8h, #1 // 128 << bitdepth_min_8 - 1 cmhi v0.8h, v0.8h, v12.8h // hev not v9.16b, v3.16b // - 128 * (1 << bitdepth_min_8) smin v2.8h, v2.8h, v3.8h // iclip_diff(p1 - q1) smax v2.8h, v2.8h, v9.8h // iclip_diff(p1 - q1) and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) sub v2.8h, v24.8h, v23.8h movi v5.8h, #3 bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) mul v2.8h, v2.8h, v5.8h movi v6.8h, #4 add v2.8h, v2.8h, v4.8h smin v2.8h, v2.8h, v3.8h // f = iclip_diff() smax v2.8h, v2.8h, v9.8h // f = iclip_diff() sqadd v4.8h, v6.8h, v2.8h // f + 4 sqadd v5.8h, v5.8h, v2.8h // f + 3 smin v4.8h, v4.8h, v3.8h // imin(f + 4, 128 << bitdepth_min_8 - 1) smin v5.8h, v5.8h, v3.8h // imin(f + 3, 128 << bitdepth_min_8 - 1) sshr v4.8h, v4.8h, #3 // f1 sshr v5.8h, v5.8h, #3 // f2 movi v9.8h, #0 dup v3.8h, w8 // bitdepth_max sqadd v2.8h, v23.8h, v5.8h // p0 + f2 sqsub v6.8h, v24.8h, v4.8h // q0 - f1 srshr v4.8h, v4.8h, #1 // (f1 + 1) >> 1 smin v2.8h, v2.8h, v3.8h // out p0 = iclip_pixel() smin v6.8h, v6.8h, v3.8h // out q0 = iclip_pixel() smax v2.8h, v2.8h, v9.8h // out p0 = iclip_pixel() smax v6.8h, v6.8h, v9.8h // out q0 = iclip_pixel() bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4) sqadd v2.8h, v22.8h, v4.8h // p1 + f sqsub v6.8h, v25.8h, v4.8h // q1 - f smin v2.8h, v2.8h, v3.8h // out p1 = iclip_pixel() smin v6.8h, v6.8h, v3.8h // out q1 = iclip_pixel() smax v2.8h, v2.8h, v9.8h // out p1 = iclip_pixel() smax v6.8h, v6.8h, v9.8h // out q1 = iclip_pixel() bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev) 1: .if \wd == 6 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 2f // skip if there's no flat8in add v0.8h, v21.8h, v21.8h // p2 * 2 add v2.8h, v21.8h, v22.8h // p2 + p1 add v4.8h, v22.8h, v23.8h // p1 + p0 add v6.8h, v23.8h, v24.8h // p0 + q0 add v8.8h, v0.8h, v2.8h add v10.8h, v4.8h, v6.8h add v12.8h, v24.8h, v25.8h // q0 + q1 add v8.8h, v8.8h, v10.8h sub v12.8h, v12.8h, v0.8h add v10.8h, v25.8h, v26.8h // q1 + q2 urshr v0.8h, v8.8h, #3 // out p1 add v8.8h, v8.8h, v12.8h sub v10.8h, v10.8h, v2.8h add v12.8h, v26.8h, v26.8h // q2 + q2 urshr v1.8h, v8.8h, #3 // out p0 add v8.8h, v8.8h, v10.8h sub v12.8h, v12.8h, v4.8h urshr v2.8h, v8.8h, #3 // out q0 bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) add v8.8h, v8.8h, v12.8h bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) urshr v3.8h, v8.8h, #3 // out q1 bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) .elseif \wd >= 8 mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 .if \wd == 8 b.eq 8f // skip if there's no flat8in .else b.eq 2f // skip if there's no flat8in .endif add v0.8h, v20.8h, v21.8h // p3 + p2 add v2.8h, v22.8h, v25.8h // p1 + q1 add v4.8h, v20.8h, v22.8h // p3 + p1 add v6.8h, v23.8h, v26.8h // p0 + q2 add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) add v9.8h, v23.8h, v24.8h // p0 + q0 add v8.8h, v8.8h, v4.8h // + p3 + p1 sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 add v8.8h, v8.8h, v9.8h // + p0 + q0 sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 urshr v10.8h, v8.8h, #3 // out p2 add v8.8h, v8.8h, v2.8h add v0.8h, v20.8h, v23.8h // p3 + p0 add v2.8h, v24.8h, v27.8h // q0 + q3 urshr v11.8h, v8.8h, #3 // out p1 add v8.8h, v8.8h, v6.8h sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 add v4.8h, v21.8h, v24.8h // p2 + q0 add v6.8h, v25.8h, v27.8h // q1 + q3 urshr v12.8h, v8.8h, #3 // out p0 add v8.8h, v8.8h, v2.8h sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 add v0.8h, v22.8h, v25.8h // p1 + q1 add v2.8h, v26.8h, v27.8h // q2 + q3 urshr v13.8h, v8.8h, #3 // out q0 add v8.8h, v8.8h, v6.8h sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 urshr v0.8h, v8.8h, #3 // out q1 add v8.8h, v8.8h, v2.8h bit v21.16b, v10.16b, v14.16b bit v22.16b, v11.16b, v14.16b bit v23.16b, v12.16b, v14.16b urshr v1.8h, v8.8h, #3 // out q2 bit v24.16b, v13.16b, v14.16b bit v25.16b, v0.16b, v14.16b bit v26.16b, v1.16b, v14.16b .endif 2: .if \wd == 16 mov x16, v15.d[0] mov x17, v15.d[1] adds x16, x16, x17 b.ne 1f // check if flat8out is needed mov x16, v14.d[0] mov x17, v14.d[1] adds x16, x16, x17 b.eq 8f // if there was no flat8in, just write the inner 4 pixels b 7f // if flat8in was used, write the inner 6 pixels 1: add v2.8h, v17.8h, v17.8h // p6 + p6 add v4.8h, v17.8h, v18.8h // p6 + p5 add v6.8h, v17.8h, v19.8h // p6 + p4 add v8.8h, v17.8h, v20.8h // p6 + p3 add v12.8h, v2.8h, v4.8h add v10.8h, v6.8h, v8.8h add v6.8h, v17.8h, v21.8h // p6 + p2 add v12.8h, v12.8h, v10.8h add v8.8h, v17.8h, v22.8h // p6 + p1 add v10.8h, v18.8h, v23.8h // p5 + p0 add v6.8h, v6.8h, v8.8h add v8.8h, v19.8h, v24.8h // p4 + q0 add v12.8h, v12.8h, v6.8h add v10.8h, v10.8h, v8.8h add v6.8h, v20.8h, v25.8h // p3 + q1 add v12.8h, v12.8h, v10.8h sub v6.8h, v6.8h, v2.8h add v2.8h, v21.8h, v26.8h // p2 + q2 urshr v0.8h, v12.8h, #4 // out p5 add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) sub v2.8h, v2.8h, v4.8h add v4.8h, v22.8h, v27.8h // p1 + q3 add v6.8h, v17.8h, v19.8h // p6 + p4 urshr v1.8h, v12.8h, #4 // out p4 add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) sub v4.8h, v4.8h, v6.8h add v6.8h, v23.8h, v28.8h // p0 + q4 add v8.8h, v17.8h, v20.8h // p6 + p3 urshr v2.8h, v12.8h, #4 // out p3 add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) sub v6.8h, v6.8h, v8.8h add v8.8h, v24.8h, v29.8h // q0 + q5 add v4.8h, v17.8h, v21.8h // p6 + p2 urshr v3.8h, v12.8h, #4 // out p2 add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) sub v8.8h, v8.8h, v4.8h add v6.8h, v25.8h, v30.8h // q1 + q6 add v10.8h, v17.8h, v22.8h // p6 + p1 urshr v4.8h, v12.8h, #4 // out p1 add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) sub v6.8h, v6.8h, v10.8h add v8.8h, v26.8h, v30.8h // q2 + q6 bif v0.16b, v18.16b, v15.16b // out p5 add v10.8h, v18.8h, v23.8h // p5 + p0 urshr v5.8h, v12.8h, #4 // out p0 add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) sub v8.8h, v8.8h, v10.8h add v10.8h, v27.8h, v30.8h // q3 + q6 bif v1.16b, v19.16b, v15.16b // out p4 add v18.8h, v19.8h, v24.8h // p4 + q0 urshr v6.8h, v12.8h, #4 // out q0 add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) sub v10.8h, v10.8h, v18.8h add v8.8h, v28.8h, v30.8h // q4 + q6 bif v2.16b, v20.16b, v15.16b // out p3 add v18.8h, v20.8h, v25.8h // p3 + q1 urshr v7.8h, v12.8h, #4 // out q1 add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) sub v18.8h, v8.8h, v18.8h add v10.8h, v29.8h, v30.8h // q5 + q6 bif v3.16b, v21.16b, v15.16b // out p2 add v20.8h, v21.8h, v26.8h // p2 + q2 urshr v8.8h, v12.8h, #4 // out q2 add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) sub v10.8h, v10.8h, v20.8h add v18.8h, v30.8h, v30.8h // q6 + q6 bif v4.16b, v22.16b, v15.16b // out p1 add v20.8h, v22.8h, v27.8h // p1 + q3 urshr v9.8h, v12.8h, #4 // out q3 add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) sub v18.8h, v18.8h, v20.8h bif v5.16b, v23.16b, v15.16b // out p0 urshr v10.8h, v12.8h, #4 // out q4 add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) urshr v11.8h, v12.8h, #4 // out q5 bif v6.16b, v24.16b, v15.16b // out q0 bif v7.16b, v25.16b, v15.16b // out q1 bif v8.16b, v26.16b, v15.16b // out q2 bif v9.16b, v27.16b, v15.16b // out q3 bif v10.16b, v28.16b, v15.16b // out q4 bif v11.16b, v29.16b, v15.16b // out q5 .endif mov x14, #0 ret .if \wd == 16 7: // Return to a shorter epilogue, writing only the inner 6 pixels mov x14, #(1 << 6) ret .endif .if \wd >= 8 8: // Return to a shorter epilogue, writing only the inner 4 pixels mov x14, #(1 << 4) ret .endif endfunc .endm loop_filter 16 loop_filter 8 loop_filter 6 loop_filter 4 .macro lpf_8_wd16 bl lpf_8_wd16_neon cbz x14, 1f tbnz x14, #6, 7f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_8_wd8 bl lpf_8_wd8_neon cbz x14, 1f tbnz x14, #4, 8f ret x15 1: .endm .macro lpf_8_wd6 bl lpf_8_wd6_neon cbz x14, 1f ret x15 1: .endm .macro lpf_8_wd4 bl lpf_8_wd4_neon cbz x14, 1f ret x15 1: .endm function lpf_v_4_8_neon mov x15, x30 sub x16, x0, x1, lsl #1 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 lpf_8_wd4 sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_4_8_neon mov x15, x30 sub x16, x0, #4 add x0, x16, x1, lsl #2 ld1 {v22.d}[0], [x16], x1 ld1 {v22.d}[1], [x0], x1 ld1 {v23.d}[0], [x16], x1 ld1 {v23.d}[1], [x0], x1 ld1 {v24.d}[0], [x16], x1 ld1 {v24.d}[1], [x0], x1 ld1 {v25.d}[0], [x16], x1 ld1 {v25.d}[1], [x0], x1 add x0, x0, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 lpf_8_wd4 sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc function lpf_v_6_8_neon mov x15, x30 sub x16, x0, x1, lsl #1 sub x16, x16, x1 ld1 {v21.8h}, [x16], x1 // p2 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v25.8h}, [x0], x1 // q1 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 lpf_8_wd6 sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_6_8_neon mov x15, x30 sub x16, x0, #8 add x0, x16, x1, lsl #2 ld1 {v20.8h}, [x16], x1 ld1 {v24.8h}, [x0], x1 ld1 {v21.8h}, [x16], x1 ld1 {v25.8h}, [x0], x1 ld1 {v22.8h}, [x16], x1 ld1 {v26.8h}, [x0], x1 ld1 {v23.8h}, [x16], x1 ld1 {v27.8h}, [x0], x1 add x0, x0, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_8_wd6 sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc function lpf_v_8_8_neon mov x15, x30 sub x16, x0, x1, lsl #2 ld1 {v20.8h}, [x16], x1 // p3 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v21.8h}, [x16], x1 // p2 ld1 {v25.8h}, [x0], x1 // q1 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v26.8h}, [x0], x1 // q2 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v27.8h}, [x0], x1 // q3 sub x0, x0, x1, lsl #2 lpf_8_wd8 sub x16, x0, x1, lsl #1 sub x16, x16, x1 st1 {v21.8h}, [x16], x1 // p2 st1 {v24.8h}, [x0], x1 // q0 st1 {v22.8h}, [x16], x1 // p1 st1 {v25.8h}, [x0], x1 // q1 st1 {v23.8h}, [x16], x1 // p0 st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_8_8_neon mov x15, x30 sub x16, x0, #8 add x0, x16, x1, lsl #2 ld1 {v20.8h}, [x16], x1 ld1 {v24.8h}, [x0], x1 ld1 {v21.8h}, [x16], x1 ld1 {v25.8h}, [x0], x1 ld1 {v22.8h}, [x16], x1 ld1 {v26.8h}, [x0], x1 ld1 {v23.8h}, [x16], x1 ld1 {v27.8h}, [x0], x1 add x0, x0, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 lpf_8_wd8 sub x16, x0, x1, lsl #3 sub x16, x16, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v20.8h}, [x16], x1 st1 {v24.8h}, [x0], x1 st1 {v21.8h}, [x16], x1 st1 {v25.8h}, [x0], x1 st1 {v22.8h}, [x16], x1 st1 {v26.8h}, [x0], x1 st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc function lpf_v_16_8_neon mov x15, x30 sub x16, x0, x1, lsl #3 add x16, x16, x1 ld1 {v17.8h}, [x16], x1 // p6 ld1 {v24.8h}, [x0], x1 // q0 ld1 {v18.8h}, [x16], x1 // p5 ld1 {v25.8h}, [x0], x1 // q1 ld1 {v19.8h}, [x16], x1 // p4 ld1 {v26.8h}, [x0], x1 // q2 ld1 {v20.8h}, [x16], x1 // p3 ld1 {v27.8h}, [x0], x1 // q3 ld1 {v21.8h}, [x16], x1 // p2 ld1 {v28.8h}, [x0], x1 // q4 ld1 {v22.8h}, [x16], x1 // p1 ld1 {v29.8h}, [x0], x1 // q5 ld1 {v23.8h}, [x16], x1 // p0 ld1 {v30.8h}, [x0], x1 // q6 sub x0, x0, x1, lsl #3 add x0, x0, x1 lpf_8_wd16 sub x16, x0, x1, lsl #2 sub x16, x16, x1, lsl #1 st1 {v0.8h}, [x16], x1 // p5 st1 {v6.8h}, [x0], x1 // q0 st1 {v1.8h}, [x16], x1 // p4 st1 {v7.8h}, [x0], x1 // q1 st1 {v2.8h}, [x16], x1 // p3 st1 {v8.8h}, [x0], x1 // q2 st1 {v3.8h}, [x16], x1 // p2 st1 {v9.8h}, [x0], x1 // q3 st1 {v4.8h}, [x16], x1 // p1 st1 {v10.8h}, [x0], x1 // q4 st1 {v5.8h}, [x16], x1 // p0 st1 {v11.8h}, [x0], x1 // q5 sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #1 ret x15 7: sub x16, x0, x1 sub x16, x16, x1, lsl #1 st1 {v21.8h}, [x16], x1 // p2 st1 {v24.8h}, [x0], x1 // q0 st1 {v22.8h}, [x16], x1 // p1 st1 {v25.8h}, [x0], x1 // q1 st1 {v23.8h}, [x16], x1 // p0 st1 {v26.8h}, [x0], x1 // q2 sub x0, x0, x1, lsl #1 sub x0, x0, x1 ret x15 8: sub x16, x0, x1, lsl #1 st1 {v22.8h}, [x16], x1 // p1 st1 {v24.8h}, [x0], x1 // q0 st1 {v23.8h}, [x16], x1 // p0 st1 {v25.8h}, [x0], x1 // q1 sub x0, x0, x1, lsl #1 ret x15 endfunc function lpf_h_16_8_neon mov x15, x30 sub x16, x0, #16 ld1 {v16.8h}, [x16], x1 ld1 {v24.8h}, [x0], x1 ld1 {v17.8h}, [x16], x1 ld1 {v25.8h}, [x0], x1 ld1 {v18.8h}, [x16], x1 ld1 {v26.8h}, [x0], x1 ld1 {v19.8h}, [x16], x1 ld1 {v27.8h}, [x0], x1 ld1 {v20.8h}, [x16], x1 ld1 {v28.8h}, [x0], x1 ld1 {v21.8h}, [x16], x1 ld1 {v29.8h}, [x0], x1 ld1 {v22.8h}, [x16], x1 ld1 {v30.8h}, [x0], x1 ld1 {v23.8h}, [x16], x1 ld1 {v31.8h}, [x0], x1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 lpf_8_wd16 sub x0, x0, x1, lsl #3 sub x16, x0, #16 transpose_8x8h v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 transpose_8x8h v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 st1 {v16.8h}, [x16], x1 st1 {v6.8h}, [x0], x1 st1 {v17.8h}, [x16], x1 st1 {v7.8h}, [x0], x1 st1 {v0.8h}, [x16], x1 st1 {v8.8h}, [x0], x1 st1 {v1.8h}, [x16], x1 st1 {v9.8h}, [x0], x1 st1 {v2.8h}, [x16], x1 st1 {v10.8h}, [x0], x1 st1 {v3.8h}, [x16], x1 st1 {v11.8h}, [x0], x1 st1 {v4.8h}, [x16], x1 st1 {v30.8h}, [x0], x1 st1 {v5.8h}, [x16], x1 st1 {v31.8h}, [x0], x1 ret x15 7: sub x16, x0, x1, lsl #3 sub x16, x16, #8 transpose_8x8h v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v20.8h}, [x16], x1 st1 {v24.8h}, [x0], x1 st1 {v21.8h}, [x16], x1 st1 {v25.8h}, [x0], x1 st1 {v22.8h}, [x16], x1 st1 {v26.8h}, [x0], x1 st1 {v23.8h}, [x16], x1 st1 {v27.8h}, [x0], x1 add x0, x0, #8 ret x15 8: sub x16, x0, x1, lsl #3 sub x16, x16, #4 transpose_4x8h v22, v23, v24, v25, v26, v27, v28, v29 add x0, x16, x1, lsl #2 st1 {v22.d}[0], [x16], x1 st1 {v22.d}[1], [x0], x1 st1 {v23.d}[0], [x16], x1 st1 {v23.d}[1], [x0], x1 st1 {v24.d}[0], [x16], x1 st1 {v24.d}[1], [x0], x1 st1 {v25.d}[0], [x16], x1 st1 {v25.d}[1], [x0], x1 add x0, x0, #4 ret x15 endfunc // void dav1d_lpf_v_sb_y_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const uint32_t *const vmask, // const uint8_t (*l)[4], ptrdiff_t b4_stride, // const Av1FilterLUT *lut, const int w, // const int bitdepth_max) .macro lpf_func dir, type function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1 mov x11, x30 mov w8, w7 // bitdepth_max clz w9, w8 mov w10, #24 sub w9, w10, w9 // bitdepth_min_8 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] ldp w6, w7, [x2] // vmask[0], vmask[1] .ifc \type, y ldr w2, [x2, #8] // vmask[2] .endif add x5, x5, #128 // Move to sharp part of lut .ifc \type, y orr w7, w7, w2 // vmask[1] |= vmask[2] .endif .ifc \dir, v sub x4, x3, x4, lsl #2 .else sub x3, x3, #4 lsl x4, x4, #2 .endif orr w6, w6, w7 // vmask[0] |= vmask[1] 1: tst w6, #0x03 .ifc \dir, v ld1 {v0.8b}, [x4], #8 ld1 {v1.8b}, [x3], #8 .else ld2 {v0.s,v1.s}[0], [x3], x4 ld2 {v0.s,v1.s}[1], [x3], x4 .endif b.eq 7f // if (!(vm & bits)) continue; ld1r {v5.8b}, [x5] // sharp[0] add x5, x5, #8 movi v2.2s, #0xff dup v13.2s, w6 // vmask[0] dup v31.8h, w9 // bitdepth_min_8 and v0.8b, v0.8b, v2.8b // Keep only lowest byte in each 32 bit word and v1.8b, v1.8b, v2.8b cmtst v3.8b, v1.8b, v2.8b // Check for nonzero values in l[0][0] movi v4.8b, #1 ld1r {v6.8b}, [x5] // sharp[1] sub x5, x5, #8 bif v1.8b, v0.8b, v3.8b // if (!l[0][0]) L = l[offset][0] cmtst v2.2s, v1.2s, v2.2s // L != 0 mul v1.2s, v1.2s, v4.2s // L .ifc \type, y dup v15.2s, w2 // vmask[2] .endif dup v14.2s, w7 // vmask[1] mov x16, v2.d[0] cmp x16, #0 b.eq 7f // if (!L) continue; neg v5.8b, v5.8b // -sharp[0] movrel x16, word_12 ushr v12.8b, v1.8b, #4 // H ld1 {v16.2s}, [x16] sshl v3.8b, v1.8b, v5.8b // L >> sharp[0] .ifc \type, y cmtst v15.2s, v15.2s, v16.2s // if (vmask[2] & bits) .endif movi v7.8b, #2 umin v3.8b, v3.8b, v6.8b // imin(L >> sharp[0], sharp[1]) add v0.8b, v1.8b, v7.8b // L + 2 umax v11.8b, v3.8b, v4.8b // imax(imin(), 1) = limit = I add v0.8b, v0.8b, v0.8b // 2*(L + 2) cmtst v14.2s, v14.2s, v16.2s // if (vmask[1] & bits) uxtl v12.8h, v12.8b add v10.8b, v0.8b, v11.8b // 2*(L + 2) + limit = E cmtst v13.2s, v13.2s, v16.2s // if (vmask[0] & bits) uxtl v11.8h, v11.8b uxtl v10.8h, v10.8b and v13.8b, v13.8b, v2.8b // vmask[0] &= L != 0 sxtl v14.8h, v14.8b sxtl v13.8h, v13.8b .ifc \type, y sxtl v15.8h, v15.8b .endif ushl v12.8h, v12.8h, v31.8h ushl v11.8h, v11.8h, v31.8h ushl v10.8h, v10.8h, v31.8h .ifc \type, y tst w2, #0x03 b.eq 2f // wd16 bl lpf_\dir\()_16_8_neon b 8f 2: .endif tst w7, #0x03 b.eq 3f .ifc \type, y // wd8 bl lpf_\dir\()_8_8_neon .else // wd6 bl lpf_\dir\()_6_8_neon .endif b 8f 3: // wd4 bl lpf_\dir\()_4_8_neon .ifc \dir, h b 8f 7: // For dir h, the functions above increment x0. // If the whole function is skipped, increment it here instead. add x0, x0, x1, lsl #3 .else 7: .endif 8: lsr w6, w6, #2 // vmask[0] >>= 2 lsr w7, w7, #2 // vmask[1] >>= 2 .ifc \type, y lsr w2, w2, #2 // vmask[2] >>= 2 .endif .ifc \dir, v add x0, x0, #16 .else // For dir h, x0 is returned incremented .endif cbnz w6, 1b ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x11 endfunc .endm lpf_func v, y lpf_func h, y lpf_func v, uv lpf_func h, uv const word_12 .word 1, 2 endconst rav1e-0.7.1/src/arm/64/looprestoration.S000064400000000000000000001521501046102023000160410ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" const right_ext_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 right_ext_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride, // const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter7_8bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-16]! mov x29, sp ld1 {v0.8h, v1.8h}, [x6] tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*6 mov w17, #(1 << 14) - (1 << 2) dup v30.8h, w17 movi v31.8h, #8, lsl #8 // x9 - t6 // x10 - t5 // x11 - t4 // x12 - t3 // x13 - t2 // x14 - t1 // x15 - t0 mov x14, sp // t1 b.eq L(no_top_7) mov x16, x2 // backup left mov x2, #0 bl wiener_filter7_h_8bpc_neon add x3, x3, x1 // lpf += stride mov x9, x14 // t6 mov x10, x14 // t5 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon add x3, x3, x1, lsl #2 add x3, x3, x1 // lpf += stride*5 mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter7_h_8bpc_neon subs w5, w5, #1 // h-- mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon mov x13, x14 // t2 subs w5, w5, #1 // h-- b.eq L(v2_7) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon subs w5, w5, #1 // h-- b.eq L(v3_7) add x3, x3, x1 // src += stride L(main_7): add x15, x14, #384*2 // t0 = t1 + 384*2 L(main_loop_7): bl wiener_filter7_hv_8bpc_neon subs w5, w5, #1 // h-- b.ne L(main_loop_7) tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v3_7) mov x3, x16 // restore lpf mov x2, #0 // left = NULL bl wiener_filter7_hv_8bpc_neon bl wiener_filter7_hv_8bpc_neon L(v1_7): bl wiener_filter7_v_8bpc_neon mov sp, x29 ldp x29, x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_7): add x3, x3, x1, lsl #2 add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter7_h_8bpc_neon subs w5, w5, #1 // h-- mov x9, x14 // t6 mov x10, x14 // t5 mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v2_7) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_8bpc_neon subs w5, w5, #1 // h-- b.eq L(v3_7) add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter7_hv_8bpc_neon subs w5, w5, #1 // h-- b.eq L(v3_7) add x15, x15, #384*2*4 // t0 += 384*2*4 bl wiener_filter7_hv_8bpc_neon subs w5, w5, #1 // h-- b.ne L(main_7) L(v3_7): bl wiener_filter7_v_8bpc_neon L(v2_7): bl wiener_filter7_v_8bpc_neon b L(v1_7) endfunc function wiener_filter7_h_8bpc_neon stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #3 ld1 {v3.16b}, [x3], #16 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v3.16b}, [x3], #16 ld1 {v2.s}[3], [x2], #4 // Move x3 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #3 ext v3.16b, v2.16b, v3.16b, #13 b 2f 1: ld1 {v3.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v2 with the leftmost byte // and shift v3 to have 3x the first byte at the front. dup v2.16b, v3.b[0] // Move x3 back to account for the last 3 bytes we loaded before, // which we shifted out. sub x3, x3, #3 ext v3.16b, v2.16b, v3.16b, #13 2: ld1 {v4.8b}, [x3], #8 uxtl v2.8h, v3.8b uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -6 ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b bit v4.16b, v28.16b, v27.16b 4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. ext v17.16b, v2.16b, v3.16b, #4 ext v19.16b, v2.16b, v3.16b, #8 ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 ext v18.16b, v2.16b, v3.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v2.8h shl v22.8h, v18.8h, #7 mul v6.8h, v18.8h, v0.h[3] mla v6.8h, v19.8h, v0.h[4] mla v6.8h, v20.8h, v0.h[5] mla v6.8h, v21.8h, v0.h[6] ext v17.16b, v3.16b, v4.16b, #4 ext v19.16b, v3.16b, v4.16b, #8 ext v16.16b, v3.16b, v4.16b, #2 ext v20.16b, v3.16b, v4.16b, #10 ext v21.16b, v3.16b, v4.16b, #12 ext v18.16b, v3.16b, v4.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v3.8h shl v23.8h, v18.8h, #7 mul v7.8h, v18.8h, v0.h[3] mla v7.8h, v19.8h, v0.h[4] mla v7.8h, v20.8h, v0.h[5] mla v7.8h, v21.8h, v0.h[6] sub v22.8h, v22.8h, v30.8h sub v23.8h, v23.8h, v30.8h sqadd v6.8h, v6.8h, v22.8h sqadd v7.8h, v7.8h, v23.8h sshr v6.8h, v6.8h, #3 sshr v7.8h, v7.8h, #3 add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] ldp x3, x4, [sp], #32 ret endfunc function wiener_filter7_v_8bpc_neon // Backing up/restoring registers shifted, so that x9 gets the value // of x10, etc, afterwards. stp x10, x11, [sp, #-64]! stp x12, x13, [sp, #16] stp x14, x14, [sp, #32] stp x0, x4, [sp, #48] 1: ld1 {v20.8h, v21.8h}, [x11], #32 ld1 {v24.8h, v25.8h}, [x13], #32 ld1 {v18.8h, v19.8h}, [x10], #32 add v24.8h, v24.8h, v20.8h ld1 {v26.8h, v27.8h}, [x14], #32 ld1 {v16.8h, v17.8h}, [x9], #32 add v28.8h, v26.8h, v18.8h ld1 {v22.8h, v23.8h}, [x12], #32 add v16.8h, v26.8h, v16.8h add v25.8h, v25.8h, v21.8h smull v2.4s, v22.4h, v1.h[3] smlal v2.4s, v24.4h, v1.h[4] smlal v2.4s, v28.4h, v1.h[5] smlal v2.4s, v16.4h, v1.h[6] add v29.8h, v27.8h, v19.8h smull2 v3.4s, v22.8h, v1.h[3] smlal2 v3.4s, v24.8h, v1.h[4] smlal2 v3.4s, v28.8h, v1.h[5] smlal2 v3.4s, v16.8h, v1.h[6] add v17.8h, v27.8h, v17.8h smull v4.4s, v23.4h, v1.h[3] smlal v4.4s, v25.4h, v1.h[4] smlal v4.4s, v29.4h, v1.h[5] smlal v4.4s, v17.4h, v1.h[6] smull2 v5.4s, v23.8h, v1.h[3] smlal2 v5.4s, v25.8h, v1.h[4] smlal2 v5.4s, v29.8h, v1.h[5] smlal2 v5.4s, v17.8h, v1.h[6] sqrshrun v2.4h, v2.4s, #11 sqrshrun2 v2.8h, v3.4s, #11 sqrshrun v3.4h, v4.4s, #11 sqrshrun2 v3.8h, v5.4s, #11 sqxtun v2.8b, v2.8h sqxtun2 v2.16b, v3.8h subs w4, w4, #16 st1 {v2.16b}, [x0], #16 b.gt 1b ldp x0, x4, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #64 add x0, x0, x1 ret endfunc function wiener_filter7_hv_8bpc_neon // Backing up/restoring registers shifted, so that x9 gets the value // of x10, etc, and x15==x9, afterwards. stp x10, x11, [sp, #-80]! stp x12, x13, [sp, #16] stp x14, x15, [sp, #32] stp x10, x0, [sp, #48] stp x3, x4, [sp, #64] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #3 ld1 {v3.16b}, [x3], #16 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v3.16b}, [x3], #16 ld1 {v2.s}[3], [x2], #4 // Move x3 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #3 ext v3.16b, v2.16b, v3.16b, #13 b 2f 1: ld1 {v3.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v2 with the leftmost byte // and shift v3 to have 3x the first byte at the front. dup v2.16b, v3.b[0] // Move x3 back to account for the last 3 bytes we loaded before, // which we shifted out. sub x3, x3, #3 ext v3.16b, v2.16b, v3.16b, #13 2: ld1 {v4.8b}, [x3], #8 uxtl v2.8h, v3.8b uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -6 ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b bit v4.16b, v28.16b, v27.16b 4: // Loop horizontally ext v17.16b, v2.16b, v3.16b, #4 ext v19.16b, v2.16b, v3.16b, #8 ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 ext v18.16b, v2.16b, v3.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v2.8h shl v22.8h, v18.8h, #7 mul v6.8h, v18.8h, v0.h[3] mla v6.8h, v19.8h, v0.h[4] mla v6.8h, v20.8h, v0.h[5] mla v6.8h, v21.8h, v0.h[6] ext v17.16b, v3.16b, v4.16b, #4 ext v19.16b, v3.16b, v4.16b, #8 ext v16.16b, v3.16b, v4.16b, #2 ext v20.16b, v3.16b, v4.16b, #10 ext v21.16b, v3.16b, v4.16b, #12 ext v18.16b, v3.16b, v4.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v3.8h shl v23.8h, v18.8h, #7 mul v7.8h, v18.8h, v0.h[3] mla v7.8h, v19.8h, v0.h[4] mla v7.8h, v20.8h, v0.h[5] mla v7.8h, v21.8h, v0.h[6] ld1 {v20.8h, v21.8h}, [x11], #32 sub v22.8h, v22.8h, v30.8h sub v23.8h, v23.8h, v30.8h ld1 {v26.8h, v27.8h}, [x13], #32 sqadd v6.8h, v6.8h, v22.8h sqadd v7.8h, v7.8h, v23.8h ld1 {v18.8h, v19.8h}, [x10], #32 sshr v6.8h, v6.8h, #3 sshr v7.8h, v7.8h, #3 ld1 {v28.8h, v29.8h}, [x14], #32 add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h ld1 {v16.8h, v17.8h}, [x9], #32 add v26.8h, v20.8h, v26.8h ld1 {v24.8h, v25.8h}, [x12], #32 add v28.8h, v18.8h, v28.8h add v16.8h, v16.8h, v6.8h add v27.8h, v21.8h, v27.8h smull v18.4s, v24.4h, v1.h[3] smlal v18.4s, v26.4h, v1.h[4] smlal v18.4s, v28.4h, v1.h[5] smlal v18.4s, v16.4h, v1.h[6] add v29.8h, v19.8h, v29.8h smull2 v19.4s, v24.8h, v1.h[3] smlal2 v19.4s, v26.8h, v1.h[4] smlal2 v19.4s, v28.8h, v1.h[5] smlal2 v19.4s, v16.8h, v1.h[6] add v17.8h, v17.8h, v7.8h smull v20.4s, v25.4h, v1.h[3] smlal v20.4s, v27.4h, v1.h[4] smlal v20.4s, v29.4h, v1.h[5] smlal v20.4s, v17.4h, v1.h[6] smull2 v21.4s, v25.8h, v1.h[3] smlal2 v21.4s, v27.8h, v1.h[4] smlal2 v21.4s, v29.8h, v1.h[5] smlal2 v21.4s, v17.8h, v1.h[6] sqrshrun v18.4h, v18.4s, #11 sqrshrun2 v18.8h, v19.4s, #11 sqrshrun v19.4h, v20.4s, #11 sqrshrun2 v19.8h, v21.4s, #11 st1 {v6.8h, v7.8h}, [x15], #32 sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h subs w4, w4, #16 st1 {v18.16b}, [x0], #16 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldp x3, x4, [sp, #64] ldp x15, x0, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #80 add x3, x3, x1 add x0, x0, x1 ret endfunc // void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride, // const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges); function wiener_filter5_8bpc_neon, export=1 AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-16]! mov x29, sp ld1 {v0.8h, v1.8h}, [x6] tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*4 mov w17, #(1 << 14) - (1 << 2) dup v30.8h, w17 movi v31.8h, #8, lsl #8 // x11 - t4 // x12 - t3 // x13 - t2 // x14 - t1 // x15 - t0 mov x14, sp // t1 b.eq L(no_top_5) mov x16, x2 // backup left mov x2, #0 bl wiener_filter5_h_8bpc_neon add x3, x3, x1 // lpf += stride mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon add x3, x3, x1, lsl #2 add x3, x3, x1 // lpf += stride*5 mov x12, x14 // t3 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter5_h_8bpc_neon subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v1_5) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon subs w5, w5, #1 // h-- b.eq L(v2_5) add x3, x3, x1 // src += stride L(main_5): mov x15, x11 // t0 = t4 L(main_loop_5): bl wiener_filter5_hv_8bpc_neon subs w5, w5, #1 // h-- b.ne L(main_loop_5) tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v2_5) mov x3, x16 // restore lpf mov x2, #0 // left = NULL bl wiener_filter5_hv_8bpc_neon bl wiener_filter5_hv_8bpc_neon L(end_5): mov sp, x29 ldp x29, x30, [sp], #16 AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_5): add x3, x3, x1, lsl #2 add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter5_h_8bpc_neon subs w5, w5, #1 // h-- mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_5) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_8bpc_neon subs w5, w5, #1 // h-- b.eq L(v2_5) add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter5_hv_8bpc_neon subs w5, w5, #1 // h-- b.eq L(v2_5) add x15, x15, #384*2*3 // t0 += 384*2*3 bl wiener_filter5_hv_8bpc_neon subs w5, w5, #1 // h-- b.ne L(main_5) L(v2_5): bl wiener_filter5_v_8bpc_neon add x0, x0, x1 mov x11, x12 mov x12, x13 mov x13, x14 L(v1_5): bl wiener_filter5_v_8bpc_neon b L(end_5) endfunc function wiener_filter5_h_8bpc_neon stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #2 ld1 {v3.16b}, [x3], #16 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v3.16b}, [x3], #16 ld1 {v2.s}[3], [x2], #4 // Move x3 back to account for the last 2 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #2 ext v3.16b, v2.16b, v3.16b, #14 b 2f 1: ld1 {v3.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v2 with the leftmost byte // and shift v3 to have 3x the first byte at the front. dup v2.16b, v3.b[0] // Move x3 back to account for the last 2 bytes we loaded before, // which we shifted out. sub x3, x3, #2 ext v3.16b, v2.16b, v3.16b, #14 2: ld1 {v4.8b}, [x3], #8 uxtl v2.8h, v3.8b uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -4 ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b bit v4.16b, v28.16b, v27.16b 4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. ext v16.16b, v2.16b, v3.16b, #2 ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 ext v17.16b, v2.16b, v3.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v2.8h shl v22.8h, v17.8h, #7 mul v6.8h, v17.8h, v0.h[3] mla v6.8h, v18.8h, v0.h[4] mla v6.8h, v19.8h, v0.h[5] ext v16.16b, v3.16b, v4.16b, #2 ext v18.16b, v3.16b, v4.16b, #6 ext v19.16b, v3.16b, v4.16b, #8 ext v17.16b, v3.16b, v4.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v3.8h shl v23.8h, v17.8h, #7 mul v7.8h, v17.8h, v0.h[3] mla v7.8h, v18.8h, v0.h[4] mla v7.8h, v19.8h, v0.h[5] sub v22.8h, v22.8h, v30.8h sub v23.8h, v23.8h, v30.8h sqadd v6.8h, v6.8h, v22.8h sqadd v7.8h, v7.8h, v23.8h sshr v6.8h, v6.8h, #3 sshr v7.8h, v7.8h, #3 add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] ldp x3, x4, [sp], #32 ret endfunc function wiener_filter5_v_8bpc_neon stp x11, x12, [sp, #-48]! stp x13, x14, [sp, #16] stp x0, x4, [sp, #32] 1: ld1 {v18.8h, v19.8h}, [x12], #32 ld1 {v22.8h, v23.8h}, [x14], #32 ld1 {v16.8h, v17.8h}, [x11], #32 add v24.8h, v22.8h, v18.8h ld1 {v20.8h, v21.8h}, [x13], #32 add v16.8h, v22.8h, v16.8h add v25.8h, v23.8h, v19.8h smull v2.4s, v20.4h, v1.h[3] smlal v2.4s, v24.4h, v1.h[4] smlal v2.4s, v16.4h, v1.h[5] add v17.8h, v23.8h, v17.8h smull2 v3.4s, v20.8h, v1.h[3] smlal2 v3.4s, v24.8h, v1.h[4] smlal2 v3.4s, v16.8h, v1.h[5] smull v4.4s, v21.4h, v1.h[3] smlal v4.4s, v25.4h, v1.h[4] smlal v4.4s, v17.4h, v1.h[5] smull2 v5.4s, v21.8h, v1.h[3] smlal2 v5.4s, v25.8h, v1.h[4] smlal2 v5.4s, v17.8h, v1.h[5] sqrshrun v2.4h, v2.4s, #11 sqrshrun2 v2.8h, v3.4s, #11 sqrshrun v3.4h, v4.4s, #11 sqrshrun2 v3.8h, v5.4s, #11 sqxtun v2.8b, v2.8h sqxtun2 v2.16b, v3.8h subs w4, w4, #16 st1 {v2.16b}, [x0], #16 b.gt 1b ldp x0, x4, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #48 ret endfunc function wiener_filter5_hv_8bpc_neon // Backing up/restoring registers shifted, so that x11 gets the value // of x12, etc, and x15==x11, afterwards. stp x12, x13, [sp, #-64]! stp x14, x15, [sp, #16] stp x12, x0, [sp, #32] stp x3, x4, [sp, #48] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #2 ld1 {v3.16b}, [x3], #16 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v3.16b}, [x3], #16 ld1 {v2.s}[3], [x2], #4 // Move x3 back to account for the last 2 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #2 ext v3.16b, v2.16b, v3.16b, #14 b 2f 1: ld1 {v3.16b}, [x3], #16 // !LR_HAVE_LEFT, fill v2 with the leftmost byte // and shift v3 to have 2x the first byte at the front. dup v2.16b, v3.b[0] // Move x3 back to account for the last 2 bytes we loaded before, // which we shifted out. sub x3, x3, #2 ext v3.16b, v2.16b, v3.16b, #14 2: ld1 {v4.8b}, [x3], #8 uxtl v2.8h, v3.8b uxtl2 v3.8h, v3.16b uxtl v4.8h, v4.8b tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -4 ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1 dup v28.8h, v28.h[0] ld1 {v25.16b, v26.16b, v27.16b}, [x6] bit v2.16b, v28.16b, v25.16b bit v3.16b, v28.16b, v26.16b bit v4.16b, v28.16b, v27.16b 4: // Loop horizontally ext v16.16b, v2.16b, v3.16b, #2 ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 ext v17.16b, v2.16b, v3.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v2.8h shl v22.8h, v17.8h, #7 mul v6.8h, v17.8h, v0.h[3] mla v6.8h, v18.8h, v0.h[4] mla v6.8h, v19.8h, v0.h[5] ext v16.16b, v3.16b, v4.16b, #2 ext v18.16b, v3.16b, v4.16b, #6 ext v19.16b, v3.16b, v4.16b, #8 ext v17.16b, v3.16b, v4.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v3.8h shl v23.8h, v17.8h, #7 mul v7.8h, v17.8h, v0.h[3] mla v7.8h, v18.8h, v0.h[4] mla v7.8h, v19.8h, v0.h[5] ld1 {v18.8h, v19.8h}, [x12], #32 sub v22.8h, v22.8h, v30.8h sub v23.8h, v23.8h, v30.8h ld1 {v24.8h, v25.8h}, [x14], #32 sqadd v6.8h, v6.8h, v22.8h sqadd v7.8h, v7.8h, v23.8h ld1 {v16.8h, v17.8h}, [x11], #32 sshr v6.8h, v6.8h, #3 sshr v7.8h, v7.8h, #3 ld1 {v20.8h, v21.8h}, [x13], #32 add v6.8h, v6.8h, v31.8h add v7.8h, v7.8h, v31.8h add v24.8h, v24.8h, v18.8h add v16.8h, v16.8h, v6.8h smull v18.4s, v20.4h, v1.h[3] smlal v18.4s, v24.4h, v1.h[4] smlal v18.4s, v16.4h, v1.h[5] add v25.8h, v25.8h, v19.8h smull2 v19.4s, v20.8h, v1.h[3] smlal2 v19.4s, v24.8h, v1.h[4] smlal2 v19.4s, v16.8h, v1.h[5] add v17.8h, v17.8h, v7.8h smull v20.4s, v21.4h, v1.h[3] smlal v20.4s, v25.4h, v1.h[4] smlal v20.4s, v17.4h, v1.h[5] smull2 v21.4s, v21.8h, v1.h[3] smlal2 v21.4s, v25.8h, v1.h[4] smlal2 v21.4s, v17.8h, v1.h[5] sqrshrun v18.4h, v18.4s, #11 sqrshrun2 v18.8h, v19.4s, #11 sqrshrun v19.4h, v20.4s, #11 sqrshrun2 v19.8h, v21.4s, #11 st1 {v6.8h, v7.8h}, [x15], #32 sqxtun v18.8b, v18.8h sqxtun2 v18.16b, v19.8h subs w4, w4, #16 st1 {v18.16b}, [x0], #16 b.le 0f mov v2.16b, v4.16b ld1 {v4.16b}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT uxtl v3.8h, v4.8b uxtl2 v4.8h, v4.16b b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldp x3, x4, [sp, #48] ldp x15, x0, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #64 add x3, x3, x1 add x0, x0, x1 ret endfunc #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" // void dav1d_sgr_box3_h_8bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box3_h_8bpc_neon, export=1 add w5, w5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add x10, x0, #(4*SUM_STRIDE) // sumsq add x11, x1, #(2*SUM_STRIDE) // sum add x12, x3, x4 // src lsl x4, x4, #1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add w13, w5, #7 bic w13, w13, #7 sub x9, x9, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 // Subtract the number of pixels read from the input from the stride add w13, w13, #8 sub x4, x4, w13, uxtw // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 2f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #2 sub x12, x12, #2 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 2 pixels from the src pointer, // but shift it as if we had done that. add x4, x4, #2 1: // Loop vertically ld1 {v0.16b}, [x3], #16 ld1 {v4.16b}, [x12], #16 tst w7, #1 // LR_HAVE_LEFT b.eq 0f cbz x2, 2f // LR_HAVE_LEFT, left != NULL ld1 {v1.s}[3], [x2], #4 // Move x3/x12 back to account for the last 2 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #2 sub x12, x12, #2 ld1 {v5.s}[3], [x2], #4 ext v0.16b, v1.16b, v0.16b, #14 ext v4.16b, v5.16b, v4.16b, #14 b 2f 0: // !LR_HAVE_LEFT, fill v1 with the leftmost byte // and shift v0 to have 2x the first byte at the front. dup v1.16b, v0.b[0] dup v5.16b, v4.b[0] // Move x3 back to account for the last 2 bytes we loaded before, // which we shifted out. sub x3, x3, #2 sub x12, x12, #2 ext v0.16b, v1.16b, v0.16b, #14 ext v4.16b, v5.16b, v4.16b, #14 2: umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b umull v5.8h, v4.8b, v4.8b umull2 v6.8h, v4.16b, v4.16b tst w7, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 2 + 1) ldr b30, [x3, w13, sxtw] ldr b31, [x12, w13, sxtw] // Fill v30/v31 with the right padding pixel dup v30.16b, v30.b[0] dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w5, #10 b.ge 4f // If w >= 10, all used input pixels are valid // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. // Insert padding in v0/4.b[w] onwards movrel x13, right_ext_mask sub x13, x13, w5, uxtw ld1 {v29.16b}, [x13] bit v0.16b, v30.16b, v29.16b bit v4.16b, v31.16b, v29.16b // Update the precalculated squares umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b umull v5.8h, v4.8b, v4.8b umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v4.16b, v4.16b, #1 ext v19.16b, v4.16b, v4.16b, #2 uaddl v3.8h, v0.8b, v16.8b uaddw v3.8h, v3.8h, v17.8b uaddl v7.8h, v4.8b, v18.8b uaddw v7.8h, v7.8h, v19.8b ext v20.16b, v1.16b, v2.16b, #2 ext v21.16b, v1.16b, v2.16b, #4 ext v22.16b, v5.16b, v6.16b, #2 ext v23.16b, v5.16b, v6.16b, #4 uaddl v26.4s, v1.4h, v20.4h uaddl2 v27.4s, v1.8h, v20.8h uaddw v26.4s, v26.4s, v21.4h uaddw2 v27.4s, v27.4s, v21.8h uaddl v28.4s, v5.4h, v22.4h uaddl2 v29.4s, v5.8h, v22.8h uaddw v28.4s, v28.4s, v23.4h uaddw2 v29.4s, v29.4s, v23.8h subs w5, w5, #8 st1 {v3.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 st1 {v28.4s,v29.4s}, [x10], #32 b.le 9f tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 ld1 {v7.8b}, [x12], #8 mov v1.16b, v2.16b mov v5.16b, v6.16b ext v0.16b, v0.16b, v3.16b, #8 ext v4.16b, v4.16b, v7.16b, #8 umull v2.8h, v3.8b, v3.8b umull v6.8h, v7.8b, v7.8b b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs w6, w6, #2 b.le 0f // Jump to the next row and loop horizontally add x0, x0, x9, lsl #1 add x10, x10, x9, lsl #1 add x1, x1, x9 add x11, x11, x9 add x3, x3, x4 add x12, x12, x4 mov w5, w8 b 1b 0: ret endfunc // void dav1d_sgr_box5_h_8bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box5_h_8bpc_neon, export=1 add w5, w5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add x10, x0, #(4*SUM_STRIDE) // sumsq add x11, x1, #(2*SUM_STRIDE) // sum add x12, x3, x4 // src lsl x4, x4, #1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add w13, w5, #7 bic w13, w13, #7 sub x9, x9, w13, uxtw #1 add w13, w13, #8 sub x4, x4, w13, uxtw // Store the width for the vertical loop mov w8, w5 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 2f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #3 sub x12, x12, #3 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 3 pixels from the src pointer, // but shift it as if we had done that. add x4, x4, #3 1: // Loop vertically ld1 {v0.16b}, [x3], #16 ld1 {v4.16b}, [x12], #16 tst w7, #1 // LR_HAVE_LEFT b.eq 0f cbz x2, 2f // LR_HAVE_LEFT, left != NULL ld1 {v1.s}[3], [x2], #4 // Move x3/x12 back to account for the last 3 bytes we loaded earlier, // which we'll shift out. sub x3, x3, #3 sub x12, x12, #3 ld1 {v5.s}[3], [x2], #4 ext v0.16b, v1.16b, v0.16b, #13 ext v4.16b, v5.16b, v4.16b, #13 b 2f 0: // !LR_HAVE_LEFT, fill v1 with the leftmost byte // and shift v0 to have 3x the first byte at the front. dup v1.16b, v0.b[0] dup v5.16b, v4.b[0] // Move x3 back to account for the last 3 bytes we loaded before, // which we shifted out. sub x3, x3, #3 sub x12, x12, #3 ext v0.16b, v1.16b, v0.16b, #13 ext v4.16b, v5.16b, v4.16b, #13 2: umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b umull v5.8h, v4.8b, v4.8b umull2 v6.8h, v4.16b, v4.16b tst w7, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that byte to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 3 + 1) ldr b30, [x3, w13, sxtw] ldr b31, [x12, w13, sxtw] // Fill v30/v31 with the right padding pixel dup v30.16b, v30.b[0] dup v31.16b, v31.b[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // Insert padding in v0/4.b[w+1] onwards; fuse the +1 into the // buffer pointer. movrel x13, right_ext_mask, -1 sub x13, x13, w5, uxtw ld1 {v29.16b}, [x13] bit v0.16b, v30.16b, v29.16b bit v4.16b, v31.16b, v29.16b // Update the precalculated squares umull v1.8h, v0.8b, v0.8b umull2 v2.8h, v0.16b, v0.16b umull v5.8h, v4.8b, v4.8b umull2 v6.8h, v4.16b, v4.16b 4: // Loop horizontally ext v16.16b, v0.16b, v0.16b, #1 ext v17.16b, v0.16b, v0.16b, #2 ext v18.16b, v0.16b, v0.16b, #3 ext v19.16b, v0.16b, v0.16b, #4 ext v20.16b, v4.16b, v4.16b, #1 ext v21.16b, v4.16b, v4.16b, #2 ext v22.16b, v4.16b, v4.16b, #3 ext v23.16b, v4.16b, v4.16b, #4 uaddl v3.8h, v0.8b, v16.8b uaddl v24.8h, v17.8b, v18.8b uaddl v7.8h, v4.8b, v20.8b uaddw v3.8h, v3.8h, v19.8b uaddl v25.8h, v21.8b, v22.8b uaddw v7.8h, v7.8h, v23.8b add v3.8h, v3.8h, v24.8h add v7.8h, v7.8h, v25.8h ext v16.16b, v1.16b, v2.16b, #2 ext v17.16b, v1.16b, v2.16b, #4 ext v18.16b, v1.16b, v2.16b, #6 ext v19.16b, v1.16b, v2.16b, #8 ext v20.16b, v5.16b, v6.16b, #2 ext v21.16b, v5.16b, v6.16b, #4 ext v22.16b, v5.16b, v6.16b, #6 ext v23.16b, v5.16b, v6.16b, #8 uaddl v26.4s, v1.4h, v16.4h uaddl2 v27.4s, v1.8h, v16.8h uaddl v16.4s, v17.4h, v18.4h uaddl2 v17.4s, v17.8h, v18.8h uaddl v28.4s, v5.4h, v20.4h uaddl2 v29.4s, v5.8h, v20.8h uaddw v26.4s, v26.4s, v19.4h uaddw2 v27.4s, v27.4s, v19.8h uaddl v20.4s, v21.4h, v22.4h uaddl2 v21.4s, v21.8h, v22.8h uaddw v28.4s, v28.4s, v23.4h uaddw2 v29.4s, v29.4s, v23.8h add v26.4s, v26.4s, v16.4s add v27.4s, v27.4s, v17.4s add v28.4s, v28.4s, v20.4s add v29.4s, v29.4s, v21.4s subs w5, w5, #8 st1 {v3.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v26.4s,v27.4s}, [x0], #32 st1 {v28.4s,v29.4s}, [x10], #32 b.le 9f tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8b}, [x3], #8 ld1 {v7.8b}, [x12], #8 mov v1.16b, v2.16b mov v5.16b, v6.16b ext v0.16b, v0.16b, v3.16b, #8 ext v4.16b, v4.16b, v7.16b, #8 umull v2.8h, v3.8b, v3.8b umull v6.8h, v7.8b, v7.8b b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs w6, w6, #2 b.le 0f // Jump to the next row and loop horizontally add x0, x0, x9, lsl #1 add x10, x10, x9, lsl #1 add x1, x1, x9 add x11, x11, x9 add x3, x3, x4 add x12, x12, x4 mov w5, w8 b 1b 0: ret endfunc sgr_funcs 8 rav1e-0.7.1/src/arm/64/looprestoration16.S000064400000000000000000001651401046102023000162130ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" const right_ext_mask_buf .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 right_ext_mask: .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff endconst // void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride, // const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges, // const int bitdepth_max); function wiener_filter7_16bpc_neon, export=1 ldr w8, [sp] AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp ld1 {v0.8h, v1.8h}, [x6] tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*6 dup v28.8h, w8 // bitdepth_max clz w8, w8 movi v30.4s, #1 sub w10, w8, #38 // -(bitdepth + 6) sub w11, w8, #11 // round_bits_v sub w8, w8, #25 // -round_bits_h neg w10, w10 // bitdepth + 6 neg w11, w11 // -round_bits_v dup v2.4s, w10 dup v29.4s, w8 // -round_bits_h dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 // x9 - t6 // x10 - t5 // x11 - t4 // x12 - t3 // x13 - t2 // x14 - t1 // x15 - t0 mov x14, sp // t1 b.eq L(no_top_7) mov x16, x2 // backup left mov x2, #0 bl wiener_filter7_h_16bpc_neon add x3, x3, x1 // lpf += stride mov x9, x14 // t6 mov x10, x14 // t5 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon add x3, x3, x1, lsl #2 add x3, x3, x1 // lpf += stride*5 mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter7_h_16bpc_neon subs w5, w5, #1 // h-- mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon mov x13, x14 // t2 subs w5, w5, #1 // h-- b.eq L(v2_7) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon subs w5, w5, #1 // h-- b.eq L(v3_7) add x3, x3, x1 // src += stride L(main_7): add x15, x14, #384*2 // t0 = t1 + 384*2 L(main_loop_7): bl wiener_filter7_hv_16bpc_neon subs w5, w5, #1 // h-- b.ne L(main_loop_7) tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v3_7) mov x3, x16 // restore lpf mov x2, #0 // left = NULL bl wiener_filter7_hv_16bpc_neon bl wiener_filter7_hv_16bpc_neon L(v1_7): bl wiener_filter7_v_16bpc_neon mov sp, x29 ldp d8, d9, [sp, #16] ldp x29, x30, [sp], #32 AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_7): add x3, x3, x1, lsl #2 add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter7_h_16bpc_neon subs w5, w5, #1 // h-- mov x9, x14 // t6 mov x10, x14 // t5 mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_7) add x3, x3, x1 // src += p_stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v2_7) add x3, x3, x1 // src += p_stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter7_h_16bpc_neon subs w5, w5, #1 // h-- b.eq L(v3_7) add x3, x3, x1 // src += p_stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter7_hv_16bpc_neon subs w5, w5, #1 // h-- b.eq L(v3_7) add x15, x15, #384*2*4 // t0 += 384*2*4 bl wiener_filter7_hv_16bpc_neon subs w5, w5, #1 // h-- b.ne L(main_7) L(v3_7): bl wiener_filter7_v_16bpc_neon L(v2_7): bl wiener_filter7_v_16bpc_neon b L(v1_7) endfunc function wiener_filter7_h_16bpc_neon stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #6 ld1 {v2.8h, v3.8h}, [x3], #32 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v2.8h, v3.8h}, [x3], #32 ld1 {v4.d}[1], [x2], #8 // Move x3 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 ext v2.16b, v4.16b, v2.16b, #10 b 2f 1: ld1 {v2.8h, v3.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v4 with the leftmost pixel // and shift v3 to have 3x the first pixel at the front. dup v4.8h, v2.h[0] // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 ext v2.16b, v4.16b, v2.16b, #10 2: ld1 {v4.8h}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -6 ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b bit v4.16b, v26.16b, v25.16b 4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. ext v17.16b, v2.16b, v3.16b, #4 ext v19.16b, v2.16b, v3.16b, #8 ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 ext v18.16b, v2.16b, v3.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v2.8h smull v6.4s, v18.4h, v0.h[3] smlal v6.4s, v19.4h, v0.h[2] smlal v6.4s, v20.4h, v0.h[1] smlal v6.4s, v21.4h, v0.h[0] smull2 v7.4s, v18.8h, v0.h[3] smlal2 v7.4s, v19.8h, v0.h[2] smlal2 v7.4s, v20.8h, v0.h[1] smlal2 v7.4s, v21.8h, v0.h[0] ext v17.16b, v3.16b, v4.16b, #4 ext v19.16b, v3.16b, v4.16b, #8 ext v16.16b, v3.16b, v4.16b, #2 ext v20.16b, v3.16b, v4.16b, #10 ext v21.16b, v3.16b, v4.16b, #12 ext v18.16b, v3.16b, v4.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v3.8h smull v16.4s, v18.4h, v0.h[3] smlal v16.4s, v19.4h, v0.h[2] smlal v16.4s, v20.4h, v0.h[1] smlal v16.4s, v21.4h, v0.h[0] smull2 v17.4s, v18.8h, v0.h[3] smlal2 v17.4s, v19.8h, v0.h[2] smlal2 v17.4s, v20.8h, v0.h[1] smlal2 v17.4s, v21.8h, v0.h[0] mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s add v16.4s, v16.4s, v30.4s add v17.4s, v17.4s, v30.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s srshl v16.4s, v16.4s, v29.4s srshl v17.4s, v17.4s, v29.4s sqxtun v6.4h, v6.4s sqxtun2 v6.8h, v7.4s sqxtun v7.4h, v16.4s sqxtun2 v7.8h, v17.4s umin v6.8h, v6.8h, v24.8h umin v7.8h, v7.8h, v24.8h sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] ldp x3, x4, [sp], #32 ret endfunc function wiener_filter7_v_16bpc_neon // Backing up/restoring registers shifted, so that x9 gets the value // of x10, etc, afterwards. stp x10, x11, [sp, #-64]! stp x12, x13, [sp, #16] stp x14, x14, [sp, #32] stp x0, x4, [sp, #48] 1: ld1 {v16.8h, v17.8h}, [x9], #32 ld1 {v18.8h, v19.8h}, [x10], #32 ld1 {v20.8h, v21.8h}, [x11], #32 ld1 {v22.8h, v23.8h}, [x12], #32 ld1 {v24.8h, v25.8h}, [x13], #32 ld1 {v6.8h, v7.8h}, [x14], #32 smull v2.4s, v16.4h, v0.h[4] smlal v2.4s, v18.4h, v0.h[5] smlal v2.4s, v20.4h, v0.h[6] smlal v2.4s, v22.4h, v0.h[7] smlal v2.4s, v24.4h, v0.h[6] smlal v2.4s, v6.4h, v0.h[5] smlal v2.4s, v6.4h, v0.h[4] smull2 v3.4s, v16.8h, v0.h[4] smlal2 v3.4s, v18.8h, v0.h[5] smlal2 v3.4s, v20.8h, v0.h[6] smlal2 v3.4s, v22.8h, v0.h[7] smlal2 v3.4s, v24.8h, v0.h[6] smlal2 v3.4s, v6.8h, v0.h[5] smlal2 v3.4s, v6.8h, v0.h[4] smull v4.4s, v17.4h, v0.h[4] smlal v4.4s, v19.4h, v0.h[5] smlal v4.4s, v21.4h, v0.h[6] smlal v4.4s, v23.4h, v0.h[7] smlal v4.4s, v25.4h, v0.h[6] smlal v4.4s, v7.4h, v0.h[5] smlal v4.4s, v7.4h, v0.h[4] smull2 v5.4s, v17.8h, v0.h[4] smlal2 v5.4s, v19.8h, v0.h[5] smlal2 v5.4s, v21.8h, v0.h[6] smlal2 v5.4s, v23.8h, v0.h[7] smlal2 v5.4s, v25.8h, v0.h[6] smlal2 v5.4s, v7.8h, v0.h[5] smlal2 v5.4s, v7.8h, v0.h[4] srshl v2.4s, v2.4s, v27.4s // -round_bits_v srshl v3.4s, v3.4s, v27.4s srshl v4.4s, v4.4s, v27.4s srshl v5.4s, v5.4s, v27.4s sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v28.8h // bitdepth_max umin v3.8h, v3.8h, v28.8h subs w4, w4, #16 st1 {v2.8h, v3.8h}, [x0], #32 b.gt 1b ldp x0, x4, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #64 add x0, x0, x1 ret endfunc function wiener_filter7_hv_16bpc_neon // Backing up/restoring registers shifted, so that x9 gets the value // of x10, etc, and x15==x9, afterwards. stp x10, x11, [sp, #-80]! stp x12, x13, [sp, #16] stp x14, x15, [sp, #32] stp x10, x0, [sp, #48] stp x3, x4, [sp, #64] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #6 ld1 {v2.8h, v3.8h}, [x3], #32 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v2.8h, v3.8h}, [x3], #32 ld1 {v4.d}[1], [x2], #8 // Move x3 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 ext v2.16b, v4.16b, v2.16b, #10 b 2f 1: ld1 {v2.8h, v3.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v4 with the leftmost pixel // and shift v3 to have 3x the first pixel at the front. dup v4.8h, v2.h[0] // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub x3, x3, #6 ext v3.16b, v2.16b, v3.16b, #10 ext v2.16b, v4.16b, v2.16b, #10 2: ld1 {v4.8h}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #19 b.ge 4f // If w >= 19, all used input pixels are valid // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22 // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -6 ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b bit v4.16b, v26.16b, v25.16b 4: // Loop horizontally ext v17.16b, v2.16b, v3.16b, #4 ext v19.16b, v2.16b, v3.16b, #8 ext v16.16b, v2.16b, v3.16b, #2 ext v20.16b, v2.16b, v3.16b, #10 ext v21.16b, v2.16b, v3.16b, #12 ext v18.16b, v2.16b, v3.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v2.8h smull v6.4s, v18.4h, v0.h[3] smlal v6.4s, v19.4h, v0.h[2] smlal v6.4s, v20.4h, v0.h[1] smlal v6.4s, v21.4h, v0.h[0] smull2 v7.4s, v18.8h, v0.h[3] smlal2 v7.4s, v19.8h, v0.h[2] smlal2 v7.4s, v20.8h, v0.h[1] smlal2 v7.4s, v21.8h, v0.h[0] ext v17.16b, v3.16b, v4.16b, #4 ext v19.16b, v3.16b, v4.16b, #8 ext v16.16b, v3.16b, v4.16b, #2 ext v20.16b, v3.16b, v4.16b, #10 ext v21.16b, v3.16b, v4.16b, #12 ext v18.16b, v3.16b, v4.16b, #6 add v19.8h, v19.8h, v17.8h add v20.8h, v20.8h, v16.8h add v21.8h, v21.8h, v3.8h smull v24.4s, v18.4h, v0.h[3] smlal v24.4s, v19.4h, v0.h[2] smlal v24.4s, v20.4h, v0.h[1] smlal v24.4s, v21.4h, v0.h[0] smull2 v25.4s, v18.8h, v0.h[3] smlal2 v25.4s, v19.8h, v0.h[2] smlal2 v25.4s, v20.8h, v0.h[1] smlal2 v25.4s, v21.8h, v0.h[0] ld1 {v16.8h, v17.8h}, [x9], #32 mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s add v24.4s, v24.4s, v30.4s add v25.4s, v25.4s, v30.4s ld1 {v18.8h, v19.8h}, [x10], #32 srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s srshl v24.4s, v24.4s, v29.4s srshl v25.4s, v25.4s, v29.4s ld1 {v20.8h, v21.8h}, [x11], #32 sqxtun v6.4h, v6.4s sqxtun2 v6.8h, v7.4s sqxtun v7.4h, v24.4s sqxtun2 v7.8h, v25.4s ld1 {v22.8h, v23.8h}, [x12], #32 umin v6.8h, v6.8h, v26.8h umin v7.8h, v7.8h, v26.8h ld1 {v24.8h, v25.8h}, [x13], #32 sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h ld1 {v8.8h, v9.8h}, [x14], #32 smull v1.4s, v16.4h, v0.h[4] smlal v1.4s, v18.4h, v0.h[5] smlal v1.4s, v20.4h, v0.h[6] smlal v1.4s, v22.4h, v0.h[7] smlal v1.4s, v24.4h, v0.h[6] smlal v1.4s, v8.4h, v0.h[5] smlal v1.4s, v6.4h, v0.h[4] smull2 v5.4s, v16.8h, v0.h[4] smlal2 v5.4s, v18.8h, v0.h[5] smlal2 v5.4s, v20.8h, v0.h[6] smlal2 v5.4s, v22.8h, v0.h[7] smlal2 v5.4s, v24.8h, v0.h[6] smlal2 v5.4s, v8.8h, v0.h[5] smlal2 v5.4s, v6.8h, v0.h[4] smull v26.4s, v17.4h, v0.h[4] smlal v26.4s, v19.4h, v0.h[5] smlal v26.4s, v21.4h, v0.h[6] smlal v26.4s, v23.4h, v0.h[7] smlal v26.4s, v25.4h, v0.h[6] smlal v26.4s, v9.4h, v0.h[5] smlal v26.4s, v7.4h, v0.h[4] smull2 v16.4s, v17.8h, v0.h[4] smlal2 v16.4s, v19.8h, v0.h[5] smlal2 v16.4s, v21.8h, v0.h[6] smlal2 v16.4s, v23.8h, v0.h[7] smlal2 v16.4s, v25.8h, v0.h[6] smlal2 v16.4s, v9.8h, v0.h[5] smlal2 v16.4s, v7.8h, v0.h[4] srshl v1.4s, v1.4s, v27.4s // -round_bits_v srshl v5.4s, v5.4s, v27.4s srshl v26.4s, v26.4s, v27.4s srshl v16.4s, v16.4s, v27.4s sqxtun v18.4h, v1.4s sqxtun2 v18.8h, v5.4s sqxtun v19.4h, v26.4s sqxtun2 v19.8h, v16.4s st1 {v6.8h, v7.8h}, [x15], #32 umin v18.8h, v18.8h, v28.8h // bitdepth_max umin v19.8h, v19.8h, v28.8h subs w4, w4, #16 st1 {v18.8h, v19.8h}, [x0], #32 b.le 0f mov v2.16b, v4.16b tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldp x3, x4, [sp, #64] ldp x15, x0, [sp, #48] ldp x13, x14, [sp, #32] ldp x11, x12, [sp, #16] ldp x9, x10, [sp], #80 add x3, x3, x1 add x0, x0, x1 ret endfunc // void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride, // const pixel (*left)[4], const pixel *lpf, // const int w, int h, // const int16_t filter[2][8], // const enum LrEdgeFlags edges, // const int bitdepth_max); function wiener_filter5_16bpc_neon, export=1 ldr w8, [sp] AARCH64_SIGN_LINK_REGISTER stp x29, x30, [sp, #-32]! stp d8, d9, [sp, #16] mov x29, sp ld1 {v0.8h, v1.8h}, [x6] tst w7, #4 // LR_HAVE_TOP sub_sp 384*2*4 dup v28.8h, w8 // bitdepth_max clz w8, w8 movi v30.4s, #1 sub w10, w8, #38 // -(bitdepth + 6) sub w11, w8, #11 // round_bits_v sub w8, w8, #25 // -round_bits_h neg w10, w10 // bitdepth + 6 neg w11, w11 // -round_bits_v dup v2.4s, w10 dup v29.4s, w8 // -round_bits_h dup v27.4s, w11 // -round_bits_v movi v31.8h, #0x20, lsl #8 // 1 << 13 = 8192 ushl v30.4s, v30.4s, v2.4s // 1 << (bitdepth + 6) zip1 v0.2d, v0.2d, v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1 // x11 - t4 // x12 - t3 // x13 - t2 // x14 - t1 // x15 - t0 mov x14, sp // t1 b.eq L(no_top_5) mov x16, x2 // backup left mov x2, #0 bl wiener_filter5_h_16bpc_neon add x3, x3, x1 // lpf += stride mov x11, x14 // t4 add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon add x3, x3, x1, lsl #2 add x3, x3, x1 // lpf += stride*5 mov x12, x14 // t3 add x14, x14, #384*2 // t1 += 384*2 mov x2, x16 // left mov x16, x3 // backup lpf mov x3, x0 // lpf = p bl wiener_filter5_h_16bpc_neon subs w5, w5, #1 // h-- mov x13, x14 // t2 b.eq L(v1_5) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon subs w5, w5, #1 // h-- b.eq L(v2_5) add x3, x3, x1 // src += stride L(main_5): mov x15, x11 // t0 = t4 L(main_loop_5): bl wiener_filter5_hv_16bpc_neon subs w5, w5, #1 // h-- b.ne L(main_loop_5) tst w7, #8 // LR_HAVE_BOTTOM b.eq L(v2_5) mov x3, x16 // restore lpf mov x2, #0 // left = NULL bl wiener_filter5_hv_16bpc_neon bl wiener_filter5_hv_16bpc_neon L(end_5): mov sp, x29 ldp d8, d9, [sp, #16] ldp x29, x30, [sp], #32 AARCH64_VALIDATE_LINK_REGISTER ret L(no_top_5): add x3, x3, x1, lsl #2 add x16, x3, x1, lsl #1 // lpf += stride*6, backup mov x3, x0 // lpf = p bl wiener_filter5_h_16bpc_neon subs w5, w5, #1 // h-- mov x11, x14 // t4 mov x12, x14 // t3 mov x13, x14 // t2 b.eq L(v1_5) add x3, x3, x1 // src += stride add x14, x14, #384*2 // t1 += 384*2 bl wiener_filter5_h_16bpc_neon subs w5, w5, #1 // h-- b.eq L(v2_5) add x3, x3, x1 // src += stride add x15, x14, #384*2 // t0 = t1 + 384*2 bl wiener_filter5_hv_16bpc_neon subs w5, w5, #1 // h-- b.eq L(v2_5) add x15, x15, #384*2*3 // t0 += 384*2*3 bl wiener_filter5_hv_16bpc_neon subs w5, w5, #1 // h-- b.ne L(main_5) L(v2_5): bl wiener_filter5_v_16bpc_neon add x0, x0, x1 mov x11, x12 mov x12, x13 mov x13, x14 L(v1_5): bl wiener_filter5_v_16bpc_neon b L(end_5) endfunc function wiener_filter5_h_16bpc_neon stp x3, x4, [sp, #-32]! str x14, [sp, #16] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #4 ld1 {v2.8h, v3.8h}, [x3], #32 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v2.8h, v3.8h}, [x3], #32 ld1 {v4.d}[1], [x2], #8 // Move x3 back to account for the last 2 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #4 ext v3.16b, v2.16b, v3.16b, #12 ext v2.16b, v4.16b, v2.16b, #12 b 2f 1: ld1 {v2.8h, v3.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v3 to have 3x the first pixel at the front. dup v4.8h, v2.h[0] // Move x3 back to account for the last 2 pixels we loaded before, // which we shifted out. sub x3, x3, #4 ext v3.16b, v2.16b, v3.16b, #12 ext v2.16b, v4.16b, v2.16b, #12 2: ld1 {v4.8h}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -4 ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b bit v4.16b, v26.16b, v25.16b 4: // Loop horizontally // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. ext v16.16b, v2.16b, v3.16b, #2 ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 ext v17.16b, v2.16b, v3.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v2.8h smull v6.4s, v17.4h, v0.h[3] smlal v6.4s, v18.4h, v0.h[2] smlal v6.4s, v19.4h, v0.h[1] smull2 v7.4s, v17.8h, v0.h[3] smlal2 v7.4s, v18.8h, v0.h[2] smlal2 v7.4s, v19.8h, v0.h[1] ext v16.16b, v3.16b, v4.16b, #2 ext v18.16b, v3.16b, v4.16b, #6 ext v19.16b, v3.16b, v4.16b, #8 ext v17.16b, v3.16b, v4.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v3.8h smull v16.4s, v17.4h, v0.h[3] smlal v16.4s, v18.4h, v0.h[2] smlal v16.4s, v19.4h, v0.h[1] smull2 v17.4s, v17.8h, v0.h[3] smlal2 v17.4s, v18.8h, v0.h[2] smlal2 v17.4s, v19.8h, v0.h[1] mvni v24.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s add v16.4s, v16.4s, v30.4s add v17.4s, v17.4s, v30.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s srshl v16.4s, v16.4s, v29.4s srshl v17.4s, v17.4s, v29.4s sqxtun v6.4h, v6.4s sqxtun2 v6.8h, v7.4s sqxtun v7.4h, v16.4s sqxtun2 v7.8h, v17.4s umin v6.8h, v6.8h, v24.8h umin v7.8h, v7.8h, v24.8h sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h subs w4, w4, #16 st1 {v6.8h, v7.8h}, [x14], #32 b.le 0f mov v2.16b, v4.16b tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldr x14, [sp, #16] ldp x3, x4, [sp], #32 ret endfunc function wiener_filter5_v_16bpc_neon stp x11, x12, [sp, #-48]! stp x13, x14, [sp, #16] stp x0, x4, [sp, #32] 1: ld1 {v16.8h, v17.8h}, [x11], #32 ld1 {v18.8h, v19.8h}, [x12], #32 ld1 {v20.8h, v21.8h}, [x13], #32 ld1 {v22.8h, v23.8h}, [x14], #32 smull v2.4s, v16.4h, v0.h[5] smlal v2.4s, v18.4h, v0.h[6] smlal v2.4s, v20.4h, v0.h[7] smlal v2.4s, v22.4h, v0.h[6] smlal v2.4s, v22.4h, v0.h[5] smull2 v3.4s, v16.8h, v0.h[5] smlal2 v3.4s, v18.8h, v0.h[6] smlal2 v3.4s, v20.8h, v0.h[7] smlal2 v3.4s, v22.8h, v0.h[6] smlal2 v3.4s, v22.8h, v0.h[5] smull v4.4s, v17.4h, v0.h[5] smlal v4.4s, v19.4h, v0.h[6] smlal v4.4s, v21.4h, v0.h[7] smlal v4.4s, v23.4h, v0.h[6] smlal v4.4s, v23.4h, v0.h[5] smull2 v5.4s, v17.8h, v0.h[5] smlal2 v5.4s, v19.8h, v0.h[6] smlal2 v5.4s, v21.8h, v0.h[7] smlal2 v5.4s, v23.8h, v0.h[6] smlal2 v5.4s, v23.8h, v0.h[5] srshl v2.4s, v2.4s, v27.4s // -round_bits_v srshl v3.4s, v3.4s, v27.4s srshl v4.4s, v4.4s, v27.4s srshl v5.4s, v5.4s, v27.4s sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v28.8h // bitdepth_max umin v3.8h, v3.8h, v28.8h subs w4, w4, #16 st1 {v2.8h, v3.8h}, [x0], #32 b.gt 1b ldp x0, x4, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #48 ret endfunc function wiener_filter5_hv_16bpc_neon // Backing up/restoring registers shifted, so that x11 gets the value // of x12, etc, and x15==x11, afterwards. stp x12, x13, [sp, #-64]! stp x14, x15, [sp, #16] stp x12, x0, [sp, #32] stp x3, x4, [sp, #48] // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 1f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #4 ld1 {v2.8h, v3.8h}, [x3], #32 b 2f 0: // LR_HAVE_LEFT, left != NULL ld1 {v2.8h, v3.8h}, [x3], #32 ld1 {v4.d}[1], [x2], #8 // Move x3 back to account for the last 2 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #4 ext v3.16b, v2.16b, v3.16b, #12 ext v2.16b, v4.16b, v2.16b, #12 b 2f 1: ld1 {v2.8h, v3.8h}, [x3], #32 // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v3 to have 2x the first pixel at the front. dup v4.8h, v2.h[0] // Move x3 back to account for the last 2 pixels we loaded before, // which we shifted out. sub x3, x3, #4 ext v3.16b, v2.16b, v3.16b, #12 ext v2.16b, v4.16b, v2.16b, #12 2: ld1 {v4.8h}, [x3], #16 tst w7, #2 // LR_HAVE_RIGHT b.ne 4f 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w4, #18 b.ge 4f // If w >= 18, all used input pixels are valid // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23 // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the // buffer pointer. movrel x6, right_ext_mask, -4 ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1 dup v26.8h, v26.h[0] ld1 {v23.16b, v24.16b, v25.16b}, [x6] bit v2.16b, v26.16b, v23.16b bit v3.16b, v26.16b, v24.16b bit v4.16b, v26.16b, v25.16b 4: // Loop horizontally ext v16.16b, v2.16b, v3.16b, #2 ext v18.16b, v2.16b, v3.16b, #6 ext v19.16b, v2.16b, v3.16b, #8 ext v17.16b, v2.16b, v3.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v2.8h smull v6.4s, v17.4h, v0.h[3] smlal v6.4s, v18.4h, v0.h[2] smlal v6.4s, v19.4h, v0.h[1] smull2 v7.4s, v17.8h, v0.h[3] smlal2 v7.4s, v18.8h, v0.h[2] smlal2 v7.4s, v19.8h, v0.h[1] ext v16.16b, v3.16b, v4.16b, #2 ext v18.16b, v3.16b, v4.16b, #6 ext v19.16b, v3.16b, v4.16b, #8 ext v17.16b, v3.16b, v4.16b, #4 add v18.8h, v18.8h, v16.8h add v19.8h, v19.8h, v3.8h smull v24.4s, v17.4h, v0.h[3] smlal v24.4s, v18.4h, v0.h[2] smlal v24.4s, v19.4h, v0.h[1] smull2 v25.4s, v17.8h, v0.h[3] smlal2 v25.4s, v18.8h, v0.h[2] smlal2 v25.4s, v19.8h, v0.h[1] ld1 {v16.8h, v17.8h}, [x11], #32 mvni v26.8h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1 add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s add v24.4s, v24.4s, v30.4s add v25.4s, v25.4s, v30.4s ld1 {v18.8h, v19.8h}, [x12], #32 srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s srshl v24.4s, v24.4s, v29.4s srshl v25.4s, v25.4s, v29.4s ld1 {v20.8h, v21.8h}, [x13], #32 sqxtun v6.4h, v6.4s sqxtun2 v6.8h, v7.4s sqxtun v7.4h, v24.4s sqxtun2 v7.8h, v25.4s ld1 {v22.8h, v23.8h}, [x14], #32 umin v6.8h, v6.8h, v26.8h umin v7.8h, v7.8h, v26.8h sub v6.8h, v6.8h, v31.8h sub v7.8h, v7.8h, v31.8h smull v8.4s, v16.4h, v0.h[5] smlal v8.4s, v18.4h, v0.h[6] smlal v8.4s, v20.4h, v0.h[7] smlal v8.4s, v22.4h, v0.h[6] smlal v8.4s, v6.4h, v0.h[5] smull2 v9.4s, v16.8h, v0.h[5] smlal2 v9.4s, v18.8h, v0.h[6] smlal2 v9.4s, v20.8h, v0.h[7] smlal2 v9.4s, v22.8h, v0.h[6] smlal2 v9.4s, v6.8h, v0.h[5] smull v1.4s, v17.4h, v0.h[5] smlal v1.4s, v19.4h, v0.h[6] smlal v1.4s, v21.4h, v0.h[7] smlal v1.4s, v23.4h, v0.h[6] smlal v1.4s, v7.4h, v0.h[5] smull2 v5.4s, v17.8h, v0.h[5] smlal2 v5.4s, v19.8h, v0.h[6] smlal2 v5.4s, v21.8h, v0.h[7] smlal2 v5.4s, v23.8h, v0.h[6] smlal2 v5.4s, v7.8h, v0.h[5] srshl v8.4s, v8.4s, v27.4s // -round_bits_v srshl v9.4s, v9.4s, v27.4s srshl v1.4s, v1.4s, v27.4s srshl v5.4s, v5.4s, v27.4s sqxtun v8.4h, v8.4s sqxtun2 v8.8h, v9.4s sqxtun v9.4h, v1.4s sqxtun2 v9.8h, v5.4s st1 {v6.8h, v7.8h}, [x15], #32 umin v8.8h, v8.8h, v28.8h // bitdepth_max umin v9.8h, v9.8h, v28.8h subs w4, w4, #16 st1 {v8.8h, v9.8h}, [x0], #32 b.le 0f mov v2.16b, v4.16b tst w7, #2 // LR_HAVE_RIGHT ld1 {v3.8h, v4.8h}, [x3], #32 b.ne 4b // If we don't need to pad, just keep filtering. b 3b // If we need to pad, check how many pixels we have left. 0: ldp x3, x4, [sp, #48] ldp x15, x0, [sp, #32] ldp x13, x14, [sp, #16] ldp x11, x12, [sp], #64 add x3, x3, x1 add x0, x0, x1 ret endfunc #define SUM_STRIDE (384+16) #include "looprestoration_tmpl.S" // void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box3_h_16bpc_neon, export=1 add w5, w5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add x10, x0, #(4*SUM_STRIDE) // sumsq add x11, x1, #(2*SUM_STRIDE) // sum add x12, x3, x4 // src lsl x4, x4, #1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add w13, w5, #7 bic w13, w13, #7 sub x9, x9, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 // Subtract the number of pixels read from the input from the stride add w13, w13, #8 sub x4, x4, w13, uxtw #1 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 2f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #4 sub x12, x12, #4 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 2 pixels from the src pointer, // but shift it as if we had done that. add x4, x4, #4 1: // Loop vertically ld1 {v0.8h, v1.8h}, [x3], #32 ld1 {v16.8h, v17.8h}, [x12], #32 tst w7, #1 // LR_HAVE_LEFT b.eq 0f cbz x2, 2f // LR_HAVE_LEFT, left != NULL ld1 {v2.d}[1], [x2], #8 // Move x3/x12 back to account for the last 2 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #4 sub x12, x12, #4 ld1 {v18.d}[1], [x2], #8 ext v1.16b, v0.16b, v1.16b, #12 ext v0.16b, v2.16b, v0.16b, #12 ext v17.16b, v16.16b, v17.16b, #12 ext v16.16b, v18.16b, v16.16b, #12 b 2f 0: // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v0/v1 to have 2x the first pixel at the front. dup v2.8h, v0.h[0] dup v18.8h, v16.h[0] // Move x3 back to account for the last 2 pixels we loaded before, // which we shifted out. sub x3, x3, #4 sub x12, x12, #4 ext v1.16b, v0.16b, v1.16b, #12 ext v0.16b, v2.16b, v0.16b, #12 ext v17.16b, v16.16b, v17.16b, #12 ext v16.16b, v18.16b, v16.16b, #12 2: tst w7, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 2 + 1) ldr h30, [x3, w13, sxtw #1] ldr h31, [x12, w13, sxtw #1] // Fill v30/v31 with the right padding pixel dup v30.8h, v30.h[0] dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w5, #10 b.ge 4f // If w >= 10, all used input pixels are valid // 1 <= w < 10, w pixels valid in v0-v1. For w=9, this ends up called // again; it's not strictly needed in those cases (we pad enough here), // but keeping the code as simple as possible. // Insert padding in v0/1.h[w] onwards movrel x13, right_ext_mask sub x13, x13, w5, uxtw #1 ld1 {v28.16b, v29.16b}, [x13] bit v0.16b, v30.16b, v28.16b bit v1.16b, v30.16b, v29.16b bit v16.16b, v31.16b, v28.16b bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally ext v26.16b, v0.16b, v1.16b, #2 ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 ext v29.16b, v16.16b, v17.16b, #4 add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h add v7.8h, v16.8h, v28.8h umull v24.4s, v16.4h, v16.4h umlal v24.4s, v28.4h, v28.4h umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h add v7.8h, v7.8h, v29.8h umull2 v25.4s, v16.8h, v16.8h umlal2 v25.4s, v28.8h, v28.8h umlal2 v25.4s, v29.8h, v29.8h subs w5, w5, #8 st1 {v6.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 st1 {v24.4s,v25.4s}, [x10], #32 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 ld1 {v17.8h}, [x12], #16 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs w6, w6, #2 b.le 0f // Jump to the next row and loop horizontally add x0, x0, x9, lsl #1 add x10, x10, x9, lsl #1 add x1, x1, x9 add x11, x11, x9 add x3, x3, x4 add x12, x12, x4 mov w5, w8 b 1b 0: ret endfunc // void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum, // const pixel (*left)[4], // const pixel *src, const ptrdiff_t stride, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box5_h_16bpc_neon, export=1 add w5, w5, #2 // w += 2 // Set up pointers for reading/writing alternate rows add x10, x0, #(4*SUM_STRIDE) // sumsq add x11, x1, #(2*SUM_STRIDE) // sum add x12, x3, x4 // src lsl x4, x4, #1 mov x9, #(2*2*SUM_STRIDE) // double sum stride // Subtract the aligned width from the output stride. add w13, w5, #7 bic w13, w13, #7 sub x9, x9, w13, uxtw #1 add w13, w13, #8 sub x4, x4, w13, uxtw #1 // Store the width for the vertical loop mov w8, w5 // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL tst w7, #1 // LR_HAVE_LEFT b.eq 2f // LR_HAVE_LEFT cbnz x2, 0f // left == NULL sub x3, x3, #6 sub x12, x12, #6 b 1f 0: // LR_HAVE_LEFT, left != NULL 2: // !LR_HAVE_LEFT, increase the stride. // For this case we don't read the left 3 pixels from the src pointer, // but shift it as if we had done that. add x4, x4, #6 1: // Loop vertically ld1 {v0.8h, v1.8h}, [x3], #32 ld1 {v16.8h, v17.8h}, [x12], #32 tst w7, #1 // LR_HAVE_LEFT b.eq 0f cbz x2, 2f // LR_HAVE_LEFT, left != NULL ld1 {v2.d}[1], [x2], #8 // Move x3/x12 back to account for the last 3 pixels we loaded earlier, // which we'll shift out. sub x3, x3, #6 sub x12, x12, #6 ld1 {v18.d}[1], [x2], #8 ext v1.16b, v0.16b, v1.16b, #10 ext v0.16b, v2.16b, v0.16b, #10 ext v17.16b, v16.16b, v17.16b, #10 ext v16.16b, v18.16b, v16.16b, #10 b 2f 0: // !LR_HAVE_LEFT, fill v2 with the leftmost pixel // and shift v0/v1 to have 3x the first pixel at the front. dup v2.8h, v0.h[0] dup v18.8h, v16.h[0] // Move x3 back to account for the last 3 pixels we loaded before, // which we shifted out. sub x3, x3, #6 sub x12, x12, #6 ext v1.16b, v0.16b, v1.16b, #10 ext v0.16b, v2.16b, v0.16b, #10 ext v17.16b, v16.16b, v17.16b, #10 ext v16.16b, v18.16b, v16.16b, #10 2: tst w7, #2 // LR_HAVE_RIGHT b.ne 4f // If we'll need to pad the right edge, load that pixel to pad with // here since we can find it pretty easily from here. sub w13, w5, #(2 + 16 - 3 + 1) ldr h30, [x3, w13, sxtw #1] ldr h31, [x12, w13, sxtw #1] // Fill v30/v31 with the right padding pixel dup v30.8h, v30.h[0] dup v31.8h, v31.h[0] 3: // !LR_HAVE_RIGHT // Check whether we need to pad the right edge cmp w5, #11 b.ge 4f // If w >= 11, all used input pixels are valid // 1 <= w < 11, w+1 pixels valid in v0-v1. For w=9 or w=10, // this ends up called again; it's not strictly needed in those // cases (we pad enough here), but keeping the code as simple as possible. // Insert padding in v0/1.h[w+1] onwards; fuse the +1 into the // buffer pointer. movrel x13, right_ext_mask, -2 sub x13, x13, w5, uxtw #1 ld1 {v28.16b, v29.16b}, [x13] bit v0.16b, v30.16b, v28.16b bit v1.16b, v30.16b, v29.16b bit v16.16b, v31.16b, v28.16b bit v17.16b, v31.16b, v29.16b 4: // Loop horizontally ext v26.16b, v0.16b, v1.16b, #2 ext v28.16b, v16.16b, v17.16b, #2 ext v27.16b, v0.16b, v1.16b, #4 ext v29.16b, v16.16b, v17.16b, #4 add v6.8h, v0.8h, v26.8h umull v22.4s, v0.4h, v0.4h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h add v7.8h, v16.8h, v28.8h umull v24.4s, v16.4h, v16.4h umlal v24.4s, v28.4h, v28.4h umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umull2 v23.4s, v0.8h, v0.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h add v7.8h, v7.8h, v29.8h umull2 v25.4s, v16.8h, v16.8h umlal2 v25.4s, v28.8h, v28.8h umlal2 v25.4s, v29.8h, v29.8h ext v26.16b, v0.16b, v1.16b, #6 ext v28.16b, v16.16b, v17.16b, #6 ext v27.16b, v0.16b, v1.16b, #8 ext v29.16b, v16.16b, v17.16b, #8 add v6.8h, v6.8h, v26.8h umlal v22.4s, v26.4h, v26.4h umlal v22.4s, v27.4h, v27.4h add v7.8h, v7.8h, v28.8h umlal v24.4s, v28.4h, v28.4h umlal v24.4s, v29.4h, v29.4h add v6.8h, v6.8h, v27.8h umlal2 v23.4s, v26.8h, v26.8h umlal2 v23.4s, v27.8h, v27.8h add v7.8h, v7.8h, v29.8h umlal2 v25.4s, v28.8h, v28.8h umlal2 v25.4s, v29.8h, v29.8h subs w5, w5, #8 st1 {v6.8h}, [x1], #16 st1 {v7.8h}, [x11], #16 st1 {v22.4s,v23.4s}, [x0], #32 st1 {v24.4s,v25.4s}, [x10], #32 b.le 9f tst w7, #2 // LR_HAVE_RIGHT mov v0.16b, v1.16b mov v16.16b, v17.16b ld1 {v1.8h}, [x3], #16 ld1 {v17.8h}, [x12], #16 b.ne 4b // If we don't need to pad, just keep summing. b 3b // If we need to pad, check how many pixels we have left. 9: subs w6, w6, #2 b.le 0f // Jump to the next row and loop horizontally add x0, x0, x9, lsl #1 add x10, x10, x9, lsl #1 add x1, x1, x9 add x11, x11, x9 add x3, x3, x4 add x12, x12, x4 mov w5, w8 b 1b 0: ret endfunc sgr_funcs 16 rav1e-0.7.1/src/arm/64/looprestoration_common.S000064400000000000000000000412161046102023000174110ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define SUM_STRIDE (384+16) // void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box3_v_neon, export=1 add w10, w3, #2 // Number of output rows to move back mov w11, w3 // Number of input rows to move back add w2, w2, #2 // Actual summed width mov x7, #(4*SUM_STRIDE) // sumsq stride mov x8, #(2*SUM_STRIDE) // sum stride sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride sub x1, x1, #(2*SUM_STRIDE) // sum -= stride tst w4, #4 // LR_HAVE_TOP b.eq 0f // If have top, read from row -2. sub x5, x0, #(4*SUM_STRIDE) sub x6, x1, #(2*SUM_STRIDE) add w11, w11, #2 b 1f 0: // !LR_HAVE_TOP // If we don't have top, read from row 0 even if // we start writing to row -1. add x5, x0, #(4*SUM_STRIDE) add x6, x1, #(2*SUM_STRIDE) 1: tst w4, #8 // LR_HAVE_BOTTOM b.eq 1f // LR_HAVE_BOTTOM add w3, w3, #2 // Sum all h+2 lines with the main loop add w11, w11, #2 1: mov w9, w3 // Backup of h for next loops 1: // Start of horizontal loop; start one vertical filter slice. // Start loading rows into v16-v21 and v24-v26 taking top // padding into consideration. tst w4, #4 // LR_HAVE_TOP ld1 {v16.4s, v17.4s}, [x5], x7 ld1 {v24.8h}, [x6], x8 b.eq 2f // LR_HAVE_TOP ld1 {v18.4s, v19.4s}, [x5], x7 ld1 {v25.8h}, [x6], x8 ld1 {v20.4s, v21.4s}, [x5], x7 ld1 {v26.8h}, [x6], x8 b 3f 2: // !LR_HAVE_TOP mov v18.16b, v16.16b mov v19.16b, v17.16b mov v25.16b, v24.16b mov v20.16b, v16.16b mov v21.16b, v17.16b mov v26.16b, v24.16b 3: subs w3, w3, #1 .macro add3 add v16.4s, v16.4s, v18.4s add v17.4s, v17.4s, v19.4s add v24.8h, v24.8h, v25.8h add v16.4s, v16.4s, v20.4s add v17.4s, v17.4s, v21.4s add v24.8h, v24.8h, v26.8h st1 {v16.4s, v17.4s}, [x0], x7 st1 {v24.8h}, [x1], x8 .endm add3 mov v16.16b, v18.16b mov v17.16b, v19.16b mov v24.16b, v25.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v25.16b, v26.16b b.le 4f ld1 {v20.4s, v21.4s}, [x5], x7 ld1 {v26.8h}, [x6], x8 b 3b 4: tst w4, #8 // LR_HAVE_BOTTOM b.ne 5f // !LR_HAVE_BOTTOM // Produce two more rows, extending the already loaded rows. add3 mov v16.16b, v18.16b mov v17.16b, v19.16b mov v24.16b, v25.16b add3 5: // End of one vertical slice. subs w2, w2, #8 b.le 0f // Move pointers back up to the top and loop horizontally. // Input pointers msub x5, x7, x11, x5 msub x6, x8, x11, x6 // Output pointers msub x0, x7, x10, x0 msub x1, x8, x10, x1 add x0, x0, #32 add x1, x1, #16 add x5, x5, #32 add x6, x6, #16 mov w3, w9 b 1b 0: ret .purgem add3 endfunc // void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, // const int w, const int h, // const enum LrEdgeFlags edges); function sgr_box5_v_neon, export=1 add w10, w3, #2 // Number of output rows to move back mov w11, w3 // Number of input rows to move back add w2, w2, #8 // Actual summed width mov x7, #(4*SUM_STRIDE) // sumsq stride mov x8, #(2*SUM_STRIDE) // sum stride sub x0, x0, #(4*SUM_STRIDE) // sumsq -= stride sub x1, x1, #(2*SUM_STRIDE) // sum -= stride tst w4, #4 // LR_HAVE_TOP b.eq 0f // If have top, read from row -2. sub x5, x0, #(4*SUM_STRIDE) sub x6, x1, #(2*SUM_STRIDE) add w11, w11, #2 b 1f 0: // !LR_HAVE_TOP // If we don't have top, read from row 0 even if // we start writing to row -1. add x5, x0, #(4*SUM_STRIDE) add x6, x1, #(2*SUM_STRIDE) 1: tst w4, #8 // LR_HAVE_BOTTOM b.eq 0f // LR_HAVE_BOTTOM add w3, w3, #2 // Handle h+2 lines with the main loop add w11, w11, #2 b 1f 0: // !LR_HAVE_BOTTOM sub w3, w3, #1 // Handle h-1 lines with the main loop 1: mov w9, w3 // Backup of h for next loops 1: // Start of horizontal loop; start one vertical filter slice. // Start loading rows into v16-v25 and v26-v30 taking top // padding into consideration. tst w4, #4 // LR_HAVE_TOP ld1 {v16.4s, v17.4s}, [x5], x7 ld1 {v26.8h}, [x6], x8 b.eq 2f // LR_HAVE_TOP ld1 {v20.4s, v21.4s}, [x5], x7 ld1 {v28.8h}, [x6], x8 mov v18.16b, v16.16b mov v19.16b, v17.16b mov v27.16b, v26.16b ld1 {v22.4s, v23.4s}, [x5], x7 ld1 {v29.8h}, [x6], x8 b 3f 2: // !LR_HAVE_TOP mov v18.16b, v16.16b mov v19.16b, v17.16b mov v27.16b, v26.16b mov v20.16b, v16.16b mov v21.16b, v17.16b mov v28.16b, v26.16b mov v22.16b, v16.16b mov v23.16b, v17.16b mov v29.16b, v26.16b 3: cbz w3, 4f ld1 {v24.4s, v25.4s}, [x5], x7 ld1 {v30.8h}, [x6], x8 3: // Start of vertical loop subs w3, w3, #2 .macro add5 add v16.4s, v16.4s, v18.4s add v17.4s, v17.4s, v19.4s add v26.8h, v26.8h, v27.8h add v0.4s, v20.4s, v22.4s add v1.4s, v21.4s, v23.4s add v2.8h, v28.8h, v29.8h add v16.4s, v16.4s, v24.4s add v17.4s, v17.4s, v25.4s add v26.8h, v26.8h, v30.8h add v16.4s, v16.4s, v0.4s add v17.4s, v17.4s, v1.4s add v26.8h, v26.8h, v2.8h st1 {v16.4s, v17.4s}, [x0], x7 st1 {v26.8h}, [x1], x8 .endm add5 .macro shift2 mov v16.16b, v20.16b mov v17.16b, v21.16b mov v26.16b, v28.16b mov v18.16b, v22.16b mov v19.16b, v23.16b mov v27.16b, v29.16b mov v20.16b, v24.16b mov v21.16b, v25.16b mov v28.16b, v30.16b .endm shift2 add x0, x0, x7 add x1, x1, x8 b.le 5f ld1 {v22.4s, v23.4s}, [x5], x7 ld1 {v29.8h}, [x6], x8 ld1 {v24.4s, v25.4s}, [x5], x7 ld1 {v30.8h}, [x6], x8 b 3b 4: // h == 1, !LR_HAVE_BOTTOM. // Pad the last row with the only content row, and add. mov v24.16b, v22.16b mov v25.16b, v23.16b mov v30.16b, v29.16b add5 shift2 add x0, x0, x7 add x1, x1, x8 add5 b 6f 5: tst w4, #8 // LR_HAVE_BOTTOM b.ne 6f // !LR_HAVE_BOTTOM cbnz w3, 5f // The intended three edge rows left; output the one at h-2 and // the past edge one at h. ld1 {v22.4s, v23.4s}, [x5], x7 ld1 {v29.8h}, [x6], x8 // Pad the past-edge row from the last content row. mov v24.16b, v22.16b mov v25.16b, v23.16b mov v30.16b, v29.16b add5 shift2 add x0, x0, x7 add x1, x1, x8 // The last two rows are already padded properly here. add5 b 6f 5: // w3 == -1, two rows left, output one. // Pad the last two rows from the mid one. mov v22.16b, v20.16b mov v23.16b, v21.16b mov v29.16b, v28.16b mov v24.16b, v20.16b mov v25.16b, v21.16b mov v30.16b, v28.16b add5 add x0, x0, x7 add x1, x1, x8 b 6f 6: // End of one vertical slice. subs w2, w2, #8 b.le 0f // Move pointers back up to the top and loop horizontally. // Input pointers msub x5, x7, x11, x5 msub x6, x8, x11, x6 // Output pointers msub x0, x7, x10, x0 msub x1, x8, x10, x1 add x0, x0, #32 add x1, x1, #16 add x5, x5, #32 add x6, x6, #16 mov w3, w9 b 1b 0: ret .purgem add5 endfunc // void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, // const int w, const int h, const int strength, // const int bitdepth_max); // void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, // const int w, const int h, const int strength, // const int bitdepth_max); function sgr_calc_ab1_neon, export=1 clz w9, w5 add x3, x3, #2 // h += 2 movi v31.4s, #9 // n mov x5, #455 mov x8, #SUM_STRIDE b sgr_calc_ab_neon endfunc function sgr_calc_ab2_neon, export=1 clz w9, w5 add x3, x3, #3 // h += 3 asr x3, x3, #1 // h /= 2 movi v31.4s, #25 // n mov x5, #164 mov x8, #(2*SUM_STRIDE) endfunc function sgr_calc_ab_neon sub w9, w9, #24 // -bitdepth_min_8 movrel x12, X(sgr_x_by_x) ld1 {v16.16b, v17.16b, v18.16b}, [x12] dup v6.8h, w9 // -bitdepth_min_8 movi v19.16b, #5 movi v20.8b, #55 // idx of last 5 movi v21.8b, #72 // idx of last 4 movi v22.8b, #101 // idx of last 3 movi v23.8b, #169 // idx of last 2 movi v24.8b, #254 // idx of last 1 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 add x2, x2, #2 // w += 2 add x7, x2, #7 bic x7, x7, #7 // aligned w sub x7, x8, x7 // increment between rows movi v29.8h, #1, lsl #8 dup v28.4s, w4 dup v30.4s, w5 // one_by_x sub x0, x0, #(4*(SUM_STRIDE)) sub x1, x1, #(2*(SUM_STRIDE)) mov x6, x2 // backup of w sub v16.16b, v16.16b, v19.16b sub v17.16b, v17.16b, v19.16b sub v18.16b, v18.16b, v19.16b 1: subs x2, x2, #8 ld1 {v0.4s, v1.4s}, [x0] // a ld1 {v2.8h}, [x1] // b srshl v0.4s, v0.4s, v7.4s srshl v1.4s, v1.4s, v7.4s srshl v4.8h, v2.8h, v6.8h mul v0.4s, v0.4s, v31.4s // a * n mul v1.4s, v1.4s, v31.4s // a * n umull v3.4s, v4.4h, v4.4h // b * b umull2 v4.4s, v4.8h, v4.8h // b * b uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) mul v0.4s, v0.4s, v28.4s // p * s mul v1.4s, v1.4s, v28.4s // p * s uqshrn v0.4h, v0.4s, #16 uqshrn2 v0.8h, v1.4s, #16 uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 add v25.8b, v25.8b, v26.8b cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 add v27.8b, v27.8b, v4.8b add v5.8b, v5.8b, v19.8b add v25.8b, v25.8b, v27.8b add v1.8b, v1.8b, v5.8b add v1.8b, v1.8b, v25.8b uxtl v1.8h, v1.8b // x umull v3.4s, v1.4h, v2.4h // x * BB[i] umull2 v4.4s, v1.8h, v2.8h // x * BB[i] mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x srshr v3.4s, v3.4s, #12 // AA[i] srshr v4.4s, v4.4s, #12 // AA[i] sub v2.8h, v29.8h, v1.8h // 256 - x st1 {v3.4s, v4.4s}, [x0], #32 st1 {v2.8h}, [x1], #16 b.gt 1b subs x3, x3, #1 b.le 0f add x0, x0, x7, lsl #2 add x1, x1, x7, lsl #1 mov x2, x6 b 1b 0: ret endfunc rav1e-0.7.1/src/arm/64/looprestoration_tmpl.S000064400000000000000000000573041046102023000171020ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #define FILTER_OUT_STRIDE 384 .macro sgr_funcs bpc // void dav1d_sgr_finish_filter1_Xbpc_neon(int16_t *tmp, // const pixel *src, const ptrdiff_t stride, // const int32_t *a, const int16_t *b, // const int w, const int h); function sgr_finish_filter1_\bpc\()bpc_neon, export=1 sub x7, x3, #(4*SUM_STRIDE) add x8, x3, #(4*SUM_STRIDE) sub x9, x4, #(2*SUM_STRIDE) add x10, x4, #(2*SUM_STRIDE) mov x11, #SUM_STRIDE mov x12, #FILTER_OUT_STRIDE add x13, x5, #7 bic x13, x13, #7 // Aligned width .if \bpc == 8 sub x2, x2, x13 .else sub x2, x2, x13, lsl #1 .endif sub x12, x12, x13 sub x11, x11, x13 sub x11, x11, #4 // We read 4 extra elements from a sub x14, x11, #4 // We read 8 extra elements from b mov x13, x5 movi v6.8h, #3 movi v7.4s, #3 1: ld1 {v0.8h, v1.8h}, [x9], #32 ld1 {v2.8h, v3.8h}, [x4], #32 ld1 {v4.8h, v5.8h}, [x10], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x7], #48 ld1 {v19.4s, v20.4s, v21.4s}, [x3], #48 ld1 {v22.4s, v23.4s, v24.4s}, [x8], #48 2: subs x5, x5, #8 ext v25.16b, v0.16b, v1.16b, #2 // -stride ext v26.16b, v2.16b, v3.16b, #2 // 0 ext v27.16b, v4.16b, v5.16b, #2 // +stride ext v28.16b, v0.16b, v1.16b, #4 // +1-stride ext v29.16b, v2.16b, v3.16b, #4 // +1 ext v30.16b, v4.16b, v5.16b, #4 // +1+stride add v2.8h, v2.8h, v25.8h // -1, -stride add v26.8h, v26.8h, v27.8h // 0, +stride add v0.8h, v0.8h, v28.8h // -1-stride, +1-stride add v2.8h, v2.8h, v26.8h add v4.8h, v4.8h, v30.8h // -1+stride, +1+stride add v2.8h, v2.8h, v29.8h // +1 add v0.8h, v0.8h, v4.8h ext v25.16b, v16.16b, v17.16b, #4 // -stride ext v26.16b, v17.16b, v18.16b, #4 shl v2.8h, v2.8h, #2 ext v27.16b, v16.16b, v17.16b, #8 // +1-stride ext v28.16b, v17.16b, v18.16b, #8 ext v29.16b, v19.16b, v20.16b, #4 // 0 ext v30.16b, v20.16b, v21.16b, #4 mla v2.8h, v0.8h, v6.8h // * 3 -> a add v25.4s, v25.4s, v19.4s // -stride, -1 add v26.4s, v26.4s, v20.4s add v16.4s, v16.4s, v27.4s // -1-stride, +1-stride add v17.4s, v17.4s, v28.4s ext v27.16b, v19.16b, v20.16b, #8 // +1 ext v28.16b, v20.16b, v21.16b, #8 add v16.4s, v16.4s, v22.4s // -1+stride add v17.4s, v17.4s, v23.4s add v29.4s, v29.4s, v27.4s // 0, +1 add v30.4s, v30.4s, v28.4s add v25.4s, v25.4s, v29.4s add v26.4s, v26.4s, v30.4s ext v27.16b, v22.16b, v23.16b, #4 // +stride ext v28.16b, v23.16b, v24.16b, #4 ext v29.16b, v22.16b, v23.16b, #8 // +1+stride ext v30.16b, v23.16b, v24.16b, #8 .if \bpc == 8 ld1 {v19.8b}, [x1], #8 // src .else ld1 {v19.8h}, [x1], #16 // src .endif add v25.4s, v25.4s, v27.4s // +stride add v26.4s, v26.4s, v28.4s add v16.4s, v16.4s, v29.4s // +1+stride add v17.4s, v17.4s, v30.4s shl v25.4s, v25.4s, #2 shl v26.4s, v26.4s, #2 mla v25.4s, v16.4s, v7.4s // * 3 -> b mla v26.4s, v17.4s, v7.4s .if \bpc == 8 uxtl v19.8h, v19.8b // src .endif mov v0.16b, v1.16b umlal v25.4s, v2.4h, v19.4h // b + a * src umlal2 v26.4s, v2.8h, v19.8h mov v2.16b, v3.16b rshrn v25.4h, v25.4s, #9 rshrn2 v25.8h, v26.4s, #9 mov v4.16b, v5.16b st1 {v25.8h}, [x0], #16 b.le 3f mov v16.16b, v18.16b mov v19.16b, v21.16b mov v22.16b, v24.16b ld1 {v1.8h}, [x9], #16 ld1 {v3.8h}, [x4], #16 ld1 {v5.8h}, [x10], #16 ld1 {v17.4s, v18.4s}, [x7], #32 ld1 {v20.4s, v21.4s}, [x3], #32 ld1 {v23.4s, v24.4s}, [x8], #32 b 2b 3: subs x6, x6, #1 b.le 0f mov x5, x13 add x0, x0, x12, lsl #1 add x1, x1, x2 add x3, x3, x11, lsl #2 add x7, x7, x11, lsl #2 add x8, x8, x11, lsl #2 add x4, x4, x14, lsl #1 add x9, x9, x14, lsl #1 add x10, x10, x14, lsl #1 b 1b 0: ret endfunc // void dav1d_sgr_finish_filter2_Xbpc_neon(int16_t *tmp, // const pixel *src, const ptrdiff_t stride, // const int32_t *a, const int16_t *b, // const int w, const int h); function sgr_finish_filter2_\bpc\()bpc_neon, export=1 add x7, x3, #(4*(SUM_STRIDE)) sub x3, x3, #(4*(SUM_STRIDE)) add x8, x4, #(2*(SUM_STRIDE)) sub x4, x4, #(2*(SUM_STRIDE)) mov x9, #(2*SUM_STRIDE) mov x10, #FILTER_OUT_STRIDE add x11, x5, #7 bic x11, x11, #7 // Aligned width .if \bpc == 8 sub x2, x2, x11 .else sub x2, x2, x11, lsl #1 .endif sub x10, x10, x11 sub x9, x9, x11 sub x9, x9, #4 // We read 4 extra elements from a sub x12, x9, #4 // We read 8 extra elements from b mov x11, x5 movi v4.8h, #5 movi v5.4s, #5 movi v6.8h, #6 movi v7.4s, #6 1: ld1 {v0.8h, v1.8h}, [x4], #32 ld1 {v2.8h, v3.8h}, [x8], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 ld1 {v19.4s, v20.4s, v21.4s}, [x7], #48 2: subs x5, x5, #8 ext v24.16b, v0.16b, v1.16b, #4 // +1-stride ext v25.16b, v2.16b, v3.16b, #4 // +1+stride ext v22.16b, v0.16b, v1.16b, #2 // -stride ext v23.16b, v2.16b, v3.16b, #2 // +stride add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride add v2.8h, v22.8h, v23.8h // -stride, +stride add v0.8h, v0.8h, v25.8h ext v22.16b, v16.16b, v17.16b, #4 // -stride ext v23.16b, v17.16b, v18.16b, #4 ext v24.16b, v19.16b, v20.16b, #4 // +stride ext v25.16b, v20.16b, v21.16b, #4 ext v26.16b, v16.16b, v17.16b, #8 // +1-stride ext v27.16b, v17.16b, v18.16b, #8 ext v28.16b, v19.16b, v20.16b, #8 // +1+stride ext v29.16b, v20.16b, v21.16b, #8 mul v0.8h, v0.8h, v4.8h // * 5 mla v0.8h, v2.8h, v6.8h // * 6 .if \bpc == 8 ld1 {v31.8b}, [x1], #8 .else ld1 {v31.8h}, [x1], #16 .endif add v16.4s, v16.4s, v26.4s // -1-stride, +1-stride add v17.4s, v17.4s, v27.4s add v19.4s, v19.4s, v28.4s // -1+stride, +1+stride add v20.4s, v20.4s, v29.4s add v16.4s, v16.4s, v19.4s add v17.4s, v17.4s, v20.4s add v22.4s, v22.4s, v24.4s // -stride, +stride add v23.4s, v23.4s, v25.4s // This is, surprisingly, faster than other variants where the // mul+mla pairs are further apart, on Cortex A53. mul v16.4s, v16.4s, v5.4s // * 5 mla v16.4s, v22.4s, v7.4s // * 6 mul v17.4s, v17.4s, v5.4s // * 5 mla v17.4s, v23.4s, v7.4s // * 6 .if \bpc == 8 uxtl v31.8h, v31.8b .endif umlal v16.4s, v0.4h, v31.4h // b + a * src umlal2 v17.4s, v0.8h, v31.8h mov v0.16b, v1.16b rshrn v16.4h, v16.4s, #9 rshrn2 v16.8h, v17.4s, #9 mov v2.16b, v3.16b st1 {v16.8h}, [x0], #16 b.le 3f mov v16.16b, v18.16b mov v19.16b, v21.16b ld1 {v1.8h}, [x4], #16 ld1 {v3.8h}, [x8], #16 ld1 {v17.4s, v18.4s}, [x3], #32 ld1 {v20.4s, v21.4s}, [x7], #32 b 2b 3: subs x6, x6, #1 b.le 0f mov x5, x11 add x0, x0, x10, lsl #1 add x1, x1, x2 add x3, x3, x9, lsl #2 add x7, x7, x9, lsl #2 add x4, x4, x12, lsl #1 add x8, x8, x12, lsl #1 mov x13, x3 mov x14, x4 ld1 {v0.8h, v1.8h}, [x4], #32 ld1 {v16.4s, v17.4s, v18.4s}, [x3], #48 4: subs x5, x5, #8 ext v23.16b, v0.16b, v1.16b, #4 // +1 ext v22.16b, v0.16b, v1.16b, #2 // 0 add v0.8h, v0.8h, v23.8h // -1, +1 ext v24.16b, v16.16b, v17.16b, #4 // 0 ext v25.16b, v17.16b, v18.16b, #4 ext v26.16b, v16.16b, v17.16b, #8 // +1 ext v27.16b, v17.16b, v18.16b, #8 mul v2.8h, v22.8h, v6.8h // * 6 mla v2.8h, v0.8h, v4.8h // * 5 -> a .if \bpc == 8 ld1 {v31.8b}, [x1], #8 .else ld1 {v31.8h}, [x1], #16 .endif add v16.4s, v16.4s, v26.4s // -1, +1 add v17.4s, v17.4s, v27.4s .if \bpc == 8 uxtl v31.8h, v31.8b .endif // This is, surprisingly, faster than other variants where the // mul+mla pairs are further apart, on Cortex A53. mul v24.4s, v24.4s, v7.4s // * 6 mla v24.4s, v16.4s, v5.4s // * 5 -> b mul v25.4s, v25.4s, v7.4s // * 6 mla v25.4s, v17.4s, v5.4s // * 5 -> b umlal v24.4s, v2.4h, v31.4h // b + a * src umlal2 v25.4s, v2.8h, v31.8h mov v0.16b, v1.16b rshrn v24.4h, v24.4s, #8 rshrn2 v24.8h, v25.4s, #8 mov v16.16b, v18.16b st1 {v24.8h}, [x0], #16 b.le 5f ld1 {v1.8h}, [x4], #16 ld1 {v17.4s, v18.4s}, [x3], #32 b 4b 5: subs x6, x6, #1 b.le 0f mov x5, x11 add x0, x0, x10, lsl #1 add x1, x1, x2 mov x3, x13 // Rewind x3/x4 to where they started mov x4, x14 b 1b 0: ret endfunc // void dav1d_sgr_weighted1_Xbpc_neon(pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int w, const int h, // const int wt, const int bitdepth_max); function sgr_weighted1_\bpc\()bpc_neon, export=1 .if \bpc == 16 ldr w8, [sp] .endif dup v31.8h, w7 cmp x6, #2 .if \bpc == 16 dup v30.8h, w8 .endif add x9, x0, x1 add x10, x2, x3 add x11, x4, #2*FILTER_OUT_STRIDE mov x7, #(4*FILTER_OUT_STRIDE) lsl x1, x1, #1 lsl x3, x3, #1 add x8, x5, #7 bic x8, x8, #7 // Aligned width .if \bpc == 8 sub x1, x1, x8 sub x3, x3, x8 .else sub x1, x1, x8, lsl #1 sub x3, x3, x8, lsl #1 .endif sub x7, x7, x8, lsl #1 mov x8, x5 b.lt 2f 1: .if \bpc == 8 ld1 {v0.8b}, [x2], #8 ld1 {v4.8b}, [x10], #8 .else ld1 {v0.8h}, [x2], #16 ld1 {v4.8h}, [x10], #16 .endif ld1 {v1.8h}, [x4], #16 ld1 {v5.8h}, [x11], #16 subs x5, x5, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u ushll v4.8h, v4.8b, #4 // u .else shl v0.8h, v0.8h, #4 // u shl v4.8h, v4.8h, #4 // u .endif sub v1.8h, v1.8h, v0.8h // t1 - u sub v5.8h, v5.8h, v4.8h // t1 - u ushll v2.4s, v0.4h, #7 // u << 7 ushll2 v3.4s, v0.8h, #7 // u << 7 ushll v6.4s, v4.4h, #7 // u << 7 ushll2 v7.4s, v4.8h, #7 // u << 7 smlal v2.4s, v1.4h, v31.4h // v smlal2 v3.4s, v1.8h, v31.8h // v smlal v6.4s, v5.4h, v31.4h // v smlal2 v7.4s, v5.8h, v31.8h // v .if \bpc == 8 rshrn v2.4h, v2.4s, #11 rshrn2 v2.8h, v3.4s, #11 rshrn v6.4h, v6.4s, #11 rshrn2 v6.8h, v7.4s, #11 sqxtun v2.8b, v2.8h sqxtun v6.8b, v6.8h st1 {v2.8b}, [x0], #8 st1 {v6.8b}, [x9], #8 .else sqrshrun v2.4h, v2.4s, #11 sqrshrun2 v2.8h, v3.4s, #11 sqrshrun v6.4h, v6.4s, #11 sqrshrun2 v6.8h, v7.4s, #11 umin v2.8h, v2.8h, v30.8h umin v6.8h, v6.8h, v30.8h st1 {v2.8h}, [x0], #16 st1 {v6.8h}, [x9], #16 .endif b.gt 1b sub x6, x6, #2 cmp x6, #1 b.lt 0f mov x5, x8 add x0, x0, x1 add x9, x9, x1 add x2, x2, x3 add x10, x10, x3 add x4, x4, x7 add x11, x11, x7 b.eq 2f b 1b 2: .if \bpc == 8 ld1 {v0.8b}, [x2], #8 .else ld1 {v0.8h}, [x2], #16 .endif ld1 {v1.8h}, [x4], #16 subs x5, x5, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u .else shl v0.8h, v0.8h, #4 // u .endif sub v1.8h, v1.8h, v0.8h // t1 - u ushll v2.4s, v0.4h, #7 // u << 7 ushll2 v3.4s, v0.8h, #7 // u << 7 smlal v2.4s, v1.4h, v31.4h // v smlal2 v3.4s, v1.8h, v31.8h // v .if \bpc == 8 rshrn v2.4h, v2.4s, #11 rshrn2 v2.8h, v3.4s, #11 sqxtun v2.8b, v2.8h st1 {v2.8b}, [x0], #8 .else sqrshrun v2.4h, v2.4s, #11 sqrshrun2 v2.8h, v3.4s, #11 umin v2.8h, v2.8h, v30.8h st1 {v2.8h}, [x0], #16 .endif b.gt 2b 0: ret endfunc // void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *t1, const int16_t *t2, // const int w, const int h, // const int16_t wt[2], const int bitdepth_max); function sgr_weighted2_\bpc\()bpc_neon, export=1 .if \bpc == 8 ldr x8, [sp] .else ldp x8, x9, [sp] .endif cmp x7, #2 add x10, x0, x1 add x11, x2, x3 add x12, x4, #2*FILTER_OUT_STRIDE add x13, x5, #2*FILTER_OUT_STRIDE ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1] .if \bpc == 16 dup v29.8h, w9 .endif mov x8, #4*FILTER_OUT_STRIDE lsl x1, x1, #1 lsl x3, x3, #1 add x9, x6, #7 bic x9, x9, #7 // Aligned width .if \bpc == 8 sub x1, x1, x9 sub x3, x3, x9 .else sub x1, x1, x9, lsl #1 sub x3, x3, x9, lsl #1 .endif sub x8, x8, x9, lsl #1 mov x9, x6 b.lt 2f 1: .if \bpc == 8 ld1 {v0.8b}, [x2], #8 ld1 {v16.8b}, [x11], #8 .else ld1 {v0.8h}, [x2], #16 ld1 {v16.8h}, [x11], #16 .endif ld1 {v1.8h}, [x4], #16 ld1 {v17.8h}, [x12], #16 ld1 {v2.8h}, [x5], #16 ld1 {v18.8h}, [x13], #16 subs x6, x6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u ushll v16.8h, v16.8b, #4 // u .else shl v0.8h, v0.8h, #4 // u shl v16.8h, v16.8h, #4 // u .endif sub v1.8h, v1.8h, v0.8h // t1 - u sub v2.8h, v2.8h, v0.8h // t2 - u sub v17.8h, v17.8h, v16.8h // t1 - u sub v18.8h, v18.8h, v16.8h // t2 - u ushll v3.4s, v0.4h, #7 // u << 7 ushll2 v4.4s, v0.8h, #7 // u << 7 ushll v19.4s, v16.4h, #7 // u << 7 ushll2 v20.4s, v16.8h, #7 // u << 7 smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) smlal v19.4s, v17.4h, v30.4h // wt[0] * (t1 - u) smlal v19.4s, v18.4h, v31.4h // wt[1] * (t2 - u) smlal2 v20.4s, v17.8h, v30.8h // wt[0] * (t1 - u) smlal2 v20.4s, v18.8h, v31.8h // wt[1] * (t2 - u) .if \bpc == 8 rshrn v3.4h, v3.4s, #11 rshrn2 v3.8h, v4.4s, #11 rshrn v19.4h, v19.4s, #11 rshrn2 v19.8h, v20.4s, #11 sqxtun v3.8b, v3.8h sqxtun v19.8b, v19.8h st1 {v3.8b}, [x0], #8 st1 {v19.8b}, [x10], #8 .else sqrshrun v3.4h, v3.4s, #11 sqrshrun2 v3.8h, v4.4s, #11 sqrshrun v19.4h, v19.4s, #11 sqrshrun2 v19.8h, v20.4s, #11 umin v3.8h, v3.8h, v29.8h umin v19.8h, v19.8h, v29.8h st1 {v3.8h}, [x0], #16 st1 {v19.8h}, [x10], #16 .endif b.gt 1b subs x7, x7, #2 cmp x7, #1 b.lt 0f mov x6, x9 add x0, x0, x1 add x10, x10, x1 add x2, x2, x3 add x11, x11, x3 add x4, x4, x8 add x12, x12, x8 add x5, x5, x8 add x13, x13, x8 b.eq 2f b 1b 2: .if \bpc == 8 ld1 {v0.8b}, [x2], #8 .else ld1 {v0.8h}, [x2], #16 .endif ld1 {v1.8h}, [x4], #16 ld1 {v2.8h}, [x5], #16 subs x6, x6, #8 .if \bpc == 8 ushll v0.8h, v0.8b, #4 // u .else shl v0.8h, v0.8h, #4 // u .endif sub v1.8h, v1.8h, v0.8h // t1 - u sub v2.8h, v2.8h, v0.8h // t2 - u ushll v3.4s, v0.4h, #7 // u << 7 ushll2 v4.4s, v0.8h, #7 // u << 7 smlal v3.4s, v1.4h, v30.4h // wt[0] * (t1 - u) smlal v3.4s, v2.4h, v31.4h // wt[1] * (t2 - u) smlal2 v4.4s, v1.8h, v30.8h // wt[0] * (t1 - u) smlal2 v4.4s, v2.8h, v31.8h // wt[1] * (t2 - u) .if \bpc == 8 rshrn v3.4h, v3.4s, #11 rshrn2 v3.8h, v4.4s, #11 sqxtun v3.8b, v3.8h st1 {v3.8b}, [x0], #8 .else sqrshrun v3.4h, v3.4s, #11 sqrshrun2 v3.8h, v4.4s, #11 umin v3.8h, v3.8h, v29.8h st1 {v3.8h}, [x0], #16 .endif b.gt 1b 0: ret endfunc .endm rav1e-0.7.1/src/arm/64/mc.S000064400000000000000000003600311046102023000131740ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * Copyright © 2018, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" .macro avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 add \t0\().8h, \t0\().8h, \t2\().8h add \t1\().8h, \t1\().8h, \t3\().8h sqrshrun \dst\().8b, \t0\().8h, #5 sqrshrun2 \dst\().16b, \t1\().8h, #5 .endm .macro w_avg dst, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sub \t0\().8h, \t2\().8h, \t0\().8h sub \t1\().8h, \t3\().8h, \t1\().8h sqdmulh \t0\().8h, \t0\().8h, v30.8h sqdmulh \t1\().8h, \t1\().8h, v30.8h add \t0\().8h, \t2\().8h, \t0\().8h add \t1\().8h, \t3\().8h, \t1\().8h sqrshrun \dst\().8b, \t0\().8h, #4 sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm .macro mask dst, t0, t1, t2, t3 ld1 {v30.16b}, [x6], 16 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 mul v30.16b, v30.16b, v31.16b ld1 {\t2\().8h,\t3\().8h}, [x3], 32 shll v28.8h, v30.8b, #8 shll2 v29.8h, v30.16b, #8 sub \t0\().8h, \t2\().8h, \t0\().8h sub \t1\().8h, \t3\().8h, \t1\().8h sqdmulh \t0\().8h, \t0\().8h, v28.8h sqdmulh \t1\().8h, \t1\().8h, v29.8h add \t0\().8h, \t2\().8h, \t0\().8h add \t1\().8h, \t3\().8h, \t1\().8h sqrshrun \dst\().8b, \t0\().8h, #4 sqrshrun2 \dst\().16b, \t1\().8h, #4 .endm .macro bidir_fn type function \type\()_8bpc_neon, export=1 clz w4, w4 .ifc \type, w_avg dup v30.8h, w6 neg v30.8h, v30.8h shl v30.8h, v30.8h, #11 .endif .ifc \type, mask movi v31.16b, #256-2 .endif adr x7, L(\type\()_tbl) sub w4, w4, #24 ldrh w4, [x7, x4, lsl #1] \type v4, v0, v1, v2, v3 sub x7, x7, w4, uxtw br x7 40: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: cmp w5, #4 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x7], x1 st1 {v4.s}[2], [x0], x1 st1 {v4.s}[3], [x7], x1 b.eq 0f \type v5, v0, v1, v2, v3 cmp w5, #8 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 st1 {v5.s}[3], [x7], x1 b.eq 0f \type v4, v0, v1, v2, v3 st1 {v4.s}[0], [x0], x1 st1 {v4.s}[1], [x7], x1 \type v5, v0, v1, v2, v3 st1 {v4.s}[2], [x0], x1 st1 {v4.s}[3], [x7], x1 st1 {v5.s}[0], [x0], x1 st1 {v5.s}[1], [x7], x1 st1 {v5.s}[2], [x0], x1 st1 {v5.s}[3], [x7], x1 ret 80: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: st1 {v4.d}[0], [x0], x1 \type v5, v0, v1, v2, v3 st1 {v4.d}[1], [x7], x1 st1 {v5.d}[0], [x0], x1 subs w5, w5, #4 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 8b 16: AARCH64_VALID_JUMP_TARGET \type v5, v0, v1, v2, v3 st1 {v4.16b}, [x0], x1 \type v6, v0, v1, v2, v3 st1 {v5.16b}, [x0], x1 \type v7, v0, v1, v2, v3 st1 {v6.16b}, [x0], x1 subs w5, w5, #4 st1 {v7.16b}, [x0], x1 b.le 0f \type v4, v0, v1, v2, v3 b 16b 320: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 32: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 st1 {v4.16b,v5.16b}, [x0], x1 \type v7, v0, v1, v2, v3 subs w5, w5, #2 st1 {v6.16b,v7.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 32b 640: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 64: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 \type v7, v0, v1, v2, v3 \type v16, v0, v1, v2, v3 \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 \type v18, v0, v1, v2, v3 \type v19, v0, v1, v2, v3 subs w5, w5, #2 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 64b 1280: AARCH64_VALID_JUMP_TARGET add x7, x0, #64 128: \type v5, v0, v1, v2, v3 \type v6, v0, v1, v2, v3 \type v7, v0, v1, v2, v3 \type v16, v0, v1, v2, v3 \type v17, v0, v1, v2, v3 st1 {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1 \type v18, v0, v1, v2, v3 \type v19, v0, v1, v2, v3 subs w5, w5, #1 st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1 b.le 0f \type v4, v0, v1, v2, v3 b 128b 0: ret L(\type\()_tbl): .hword L(\type\()_tbl) - 1280b .hword L(\type\()_tbl) - 640b .hword L(\type\()_tbl) - 320b .hword L(\type\()_tbl) - 16b .hword L(\type\()_tbl) - 80b .hword L(\type\()_tbl) - 40b endfunc .endm bidir_fn avg bidir_fn w_avg bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 clz w8, w4 adr x9, L(w_mask_\type\()_tbl) sub w8, w8, #24 ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw mov w10, #6903 dup v0.8h, w10 .if \type == 444 movi v1.16b, #64 .elseif \type == 422 dup v2.8b, w7 movi v3.8b, #129 sub v3.8b, v3.8b, v2.8b .elseif \type == 420 dup v2.8h, w7 movi v3.8h, #1, lsl #8 sub v3.8h, v3.8h, v2.8h .endif add x12, x0, x1 lsl x1, x1, #1 br x9 4: AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once) ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once) subs w5, w5, #4 sub v16.8h, v6.8h, v4.8h sub v17.8h, v7.8h, v5.8h sabd v18.8h, v4.8h, v6.8h sabd v19.8h, v5.8h, v7.8h uqsub v18.8h, v0.8h, v18.8h uqsub v19.8h, v0.8h, v19.8h ushr v18.8h, v18.8h, #8 ushr v19.8h, v19.8h, #8 shl v20.8h, v18.8h, #9 shl v21.8h, v19.8h, #9 sqdmulh v20.8h, v20.8h, v16.8h sqdmulh v21.8h, v21.8h, v17.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v5.8h sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 addp v18.8h, v18.8h, v19.8h xtn v18.8b, v18.8h uhsub v18.8b, v3.8b, v18.8b st1 {v18.8b}, [x6], #8 .elseif \type == 420 trn1 v24.2d, v18.2d, v19.2d trn2 v25.2d, v18.2d, v19.2d add v24.8h, v24.8h, v25.8h addp v18.8h, v24.8h, v24.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 st1 {v18.s}[0], [x6], #4 .endif st1 {v22.s}[0], [x0], x1 st1 {v22.s}[1], [x12], x1 st1 {v23.s}[0], [x0], x1 st1 {v23.s}[1], [x12], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 subs w5, w5, #2 sub v16.8h, v6.8h, v4.8h sub v17.8h, v7.8h, v5.8h sabd v18.8h, v4.8h, v6.8h sabd v19.8h, v5.8h, v7.8h uqsub v18.8h, v0.8h, v18.8h uqsub v19.8h, v0.8h, v19.8h ushr v18.8h, v18.8h, #8 ushr v19.8h, v19.8h, #8 shl v20.8h, v18.8h, #9 shl v21.8h, v19.8h, #9 sqdmulh v20.8h, v20.8h, v16.8h sqdmulh v21.8h, v21.8h, v17.8h add v20.8h, v20.8h, v4.8h add v21.8h, v21.8h, v5.8h sqrshrun v22.8b, v20.8h, #4 sqrshrun v23.8b, v21.8h, #4 .if \type == 444 uzp1 v18.16b, v18.16b, v19.16b // Same as xtn, xtn2 sub v18.16b, v1.16b, v18.16b st1 {v18.16b}, [x6], #16 .elseif \type == 422 addp v18.8h, v18.8h, v19.8h xtn v18.8b, v18.8h uhsub v18.8b, v3.8b, v18.8b st1 {v18.8b}, [x6], #8 .elseif \type == 420 add v18.8h, v18.8h, v19.8h addp v18.8h, v18.8h, v18.8h sub v18.4h, v3.4h, v18.4h rshrn v18.8b, v18.8h, #2 st1 {v18.s}[0], [x6], #4 .endif st1 {v22.8b}, [x0], x1 st1 {v23.8b}, [x12], x1 b.gt 8b ret 1280: 640: 320: 160: AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw .if \type == 444 add x10, x6, w4, uxtw .elseif \type == 422 add x10, x6, x11, lsr #1 .endif add x9, x3, w4, uxtw #1 add x7, x2, w4, uxtw #1 161: mov w8, w4 16: ld1 {v4.8h, v5.8h}, [x2], #32 ld1 {v6.8h, v7.8h}, [x3], #32 ld1 {v16.8h, v17.8h}, [x7], #32 ld1 {v18.8h, v19.8h}, [x9], #32 subs w8, w8, #16 sub v6.8h, v6.8h, v4.8h sub v7.8h, v7.8h, v5.8h sub v18.8h, v18.8h, v16.8h sub v19.8h, v19.8h, v17.8h abs v20.8h, v6.8h abs v21.8h, v7.8h abs v22.8h, v18.8h abs v23.8h, v19.8h uqsub v20.8h, v0.8h, v20.8h uqsub v21.8h, v0.8h, v21.8h uqsub v22.8h, v0.8h, v22.8h uqsub v23.8h, v0.8h, v23.8h ushr v20.8h, v20.8h, #8 ushr v21.8h, v21.8h, #8 ushr v22.8h, v22.8h, #8 ushr v23.8h, v23.8h, #8 shl v24.8h, v20.8h, #9 shl v25.8h, v21.8h, #9 shl v26.8h, v22.8h, #9 shl v27.8h, v23.8h, #9 sqdmulh v24.8h, v24.8h, v6.8h sqdmulh v25.8h, v25.8h, v7.8h sqdmulh v26.8h, v26.8h, v18.8h sqdmulh v27.8h, v27.8h, v19.8h add v24.8h, v24.8h, v4.8h add v25.8h, v25.8h, v5.8h add v26.8h, v26.8h, v16.8h add v27.8h, v27.8h, v17.8h sqrshrun v24.8b, v24.8h, #4 sqrshrun v25.8b, v25.8h, #4 sqrshrun v26.8b, v26.8h, #4 sqrshrun v27.8b, v27.8h, #4 .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // Same as xtn, xtn2 uzp1 v21.16b, v22.16b, v23.16b // Ditto sub v20.16b, v1.16b, v20.16b sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 st1 {v21.16b}, [x10], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h addp v21.8h, v22.8h, v23.8h xtn v20.8b, v20.8h xtn v21.8b, v21.8h uhsub v20.8b, v3.8b, v20.8b uhsub v21.8b, v3.8b, v21.8b st1 {v20.8b}, [x6], #8 st1 {v21.8b}, [x10], #8 .elseif \type == 420 add v20.8h, v20.8h, v22.8h add v21.8h, v21.8h, v23.8h addp v20.8h, v20.8h, v21.8h sub v20.8h, v3.8h, v20.8h rshrn v20.8b, v20.8h, #2 st1 {v20.8b}, [x6], #8 .endif st1 {v24.8b, v25.8b}, [x0], #16 st1 {v26.8b, v27.8b}, [x12], #16 b.gt 16b subs w5, w5, #2 add x2, x2, w4, uxtw #1 add x3, x3, w4, uxtw #1 add x7, x7, w4, uxtw #1 add x9, x9, w4, uxtw #1 .if \type == 444 add x6, x6, w4, uxtw add x10, x10, w4, uxtw .elseif \type == 422 add x6, x6, x11, lsr #1 add x10, x10, x11, lsr #1 .endif add x0, x0, x1 add x12, x12, x1 b.gt 161b ret L(w_mask_\type\()_tbl): .hword L(w_mask_\type\()_tbl) - 1280b .hword L(w_mask_\type\()_tbl) - 640b .hword L(w_mask_\type\()_tbl) - 320b .hword L(w_mask_\type\()_tbl) - 160b .hword L(w_mask_\type\()_tbl) - 8b .hword L(w_mask_\type\()_tbl) - 4b endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_8bpc_neon, export=1 adr x6, L(blend_tbl) clz w3, w3 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 br x6 4: AARCH64_VALID_JUMP_TARGET ld1 {v2.8b}, [x5], #8 ld1 {v1.d}[0], [x2], #8 ld1 {v0.s}[0], [x0] subs w4, w4, #2 ld1 {v0.s}[1], [x8] sub v3.8b, v4.8b, v2.8b umull v5.8h, v1.8b, v2.8b umlal v5.8h, v0.8b, v3.8b rshrn v6.8b, v5.8h, #6 st1 {v6.s}[0], [x0], x1 st1 {v6.s}[1], [x8], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v2.16b}, [x5], #16 ld1 {v1.16b}, [x2], #16 ld1 {v0.d}[0], [x0] ld1 {v0.d}[1], [x8] sub v3.16b, v4.16b, v2.16b subs w4, w4, #2 umull v5.8h, v1.8b, v2.8b umlal v5.8h, v0.8b, v3.8b umull2 v6.8h, v1.16b, v2.16b umlal2 v6.8h, v0.16b, v3.16b rshrn v7.8b, v5.8h, #6 rshrn2 v7.16b, v6.8h, #6 st1 {v7.d}[0], [x0], x1 st1 {v7.d}[1], [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld1 {v1.16b, v2.16b}, [x5], #32 ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v0.16b}, [x0] subs w4, w4, #2 sub v7.16b, v4.16b, v1.16b sub v20.16b, v4.16b, v2.16b ld1 {v3.16b}, [x8] umull v16.8h, v5.8b, v1.8b umlal v16.8h, v0.8b, v7.8b umull2 v17.8h, v5.16b, v1.16b umlal2 v17.8h, v0.16b, v7.16b umull v21.8h, v6.8b, v2.8b umlal v21.8h, v3.8b, v20.8b umull2 v22.8h, v6.16b, v2.16b umlal2 v22.8h, v3.16b, v20.16b rshrn v18.8b, v16.8h, #6 rshrn2 v18.16b, v17.8h, #6 rshrn v19.8b, v21.8h, #6 rshrn2 v19.16b, v22.8h, #6 st1 {v18.16b}, [x0], x1 st1 {v19.16b}, [x8], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], #64 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v20.16b, v21.16b}, [x0] subs w4, w4, #2 ld1 {v22.16b, v23.16b}, [x8] sub v5.16b, v4.16b, v0.16b sub v6.16b, v4.16b, v1.16b sub v30.16b, v4.16b, v2.16b sub v31.16b, v4.16b, v3.16b umull v24.8h, v16.8b, v0.8b umlal v24.8h, v20.8b, v5.8b umull2 v26.8h, v16.16b, v0.16b umlal2 v26.8h, v20.16b, v5.16b umull v28.8h, v17.8b, v1.8b umlal v28.8h, v21.8b, v6.8b umull2 v7.8h, v17.16b, v1.16b umlal2 v7.8h, v21.16b, v6.16b umull v27.8h, v18.8b, v2.8b umlal v27.8h, v22.8b, v30.8b umull2 v1.8h, v18.16b, v2.16b umlal2 v1.8h, v22.16b, v30.16b umull v29.8h, v19.8b, v3.8b umlal v29.8h, v23.8b, v31.8b umull2 v21.8h, v19.16b, v3.16b umlal2 v21.8h, v23.16b, v31.16b rshrn v24.8b, v24.8h, #6 rshrn2 v24.16b, v26.8h, #6 rshrn v25.8b, v28.8h, #6 rshrn2 v25.16b, v7.8h, #6 rshrn v27.8b, v27.8h, #6 rshrn2 v27.16b, v1.8h, #6 rshrn v28.8b, v29.8h, #6 rshrn2 v28.16b, v21.8h, #6 st1 {v24.16b, v25.16b}, [x0], x1 st1 {v27.16b, v28.16b}, [x8], x1 b.gt 32b ret L(blend_tbl): .hword L(blend_tbl) - 32b .hword L(blend_tbl) - 16b .hword L(blend_tbl) - 8b .hword L(blend_tbl) - 4b endfunc function blend_h_8bpc_neon, export=1 adr x6, L(blend_h_tbl) movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 clz w7, w3 movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 ldrh w7, [x6, x7, lsl #1] sub x6, x6, w7, uxtw br x6 2: AARCH64_VALID_JUMP_TARGET ld1 {v0.h}[0], [x5], #2 ld1 {v1.s}[0], [x2], #4 subs w4, w4, #2 ld1 {v2.h}[0], [x0] zip1 v0.8b, v0.8b, v0.8b sub v3.8b, v4.8b, v0.8b ld1 {v2.h}[1], [x8] umull v5.8h, v1.8b, v0.8b umlal v5.8h, v2.8b, v3.8b rshrn v5.8b, v5.8h, #6 st1 {v5.h}[0], [x0], x1 st1 {v5.h}[1], [x8], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld2r {v0.8b, v1.8b}, [x5], #2 ld1 {v2.8b}, [x2], #8 subs w4, w4, #2 ext v0.8b, v0.8b, v1.8b, #4 ld1 {v3.s}[0], [x0] sub v5.8b, v4.8b, v0.8b ld1 {v3.s}[1], [x8] umull v6.8h, v2.8b, v0.8b umlal v6.8h, v3.8b, v5.8b rshrn v6.8b, v6.8h, #6 st1 {v6.s}[0], [x0], x1 st1 {v6.s}[1], [x8], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b}, [x2], #16 ld1 {v3.d}[0], [x0] ext v0.16b, v0.16b, v1.16b, #8 sub v5.16b, v4.16b, v0.16b ld1 {v3.d}[1], [x8] subs w4, w4, #2 umull v6.8h, v0.8b, v2.8b umlal v6.8h, v3.8b, v5.8b umull2 v7.8h, v0.16b, v2.16b umlal2 v7.8h, v3.16b, v5.16b rshrn v16.8b, v6.8h, #6 rshrn2 v16.16b, v7.8h, #6 st1 {v16.d}[0], [x0], x1 st1 {v16.d}[1], [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld2r {v0.16b, v1.16b}, [x5], #2 ld1 {v2.16b, v3.16b}, [x2], #32 ld1 {v5.16b}, [x0] sub v7.16b, v4.16b, v0.16b sub v16.16b, v4.16b, v1.16b ld1 {v6.16b}, [x8] subs w4, w4, #2 umull v17.8h, v0.8b, v2.8b umlal v17.8h, v5.8b, v7.8b umull2 v18.8h, v0.16b, v2.16b umlal2 v18.8h, v5.16b, v7.16b umull v19.8h, v1.8b, v3.8b umlal v19.8h, v6.8b, v16.8b umull2 v20.8h, v1.16b, v3.16b umlal2 v20.8h, v6.16b, v16.16b rshrn v21.8b, v17.8h, #6 rshrn2 v21.16b, v18.8h, #6 rshrn v22.8b, v19.8h, #6 rshrn2 v22.16b, v20.8h, #6 st1 {v21.16b}, [x0], x1 st1 {v22.16b}, [x8], x1 b.gt 16b ret 1280: 640: 320: AARCH64_VALID_JUMP_TARGET sub x1, x1, w3, uxtw add x7, x2, w3, uxtw 321: ld2r {v0.16b, v1.16b}, [x5], #2 mov w6, w3 sub v20.16b, v4.16b, v0.16b sub v21.16b, v4.16b, v1.16b 32: ld1 {v16.16b, v17.16b}, [x2], #32 ld1 {v2.16b, v3.16b}, [x0] subs w6, w6, #32 umull v23.8h, v0.8b, v16.8b umlal v23.8h, v2.8b, v20.8b ld1 {v18.16b, v19.16b}, [x7], #32 umull2 v27.8h, v0.16b, v16.16b umlal2 v27.8h, v2.16b, v20.16b ld1 {v6.16b, v7.16b}, [x8] umull v24.8h, v0.8b, v17.8b umlal v24.8h, v3.8b, v20.8b umull2 v28.8h, v0.16b, v17.16b umlal2 v28.8h, v3.16b, v20.16b umull v25.8h, v1.8b, v18.8b umlal v25.8h, v6.8b, v21.8b umull2 v5.8h, v1.16b, v18.16b umlal2 v5.8h, v6.16b, v21.16b rshrn v29.8b, v23.8h, #6 rshrn2 v29.16b, v27.8h, #6 umull v26.8h, v1.8b, v19.8b umlal v26.8h, v7.8b, v21.8b umull2 v31.8h, v1.16b, v19.16b umlal2 v31.8h, v7.16b, v21.16b rshrn v30.8b, v24.8h, #6 rshrn2 v30.16b, v28.8h, #6 rshrn v23.8b, v25.8h, #6 rshrn2 v23.16b, v5.8h, #6 rshrn v24.8b, v26.8h, #6 st1 {v29.16b, v30.16b}, [x0], #32 rshrn2 v24.16b, v31.8h, #6 st1 {v23.16b, v24.16b}, [x8], #32 b.gt 32b subs w4, w4, #2 add x0, x0, x1 add x8, x8, x1 add x2, x2, w3, uxtw add x7, x7, w3, uxtw b.gt 321b ret L(blend_h_tbl): .hword L(blend_h_tbl) - 1280b .hword L(blend_h_tbl) - 640b .hword L(blend_h_tbl) - 320b .hword L(blend_h_tbl) - 16b .hword L(blend_h_tbl) - 8b .hword L(blend_h_tbl) - 4b .hword L(blend_h_tbl) - 2b endfunc function blend_v_8bpc_neon, export=1 adr x6, L(blend_v_tbl) movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw br x6 20: AARCH64_VALID_JUMP_TARGET ld1r {v0.8b}, [x5] sub v1.8b, v4.8b, v0.8b 2: ld1 {v2.h}[0], [x2], #2 ld1 {v3.b}[0], [x0] subs w4, w4, #2 ld1 {v2.b}[1], [x2] ld1 {v3.b}[1], [x8] umull v5.8h, v2.8b, v0.8b umlal v5.8h, v3.8b, v1.8b rshrn v5.8b, v5.8h, #6 add x2, x2, #2 st1 {v5.b}[0], [x0], x1 st1 {v5.b}[1], [x8], x1 b.gt 2b ret 40: AARCH64_VALID_JUMP_TARGET ld1r {v0.2s}, [x5] sub x1, x1, #2 sub v1.8b, v4.8b, v0.8b 4: ld1 {v2.8b}, [x2], #8 ld1 {v3.s}[0], [x0] ld1 {v3.s}[1], [x8] subs w4, w4, #2 umull v5.8h, v2.8b, v0.8b umlal v5.8h, v3.8b, v1.8b rshrn v5.8b, v5.8h, #6 st1 {v5.h}[0], [x0], #2 st1 {v5.h}[2], [x8], #2 st1 {v5.b}[2], [x0], x1 st1 {v5.b}[6], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1r {v0.2d}, [x5] sub x1, x1, #4 sub v1.16b, v4.16b, v0.16b 8: ld1 {v2.16b}, [x2], #16 ld1 {v3.d}[0], [x0] ld1 {v3.d}[1], [x8] subs w4, w4, #2 umull v5.8h, v0.8b, v2.8b umlal v5.8h, v3.8b, v1.8b umull2 v6.8h, v0.16b, v2.16b umlal2 v6.8h, v3.16b, v1.16b rshrn v7.8b, v5.8h, #6 rshrn2 v7.16b, v6.8h, #6 st1 {v7.s}[0], [x0], #4 st1 {v7.s}[2], [x8], #4 st1 {v7.h}[2], [x0], x1 st1 {v7.h}[6], [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b}, [x5] sub x1, x1, #8 sub v2.16b, v4.16b, v0.16b 16: ld1 {v5.16b, v6.16b}, [x2], #32 ld1 {v7.16b}, [x0] subs w4, w4, #2 ld1 {v16.16b}, [x8] umull v17.8h, v5.8b, v0.8b umlal v17.8h, v7.8b, v2.8b umull2 v18.8h, v5.16b, v0.16b umlal2 v18.8h, v7.16b, v2.16b umull v20.8h, v6.8b, v0.8b umlal v20.8h, v16.8b, v2.8b umull2 v21.8h, v6.16b, v0.16b umlal2 v21.8h, v16.16b, v2.16b rshrn v19.8b, v17.8h, #6 rshrn2 v19.16b, v18.8h, #6 rshrn v22.8b, v20.8h, #6 rshrn2 v22.16b, v21.8h, #6 st1 {v19.8b}, [x0], #8 st1 {v22.8b}, [x8], #8 st1 {v19.s}[2], [x0], x1 st1 {v22.s}[2], [x8], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v0.16b, v1.16b}, [x5] sub x1, x1, #16 sub v2.16b, v4.16b, v0.16b sub v3.8b, v4.8b, v1.8b 32: ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64 ld1 {v5.16b, v6.16b}, [x0] subs w4, w4, #2 ld1 {v20.16b, v21.16b}, [x8] umull v22.8h, v16.8b, v0.8b umlal v22.8h, v5.8b, v2.8b umull2 v23.8h, v16.16b, v0.16b umlal2 v23.8h, v5.16b, v2.16b umull v28.8h, v17.8b, v1.8b umlal v28.8h, v6.8b, v3.8b umull v30.8h, v18.8b, v0.8b umlal v30.8h, v20.8b, v2.8b umull2 v31.8h, v18.16b, v0.16b umlal2 v31.8h, v20.16b, v2.16b umull v25.8h, v19.8b, v1.8b umlal v25.8h, v21.8b, v3.8b rshrn v24.8b, v22.8h, #6 rshrn2 v24.16b, v23.8h, #6 rshrn v28.8b, v28.8h, #6 rshrn v30.8b, v30.8h, #6 rshrn2 v30.16b, v31.8h, #6 rshrn v27.8b, v25.8h, #6 st1 {v24.16b}, [x0], #16 st1 {v30.16b}, [x8], #16 st1 {v28.8b}, [x0], x1 st1 {v27.8b}, [x8], x1 b.gt 32b ret L(blend_v_tbl): .hword L(blend_v_tbl) - 320b .hword L(blend_v_tbl) - 160b .hword L(blend_v_tbl) - 80b .hword L(blend_v_tbl) - 40b .hword L(blend_v_tbl) - 20b endfunc // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). function put_neon adr x9, L(put_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 2: AARCH64_VALID_JUMP_TARGET ld1 {v0.h}[0], [x2], x3 ld1 {v1.h}[0], [x2], x3 subs w5, w5, #2 st1 {v0.h}[0], [x0], x1 st1 {v1.h}[0], [x0], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 subs w5, w5, #2 st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x2], x3 ld1 {v1.8b}, [x2], x3 subs w5, w5, #2 st1 {v0.8b}, [x0], x1 st1 {v1.8b}, [x0], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x8, x0, x1 lsl x1, x1, #1 add x9, x2, x3 lsl x3, x3, #1 16: ld1 {v0.16b}, [x2], x3 ld1 {v1.16b}, [x9], x3 subs w5, w5, #2 st1 {v0.16b}, [x0], x1 st1 {v1.16b}, [x8], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] subs w5, w5, #1 stp x8, x9, [x0, #16] add x2, x2, x3 add x0, x0, x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] ldp x10, x11, [x2, #32] stp x8, x9, [x0, #16] subs w5, w5, #1 ldp x12, x13, [x2, #48] stp x10, x11, [x0, #32] stp x12, x13, [x0, #48] add x2, x2, x3 add x0, x0, x1 b.gt 64b ret 128: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] ldp q4, q5, [x2, #64] stp q2, q3, [x0, #32] ldp q6, q7, [x2, #96] subs w5, w5, #1 stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x2, x2, x3 add x0, x0, x1 b.gt 128b ret L(put_tbl): .hword L(put_tbl) - 128b .hword L(put_tbl) - 64b .hword L(put_tbl) - 32b .hword L(put_tbl) - 160b .hword L(put_tbl) - 8b .hword L(put_tbl) - 4b .hword L(put_tbl) - 2b endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. function prep_neon adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x1], x2 ld1 {v1.s}[0], [x1], x2 subs w4, w4, #2 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 st1 {v0.4h, v1.4h}, [x0], #16 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x1], x2 subs w4, w4, #2 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 st1 {v0.8h, v1.8h}, [x0], #32 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 16: ld1 {v0.16b}, [x1], x2 ld1 {v1.16b}, [x9], x2 subs w4, w4, #2 ushll v4.8h, v0.8b, #4 ushll2 v5.8h, v0.16b, #4 ushll v6.8h, v1.8b, #4 ushll2 v7.8h, v1.16b, #4 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET add x8, x0, w3, uxtw 32: ld1 {v0.16b, v1.16b}, [x1], x2 subs w4, w4, #2 ushll v4.8h, v0.8b, #4 ushll2 v5.8h, v0.16b, #4 ld1 {v2.16b, v3.16b}, [x1], x2 ushll v6.8h, v1.8b, #4 ushll2 v7.8h, v1.16b, #4 ushll v16.8h, v2.8b, #4 st1 {v4.8h, v5.8h}, [x0], x7 ushll2 v17.8h, v2.16b, #4 st1 {v6.8h, v7.8h}, [x8], x7 ushll v18.8h, v3.8b, #4 st1 {v16.8h, v17.8h}, [x0], x7 ushll2 v19.8h, v3.16b, #4 st1 {v18.8h, v19.8h}, [x8], x7 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET add x8, x0, #32 mov x6, #64 64: ldp q0, q1, [x1] subs w4, w4, #1 ushll v4.8h, v0.8b, #4 ushll2 v5.8h, v0.16b, #4 ldp q2, q3, [x1, #32] ushll v6.8h, v1.8b, #4 ushll2 v7.8h, v1.16b, #4 add x1, x1, x2 ushll v16.8h, v2.8b, #4 st1 {v4.8h, v5.8h}, [x0], x6 ushll2 v17.8h, v2.16b, #4 ushll v18.8h, v3.8b, #4 st1 {v6.8h, v7.8h}, [x8], x6 ushll2 v19.8h, v3.16b, #4 st1 {v16.8h, v17.8h}, [x0], x6 st1 {v18.8h, v19.8h}, [x8], x6 b.gt 64b ret 1280: AARCH64_VALID_JUMP_TARGET add x8, x0, #64 mov x6, #128 128: ldp q0, q1, [x1] ldp q2, q3, [x1, #32] ushll v16.8h, v0.8b, #4 ushll2 v17.8h, v0.16b, #4 ushll v18.8h, v1.8b, #4 ushll2 v19.8h, v1.16b, #4 ushll v20.8h, v2.8b, #4 ushll2 v21.8h, v2.16b, #4 ldp q4, q5, [x1, #64] st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 ushll v22.8h, v3.8b, #4 ushll2 v23.8h, v3.16b, #4 ushll v24.8h, v4.8b, #4 ushll2 v25.8h, v4.16b, #4 ushll v26.8h, v5.8b, #4 ushll2 v27.8h, v5.16b, #4 ldp q6, q7, [x1, #96] st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 ushll v28.8h, v6.8b, #4 ushll2 v29.8h, v6.16b, #4 ushll v30.8h, v7.8b, #4 ushll2 v31.8h, v7.16b, #4 subs w4, w4, #1 add x1, x1, x2 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 b.gt 128b ret L(prep_tbl): .hword L(prep_tbl) - 1280b .hword L(prep_tbl) - 640b .hword L(prep_tbl) - 320b .hword L(prep_tbl) - 160b .hword L(prep_tbl) - 8b .hword L(prep_tbl) - 4b endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd ld1 {\d1\wd}[0], [\s1], \strd .ifnb \d2 ld1 {\d2\wd}[0], [\s0], \strd ld1 {\d3\wd}[0], [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}[0], [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}[0], [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}[0], [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}, [\s0], \strd ld1 {\d1\wd}, [\s1], \strd .ifnb \d2 ld1 {\d2\wd}, [\s0], \strd ld1 {\d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}, [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}, [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}, [\s0], \strd .endif .endm .macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro interleave_1 wd, r0, r1, r2, r3, r4 trn1 \r0\wd, \r0\wd, \r1\wd trn1 \r1\wd, \r1\wd, \r2\wd .ifnb \r3 trn1 \r2\wd, \r2\wd, \r3\wd trn1 \r3\wd, \r3\wd, \r4\wd .endif .endm .macro interleave_1_h r0, r1, r2, r3, r4 interleave_1 .4h, \r0, \r1, \r2, \r3, \r4 .endm .macro interleave_1_s r0, r1, r2, r3, r4 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 .endm .macro interleave_2 wd, r0, r1, r2, r3, r4, r5 trn1 \r0\wd, \r0\wd, \r2\wd trn1 \r1\wd, \r1\wd, \r3\wd trn1 \r2\wd, \r2\wd, \r4\wd trn1 \r3\wd, \r3\wd, \r5\wd .endm .macro interleave_2_s r0, r1, r2, r3, r4, r5 interleave_2 .2s, \r0, \r1, \r2, \r3, \r4, \r5 .endm .macro uxtl_b r0, r1, r2, r3, r4, r5, r6 uxtl \r0\().8h, \r0\().8b uxtl \r1\().8h, \r1\().8b .ifnb \r2 uxtl \r2\().8h, \r2\().8b uxtl \r3\().8h, \r3\().8b .endif .ifnb \r4 uxtl \r4\().8h, \r4\().8b .endif .ifnb \r5 uxtl \r5\().8h, \r5\().8b .endif .ifnb \r6 uxtl \r6\().8h, \r6\().8b .endif .endm .macro mul_mla_4 d, s0, s1, s2, s3, wd mul \d\wd, \s0\wd, v0.h[0] mla \d\wd, \s1\wd, v0.h[1] mla \d\wd, \s2\wd, v0.h[2] mla \d\wd, \s3\wd, v0.h[3] .endm // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. .macro mul_mla_8_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().4h, \s0\().4h, v0.h[0] mla \d0\().4h, \s1\().4h, v0.h[1] mla \d0\().4h, \s2\().4h, v0.h[2] mla \d0\().4h, \s3\().4h, v0.h[3] mla \d0\().4h, \s4\().4h, v0.h[4] mla \d0\().4h, \s5\().4h, v0.h[5] mla \d0\().4h, \s6\().4h, v0.h[6] mla \d0\().4h, \s7\().4h, v0.h[7] .endm .macro mul_mla_8_0 d0, s0, s1, s2, s3, s4, s5, s6, s7 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] .endm .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] mul \d1\().8h, \s1\().8h, v0.h[0] mla \d1\().8h, \s2\().8h, v0.h[1] mla \d1\().8h, \s3\().8h, v0.h[2] mla \d1\().8h, \s4\().8h, v0.h[3] mla \d1\().8h, \s5\().8h, v0.h[4] mla \d1\().8h, \s6\().8h, v0.h[5] mla \d1\().8h, \s7\().8h, v0.h[6] mla \d1\().8h, \s8\().8h, v0.h[7] .endm .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 mul \d0\().8h, \s0\().8h, v0.h[0] mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s7\().8h, v0.h[7] mul \d1\().8h, \s2\().8h, v0.h[0] mla \d1\().8h, \s3\().8h, v0.h[1] mla \d1\().8h, \s4\().8h, v0.h[2] mla \d1\().8h, \s5\().8h, v0.h[3] mla \d1\().8h, \s6\().8h, v0.h[4] mla \d1\().8h, \s7\().8h, v0.h[5] mla \d1\().8h, \s8\().8h, v0.h[6] mla \d1\().8h, \s9\().8h, v0.h[7] .endm .macro sqrshrun_b shift, r0, r1, r2, r3 sqrshrun \r0\().8b, \r0\().8h, #\shift .ifnb \r1 sqrshrun \r1\().8b, \r1\().8h, #\shift .endif .ifnb \r2 sqrshrun \r2\().8b, \r2\().8h, #\shift sqrshrun \r3\().8b, \r3\().8h, #\shift .endif .endm .macro srshr_h shift, r0, r1, r2, r3 srshr \r0\().8h, \r0\().8h, #\shift .ifnb \r1 srshr \r1\().8h, \r1\().8h, #\shift .endif .ifnb \r2 srshr \r2\().8h, \r2\().8h, #\shift srshr \r3\().8h, \r3\().8h, #\shift .endif .endm .macro st_h strd, reg, lanes st1 {\reg\().h}[0], [x0], \strd st1 {\reg\().h}[1], [x8], \strd .if \lanes > 2 st1 {\reg\().h}[2], [x0], \strd st1 {\reg\().h}[3], [x8], \strd .endif .endm .macro st_s strd, r0, r1 st1 {\r0\().s}[0], [x0], \strd st1 {\r0\().s}[1], [x8], \strd .ifnb \r1 st1 {\r1\().s}[0], [x0], \strd st1 {\r1\().s}[1], [x8], \strd .endif .endm .macro st_d strd, r0, r1 st1 {\r0\().d}[0], [x0], \strd st1 {\r0\().d}[1], [x8], \strd .ifnb \r1 st1 {\r1\().d}[0], [x0], \strd st1 {\r1\().d}[1], [x8], \strd .endif .endm .macro shift_store_4 type, strd, r0, r1 .ifc \type, put sqrshrun_b 6, \r0, \r1 st_s \strd, \r0, \r1 .else srshr_h 2, \r0, \r1 st_d \strd, \r0, \r1 .endif .endm .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 st1 {\r0\wd}, [x0], \strd st1 {\r1\wd}, [x8], \strd .ifnb \r2 st1 {\r2\wd}, [x0], \strd st1 {\r3\wd}, [x8], \strd .endif .ifnb \r4 st1 {\r4\wd}, [x0], \strd st1 {\r5\wd}, [x8], \strd st1 {\r6\wd}, [x0], \strd st1 {\r7\wd}, [x8], \strd .endif .endm .macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .8b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro shift_store_8 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_b 6, \r0, \r1, \r2, \r3 st_8b \strd, \r0, \r1, \r2, \r3 .else srshr_h 2, \r0, \r1, \r2, \r3 st_16b \strd, \r0, \r1, \r2, \r3 .endif .endm .macro shift_store_16 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun \r0\().8b, \r0\().8h, #6 sqrshrun2 \r0\().16b, \r1\().8h, #6 sqrshrun \r2\().8b, \r2\().8h, #6 sqrshrun2 \r2\().16b, \r3\().8h, #6 st_16b \strd, \r0, \r2 .else srshr_h 2, \r0, \r1, \r2, \r3 st1 {\r0\().8h, \r1\().8h}, [x0], \strd st1 {\r2\().8h, \r3\().8h}, [x8], \strd .endif .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_8bpc_neon, export=1 mov x8, \type_h mov x9, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon mov w10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w10 mul \my, \my, w10 add \mx, \mx, w8 // mx, 8tap_h, 4tap_h add \my, \my, w9 // my, 8tap_v, 4tap_v .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif clz w8, \w tst \mx, #(0x7f << 14) sub w8, w8, #24 movrel x10, X(mc_subpel_filters), -8 b.ne L(\type\()_8tap_h) tst \my, #(0x7f << 14) b.ne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7f b.le 4f mov \mx, w9 4: tst \my, #(0x7f << 14) add \xmx, x10, \mx, uxtw #3 b.ne L(\type\()_8tap_hv) adr x9, L(\type\()_8tap_h_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 2: ld1 {v4.8b}, [\src], \s_strd ld1 {v6.8b}, [\sr2], \s_strd uxtl v4.8h, v4.8b uxtl v6.8h, v6.8b ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 subs \h, \h, #2 trn1 v3.2s, v4.2s, v6.2s trn2 v6.2s, v4.2s, v6.2s trn1 v4.2s, v5.2s, v7.2s trn2 v7.2s, v5.2s, v7.2s mul v3.4h, v3.4h, v0.h[0] mla v3.4h, v4.4h, v0.h[1] mla v3.4h, v6.4h, v0.h[2] mla v3.4h, v7.4h, v0.h[3] srshr v3.4h, v3.4h, #2 sqrshrun v3.8b, v3.8h, #4 st1 {v3.h}[0], [\dst], \d_strd st1 {v3.h}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #1 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 4: ld1 {v16.8b}, [\src], \s_strd ld1 {v20.8b}, [\sr2], \s_strd uxtl v16.8h, v16.8b uxtl v20.8h, v20.8b ext v17.16b, v16.16b, v16.16b, #2 ext v18.16b, v16.16b, v16.16b, #4 ext v19.16b, v16.16b, v16.16b, #6 ext v21.16b, v20.16b, v20.16b, #2 ext v22.16b, v20.16b, v20.16b, #4 ext v23.16b, v20.16b, v20.16b, #6 subs \h, \h, #2 mul v16.4h, v16.4h, v0.h[0] mla v16.4h, v17.4h, v0.h[1] mla v16.4h, v18.4h, v0.h[2] mla v16.4h, v19.4h, v0.h[3] mul v20.4h, v20.4h, v0.h[0] mla v20.4h, v21.4h, v0.h[1] mla v20.4h, v22.4h, v0.h[2] mla v20.4h, v23.4h, v0.h[3] srshr v16.4h, v16.4h, #2 srshr v20.4h, v20.4h, #2 .ifc \type, put sqrshrun v16.8b, v16.8h, #4 sqrshrun v20.8b, v20.8h, #4 st1 {v16.s}[0], [\dst], \d_strd st1 {v20.s}[0], [\ds2], \d_strd .else st1 {v16.4h}, [\dst], \d_strd st1 {v20.4h}, [\ds2], \d_strd .endif b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 8: ld1 {v16.8b, v17.8b}, [\src], \s_strd ld1 {v20.8b, v21.8b}, [\sr2], \s_strd uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b mul v18.8h, v16.8h, v0.h[0] mul v22.8h, v20.8h, v0.h[0] .irpc i, 1234567 ext v19.16b, v16.16b, v17.16b, #(2*\i) ext v23.16b, v20.16b, v21.16b, #(2*\i) mla v18.8h, v19.8h, v0.h[\i] mla v22.8h, v23.8h, v0.h[\i] .endr subs \h, \h, #2 srshr v18.8h, v18.8h, #2 srshr v22.8h, v22.8h, #2 .ifc \type, put sqrshrun v18.8b, v18.8h, #4 sqrshrun v22.8b, v22.8h, #4 st1 {v18.8b}, [\dst], \d_strd st1 {v22.8b}, [\ds2], \d_strd .else st1 {v18.8h}, [\dst], \d_strd st1 {v22.8h}, [\ds2], \d_strd .endif b.gt 8b ret 160: 320: 640: 1280: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #3 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b sub \s_strd, \s_strd, \w, uxtw sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw .endif 161: ld1 {v16.8b, v17.8b, v18.8b}, [\src], #24 ld1 {v20.8b, v21.8b, v22.8b}, [\sr2], #24 mov \mx, \w uxtl v16.8h, v16.8b uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v20.8h, v20.8b uxtl v21.8h, v21.8b uxtl v22.8h, v22.8b 16: mul v24.8h, v16.8h, v0.h[0] mul v25.8h, v17.8h, v0.h[0] mul v26.8h, v20.8h, v0.h[0] mul v27.8h, v21.8h, v0.h[0] .irpc i, 1234567 ext v28.16b, v16.16b, v17.16b, #(2*\i) ext v29.16b, v17.16b, v18.16b, #(2*\i) ext v30.16b, v20.16b, v21.16b, #(2*\i) ext v31.16b, v21.16b, v22.16b, #(2*\i) mla v24.8h, v28.8h, v0.h[\i] mla v25.8h, v29.8h, v0.h[\i] mla v26.8h, v30.8h, v0.h[\i] mla v27.8h, v31.8h, v0.h[\i] .endr srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 srshr v26.8h, v26.8h, #2 srshr v27.8h, v27.8h, #2 subs \mx, \mx, #16 .ifc \type, put sqrshrun v24.8b, v24.8h, #4 sqrshrun2 v24.16b, v25.8h, #4 sqrshrun v26.8b, v26.8h, #4 sqrshrun2 v26.16b, v27.8h, #4 st1 {v24.16b}, [\dst], #16 st1 {v26.16b}, [\ds2], #16 .else st1 {v24.8h, v25.8h}, [\dst], #32 st1 {v26.8h, v27.8h}, [\ds2], #32 .endif b.le 9f mov v16.16b, v18.16b mov v20.16b, v22.16b ld1 {v17.8b, v18.8b}, [\src], #16 ld1 {v21.8b, v22.8b}, [\sr2], #16 uxtl v17.8h, v17.8b uxtl v18.8h, v18.8b uxtl v21.8h, v21.8b uxtl v22.8h, v22.8b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret L(\type\()_8tap_h_tbl): .hword L(\type\()_8tap_h_tbl) - 1280b .hword L(\type\()_8tap_h_tbl) - 640b .hword L(\type\()_8tap_h_tbl) - 320b .hword L(\type\()_8tap_h_tbl) - 160b .hword L(\type\()_8tap_h_tbl) - 80b .hword L(\type\()_8tap_h_tbl) - 40b .hword L(\type\()_8tap_h_tbl) - 20b .hword 0 L(\type\()_8tap_v): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w9 4: add \xmy, x10, \my, uxtw #3 adr x9, L(\type\()_8tap_v_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b // 2x2 v load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_h v1, v2, v3, v4, v5 b.gt 24f uxtl_b v1, v2, v3, v4 mul_mla_4 v6, v1, v2, v3, v4, .4h sqrshrun_b 6, v6 st_h \d_strd, v6, 2 ret 24: // 2x4 v load_h \sr2, \src, \s_strd, v6, v7 interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 mul_mla_4 v6, v1, v2, v3, v4, .8h sqrshrun_b 6, v6 st_h \d_strd, v6, 4 ret 28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b load_h \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 interleave_1_h v1, v2, v3, v4, v5 interleave_1_h v5, v6, v7 interleave_2_s v1, v2, v3, v4, v5, v6 uxtl_b v1, v2, v3, v4 216: subs \h, \h, #4 load_h \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_h v7, v16, v17, v18, v19 interleave_2_s v5, v6, v7, v16, v17, v18 uxtl_b v5, v6, v7, v16 mul_mla_8_0 v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 4 b.le 0f cmp \h, #2 mov v1.16b, v5.16b mov v2.16b, v6.16b mov v3.16b, v7.16b mov v4.16b, v16.16b mov v5.16b, v17.16b mov v6.16b, v18.16b mov v7.16b, v19.16b b.eq 26f b 216b 26: load_h \sr2, \src, \s_strd, v16, v17 interleave_1_h v7, v16, v17 uxtl_b v5, v6, v7, v16 mul_mla_8_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_b 6, v30 st_h \d_strd, v30, 2 0: ret .endif 40: AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4 mul_mla_4 v6, v1, v2, v3, v4, .8h shift_store_4 \type, \d_strd, v6 b.le 0f load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 uxtl_b v5, v6 mul_mla_4 v7, v3, v4, v5, v6, .8h shift_store_4 \type, \d_strd, v7 0: ret 480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 interleave_1_s v16, v17, v18 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v16, v17 uxtl_b v18, v19, v20, v21 48: subs \h, \h, #4 load_s \sr2, \src, \s_strd, v23, v24, v25, v26 interleave_1_s v22, v23, v24, v25, v26 uxtl_b v22, v23, v24, v25 mul_mla_8_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 shift_store_4 \type, \d_strd, v1, v2 b.le 0f load_s \sr2, \src, \s_strd, v27, v16 subs \h, \h, #2 interleave_1_s v26, v27, v16 uxtl_b v26, v27 mul_mla_8_0 v1, v20, v21, v22, v23, v24, v25, v26, v27 shift_store_4 \type, \d_strd, v1 b.le 0f load_s \sr2, \src, \s_strd, v17, v18 subs \h, \h, #2 interleave_1_s v16, v17, v18 uxtl_b v16, v17 mul_mla_8_0 v2, v22, v23, v24, v25, v26, v27, v16, v17 shift_store_4 \type, \d_strd, v2 b.le 0f subs \h, \h, #4 load_s \sr2, \src, \s_strd, v19, v20, v21, v22 interleave_1_s v18, v19, v20, v21, v22 uxtl_b v18, v19, v20, v21 mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21 shift_store_4 \type, \d_strd, v1, v2 b.gt 48b 0: ret 80: AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_8b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl_b v1, v2, v3, v4, v5 mul_mla_4 v6, v1, v2, v3, v4, .8h mul_mla_4 v7, v2, v3, v4, v5, .8h shift_store_8 \type, \d_strd, v6, v7 b.le 0f load_8b \sr2, \src, \s_strd, v6, v7 uxtl_b v6, v7 mul_mla_4 v1, v3, v4, v5, v6, .8h mul_mla_4 v2, v4, v5, v6, v7, .8h shift_store_8 \type, \d_strd, v1, v2 0: ret 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_8b \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 uxtl_b v16, v17, v18, v19, v20, v21, v22 88: subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v23, v24 uxtl_b v23, v24 mul_mla_8_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v25, v26 uxtl_b v25, v26 mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v27, v16 uxtl_b v27, v16 mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16 shift_store_8 \type, \d_strd, v1, v2 b.le 9f subs \h, \h, #2 load_8b \sr2, \src, \s_strd, v17, v18 uxtl_b v17, v18 mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18 shift_store_8 \type, \d_strd, v3, v4 b.le 9f subs \h, \h, #4 load_8b \sr2, \src, \s_strd, v19, v20, v21, v22 uxtl_b v19, v20, v21, v22 mul_mla_8_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20 mul_mla_8_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.gt 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: ret 160: AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b cmp \h, #2 load_16b \src, \sr2, \s_strd, v1, v2, v3, v4, v5 uxtl v16.8h, v1.8b uxtl v17.8h, v2.8b uxtl v18.8h, v3.8b uxtl v19.8h, v4.8b uxtl v20.8h, v5.8b uxtl2 v23.8h, v1.16b uxtl2 v24.8h, v2.16b uxtl2 v25.8h, v3.16b uxtl2 v26.8h, v4.16b uxtl2 v27.8h, v5.16b mul_mla_4 v1, v16, v17, v18, v19, .8h mul_mla_4 v16, v17, v18, v19, v20, .8h mul_mla_4 v2, v23, v24, v25, v26, .8h mul_mla_4 v17, v24, v25, v26, v27, .8h shift_store_16 \type, \d_strd, v1, v2, v16, v17 b.le 0f load_16b \sr2, \src, \s_strd, v6, v7 uxtl v21.8h, v6.8b uxtl v22.8h, v7.8b uxtl2 v28.8h, v6.16b uxtl2 v29.8h, v7.16b mul_mla_4 v1, v18, v19, v20, v21, .8h mul_mla_4 v3, v19, v20, v21, v22, .8h mul_mla_4 v2, v25, v26, v27, v28, .8h mul_mla_4 v4, v26, v27, v28, v29, .8h shift_store_16 \type, \d_strd, v1, v2, v3, v4 0: ret L(\type\()_8tap_v_tbl): .hword L(\type\()_8tap_v_tbl) - 1280b .hword L(\type\()_8tap_v_tbl) - 640b .hword L(\type\()_8tap_v_tbl) - 320b .hword L(\type\()_8tap_v_tbl) - 160b .hword L(\type\()_8tap_v_tbl) - 80b .hword L(\type\()_8tap_v_tbl) - 40b .hword L(\type\()_8tap_v_tbl) - 20b .hword 0 L(\type\()_8tap_hv): cmp \h, #4 ubfx w9, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w9 4: add \xmy, x10, \my, uxtw #3 adr x9, L(\type\()_8tap_hv_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 280f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] // 2x2, 2x4 hv sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v28.8b}, [\src], \s_strd uxtl v28.8h, v28.8b ext v29.16b, v28.16b, v28.16b, #2 mul v28.4h, v28.4h, v0.4h mul v29.4h, v29.4h, v0.4h addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 bl L(\type\()_8tap_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b 2: bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v28.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h subs \h, \h, #2 st1 {v2.h}[0], [\dst], \d_strd st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v28.8b b 2b 280: // 2x8, 2x16, 2x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v28.8b}, [\src], \s_strd uxtl v28.8h, v28.8b ext v29.16b, v28.16b, v28.16b, #2 mul v28.4h, v28.4h, v0.4h mul v29.4h, v29.4h, v0.4h addp v28.4h, v28.4h, v29.4h addp v16.4h, v28.4h, v28.4h srshr v16.4h, v16.4h, #2 bl L(\type\()_8tap_filter_2) trn1 v16.2s, v16.2s, v28.2s mov v17.8b, v28.8b bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v28.8b, #4 mov v19.8b, v28.8b bl L(\type\()_8tap_filter_2) ext v20.8b, v19.8b, v28.8b, #4 mov v21.8b, v28.8b 28: bl L(\type\()_8tap_filter_2) ext v22.8b, v21.8b, v28.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqxtun v2.8b, v2.8h subs \h, \h, #2 st1 {v2.h}[0], [\dst], \d_strd st1 {v2.h}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v28.8b b 28b 0: ret x15 L(\type\()_8tap_filter_2): ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v30.8h, v30.8b ext v29.16b, v28.16b, v28.16b, #2 ext v31.16b, v30.16b, v30.16b, #2 trn1 v27.2s, v28.2s, v30.2s trn2 v30.2s, v28.2s, v30.2s trn1 v28.2s, v29.2s, v31.2s trn2 v31.2s, v29.2s, v31.2s mul v27.4h, v27.4h, v0.h[0] mla v27.4h, v28.4h, v0.h[1] mla v27.4h, v30.4h, v0.h[2] mla v27.4h, v31.4h, v0.h[3] srshr v28.4h, v27.4h, #2 ret .endif 40: AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 480f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] sub \sr2, \src, #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 // 4x2, 4x4 hv ld1 {v26.8b}, [\src], \s_strd uxtl v26.8h, v26.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 bl L(\type\()_8tap_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b 4: bl L(\type\()_8tap_filter_4) // Interleaving the mul/mla chains actually hurts performance // significantly on Cortex A53, thus keeping mul/mla tightly // chained like this. smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v28.4h, v1.h[3] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v28.4h, v1.h[2] smlal v3.4s, v29.4h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h st1 {v2.s}[0], [\dst], \d_strd st1 {v3.s}[0], [\ds2], \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b mov v17.8b, v28.8b mov v18.8b, v29.8b b 4b 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #1 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v26.8b}, [\src], \s_strd uxtl v26.8h, v26.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] srshr v16.4h, v31.4h, #2 bl L(\type\()_8tap_filter_4) mov v17.8b, v28.8b mov v18.8b, v29.8b bl L(\type\()_8tap_filter_4) mov v19.8b, v28.8b mov v20.8b, v29.8b bl L(\type\()_8tap_filter_4) mov v21.8b, v28.8b mov v22.8b, v29.8b 48: bl L(\type\()_8tap_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal v2.4s, v28.4h, v1.h[7] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v19.4h, v1.h[2] smlal v3.4s, v20.4h, v1.h[3] smlal v3.4s, v21.4h, v1.h[4] smlal v3.4s, v22.4h, v1.h[5] smlal v3.4s, v28.4h, v1.h[6] smlal v3.4s, v29.4h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn v3.4h, v3.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h st1 {v2.s}[0], [\dst], \d_strd st1 {v3.s}[0], [\ds2], \d_strd .else st1 {v2.4h}, [\dst], \d_strd st1 {v3.4h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v28.8b mov v22.8b, v29.8b b 48b 0: ret x15 L(\type\()_8tap_filter_4): ld1 {v26.8b}, [\sr2], \s_strd ld1 {v27.8b}, [\src], \s_strd uxtl v26.8h, v26.8b uxtl v27.8h, v27.8b ext v28.16b, v26.16b, v26.16b, #2 ext v29.16b, v26.16b, v26.16b, #4 ext v30.16b, v26.16b, v26.16b, #6 mul v31.4h, v26.4h, v0.h[0] mla v31.4h, v28.4h, v0.h[1] mla v31.4h, v29.4h, v0.h[2] mla v31.4h, v30.4h, v0.h[3] ext v28.16b, v27.16b, v27.16b, #2 ext v29.16b, v27.16b, v27.16b, #4 ext v30.16b, v27.16b, v27.16b, #6 mul v27.4h, v27.4h, v0.h[0] mla v27.4h, v28.4h, v0.h[1] mla v27.4h, v29.4h, v0.h[2] mla v27.4h, v30.4h, v0.h[3] srshr v28.4h, v31.4h, #2 srshr v29.4h, v27.4h, #2 ret 80: 160: 320: AARCH64_VALID_JUMP_TARGET b.gt 880f add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] ld1 {v1.s}[0], [\xmy] sub \src, \src, #3 sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v24.4h, v1.h[2] smlal2 v5.4s, v24.8h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] smlal2 v3.4s, v24.8h, v1.h[3] smlal v4.4s, v25.4h, v1.h[3] smlal2 v5.4s, v25.8h, v1.h[3] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv sqrshrn2 v4.8h, v5.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v4.8b, v4.8h st1 {v2.8b}, [\dst], \d_strd st1 {v4.8b}, [\ds2], \d_strd .else st1 {v2.8h}, [\dst], \d_strd st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b mov v17.16b, v24.16b mov v18.16b, v25.16b b 8b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #3 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 bl L(\type\()_8tap_filter_8_first) bl L(\type\()_8tap_filter_8) mov v17.16b, v24.16b mov v18.16b, v25.16b bl L(\type\()_8tap_filter_8) mov v19.16b, v24.16b mov v20.16b, v25.16b bl L(\type\()_8tap_filter_8) mov v21.16b, v24.16b mov v22.16b, v25.16b 88: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v19.4h, v1.h[2] smlal2 v5.4s, v19.8h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal2 v3.4s, v19.8h, v1.h[3] smlal v4.4s, v20.4h, v1.h[3] smlal2 v5.4s, v20.8h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal2 v3.4s, v20.8h, v1.h[4] smlal v4.4s, v21.4h, v1.h[4] smlal2 v5.4s, v21.8h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal2 v3.4s, v21.8h, v1.h[5] smlal v4.4s, v22.4h, v1.h[5] smlal2 v5.4s, v22.8h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal2 v3.4s, v22.8h, v1.h[6] smlal v4.4s, v24.4h, v1.h[6] smlal2 v5.4s, v24.8h, v1.h[6] smlal v2.4s, v24.4h, v1.h[7] smlal2 v3.4s, v24.8h, v1.h[7] smlal v4.4s, v25.4h, v1.h[7] smlal2 v5.4s, v25.8h, v1.h[7] sqrshrn v2.4h, v2.4s, #\shift_hv sqrshrn2 v2.8h, v3.4s, #\shift_hv sqrshrn v4.4h, v4.4s, #\shift_hv sqrshrn2 v4.8h, v5.4s, #\shift_hv subs \h, \h, #2 .ifc \type, put sqxtun v2.8b, v2.8h sqxtun v4.8b, v4.8h st1 {v2.8b}, [\dst], \d_strd st1 {v4.8b}, [\ds2], \d_strd .else st1 {v2.8h}, [\dst], \d_strd st1 {v4.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b mov v17.16b, v19.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v24.16b mov v22.16b, v25.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 168b 0: ret x15 L(\type\()_8tap_filter_8_first): ld1 {v28.8b, v29.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b mul v16.8h, v28.8h, v0.h[0] ext v24.16b, v28.16b, v29.16b, #(2*1) ext v25.16b, v28.16b, v29.16b, #(2*2) ext v26.16b, v28.16b, v29.16b, #(2*3) ext v27.16b, v28.16b, v29.16b, #(2*4) mla v16.8h, v24.8h, v0.h[1] mla v16.8h, v25.8h, v0.h[2] mla v16.8h, v26.8h, v0.h[3] mla v16.8h, v27.8h, v0.h[4] ext v24.16b, v28.16b, v29.16b, #(2*5) ext v25.16b, v28.16b, v29.16b, #(2*6) ext v26.16b, v28.16b, v29.16b, #(2*7) mla v16.8h, v24.8h, v0.h[5] mla v16.8h, v25.8h, v0.h[6] mla v16.8h, v26.8h, v0.h[7] srshr v16.8h, v16.8h, #2 ret L(\type\()_8tap_filter_8): ld1 {v28.8b, v29.8b}, [\sr2], \s_strd ld1 {v30.8b, v31.8b}, [\src], \s_strd uxtl v28.8h, v28.8b uxtl v29.8h, v29.8b uxtl v30.8h, v30.8b uxtl v31.8h, v31.8b mul v24.8h, v28.8h, v0.h[0] mul v25.8h, v30.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v28.16b, v29.16b, #(2*\i) ext v27.16b, v30.16b, v31.16b, #(2*\i) mla v24.8h, v26.8h, v0.h[\i] mla v25.8h, v27.8h, v0.h[\i] .endr srshr v24.8h, v24.8h, #2 srshr v25.8h, v25.8h, #2 ret L(\type\()_8tap_hv_tbl): .hword L(\type\()_8tap_hv_tbl) - 1280b .hword L(\type\()_8tap_hv_tbl) - 640b .hword L(\type\()_8tap_hv_tbl) - 320b .hword L(\type\()_8tap_hv_tbl) - 160b .hword L(\type\()_8tap_hv_tbl) - 80b .hword L(\type\()_8tap_hv_tbl) - 40b .hword L(\type\()_8tap_hv_tbl) - 20b .hword 0 endfunc function \type\()_bilin_8bpc_neon, export=1 dup v1.16b, \mx dup v3.16b, \my mov w9, #16 sub w8, w9, \mx sub w9, w9, \my dup v0.16b, w8 dup v2.16b, w9 .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif clz w8, \w sub w8, w8, #24 cbnz \mx, L(\type\()_bilin_h) cbnz \my, L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) adr x9, L(\type\()_bilin_h_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: ld1 {v4.s}[0], [\src], \s_strd ld1 {v6.s}[0], [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #1 ext v7.8b, v6.8b, v6.8b, #1 trn1 v4.4h, v4.4h, v6.4h trn1 v5.4h, v5.4h, v7.4h subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umlal v4.8h, v5.8b, v1.8b uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: ld1 {v4.8b}, [\src], \s_strd ld1 {v6.8b}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #1 ext v7.8b, v6.8b, v6.8b, #1 trn1 v4.2s, v4.2s, v6.2s trn1 v5.2s, v5.2s, v7.2s subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umlal v4.8h, v5.8b, v1.8b .ifc \type, put uqrshrn v4.8b, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: ld1 {v4.16b}, [\src], \s_strd ld1 {v6.16b}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #1 ext v7.16b, v6.16b, v6.16b, #1 subs \h, \h, #2 umull v4.8h, v4.8b, v0.8b umull v6.8h, v6.8b, v0.8b umlal v4.8h, v5.8b, v1.8b umlal v6.8h, v7.8b, v1.8b .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn v6.8b, v6.8h, #4 st1 {v4.8b}, [\dst], \d_strd st1 {v6.8b}, [\ds2], \d_strd .else st1 {v4.8h}, [\dst], \d_strd st1 {v6.8h}, [\ds2], \d_strd .endif b.gt 8b ret 160: 320: 640: 1280: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, uxtw sub \s_strd, \s_strd, #8 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw .endif 161: ld1 {v16.d}[1], [\src], #8 ld1 {v20.d}[1], [\sr2], #8 mov \mx, \w 16: ld1 {v18.16b}, [\src], #16 ld1 {v22.16b}, [\sr2], #16 ext v17.16b, v16.16b, v18.16b, #8 ext v19.16b, v16.16b, v18.16b, #9 ext v21.16b, v20.16b, v22.16b, #8 ext v23.16b, v20.16b, v22.16b, #9 umull v16.8h, v17.8b, v0.8b umull2 v17.8h, v17.16b, v0.16b umull v20.8h, v21.8b, v0.8b umull2 v21.8h, v21.16b, v0.16b umlal v16.8h, v19.8b, v1.8b umlal2 v17.8h, v19.16b, v1.16b umlal v20.8h, v23.8b, v1.8b umlal2 v21.8h, v23.16b, v1.16b subs \mx, \mx, #16 .ifc \type, put uqrshrn v16.8b, v16.8h, #4 uqrshrn2 v16.16b, v17.8h, #4 uqrshrn v20.8b, v20.8h, #4 uqrshrn2 v20.16b, v21.8h, #4 st1 {v16.16b}, [\dst], #16 st1 {v20.16b}, [\ds2], #16 .else st1 {v16.8h, v17.8h}, [\dst], #32 st1 {v20.8h, v21.8h}, [\ds2], #32 .endif b.le 9f mov v16.16b, v18.16b mov v20.16b, v22.16b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret L(\type\()_bilin_h_tbl): .hword L(\type\()_bilin_h_tbl) - 1280b .hword L(\type\()_bilin_h_tbl) - 640b .hword L(\type\()_bilin_h_tbl) - 320b .hword L(\type\()_bilin_h_tbl) - 160b .hword L(\type\()_bilin_h_tbl) - 80b .hword L(\type\()_bilin_h_tbl) - 40b .hword L(\type\()_bilin_h_tbl) - 20b .hword 0 L(\type\()_bilin_v): cmp \h, #4 adr x9, L(\type\()_bilin_v_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v ld1 {v16.h}[0], [\src], \s_strd b.gt 24f 22: ld1 {v17.h}[0], [\sr2], \s_strd ld1 {v18.h}[0], [\src], \s_strd trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst] st1 {v4.h}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v ld1 {v17.h}[0], [\sr2], \s_strd ld1 {v18.h}[0], [\src], \s_strd ld1 {v19.h}[0], [\sr2], \s_strd ld1 {v20.h}[0], [\src], \s_strd sub \h, \h, #4 trn1 v16.4h, v16.4h, v17.4h trn1 v17.4h, v17.4h, v18.4h trn1 v18.4h, v18.4h, v19.4h trn1 v19.4h, v19.4h, v20.4h trn1 v16.2s, v16.2s, v18.2s trn1 v17.2s, v17.2s, v19.2s umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b cmp \h, #2 uqrshrn v4.8b, v4.8h, #4 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd st1 {v4.h}[2], [\dst], \d_strd st1 {v4.h}[3], [\ds2], \d_strd b.lt 0f mov v16.8b, v20.8b b.eq 22b b 24b 0: ret .endif 40: // 4xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.s}[0], [\src], \s_strd 4: ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s umull v4.8h, v16.8b, v2.8b umlal v4.8h, v17.8b, v3.8b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b b 4b 0: ret 80: // 8xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8b}, [\src], \s_strd 8: ld1 {v17.8b}, [\sr2], \s_strd ld1 {v18.8b}, [\src], \s_strd umull v4.8h, v16.8b, v2.8b umull v5.8h, v17.8b, v2.8b umlal v4.8h, v17.8b, v3.8b umlal v5.8h, v18.8b, v3.8b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn v5.8b, v5.8h, #4 st1 {v4.8b}, [\dst], \d_strd st1 {v5.8b}, [\ds2], \d_strd .else st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd .endif b.le 0f mov v16.8b, v18.8b b 8b 0: ret 160: // 16xN, 32xN, ... 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.16b}, [\src], \s_strd 2: ld1 {v17.16b}, [\sr2], \s_strd ld1 {v18.16b}, [\src], \s_strd umull v4.8h, v16.8b, v2.8b umull2 v5.8h, v16.16b, v2.16b umull v6.8h, v17.8b, v2.8b umull2 v7.8h, v17.16b, v2.16b umlal v4.8h, v17.8b, v3.8b umlal2 v5.8h, v17.16b, v3.16b umlal v6.8h, v18.8b, v3.8b umlal2 v7.8h, v18.16b, v3.16b subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #4 uqrshrn2 v4.16b, v5.8h, #4 uqrshrn v6.8b, v6.8h, #4 uqrshrn2 v6.16b, v7.8h, #4 st1 {v4.16b}, [\dst], \d_strd st1 {v6.16b}, [\ds2], \d_strd .else st1 {v4.8h, v5.8h}, [\dst], \d_strd st1 {v6.8h, v7.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #16 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 .ifc \type, put add \dst, \dst, #16 .else add \dst, \dst, #32 .endif b 1b 0: ret L(\type\()_bilin_v_tbl): .hword L(\type\()_bilin_v_tbl) - 1280b .hword L(\type\()_bilin_v_tbl) - 640b .hword L(\type\()_bilin_v_tbl) - 320b .hword L(\type\()_bilin_v_tbl) - 160b .hword L(\type\()_bilin_v_tbl) - 80b .hword L(\type\()_bilin_v_tbl) - 40b .hword L(\type\()_bilin_v_tbl) - 20b .hword 0 L(\type\()_bilin_hv): uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b adr x9, L(\type\()_bilin_hv_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw br x9 20: // 2xN hv AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.s}[0], [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 2: ld1 {v28.s}[0], [\sr2], \s_strd ld1 {v30.s}[0], [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 ext v31.8b, v30.8b, v30.8b, #1 trn1 v28.4h, v28.4h, v30.4h trn1 v29.4h, v29.4h, v31.4h umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b trn1 v16.2s, v16.2s, v17.2s mul v4.4h, v16.4h, v2.4h mla v4.4h, v17.4h, v3.4h uqrshrn v4.8b, v4.8h, #8 subs \h, \h, #2 st1 {v4.h}[0], [\dst], \d_strd st1 {v4.h}[1], [\ds2], \d_strd b.le 0f trn2 v16.2s, v17.2s, v17.2s b 2b 0: ret .endif 40: // 4xN hv AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.8b}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 4: ld1 {v28.8b}, [\sr2], \s_strd ld1 {v30.8b}, [\src], \s_strd ext v29.8b, v28.8b, v28.8b, #1 ext v31.8b, v30.8b, v30.8b, #1 trn1 v28.2s, v28.2s, v30.2s trn1 v29.2s, v29.2s, v31.2s umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b trn1 v16.2d, v16.2d, v17.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #8 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd .else urshr v4.8h, v4.8h, #4 st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd .endif b.le 0f trn2 v16.2d, v17.2d, v17.2d b 4b 0: ret 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v28.16b}, [\src], \s_strd ext v29.16b, v28.16b, v28.16b, #1 umull v16.8h, v28.8b, v0.8b umlal v16.8h, v29.8b, v1.8b 2: ld1 {v28.16b}, [\sr2], \s_strd ld1 {v30.16b}, [\src], \s_strd ext v29.16b, v28.16b, v28.16b, #1 ext v31.16b, v30.16b, v30.16b, #1 umull v17.8h, v28.8b, v0.8b umlal v17.8h, v29.8b, v1.8b umull v18.8h, v30.8b, v0.8b umlal v18.8h, v31.8b, v1.8b mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v18.8h, v3.8h subs \h, \h, #2 .ifc \type, put uqrshrn v4.8b, v4.8h, #8 uqrshrn v5.8b, v5.8h, #8 st1 {v4.8b}, [\dst], \d_strd st1 {v5.8b}, [\ds2], \d_strd .else urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd .endif b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #8 .ifc \type, put add \dst, \dst, #8 .else add \dst, \dst, #16 .endif b 1b 0: ret L(\type\()_bilin_hv_tbl): .hword L(\type\()_bilin_hv_tbl) - 1280b .hword L(\type\()_bilin_hv_tbl) - 640b .hword L(\type\()_bilin_hv_tbl) - 320b .hword L(\type\()_bilin_hv_tbl) - 160b .hword L(\type\()_bilin_hv_tbl) - 80b .hword L(\type\()_bilin_hv_tbl) - 40b .hword L(\type\()_bilin_hv_tbl) - 20b .hword 0 endfunc .endm filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10 filter_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6 .macro load_filter_row dst, src, inc asr w13, \src, #10 add \src, \src, \inc ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon add w12, w5, #512 ld1 {v16.8b, v17.8b}, [x2], x3 load_filter_row d0, w12, w7 load_filter_row d1, w12, w7 load_filter_row d2, w12, w7 load_filter_row d3, w12, w7 load_filter_row d4, w12, w7 load_filter_row d5, w12, w7 load_filter_row d6, w12, w7 // subtract by 128 to allow using smull eor v16.8b, v16.8b, v22.8b eor v17.8b, v17.8b, v22.8b load_filter_row d7, w12, w7 ext v18.8b, v16.8b, v17.8b, #1 ext v19.8b, v16.8b, v17.8b, #2 smull v0.8h, v0.8b, v16.8b smull v1.8h, v1.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #3 ext v20.8b, v16.8b, v17.8b, #4 smull v2.8h, v2.8b, v19.8b smull v3.8h, v3.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #5 ext v19.8b, v16.8b, v17.8b, #6 smull v4.8h, v4.8b, v20.8b smull v5.8h, v5.8b, v18.8b ext v18.8b, v16.8b, v17.8b, #7 smull v6.8h, v6.8b, v19.8b smull v7.8h, v7.8b, v18.8b addp v0.8h, v0.8h, v1.8h addp v2.8h, v2.8h, v3.8h addp v4.8h, v4.8h, v5.8h addp v6.8h, v6.8h, v7.8h addp v0.8h, v0.8h, v2.8h addp v4.8h, v4.8h, v6.8h addp v0.8h, v0.8h, v4.8h add w5, w5, w8 ret endfunc // void dav1d_warp_affine_8x8_8bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my) .macro warp t, shift function warp_affine_8x8\t\()_8bpc_neon, export=1 ldr x4, [x4] sbfx x7, x4, #0, #16 sbfx x8, x4, #16, #16 sbfx x9, x4, #32, #16 sbfx x4, x4, #48, #16 mov w10, #8 sub x2, x2, x3, lsl #1 sub x2, x2, x3 sub x2, x2, #3 movrel x11, X(mc_warp_filter), 64*8 mov x15, x30 .ifnb \t lsl x1, x1, #1 .endif movi v22.8b, #128 .ifb \t movi v23.8h, #128 .else movi v23.8h, #8, lsl #8 .endif bl warp_filter_horz_neon srshr v24.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v25.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v26.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v27.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v28.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v29.8h, v0.8h, #3 bl warp_filter_horz_neon srshr v30.8h, v0.8h, #3 1: add w14, w6, #512 bl warp_filter_horz_neon srshr v31.8h, v0.8h, #3 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 load_filter_row d2, w14, w9 load_filter_row d3, w14, w9 load_filter_row d4, w14, w9 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. smull v16.4s, v24.4h, v0.4h smlal v16.4s, v25.4h, v1.4h smlal v16.4s, v26.4h, v2.4h smlal v16.4s, v27.4h, v3.4h smlal v16.4s, v28.4h, v4.4h smlal v16.4s, v29.4h, v5.4h smlal v16.4s, v30.4h, v6.4h smlal v16.4s, v31.4h, v7.4h smull2 v17.4s, v24.8h, v0.8h smlal2 v17.4s, v25.8h, v1.8h smlal2 v17.4s, v26.8h, v2.8h smlal2 v17.4s, v27.8h, v3.8h smlal2 v17.4s, v28.8h, v4.8h smlal2 v17.4s, v29.8h, v5.8h smlal2 v17.4s, v30.8h, v6.8h smlal2 v17.4s, v31.8h, v7.8h mov v24.16b, v25.16b mov v25.16b, v26.16b sqrshrn v16.4h, v16.4s, #\shift mov v26.16b, v27.16b sqrshrn2 v16.8h, v17.4s, #\shift mov v27.16b, v28.16b mov v28.16b, v29.16b add v16.8h, v16.8h, v23.8h .ifb \t sqxtun v16.8b, v16.8h .endif mov v29.16b, v30.16b mov v30.16b, v31.16b subs w10, w10, #1 .ifnb \t st1 {v16.8h}, [x0], x1 .else st1 {v16.8b}, [x0], x1 .endif add w6, w6, w4 b.gt 1b ret x15 endfunc .endm warp , 11 warp t, 7 // void dav1d_emu_edge_8bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_8bpc_neon, export=1 ldp x8, x9, [sp] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub x12, x3, #1 // ih - 1 cmp x5, x3 sub x13, x2, #1 // iw - 1 csel x12, x12, x5, ge // min(y, ih - 1) cmp x4, x2 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) csel x13, x13, x4, ge // min(x, iw - 1) bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) madd x8, x12, x9, x8 // ref += iclip() * stride add x8, x8, x13 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add x10, x5, x1 // y + bh neg x5, x5 // -y sub x10, x10, x3 // y + bh - ih sub x12, x1, #1 // bh - 1 cmp x10, x1 bic x5, x5, x5, asr #63 // max(-y, 0) csel x10, x10, x12, lt // min(y + bh - ih, bh-1) cmp x5, x1 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) csel x5, x5, x12, lt // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add x11, x4, x0 // x + bw neg x4, x4 // -x sub x11, x11, x2 // x + bw - iw sub x13, x0, #1 // bw - 1 cmp x11, x0 bic x4, x4, x4, asr #63 // max(-x, 0) csel x11, x11, x13, lt // min(x + bw - iw, bw-1) cmp x4, x0 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub x1, x1, x5 // bh - top_ext madd x6, x5, x7, x6 sub x2, x0, x4 // bw - left_ext sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext sub x2, x2, x11 // center_w = bw - left_ext - right_ext mov x14, x6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left ld1r {v0.16b}, [x8] mov x12, x6 // out = dst mov x3, x4 1: subs x3, x3, #16 st1 {v0.16b}, [x12], #16 b.gt 1b .endif mov x13, x8 add x12, x6, x4 // out = dst + left_ext mov x3, x2 1: ld1 {v0.16b, v1.16b}, [x13], #32 subs x3, x3, #32 st1 {v0.16b, v1.16b}, [x12], #32 b.gt 1b .if \need_right add x3, x8, x2 // in + center_w sub x3, x3, #1 // in + center_w - 1 add x12, x6, x4 // dst + left_ext ld1r {v0.16b}, [x3] add x12, x12, x2 // out = dst + left_ext + center_w mov x3, x11 1: subs x3, x3, #16 st1 {v0.16b}, [x12], #16 b.gt 1b .endif subs x1, x1, #1 // center_h-- add x6, x6, x7 add x8, x8, x9 b.gt 0b .endm cbz x4, 2f // need_left cbz x11, 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cbz x11, 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cbz x10, 3f // need_bottom sub x8, x6, x7 // ref = dst - stride mov x4, x0 1: ld1 {v0.16b, v1.16b}, [x8], #32 mov x3, x10 2: subs x3, x3, #1 st1 {v0.16b, v1.16b}, [x6], x7 b.gt 2b msub x6, x7, x10, x6 // dst -= bottom_ext * stride subs x4, x4, #32 // bw -= 32 add x6, x6, #32 // dst += 32 b.gt 1b 3: cbz x5, 3f // need_top msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 1: ld1 {v0.16b, v1.16b}, [x14], #32 mov x3, x5 2: subs x3, x3, #1 st1 {v0.16b, v1.16b}, [x6], x7 b.gt 2b msub x6, x7, x5, x6 // dst -= top_ext * stride subs x0, x0, #32 // bw -= 32 add x6, x6, #32 // dst += 32 b.gt 1b 3: ret endfunc rav1e-0.7.1/src/arm/64/mc16.S000064400000000000000000004312461046102023000133520ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * Copyright © 2020, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define PREP_BIAS 8192 .macro avg d0, d1, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sqadd \t0\().8h, \t0\().8h, \t2\().8h sqadd \t1\().8h, \t1\().8h, \t3\().8h smax \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits smax \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sqsub \t0\().8h, \t0\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sqsub \t1\().8h, \t1\().8h, v28.8h // -2*PREP_BIAS - 1 << intermediate_bits sshl \d0\().8h, \t0\().8h, v29.8h // -(intermediate_bits+1) sshl \d1\().8h, \t1\().8h, v29.8h // -(intermediate_bits+1) .endm .macro w_avg d0, d1, t0, t1, t2, t3 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 ld1 {\t2\().8h,\t3\().8h}, [x3], 32 // This difference requires a 17 bit range, and all bits are // significant for the following multiplication. ssubl \d0\().4s, \t2\().4h, \t0\().4h ssubl2 \t0\().4s, \t2\().8h, \t0\().8h ssubl \d1\().4s, \t3\().4h, \t1\().4h ssubl2 \t1\().4s, \t3\().8h, \t1\().8h mul \d0\().4s, \d0\().4s, v27.4s mul \t0\().4s, \t0\().4s, v27.4s mul \d1\().4s, \d1\().4s, v27.4s mul \t1\().4s, \t1\().4s, v27.4s sshr \d0\().4s, \d0\().4s, #4 sshr \t0\().4s, \t0\().4s, #4 sshr \d1\().4s, \d1\().4s, #4 sshr \t1\().4s, \t1\().4s, #4 saddw \d0\().4s, \d0\().4s, \t2\().4h saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max smax \d0\().8h, \d0\().8h, v30.8h // 0 smax \d1\().8h, \d1\().8h, v30.8h // 0 .endm .macro mask d0, d1, t0, t1, t2, t3 ld1 {v27.16b}, [x6], 16 ld1 {\t0\().8h,\t1\().8h}, [x2], 32 neg v27.16b, v27.16b ld1 {\t2\().8h,\t3\().8h}, [x3], 32 sxtl v26.8h, v27.8b sxtl2 v27.8h, v27.16b sxtl v24.4s, v26.4h sxtl2 v25.4s, v26.8h sxtl v26.4s, v27.4h sxtl2 v27.4s, v27.8h ssubl \d0\().4s, \t2\().4h, \t0\().4h ssubl2 \t0\().4s, \t2\().8h, \t0\().8h ssubl \d1\().4s, \t3\().4h, \t1\().4h ssubl2 \t1\().4s, \t3\().8h, \t1\().8h mul \d0\().4s, \d0\().4s, v24.4s mul \t0\().4s, \t0\().4s, v25.4s mul \d1\().4s, \d1\().4s, v26.4s mul \t1\().4s, \t1\().4s, v27.4s sshr \d0\().4s, \d0\().4s, #6 sshr \t0\().4s, \t0\().4s, #6 sshr \d1\().4s, \d1\().4s, #6 sshr \t1\().4s, \t1\().4s, #6 saddw \d0\().4s, \d0\().4s, \t2\().4h saddw2 \t0\().4s, \t0\().4s, \t2\().8h saddw \d1\().4s, \d1\().4s, \t3\().4h saddw2 \t1\().4s, \t1\().4s, \t3\().8h uzp1 \d0\().8h, \d0\().8h, \t0\().8h // Same as xtn, xtn2 uzp1 \d1\().8h, \d1\().8h, \t1\().8h // Ditto srshl \d0\().8h, \d0\().8h, v29.8h // -intermediate_bits srshl \d1\().8h, \d1\().8h, v29.8h // -intermediate_bits add \d0\().8h, \d0\().8h, v28.8h // PREP_BIAS >> intermediate_bits add \d1\().8h, \d1\().8h, v28.8h // PREP_BIAS >> intermediate_bits smin \d0\().8h, \d0\().8h, v31.8h // bitdepth_max smin \d1\().8h, \d1\().8h, v31.8h // bitdepth_max smax \d0\().8h, \d0\().8h, v30.8h // 0 smax \d1\().8h, \d1\().8h, v30.8h // 0 .endm .macro bidir_fn type, bdmax function \type\()_16bpc_neon, export=1 clz w4, w4 .ifnc \type, avg dup v31.8h, \bdmax // bitdepth_max movi v30.8h, #0 .endif clz w7, \bdmax sub w7, w7, #18 // intermediate_bits = clz(bitdepth_max) - 18 .ifc \type, avg mov w9, #1 mov w8, #-2*PREP_BIAS lsl w9, w9, w7 // 1 << intermediate_bits add w7, w7, #1 sub w8, w8, w9 // -2*PREP_BIAS - 1 << intermediate_bits neg w7, w7 // -(intermediate_bits+1) dup v28.8h, w8 // -2*PREP_BIAS - 1 << intermediate_bits dup v29.8h, w7 // -(intermediate_bits+1) .else mov w8, #PREP_BIAS lsr w8, w8, w7 // PREP_BIAS >> intermediate_bits neg w7, w7 // -intermediate_bits dup v28.8h, w8 // PREP_BIAS >> intermediate_bits dup v29.8h, w7 // -intermediate_bits .endif .ifc \type, w_avg dup v27.4s, w6 neg v27.4s, v27.4s .endif adr x7, L(\type\()_tbl) sub w4, w4, #24 \type v4, v5, v0, v1, v2, v3 ldrh w4, [x7, x4, lsl #1] sub x7, x7, w4, uxtw br x7 40: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 4: subs w5, w5, #4 st1 {v4.d}[0], [x0], x1 st1 {v4.d}[1], [x7], x1 st1 {v5.d}[0], [x0], x1 st1 {v5.d}[1], [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 4b 80: AARCH64_VALID_JUMP_TARGET add x7, x0, x1 lsl x1, x1, #1 8: st1 {v4.8h}, [x0], x1 subs w5, w5, #2 st1 {v5.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 8b 16: AARCH64_VALID_JUMP_TARGET \type v6, v7, v0, v1, v2, v3 st1 {v4.8h, v5.8h}, [x0], x1 subs w5, w5, #2 st1 {v6.8h, v7.8h}, [x0], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 16b 32: AARCH64_VALID_JUMP_TARGET \type v6, v7, v0, v1, v2, v3 subs w5, w5, #1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 32b 640: AARCH64_VALID_JUMP_TARGET add x7, x0, #64 64: \type v6, v7, v0, v1, v2, v3 \type v16, v17, v0, v1, v2, v3 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 \type v18, v19, v0, v1, v2, v3 subs w5, w5, #1 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 64b 1280: AARCH64_VALID_JUMP_TARGET add x7, x0, #64 mov x8, #128 sub x1, x1, #128 128: \type v6, v7, v0, v1, v2, v3 \type v16, v17, v0, v1, v2, v3 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x8 \type v18, v19, v0, v1, v2, v3 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8 \type v4, v5, v0, v1, v2, v3 \type v6, v7, v0, v1, v2, v3 \type v16, v17, v0, v1, v2, v3 subs w5, w5, #1 st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 \type v18, v19, v0, v1, v2, v3 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1 b.le 0f \type v4, v5, v0, v1, v2, v3 b 128b 0: ret L(\type\()_tbl): .hword L(\type\()_tbl) - 1280b .hword L(\type\()_tbl) - 640b .hword L(\type\()_tbl) - 32b .hword L(\type\()_tbl) - 16b .hword L(\type\()_tbl) - 80b .hword L(\type\()_tbl) - 40b endfunc .endm bidir_fn avg, w6 bidir_fn w_avg, w7 bidir_fn mask, w7 .macro w_mask_fn type function w_mask_\type\()_16bpc_neon, export=1 ldr w8, [sp] clz w9, w4 adr x10, L(w_mask_\type\()_tbl) dup v31.8h, w8 // bitdepth_max sub w9, w9, #24 clz w8, w8 // clz(bitdepth_max) ldrh w9, [x10, x9, lsl #1] sub x10, x10, w9, uxtw sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 mov w9, #PREP_BIAS*64 neg w8, w8 // -sh mov w11, #27615 // (64 + 1 - 38)<> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s uxtl v22.4s, v20.4h uxtl2 v23.4s, v20.8h uxtl v24.4s, v21.4h uxtl2 v25.4s, v21.8h mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) mla v5.4s, v17.4s, v23.4s mla v6.4s, v18.4s, v24.4s mla v7.4s, v19.4s, v25.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v6.4s sqxtun2 v5.8h, v7.4s umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) xtn v20.8b, v20.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 st1 {v20.8b}, [x6], #8 .elseif \type == 420 trn1 v24.2d, v20.2d, v21.2d trn2 v25.2d, v20.2d, v21.2d add v24.8h, v24.8h, v25.8h // (64 - my1) + (64 - my2) (row wise addition) addp v20.8h, v24.8h, v24.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 st1 {v20.s}[0], [x6], #4 .endif st1 {v4.d}[0], [x0], x1 st1 {v4.d}[1], [x12], x1 st1 {v5.d}[0], [x0], x1 st1 {v5.d}[1], [x12], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 subs w5, w5, #2 sabd v20.8h, v4.8h, v6.8h // abs(tmp1 - tmp2) sabd v21.8h, v5.8h, v7.8h ssubl v16.4s, v6.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v17.4s, v6.8h, v4.8h ssubl v18.4s, v7.4h, v5.4h ssubl2 v19.4s, v7.8h, v5.8h uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() uqsub v21.8h, v0.8h, v21.8h sshll2 v7.4s, v5.8h, #6 // tmp1 << 6 sshll v6.4s, v5.4h, #6 sshll2 v5.4s, v4.8h, #6 sshll v4.4s, v4.4h, #6 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v6.4s, v6.4s, v30.4s add v7.4s, v7.4s, v30.4s uxtl v22.4s, v20.4h uxtl2 v23.4s, v20.8h uxtl v24.4s, v21.4h uxtl2 v25.4s, v21.8h mla v4.4s, v16.4s, v22.4s // (tmp2-tmp1)*(64-m) mla v5.4s, v17.4s, v23.4s mla v6.4s, v18.4s, v24.4s mla v7.4s, v19.4s, v25.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v6.4s, v6.4s, v29.4s srshl v7.4s, v7.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v6.4s sqxtun2 v5.8h, v7.4s umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m sub v20.16b, v1.16b, v20.16b // m st1 {v20.16b}, [x6], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) xtn v20.8b, v20.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 st1 {v20.8b}, [x6], #8 .elseif \type == 420 add v20.8h, v20.8h, v21.8h // (64 - my1) + (64 - my2) (row wise addition) addp v20.8h, v20.8h, v20.8h // (128 - m) + (128 - n) (column wise addition) sub v20.4h, v3.4h, v20.4h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 st1 {v20.s}[0], [x6], #4 .endif st1 {v4.8h}, [x0], x1 st1 {v5.8h}, [x12], x1 b.gt 8b ret 1280: 640: 320: 160: AARCH64_VALID_JUMP_TARGET mov w11, w4 sub x1, x1, w4, uxtw #1 .if \type == 444 add x10, x6, w4, uxtw .elseif \type == 422 add x10, x6, x11, lsr #1 .endif add x9, x3, w4, uxtw #1 add x7, x2, w4, uxtw #1 161: mov w8, w4 16: ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 ld1 {v16.8h, v17.8h}, [x3], #32 // tmp2 ld1 {v6.8h, v7.8h}, [x7], #32 ld1 {v18.8h, v19.8h}, [x9], #32 subs w8, w8, #16 sabd v20.8h, v4.8h, v16.8h // abs(tmp1 - tmp2) sabd v21.8h, v5.8h, v17.8h ssubl v22.4s, v16.4h, v4.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v23.4s, v16.8h, v4.8h ssubl v24.4s, v17.4h, v5.4h ssubl2 v25.4s, v17.8h, v5.8h uqsub v20.8h, v0.8h, v20.8h // 27615 - abs() uqsub v21.8h, v0.8h, v21.8h sshll2 v27.4s, v5.8h, #6 // tmp1 << 6 sshll v26.4s, v5.4h, #6 sshll2 v5.4s, v4.8h, #6 sshll v4.4s, v4.4h, #6 ushr v20.8h, v20.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v21.8h, v21.8h, #10 add v4.4s, v4.4s, v30.4s // += PREP_BIAS*64 add v5.4s, v5.4s, v30.4s add v26.4s, v26.4s, v30.4s add v27.4s, v27.4s, v30.4s uxtl v16.4s, v20.4h uxtl2 v17.4s, v20.8h uxtl v28.4s, v21.4h mla v4.4s, v22.4s, v16.4s // (tmp2-tmp1)*(64-m) uxtl2 v16.4s, v21.8h mla v5.4s, v23.4s, v17.4s mla v26.4s, v24.4s, v28.4s mla v27.4s, v25.4s, v16.4s srshl v4.4s, v4.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v5.4s, v5.4s, v29.4s srshl v26.4s, v26.4s, v29.4s srshl v27.4s, v27.4s, v29.4s sqxtun v4.4h, v4.4s // iclip_pixel sqxtun2 v4.8h, v5.4s sqxtun v5.4h, v26.4s sqxtun2 v5.8h, v27.4s // Start of other half sabd v22.8h, v6.8h, v18.8h // abs(tmp1 - tmp2) sabd v23.8h, v7.8h, v19.8h umin v4.8h, v4.8h, v31.8h // iclip_pixel umin v5.8h, v5.8h, v31.8h ssubl v16.4s, v18.4h, v6.4h // tmp2 - tmp1 (requires 17 bit) ssubl2 v17.4s, v18.8h, v6.8h ssubl v18.4s, v19.4h, v7.4h ssubl2 v19.4s, v19.8h, v7.8h uqsub v22.8h, v0.8h, v22.8h // 27615 - abs() uqsub v23.8h, v0.8h, v23.8h sshll v24.4s, v6.4h, #6 // tmp1 << 6 sshll2 v25.4s, v6.8h, #6 sshll v26.4s, v7.4h, #6 sshll2 v27.4s, v7.8h, #6 ushr v22.8h, v22.8h, #10 // 64-m = (27615 - abs()) >> mask_sh ushr v23.8h, v23.8h, #10 add v24.4s, v24.4s, v30.4s // += PREP_BIAS*64 add v25.4s, v25.4s, v30.4s add v26.4s, v26.4s, v30.4s add v27.4s, v27.4s, v30.4s uxtl v6.4s, v22.4h uxtl2 v7.4s, v22.8h uxtl v28.4s, v23.4h mla v24.4s, v16.4s, v6.4s // (tmp2-tmp1)*(64-m) uxtl2 v6.4s, v23.8h mla v25.4s, v17.4s, v7.4s mla v26.4s, v18.4s, v28.4s mla v27.4s, v19.4s, v6.4s srshl v24.4s, v24.4s, v29.4s // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh srshl v25.4s, v25.4s, v29.4s srshl v26.4s, v26.4s, v29.4s srshl v27.4s, v27.4s, v29.4s sqxtun v6.4h, v24.4s // iclip_pixel sqxtun2 v6.8h, v25.4s sqxtun v7.4h, v26.4s sqxtun2 v7.8h, v27.4s umin v6.8h, v6.8h, v31.8h // iclip_pixel umin v7.8h, v7.8h, v31.8h .if \type == 444 uzp1 v20.16b, v20.16b, v21.16b // 64 - m uzp1 v21.16b, v22.16b, v23.16b sub v20.16b, v1.16b, v20.16b // m sub v21.16b, v1.16b, v21.16b st1 {v20.16b}, [x6], #16 st1 {v21.16b}, [x10], #16 .elseif \type == 422 addp v20.8h, v20.8h, v21.8h // (64 - m) + (64 - n) (column wise addition) addp v21.8h, v22.8h, v23.8h xtn v20.8b, v20.8h xtn v21.8b, v21.8h uhsub v20.8b, v3.8b, v20.8b // ((129 - sign) - ((64 - m) + (64 - n)) >> 1 uhsub v21.8b, v3.8b, v21.8b st1 {v20.8b}, [x6], #8 st1 {v21.8b}, [x10], #8 .elseif \type == 420 add v20.8h, v20.8h, v22.8h // (64 - my1) + (64 - my2) (row wise addition) add v21.8h, v21.8h, v23.8h addp v20.8h, v20.8h, v21.8h // (128 - m) + (128 - n) (column wise addition) sub v20.8h, v3.8h, v20.8h // (256 - sign) - ((128 - m) + (128 - n)) rshrn v20.8b, v20.8h, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2 st1 {v20.8b}, [x6], #8 .endif st1 {v4.8h, v5.8h}, [x0], #32 st1 {v6.8h, v7.8h}, [x12], #32 b.gt 16b subs w5, w5, #2 add x2, x2, w4, uxtw #1 add x3, x3, w4, uxtw #1 add x7, x7, w4, uxtw #1 add x9, x9, w4, uxtw #1 .if \type == 444 add x6, x6, w4, uxtw add x10, x10, w4, uxtw .elseif \type == 422 add x6, x6, x11, lsr #1 add x10, x10, x11, lsr #1 .endif add x0, x0, x1 add x12, x12, x1 b.gt 161b ret L(w_mask_\type\()_tbl): .hword L(w_mask_\type\()_tbl) - 1280b .hword L(w_mask_\type\()_tbl) - 640b .hword L(w_mask_\type\()_tbl) - 320b .hword L(w_mask_\type\()_tbl) - 160b .hword L(w_mask_\type\()_tbl) - 8b .hword L(w_mask_\type\()_tbl) - 4b endfunc .endm w_mask_fn 444 w_mask_fn 422 w_mask_fn 420 function blend_16bpc_neon, export=1 adr x6, L(blend_tbl) clz w3, w3 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw add x8, x0, x1 br x6 40: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 4: ld1 {v2.8b}, [x5], #8 ld1 {v1.8h}, [x2], #16 ld1 {v0.d}[0], [x0] neg v2.8b, v2.8b // -m subs w4, w4, #2 ld1 {v0.d}[1], [x8] sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h st1 {v0.d}[0], [x0], x1 st1 {v0.d}[1], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 8: ld1 {v4.16b}, [x5], #16 ld1 {v2.8h, v3.8h}, [x2], #32 neg v5.16b, v4.16b // -m ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x8] sxtl v4.8h, v5.8b sxtl2 v5.8h, v5.16b shl v4.8h, v4.8h, #9 // -m << 9 shl v5.8h, v5.8h, #9 sub v2.8h, v0.8h, v2.8h // a - b sub v3.8h, v1.8h, v3.8h subs w4, w4, #2 sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v3.8h, v3.8h, v5.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET lsl x1, x1, #1 16: ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #2 neg v18.16b, v16.16b // -m neg v19.16b, v17.16b ld1 {v0.8h, v1.8h}, [x0] sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b ld1 {v2.8h, v3.8h}, [x8] shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 shl v18.8h, v18.8h, #9 shl v19.8h, v19.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v17.8h sqrdmulh v6.8h, v6.8h, v18.8h sqrdmulh v7.8h, v7.8h, v19.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x8], x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b, v17.16b}, [x5], #32 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 subs w4, w4, #1 neg v18.16b, v16.16b // -m neg v19.16b, v17.16b sxtl v16.8h, v18.8b sxtl2 v17.8h, v18.16b sxtl v18.8h, v19.8b sxtl2 v19.8h, v19.16b ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 shl v18.8h, v18.8h, #9 shl v19.8h, v19.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v17.8h sqrdmulh v6.8h, v6.8h, v18.8h sqrdmulh v7.8h, v7.8h, v19.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 32b ret L(blend_tbl): .hword L(blend_tbl) - 32b .hword L(blend_tbl) - 160b .hword L(blend_tbl) - 80b .hword L(blend_tbl) - 40b endfunc function blend_h_16bpc_neon, export=1 adr x6, L(blend_h_tbl) movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 clz w7, w3 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 ldrh w7, [x6, x7, lsl #1] sub x6, x6, w7, uxtw br x6 2: AARCH64_VALID_JUMP_TARGET ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.4h}, [x2], #8 ext v2.8b, v2.8b, v3.8b, #6 subs w4, w4, #2 neg v2.8b, v2.8b // -m ld1 {v0.s}[0], [x0] ld1 {v0.s}[1], [x8] sxtl v2.8h, v2.8b shl v2.4h, v2.4h, #9 // -m << 9 sub v1.4h, v0.4h, v1.4h // a - b sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 add v0.4h, v0.4h, v1.4h st1 {v0.s}[0], [x0], x1 st1 {v0.s}[1], [x8], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld2r {v2.8b, v3.8b}, [x5], #2 ld1 {v1.8h}, [x2], #16 ext v2.8b, v2.8b, v3.8b, #4 subs w4, w4, #2 neg v2.8b, v2.8b // -m ld1 {v0.d}[0], [x0] ld1 {v0.d}[1], [x8] sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h st1 {v0.d}[0], [x0], x1 st1 {v0.d}[1], [x8], x1 b.gt 4b ret 8: AARCH64_VALID_JUMP_TARGET ld2r {v4.8b, v5.8b}, [x5], #2 ld1 {v2.8h, v3.8h}, [x2], #32 neg v4.8b, v4.8b // -m neg v5.8b, v5.8b ld1 {v0.8h}, [x0] subs w4, w4, #2 sxtl v4.8h, v4.8b sxtl v5.8h, v5.8b ld1 {v1.8h}, [x8] shl v4.8h, v4.8h, #9 // -m << 9 shl v5.8h, v5.8h, #9 sub v2.8h, v0.8h, v2.8h // a - b sub v3.8h, v1.8h, v3.8h sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v3.8h, v3.8h, v5.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ld2r {v16.8b, v17.8b}, [x5], #2 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 neg v16.8b, v16.8b // -m neg v17.8b, v17.8b ld1 {v0.8h, v1.8h}, [x0] ld1 {v2.8h, v3.8h}, [x8] subs w4, w4, #2 sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b shl v16.8h, v16.8h, #9 // -m << 9 shl v17.8h, v17.8h, #9 sub v4.8h, v0.8h, v4.8h // a - b sub v5.8h, v1.8h, v5.8h sub v6.8h, v2.8h, v6.8h sub v7.8h, v3.8h, v7.8h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.8h, v5.8h, v16.8h sqrdmulh v6.8h, v6.8h, v17.8h sqrdmulh v7.8h, v7.8h, v17.8h add v0.8h, v0.8h, v4.8h add v1.8h, v1.8h, v5.8h add v2.8h, v2.8h, v6.8h add v3.8h, v3.8h, v7.8h st1 {v0.8h, v1.8h}, [x0], x1 st1 {v2.8h, v3.8h}, [x8], x1 b.gt 16b ret 1280: 640: 320: AARCH64_VALID_JUMP_TARGET sub x1, x1, w3, uxtw #1 add x7, x2, w3, uxtw #1 321: ld2r {v24.8b, v25.8b}, [x5], #2 mov w6, w3 neg v24.8b, v24.8b // -m neg v25.8b, v25.8b sxtl v24.8h, v24.8b sxtl v25.8h, v25.8b shl v24.8h, v24.8h, #9 // -m << 9 shl v25.8h, v25.8h, #9 32: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] subs w6, w6, #32 sub v16.8h, v0.8h, v16.8h // a - b sub v17.8h, v1.8h, v17.8h sub v18.8h, v2.8h, v18.8h sub v19.8h, v3.8h, v19.8h ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8] sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v17.8h, v17.8h, v24.8h sqrdmulh v18.8h, v18.8h, v24.8h sqrdmulh v19.8h, v19.8h, v24.8h sub v20.8h, v4.8h, v20.8h // a - b sub v21.8h, v5.8h, v21.8h sub v22.8h, v6.8h, v22.8h sub v23.8h, v7.8h, v23.8h add v0.8h, v0.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v3.8h, v3.8h, v19.8h sqrdmulh v20.8h, v20.8h, v25.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v21.8h, v21.8h, v25.8h sqrdmulh v22.8h, v22.8h, v25.8h sqrdmulh v23.8h, v23.8h, v25.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h add v7.8h, v7.8h, v23.8h st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x8], #64 b.gt 32b subs w4, w4, #2 add x0, x0, x1 add x8, x8, x1 add x2, x2, w3, uxtw #1 add x7, x7, w3, uxtw #1 b.gt 321b ret L(blend_h_tbl): .hword L(blend_h_tbl) - 1280b .hword L(blend_h_tbl) - 640b .hword L(blend_h_tbl) - 320b .hword L(blend_h_tbl) - 16b .hword L(blend_h_tbl) - 8b .hword L(blend_h_tbl) - 4b .hword L(blend_h_tbl) - 2b endfunc function blend_v_16bpc_neon, export=1 adr x6, L(blend_v_tbl) movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 ldrh w3, [x6, x3, lsl #1] sub x6, x6, w3, uxtw br x6 20: AARCH64_VALID_JUMP_TARGET ld1r {v2.8b}, [x5] neg v2.8b, v2.8b // -m sxtl v2.8h, v2.8b shl v2.4h, v2.4h, #9 // -m << 9 2: ld1 {v1.s}[0], [x2], #4 ld1 {v0.h}[0], [x0] subs w4, w4, #2 ld1 {v1.h}[1], [x2] ld1 {v0.h}[1], [x8] add x2, x2, #4 sub v1.4h, v0.4h, v1.4h // a - b sqrdmulh v1.4h, v1.4h, v2.4h // ((a-b)*-m + 32) >> 6 add v0.4h, v0.4h, v1.4h st1 {v0.h}[0], [x0], x1 st1 {v0.h}[1], [x8], x1 b.gt 2b ret 40: AARCH64_VALID_JUMP_TARGET ld1r {v2.2s}, [x5] sub x1, x1, #4 neg v2.8b, v2.8b // -m sxtl v2.8h, v2.8b shl v2.8h, v2.8h, #9 // -m << 9 4: ld1 {v1.8h}, [x2], #16 ld1 {v0.d}[0], [x0] ld1 {v0.d}[1], [x8] subs w4, w4, #2 sub v1.8h, v0.8h, v1.8h // a - b sqrdmulh v1.8h, v1.8h, v2.8h // ((a-b)*-m + 32) >> 6 add v0.8h, v0.8h, v1.8h st1 {v0.s}[0], [x0], #4 st1 {v0.s}[2], [x8], #4 st1 {v0.h}[2], [x0], x1 st1 {v0.h}[6], [x8], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET ld1 {v4.8b}, [x5] sub x1, x1, #8 neg v4.8b, v4.8b // -m sxtl v4.8h, v4.8b shl v4.8h, v4.8h, #9 // -m << 9 8: ld1 {v2.8h, v3.8h}, [x2], #32 ld1 {v0.8h}, [x0] ld1 {v1.8h}, [x8] subs w4, w4, #2 sub v2.8h, v0.8h, v2.8h // a - b sub v3.8h, v1.8h, v3.8h sqrdmulh v2.8h, v2.8h, v4.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v3.8h, v3.8h, v4.8h add v0.8h, v0.8h, v2.8h add v1.8h, v1.8h, v3.8h st1 {v0.d}[0], [x0], #8 st1 {v1.d}[0], [x8], #8 st1 {v0.s}[2], [x0], x1 st1 {v1.s}[2], [x8], x1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET ld1 {v16.16b}, [x5] sub x1, x1, #16 neg v17.16b, v16.16b // -m sxtl v16.8h, v17.8b sxtl2 v17.8h, v17.16b shl v16.8h, v16.8h, #9 // -m << 9 shl v17.4h, v17.4h, #9 16: ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 ld1 {v0.8h, v1.8h}, [x0] subs w4, w4, #2 ld1 {v2.8h, v3.8h}, [x8] sub v4.8h, v0.8h, v4.8h // a - b sub v5.4h, v1.4h, v5.4h sub v6.8h, v2.8h, v6.8h sub v7.4h, v3.4h, v7.4h sqrdmulh v4.8h, v4.8h, v16.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v5.4h, v5.4h, v17.4h sqrdmulh v6.8h, v6.8h, v16.8h sqrdmulh v7.4h, v7.4h, v17.4h add v0.8h, v0.8h, v4.8h add v1.4h, v1.4h, v5.4h add v2.8h, v2.8h, v6.8h add v3.4h, v3.4h, v7.4h st1 {v0.8h}, [x0], #16 st1 {v2.8h}, [x8], #16 st1 {v1.4h}, [x0], x1 st1 {v3.4h}, [x8], x1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET ld1 {v24.16b, v25.16b}, [x5] neg v26.16b, v24.16b // -m neg v27.8b, v25.8b sxtl v24.8h, v26.8b sxtl2 v25.8h, v26.16b sxtl v26.8h, v27.8b shl v24.8h, v24.8h, #9 // -m << 9 shl v25.8h, v25.8h, #9 shl v26.8h, v26.8h, #9 32: ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64 ld1 {v0.8h, v1.8h, v2.8h}, [x0] ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64 ld1 {v4.8h, v5.8h, v6.8h}, [x8] subs w4, w4, #2 sub v16.8h, v0.8h, v16.8h // a - b sub v17.8h, v1.8h, v17.8h sub v18.8h, v2.8h, v18.8h sub v20.8h, v4.8h, v20.8h sub v21.8h, v5.8h, v21.8h sub v22.8h, v6.8h, v22.8h sqrdmulh v16.8h, v16.8h, v24.8h // ((a-b)*-m + 32) >> 6 sqrdmulh v17.8h, v17.8h, v25.8h sqrdmulh v18.8h, v18.8h, v26.8h sqrdmulh v20.8h, v20.8h, v24.8h sqrdmulh v21.8h, v21.8h, v25.8h sqrdmulh v22.8h, v22.8h, v26.8h add v0.8h, v0.8h, v16.8h add v1.8h, v1.8h, v17.8h add v2.8h, v2.8h, v18.8h add v4.8h, v4.8h, v20.8h add v5.8h, v5.8h, v21.8h add v6.8h, v6.8h, v22.8h st1 {v0.8h, v1.8h, v2.8h}, [x0], x1 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 b.gt 32b ret L(blend_v_tbl): .hword L(blend_v_tbl) - 320b .hword L(blend_v_tbl) - 160b .hword L(blend_v_tbl) - 80b .hword L(blend_v_tbl) - 40b .hword L(blend_v_tbl) - 20b endfunc // This has got the same signature as the put_8tap functions, // and assumes that x9 is set to (clz(w)-24). function put_neon adr x10, L(put_tbl) ldrh w9, [x10, x9, lsl #1] sub x10, x10, w9, uxtw br x10 2: AARCH64_VALID_JUMP_TARGET ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 subs w5, w5, #2 st1 {v0.s}[0], [x0], x1 st1 {v1.s}[0], [x0], x1 b.gt 2b ret 4: AARCH64_VALID_JUMP_TARGET ld1 {v0.4h}, [x2], x3 ld1 {v1.4h}, [x2], x3 subs w5, w5, #2 st1 {v0.4h}, [x0], x1 st1 {v1.4h}, [x0], x1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x8, x0, x1 lsl x1, x1, #1 add x9, x2, x3 lsl x3, x3, #1 8: ld1 {v0.8h}, [x2], x3 ld1 {v1.8h}, [x9], x3 subs w5, w5, #2 st1 {v0.8h}, [x0], x1 st1 {v1.8h}, [x8], x1 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] subs w5, w5, #1 stp x8, x9, [x0, #16] add x2, x2, x3 add x0, x0, x1 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] ldp x10, x11, [x2, #32] stp x8, x9, [x0, #16] subs w5, w5, #1 ldp x12, x13, [x2, #48] stp x10, x11, [x0, #32] stp x12, x13, [x0, #48] add x2, x2, x3 add x0, x0, x1 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] ldp q4, q5, [x2, #64] stp q2, q3, [x0, #32] ldp q6, q7, [x2, #96] subs w5, w5, #1 stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x2, x2, x3 add x0, x0, x1 b.gt 64b ret 128: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] ldp q4, q5, [x2, #64] stp q2, q3, [x0, #32] ldp q6, q7, [x2, #96] subs w5, w5, #1 stp q4, q5, [x0, #64] ldp q16, q17, [x2, #128] stp q6, q7, [x0, #96] ldp q18, q19, [x2, #160] stp q16, q17, [x0, #128] ldp q20, q21, [x2, #192] stp q18, q19, [x0, #160] ldp q22, q23, [x2, #224] stp q20, q21, [x0, #192] stp q22, q23, [x0, #224] add x2, x2, x3 add x0, x0, x1 b.gt 128b ret L(put_tbl): .hword L(put_tbl) - 128b .hword L(put_tbl) - 64b .hword L(put_tbl) - 32b .hword L(put_tbl) - 16b .hword L(put_tbl) - 80b .hword L(put_tbl) - 4b .hword L(put_tbl) - 2b endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and // x8 to w*2. function prep_neon adr x10, L(prep_tbl) ldrh w9, [x10, x9, lsl #1] dup v31.8h, w7 // intermediate_bits movi v30.8h, #(PREP_BIAS >> 8), lsl #8 sub x10, x10, w9, uxtw br x10 40: AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 4: ld1 {v0.d}[0], [x1], x2 ld1 {v0.d}[1], [x9], x2 subs w4, w4, #2 sshl v0.8h, v0.8h, v31.8h sub v0.8h, v0.8h, v30.8h st1 {v0.8h}, [x0], #16 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET add x9, x1, x2 lsl x2, x2, #1 8: ld1 {v0.8h}, [x1], x2 ld1 {v1.8h}, [x9], x2 subs w4, w4, #2 sshl v0.8h, v0.8h, v31.8h sshl v1.8h, v1.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 8b ret 16: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] add x1, x1, x2 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1] add x1, x1, x2 subs w4, w4, #2 sshl v1.8h, v1.8h, v31.8h sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 16b ret 32: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] add x1, x1, x2 sshl v1.8h, v1.8h, v31.8h sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h subs w4, w4, #1 sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 b.gt 32b ret 64: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] sshl v1.8h, v1.8h, v31.8h ldp q4, q5, [x1, #64] sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h ldp q6, q7, [x1, #96] add x1, x1, x2 sshl v4.8h, v4.8h, v31.8h sshl v5.8h, v5.8h, v31.8h sshl v6.8h, v6.8h, v31.8h sshl v7.8h, v7.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h stp q0, q1, [x0] sub v4.8h, v4.8h, v30.8h sub v5.8h, v5.8h, v30.8h stp q2, q3, [x0, #32] sub v6.8h, v6.8h, v30.8h sub v7.8h, v7.8h, v30.8h stp q4, q5, [x0, #64] stp q6, q7, [x0, #96] add x0, x0, x8 b.gt 64b ret 128: AARCH64_VALID_JUMP_TARGET ldp q0, q1, [x1] subs w4, w4, #1 sshl v0.8h, v0.8h, v31.8h ldp q2, q3, [x1, #32] sshl v1.8h, v1.8h, v31.8h ldp q4, q5, [x1, #64] sshl v2.8h, v2.8h, v31.8h sshl v3.8h, v3.8h, v31.8h ldp q6, q7, [x1, #96] sshl v4.8h, v4.8h, v31.8h sshl v5.8h, v5.8h, v31.8h ldp q16, q17, [x1, #128] sshl v6.8h, v6.8h, v31.8h sshl v7.8h, v7.8h, v31.8h ldp q18, q19, [x1, #160] sshl v16.8h, v16.8h, v31.8h sshl v17.8h, v17.8h, v31.8h ldp q20, q21, [x1, #192] sshl v18.8h, v18.8h, v31.8h sshl v19.8h, v19.8h, v31.8h ldp q22, q23, [x1, #224] add x1, x1, x2 sshl v20.8h, v20.8h, v31.8h sshl v21.8h, v21.8h, v31.8h sshl v22.8h, v22.8h, v31.8h sshl v23.8h, v23.8h, v31.8h sub v0.8h, v0.8h, v30.8h sub v1.8h, v1.8h, v30.8h sub v2.8h, v2.8h, v30.8h sub v3.8h, v3.8h, v30.8h stp q0, q1, [x0] sub v4.8h, v4.8h, v30.8h sub v5.8h, v5.8h, v30.8h stp q2, q3, [x0, #32] sub v6.8h, v6.8h, v30.8h sub v7.8h, v7.8h, v30.8h stp q4, q5, [x0, #64] sub v16.8h, v16.8h, v30.8h sub v17.8h, v17.8h, v30.8h stp q6, q7, [x0, #96] sub v18.8h, v18.8h, v30.8h sub v19.8h, v19.8h, v30.8h stp q16, q17, [x0, #128] sub v20.8h, v20.8h, v30.8h sub v21.8h, v21.8h, v30.8h stp q18, q19, [x0, #160] sub v22.8h, v22.8h, v30.8h sub v23.8h, v23.8h, v30.8h stp q20, q21, [x0, #192] stp q22, q23, [x0, #224] add x0, x0, x8 b.gt 128b ret L(prep_tbl): .hword L(prep_tbl) - 128b .hword L(prep_tbl) - 64b .hword L(prep_tbl) - 32b .hword L(prep_tbl) - 16b .hword L(prep_tbl) - 80b .hword L(prep_tbl) - 40b endfunc .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}[0], [\s0], \strd ld1 {\d1\wd}[0], [\s1], \strd .ifnb \d2 ld1 {\d2\wd}[0], [\s0], \strd ld1 {\d3\wd}[0], [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}[0], [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}[0], [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}[0], [\s0], \strd .endif .endm .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6 ld1 {\d0\wd}, [\s0], \strd ld1 {\d1\wd}, [\s1], \strd .ifnb \d2 ld1 {\d2\wd}, [\s0], \strd ld1 {\d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd}, [\s0], \strd .endif .ifnb \d5 ld1 {\d5\wd}, [\s1], \strd .endif .ifnb \d6 ld1 {\d6\wd}, [\s0], \strd .endif .endm .macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5 ld1 {\d0\wd, \d1\wd}, [\s0], \strd .ifnb \d2 ld1 {\d2\wd, \d3\wd}, [\s1], \strd .endif .ifnb \d4 ld1 {\d4\wd, \d5\wd}, [\s0], \strd .endif .endm .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_slice \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6 load_reg \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6 .endm .macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5 load_regpair \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5 .endm .macro interleave_1 wd, r0, r1, r2, r3, r4 trn1 \r0\wd, \r0\wd, \r1\wd trn1 \r1\wd, \r1\wd, \r2\wd .ifnb \r3 trn1 \r2\wd, \r2\wd, \r3\wd trn1 \r3\wd, \r3\wd, \r4\wd .endif .endm .macro interleave_1_s r0, r1, r2, r3, r4 interleave_1 .2s, \r0, \r1, \r2, \r3, \r4 .endm .macro umin_h c, wd, r0, r1, r2, r3 umin \r0\wd, \r0\wd, \c\wd .ifnb \r1 umin \r1\wd, \r1\wd, \c\wd .endif .ifnb \r2 umin \r2\wd, \r2\wd, \c\wd umin \r3\wd, \r3\wd, \c\wd .endif .endm .macro sub_h c, wd, r0, r1, r2, r3 sub \r0\wd, \r0\wd, \c\wd .ifnb \r1 sub \r1\wd, \r1\wd, \c\wd .endif .ifnb \r2 sub \r2\wd, \r2\wd, \c\wd sub \r3\wd, \r3\wd, \c\wd .endif .endm .macro smull_smlal_4 d, s0, s1, s2, s3 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] .endm .macro smull2_smlal2_4 d, s0, s1, s2, s3 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] .endm .macro smull_smlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7 smull \d\().4s, \s0\().4h, v0.h[0] smlal \d\().4s, \s1\().4h, v0.h[1] smlal \d\().4s, \s2\().4h, v0.h[2] smlal \d\().4s, \s3\().4h, v0.h[3] smlal \d\().4s, \s4\().4h, v0.h[4] smlal \d\().4s, \s5\().4h, v0.h[5] smlal \d\().4s, \s6\().4h, v0.h[6] smlal \d\().4s, \s7\().4h, v0.h[7] .endm .macro smull2_smlal2_8 d, s0, s1, s2, s3, s4, s5, s6, s7 smull2 \d\().4s, \s0\().8h, v0.h[0] smlal2 \d\().4s, \s1\().8h, v0.h[1] smlal2 \d\().4s, \s2\().8h, v0.h[2] smlal2 \d\().4s, \s3\().8h, v0.h[3] smlal2 \d\().4s, \s4\().8h, v0.h[4] smlal2 \d\().4s, \s5\().8h, v0.h[5] smlal2 \d\().4s, \s6\().8h, v0.h[6] smlal2 \d\().4s, \s7\().8h, v0.h[7] .endm .macro sqrshrun_h shift, r0, r1, r2, r3 sqrshrun \r0\().4h, \r0\().4s, #\shift .ifnb \r1 sqrshrun2 \r0\().8h, \r1\().4s, #\shift .endif .ifnb \r2 sqrshrun \r2\().4h, \r2\().4s, #\shift sqrshrun2 \r2\().8h, \r3\().4s, #\shift .endif .endm .macro xtn_h r0, r1, r2, r3 uzp1 \r0\().8h, \r0\().8h, \r1\().8h // Same as xtn, xtn2 .ifnb \r2 uzp1 \r2\().8h, \r2\().8h, \r3\().8h // Ditto .endif .endm .macro srshl_s shift, r0, r1, r2, r3 srshl \r0\().4s, \r0\().4s, \shift\().4s srshl \r1\().4s, \r1\().4s, \shift\().4s .ifnb \r2 srshl \r2\().4s, \r2\().4s, \shift\().4s srshl \r3\().4s, \r3\().4s, \shift\().4s .endif .endm .macro st_s strd, reg, lanes st1 {\reg\().s}[0], [x0], \strd st1 {\reg\().s}[1], [x9], \strd .if \lanes > 2 st1 {\reg\().s}[2], [x0], \strd st1 {\reg\().s}[3], [x9], \strd .endif .endm .macro st_d strd, r0, r1 st1 {\r0\().d}[0], [x0], \strd st1 {\r0\().d}[1], [x9], \strd .ifnb \r1 st1 {\r1\().d}[0], [x0], \strd st1 {\r1\().d}[1], [x9], \strd .endif .endm .macro shift_store_4 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin_h v31, .8h, \r0, \r2 .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub_h v29, .8h, \r0, \r2 // PREP_BIAS .endif st_d \strd, \r0, \r2 .endm .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7 st1 {\r0\wd}, [x0], \strd st1 {\r1\wd}, [x9], \strd .ifnb \r2 st1 {\r2\wd}, [x0], \strd st1 {\r3\wd}, [x9], \strd .endif .ifnb \r4 st1 {\r4\wd}, [x0], \strd st1 {\r5\wd}, [x9], \strd st1 {\r6\wd}, [x0], \strd st1 {\r7\wd}, [x9], \strd .endif .endm .macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7 st_reg \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 .endm .macro shift_store_8 type, strd, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin_h v31, .8h, \r0, \r2 .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub_h v29, .8h, \r0, \r2 // PREP_BIAS .endif st_8h \strd, \r0, \r2 .endm .macro shift_store_16 type, strd, dst, r0, r1, r2, r3 .ifc \type, put sqrshrun_h 6, \r0, \r1, \r2, \r3 umin \r0\().8h, \r0\().8h, v31.8h umin \r1\().8h, \r2\().8h, v31.8h .else srshl_s v30, \r0, \r1, \r2, \r3 // -(6-intermediate_bits) xtn_h \r0, \r1, \r2, \r3 sub \r0\().8h, \r0\().8h, v29.8h sub \r1\().8h, \r2\().8h, v29.8h .endif st1 {\r0\().8h, \r1\().8h}, [\dst], \strd .endm .macro make_8tap_fn op, type, type_h, type_v function \op\()_8tap_\type\()_16bpc_neon, export=1 mov w9, \type_h mov w10, \type_v b \op\()_8tap_neon endfunc .endm // No spaces in these expressions, due to gas-preprocessor. #define REGULAR ((0*15<<7)|3*15) #define SMOOTH ((1*15<<7)|4*15) #define SHARP ((2*15<<7)|3*15) .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2 make_8tap_fn \type, regular, REGULAR, REGULAR make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH make_8tap_fn \type, regular_sharp, REGULAR, SHARP make_8tap_fn \type, smooth, SMOOTH, SMOOTH make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP make_8tap_fn \type, sharp, SHARP, SHARP make_8tap_fn \type, sharp_regular, SHARP, REGULAR make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH function \type\()_8tap_neon .ifc \bdmax, w8 ldr w8, [sp] .endif mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) mul \mx, \mx, w11 mul \my, \my, w11 add \mx, \mx, w9 // mx, 8tap_h, 4tap_h add \my, \my, w10 // my, 8tap_v, 4tap_v .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif dup v31.8h, \bdmax // bitdepth_max clz \bdmax, \bdmax clz w9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 mov w12, #6 tst \mx, #(0x7f << 14) sub w9, w9, #24 add w13, w12, \bdmax // 6 + intermediate_bits sub w12, w12, \bdmax // 6 - intermediate_bits movrel x11, X(mc_subpel_filters), -8 b.ne L(\type\()_8tap_h) tst \my, #(0x7f << 14) b.ne L(\type\()_8tap_v) b \type\()_neon L(\type\()_8tap_h): cmp \w, #4 ubfx w10, \mx, #7, #7 and \mx, \mx, #0x7f b.le 4f mov \mx, w10 4: tst \my, #(0x7f << 14) add \xmx, x11, \mx, uxtw #3 b.ne L(\type\()_8tap_hv) adr x10, L(\type\()_8tap_h_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.8h, \bdmax // intermediate_bits .else movi v28.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v29.8h, v29.8h // -intermediate_bits .endif br x10 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 2: ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 subs \h, \h, #2 trn1 v3.2s, v4.2s, v6.2s trn2 v6.2s, v4.2s, v6.2s trn1 v4.2s, v5.2s, v7.2s trn2 v7.2s, v5.2s, v7.2s smull v3.4s, v3.4h, v0.h[0] smlal v3.4s, v4.4h, v0.h[1] smlal v3.4s, v6.4h, v0.h[2] smlal v3.4s, v7.4h, v0.h[3] srshl v3.4s, v3.4s, v30.4s // -(6-intermediate_bits) sqxtun v3.4h, v3.4s srshl v3.4h, v3.4h, v29.4h // -intermediate_bits umin v3.4h, v3.4h, v31.4h st1 {v3.s}[0], [\dst], \d_strd st1 {v3.s}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] sub \src, \src, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b 4: ld1 {v16.8h}, [\src], \s_strd ld1 {v20.8h}, [\sr2], \s_strd ext v17.16b, v16.16b, v16.16b, #2 ext v18.16b, v16.16b, v16.16b, #4 ext v19.16b, v16.16b, v16.16b, #6 ext v21.16b, v20.16b, v20.16b, #2 ext v22.16b, v20.16b, v20.16b, #4 ext v23.16b, v20.16b, v20.16b, #6 subs \h, \h, #2 smull v16.4s, v16.4h, v0.h[0] smlal v16.4s, v17.4h, v0.h[1] smlal v16.4s, v18.4h, v0.h[2] smlal v16.4s, v19.4h, v0.h[3] smull v20.4s, v20.4h, v0.h[0] smlal v20.4s, v21.4h, v0.h[1] smlal v20.4s, v22.4h, v0.h[2] smlal v20.4s, v23.4h, v0.h[3] srshl v16.4s, v16.4s, v30.4s // -(6-intermediate_bits) srshl v20.4s, v20.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put sqxtun v16.4h, v16.4s sqxtun2 v16.8h, v20.4s srshl v16.8h, v16.8h, v29.8h // -intermediate_bits umin v16.8h, v16.8h, v31.8h .else uzp1 v16.8h, v16.8h, v20.8h // Same as xtn, xtn2 sub v16.8h, v16.8h, v28.8h // PREP_BIAS .endif st1 {v16.d}[0], [\dst], \d_strd st1 {v16.d}[1], [\ds2], \d_strd b.gt 4b ret 80: 160: 320: 640: 1280: // 8xN, 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] sub \src, \src, #6 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b sub \s_strd, \s_strd, \w, uxtw #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw #1 .endif 81: ld1 {v16.8h, v17.8h}, [\src], #32 ld1 {v20.8h, v21.8h}, [\sr2], #32 mov \mx, \w 8: smull v18.4s, v16.4h, v0.h[0] smull2 v19.4s, v16.8h, v0.h[0] smull v22.4s, v20.4h, v0.h[0] smull2 v23.4s, v20.8h, v0.h[0] .irpc i, 1234567 ext v24.16b, v16.16b, v17.16b, #(2*\i) ext v25.16b, v20.16b, v21.16b, #(2*\i) smlal v18.4s, v24.4h, v0.h[\i] smlal2 v19.4s, v24.8h, v0.h[\i] smlal v22.4s, v25.4h, v0.h[\i] smlal2 v23.4s, v25.8h, v0.h[\i] .endr subs \mx, \mx, #8 srshl v18.4s, v18.4s, v30.4s // -(6-intermediate_bits) srshl v19.4s, v19.4s, v30.4s // -(6-intermediate_bits) srshl v22.4s, v22.4s, v30.4s // -(6-intermediate_bits) srshl v23.4s, v23.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put sqxtun v18.4h, v18.4s sqxtun2 v18.8h, v19.4s sqxtun v22.4h, v22.4s sqxtun2 v22.8h, v23.4s srshl v18.8h, v18.8h, v29.8h // -intermediate_bits srshl v22.8h, v22.8h, v29.8h // -intermediate_bits umin v18.8h, v18.8h, v31.8h umin v22.8h, v22.8h, v31.8h .else uzp1 v18.8h, v18.8h, v19.8h // Same as xtn, xtn2 uzp1 v22.8h, v22.8h, v23.8h // Ditto sub v18.8h, v18.8h, v28.8h // PREP_BIAS sub v22.8h, v22.8h, v28.8h // PREP_BIAS .endif st1 {v18.8h}, [\dst], #16 st1 {v22.8h}, [\ds2], #16 b.le 9f mov v16.16b, v17.16b mov v20.16b, v21.16b ld1 {v17.8h}, [\src], #16 ld1 {v21.8h}, [\sr2], #16 b 8b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 81b ret L(\type\()_8tap_h_tbl): .hword L(\type\()_8tap_h_tbl) - 1280b .hword L(\type\()_8tap_h_tbl) - 640b .hword L(\type\()_8tap_h_tbl) - 320b .hword L(\type\()_8tap_h_tbl) - 160b .hword L(\type\()_8tap_h_tbl) - 80b .hword L(\type\()_8tap_h_tbl) - 40b .hword L(\type\()_8tap_h_tbl) - 20b .hword 0 L(\type\()_8tap_v): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w10 4: add \xmy, x11, \my, uxtw #3 .ifc \type, prep dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif adr x10, L(\type\()_8tap_v_tbl) ldrh w9, [x10, x9, lsl #1] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) .endif sub x10, x10, w9, uxtw br x10 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put b.gt 28f cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b // 2x2 v load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5 interleave_1_s v1, v2, v3, v4, v5 b.gt 24f smull_smlal_4 v6, v1, v2, v3, v4 sqrshrun_h 6, v6 umin_h v31, .8h, v6 st_s \d_strd, v6, 2 ret 24: // 2x4 v load_s \sr2, \src, \s_strd, v6, v7 interleave_1_s v5, v6, v7 smull_smlal_4 v16, v1, v2, v3, v4 smull_smlal_4 v17, v3, v4, v5, v6 sqrshrun_h 6, v16, v17 umin_h v31, .8h, v16 st_s \d_strd, v16, 4 ret 28: // 2x6, 2x8, 2x12, 2x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 sxtl v0.8h, v0.8b load_s \src, \sr2, \s_strd, v1, v2, v3, v4, v5, v6, v7 interleave_1_s v1, v2, v3, v4, v5 interleave_1_s v5, v6, v7 216: subs \h, \h, #4 load_s \sr2, \src, \s_strd, v16, v17, v18, v19 interleave_1_s v7, v16, v17, v18, v19 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 smull_smlal_8 v25, v3, v4, v5, v6, v7, v16, v17, v18 sqrshrun_h 6, v24, v25 umin_h v31, .8h, v24 st_s \d_strd, v24, 4 b.le 0f cmp \h, #2 mov v1.16b, v5.16b mov v2.16b, v6.16b mov v3.16b, v7.16b mov v4.16b, v16.16b mov v5.16b, v17.16b mov v6.16b, v18.16b mov v7.16b, v19.16b b.eq 26f b 216b 26: load_s \sr2, \src, \s_strd, v16, v17 interleave_1_s v7, v16, v17 smull_smlal_8 v24, v1, v2, v3, v4, v5, v6, v7, v16 sqrshrun_h 6, v24 umin_h v31, .4h, v24 st_s \d_strd, v24, 2 0: ret .endif 40: AARCH64_VALID_JUMP_TARGET b.gt 480f // 4x2, 4x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 smull_smlal_4 v6, v1, v2, v3, v4 smull_smlal_4 v7, v2, v3, v4, v5 shift_store_4 \type, \d_strd, v6, v7 b.le 0f load_4h \sr2, \src, \s_strd, v6, v7 smull_smlal_4 v1, v3, v4, v5, v6 smull_smlal_4 v2, v4, v5, v6, v7 shift_store_4 \type, \d_strd, v1, v2 0: ret 480: // 4x6, 4x8, 4x12, 4x16 v ld1 {v0.8b}, [\xmy] sub \sr2, \src, \s_strd, lsl #1 add \ds2, \dst, \d_strd sub \src, \sr2, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_4h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 48: subs \h, \h, #4 load_4h \sr2, \src, \s_strd, v23, v24, v25, v26 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 smull_smlal_8 v3, v18, v19, v20, v21, v22, v23, v24, v25 smull_smlal_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_4 \type, \d_strd, v1, v2, v3, v4 b.le 0f cmp \h, #2 mov v16.8b, v20.8b mov v17.8b, v21.8b mov v18.8b, v22.8b mov v19.8b, v23.8b mov v20.8b, v24.8b mov v21.8b, v25.8b mov v22.8b, v26.8b b.eq 46f b 48b 46: load_4h \sr2, \src, \s_strd, v23, v24 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_8 v2, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_4 \type, \d_strd, v1, v2 0: ret 80: AARCH64_VALID_JUMP_TARGET b.gt 880f // 8x2, 8x4 v cmp \h, #2 add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b load_8h \src, \sr2, \s_strd, v1, v2, v3, v4, v5 smull_smlal_4 v16, v1, v2, v3, v4 smull2_smlal2_4 v17, v1, v2, v3, v4 smull_smlal_4 v18, v2, v3, v4, v5 smull2_smlal2_4 v19, v2, v3, v4, v5 shift_store_8 \type, \d_strd, v16, v17, v18, v19 b.le 0f load_8h \sr2, \src, \s_strd, v6, v7 smull_smlal_4 v16, v3, v4, v5, v6 smull2_smlal2_4 v17, v3, v4, v5, v6 smull_smlal_4 v18, v4, v5, v6, v7 smull2_smlal2_4 v19, v4, v5, v6, v7 shift_store_8 \type, \d_strd, v16, v17, v18, v19 0: ret 880: // 8x6, 8x8, 8x16, 8x32 v 1680: // 16x8, 16x16, ... 320: // 32x8, 32x16, ... 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmy] sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 load_8h \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22 88: subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v23, v24 smull_smlal_8 v1, v16, v17, v18, v19, v20, v21, v22, v23 smull2_smlal2_8 v2, v16, v17, v18, v19, v20, v21, v22, v23 smull_smlal_8 v3, v17, v18, v19, v20, v21, v22, v23, v24 smull2_smlal2_8 v4, v17, v18, v19, v20, v21, v22, v23, v24 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f subs \h, \h, #2 load_8h \sr2, \src, \s_strd, v25, v26 smull_smlal_8 v1, v18, v19, v20, v21, v22, v23, v24, v25 smull2_smlal2_8 v2, v18, v19, v20, v21, v22, v23, v24, v25 smull_smlal_8 v3, v19, v20, v21, v22, v23, v24, v25, v26 smull2_smlal2_8 v4, v19, v20, v21, v22, v23, v24, v25, v26 shift_store_8 \type, \d_strd, v1, v2, v3, v4 b.le 9f mov v16.16b, v20.16b mov v17.16b, v21.16b mov v18.16b, v22.16b mov v19.16b, v23.16b mov v20.16b, v24.16b mov v21.16b, v25.16b mov v22.16b, v26.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: ret 160: AARCH64_VALID_JUMP_TARGET b.gt 1680b // 16x2, 16x4 v add \xmy, \xmy, #2 ld1 {v0.s}[0], [\xmy] sub \src, \src, \s_strd sxtl v0.8h, v0.8b load_16h \src, \src, \s_strd, v16, v17, v18, v19, v20, v21 16: load_16h \src, \src, \s_strd, v22, v23 subs \h, \h, #1 smull_smlal_4 v1, v16, v18, v20, v22 smull2_smlal2_4 v2, v16, v18, v20, v22 smull_smlal_4 v3, v17, v19, v21, v23 smull2_smlal2_4 v4, v17, v19, v21, v23 shift_store_16 \type, \d_strd, x0, v1, v2, v3, v4 b.le 0f mov v16.16b, v18.16b mov v17.16b, v19.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v23.16b b 16b 0: ret L(\type\()_8tap_v_tbl): .hword L(\type\()_8tap_v_tbl) - 1280b .hword L(\type\()_8tap_v_tbl) - 640b .hword L(\type\()_8tap_v_tbl) - 320b .hword L(\type\()_8tap_v_tbl) - 160b .hword L(\type\()_8tap_v_tbl) - 80b .hword L(\type\()_8tap_v_tbl) - 40b .hword L(\type\()_8tap_v_tbl) - 20b .hword 0 L(\type\()_8tap_hv): cmp \h, #4 ubfx w10, \my, #7, #7 and \my, \my, #0x7f b.le 4f mov \my, w10 4: add \xmy, x11, \my, uxtw #3 adr x10, L(\type\()_8tap_hv_tbl) dup v30.4s, w12 // 6 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.4s, w13 // 6 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v29.4s, v29.4s // -(6+intermediate_bits) .endif br x10 20: AARCH64_VALID_JUMP_TARGET .ifc \type, put add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 280f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] // 2x2, 2x4 hv sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 smull v27.4s, v27.4h, v0.4h smull v28.4s, v28.4h, v0.4h addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) bl L(\type\()_8tap_filter_2) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b 2: bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v24.8b, #4 smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s umin v2.4h, v2.4h, v31.4h subs \h, \h, #2 st1 {v2.s}[0], [\dst], \d_strd st1 {v2.s}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v24.8b b 2b 280: // 2x8, 2x16, 2x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v27.8h}, [\src], \s_strd ext v28.16b, v27.16b, v27.16b, #2 smull v27.4s, v27.4h, v0.4h smull v28.4s, v28.4h, v0.4h addp v27.4s, v27.4s, v28.4s addp v16.4s, v27.4s, v27.4s srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). bl L(\type\()_8tap_filter_2) xtn v16.4h, v16.4s trn1 v16.2s, v16.2s, v24.2s mov v17.8b, v24.8b bl L(\type\()_8tap_filter_2) ext v18.8b, v17.8b, v24.8b, #4 mov v19.8b, v24.8b bl L(\type\()_8tap_filter_2) ext v20.8b, v19.8b, v24.8b, #4 mov v21.8b, v24.8b 28: bl L(\type\()_8tap_filter_2) ext v22.8b, v21.8b, v24.8b, #4 smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] smlal v3.4s, v19.4h, v1.h[3] smlal v3.4s, v20.4h, v1.h[4] smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s umin v3.4h, v3.4h, v31.4h subs \h, \h, #2 st1 {v3.s}[0], [\dst], \d_strd st1 {v3.s}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v24.8b b 28b 0: ret x15 L(\type\()_8tap_filter_2): ld1 {v25.8h}, [\sr2], \s_strd ld1 {v27.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v28.16b, v27.16b, v27.16b, #2 trn1 v24.2s, v25.2s, v27.2s trn2 v27.2s, v25.2s, v27.2s trn1 v25.2s, v26.2s, v28.2s trn2 v28.2s, v26.2s, v28.2s smull v24.4s, v24.4h, v0.h[0] smlal v24.4s, v25.4h, v0.h[1] smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) xtn v24.4h, v24.4s ret .endif 40: AARCH64_VALID_JUMP_TARGET add \xmx, \xmx, #2 ld1 {v0.s}[0], [\xmx] b.gt 480f add \xmy, \xmy, #2 ld1 {v1.s}[0], [\xmy] sub \sr2, \src, #2 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 // 4x2, 4x4 hv ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s bl L(\type\()_8tap_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b 4: bl L(\type\()_8tap_filter_4) smull v2.4s, v16.4h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal v2.4s, v24.4h, v1.h[3] smull v3.4s, v17.4h, v1.h[0] smlal v3.4s, v18.4h, v1.h[1] smlal v3.4s, v24.4h, v1.h[2] smlal v3.4s, v25.4h, v1.h[3] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s umin v2.8h, v2.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.d}[0], [\dst], \d_strd st1 {v2.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v24.8b mov v18.8b, v25.8b b 4b 480: // 4x8, 4x16, 4x32 hv ld1 {v1.8b}, [\xmy] sub \src, \src, #2 sub \sr2, \src, \s_strd, lsl #1 sub \src, \sr2, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v16.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53). xtn v16.4h, v16.4s bl L(\type\()_8tap_filter_4) mov v17.8b, v24.8b mov v18.8b, v25.8b bl L(\type\()_8tap_filter_4) mov v19.8b, v24.8b mov v20.8b, v25.8b bl L(\type\()_8tap_filter_4) mov v21.8b, v24.8b mov v22.8b, v25.8b 48: bl L(\type\()_8tap_filter_4) smull v3.4s, v16.4h, v1.h[0] smlal v3.4s, v17.4h, v1.h[1] smlal v3.4s, v18.4h, v1.h[2] smlal v3.4s, v19.4h, v1.h[3] smlal v3.4s, v20.4h, v1.h[4] smlal v3.4s, v21.4h, v1.h[5] smlal v3.4s, v22.4h, v1.h[6] smlal v3.4s, v24.4h, v1.h[7] smull v4.4s, v17.4h, v1.h[0] smlal v4.4s, v18.4h, v1.h[1] smlal v4.4s, v19.4h, v1.h[2] smlal v4.4s, v20.4h, v1.h[3] smlal v4.4s, v21.4h, v1.h[4] smlal v4.4s, v22.4h, v1.h[5] smlal v4.4s, v24.4h, v1.h[6] smlal v4.4s, v25.4h, v1.h[7] .ifc \type, put srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) sqxtun v3.4h, v3.4s sqxtun2 v3.8h, v4.4s umin v3.8h, v3.8h, v31.8h .else rshrn v3.4h, v3.4s, #6 rshrn2 v3.8h, v4.4s, #6 sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v3.d}[0], [\dst], \d_strd st1 {v3.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b mov v17.8b, v19.8b mov v18.8b, v20.8b mov v19.8b, v21.8b mov v20.8b, v22.8b mov v21.8b, v24.8b mov v22.8b, v25.8b b 48b 0: ret x15 L(\type\()_8tap_filter_4): ld1 {v24.8h}, [\sr2], \s_strd ld1 {v25.8h}, [\src], \s_strd ext v26.16b, v24.16b, v24.16b, #2 ext v27.16b, v24.16b, v24.16b, #4 ext v28.16b, v24.16b, v24.16b, #6 smull v24.4s, v24.4h, v0.h[0] smlal v24.4s, v26.4h, v0.h[1] smlal v24.4s, v27.4h, v0.h[2] smlal v24.4s, v28.4h, v0.h[3] ext v26.16b, v25.16b, v25.16b, #2 ext v27.16b, v25.16b, v25.16b, #4 ext v28.16b, v25.16b, v25.16b, #6 smull v25.4s, v25.4h, v0.h[0] smlal v25.4s, v26.4h, v0.h[1] smlal v25.4s, v27.4h, v0.h[2] smlal v25.4s, v28.4h, v0.h[3] srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) xtn v24.4h, v24.4s xtn v25.4h, v25.4s ret 80: 160: 320: AARCH64_VALID_JUMP_TARGET b.gt 880f add \xmy, \xmy, #2 ld1 {v0.8b}, [\xmx] ld1 {v1.s}[0], [\xmy] sub \src, \src, #6 sub \src, \src, \s_strd sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b 8: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v23.4h, v1.h[2] smlal2 v5.4s, v23.8h, v1.h[2] smlal v2.4s, v23.4h, v1.h[3] smlal2 v3.4s, v23.8h, v1.h[3] smlal v4.4s, v24.4h, v1.h[3] smlal2 v5.4s, v24.8h, v1.h[3] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b mov v17.16b, v23.16b mov v18.16b, v24.16b b 8b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #2 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 164b 880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv 640: 1280: AARCH64_VALID_JUMP_TARGET ld1 {v0.8b}, [\xmx] ld1 {v1.8b}, [\xmy] sub \src, \src, #6 sub \src, \src, \s_strd sub \src, \src, \s_strd, lsl #1 sxtl v0.8h, v0.8b sxtl v1.8h, v1.8b mov x15, x30 mov \my, \h 168: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 ld1 {v27.8h, v28.8h}, [\src], \s_strd smull v24.4s, v27.4h, v0.h[0] smull2 v25.4s, v27.8h, v0.h[0] .irpc i, 1234567 ext v26.16b, v27.16b, v28.16b, #(2*\i) smlal v24.4s, v26.4h, v0.h[\i] smlal2 v25.4s, v26.8h, v0.h[\i] .endr srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits) srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) // The intermediates from the horizontal pass fit in 16 bit without // any bias; we could just as well keep them as .4s, but narrowing // them to .4h gives a significant speedup on out of order cores // (at the cost of a smaller slowdown on in-order cores such as A53), // and conserves register space (no need to clobber v8-v15). uzp1 v16.8h, v24.8h, v25.8h // Same as xtn, xtn2 bl L(\type\()_8tap_filter_8) mov v17.16b, v23.16b mov v18.16b, v24.16b bl L(\type\()_8tap_filter_8) mov v19.16b, v23.16b mov v20.16b, v24.16b bl L(\type\()_8tap_filter_8) mov v21.16b, v23.16b mov v22.16b, v24.16b 88: smull v2.4s, v16.4h, v1.h[0] smull2 v3.4s, v16.8h, v1.h[0] bl L(\type\()_8tap_filter_8) smull v4.4s, v17.4h, v1.h[0] smull2 v5.4s, v17.8h, v1.h[0] smlal v2.4s, v17.4h, v1.h[1] smlal2 v3.4s, v17.8h, v1.h[1] smlal v4.4s, v18.4h, v1.h[1] smlal2 v5.4s, v18.8h, v1.h[1] smlal v2.4s, v18.4h, v1.h[2] smlal2 v3.4s, v18.8h, v1.h[2] smlal v4.4s, v19.4h, v1.h[2] smlal2 v5.4s, v19.8h, v1.h[2] smlal v2.4s, v19.4h, v1.h[3] smlal2 v3.4s, v19.8h, v1.h[3] smlal v4.4s, v20.4h, v1.h[3] smlal2 v5.4s, v20.8h, v1.h[3] smlal v2.4s, v20.4h, v1.h[4] smlal2 v3.4s, v20.8h, v1.h[4] smlal v4.4s, v21.4h, v1.h[4] smlal2 v5.4s, v21.8h, v1.h[4] smlal v2.4s, v21.4h, v1.h[5] smlal2 v3.4s, v21.8h, v1.h[5] smlal v4.4s, v22.4h, v1.h[5] smlal2 v5.4s, v22.8h, v1.h[5] smlal v2.4s, v22.4h, v1.h[6] smlal2 v3.4s, v22.8h, v1.h[6] smlal v4.4s, v23.4h, v1.h[6] smlal2 v5.4s, v23.8h, v1.h[6] smlal v2.4s, v23.4h, v1.h[7] smlal2 v3.4s, v23.8h, v1.h[7] smlal v4.4s, v24.4h, v1.h[7] smlal2 v5.4s, v24.8h, v1.h[7] .ifc \type, put srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits) srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits) srshl v4.4s, v4.4s, v29.4s // -(6+intermediate_bits) srshl v5.4s, v5.4s, v29.4s // -(6+intermediate_bits) sqxtun v2.4h, v2.4s sqxtun2 v2.8h, v3.4s sqxtun v3.4h, v4.4s sqxtun2 v3.8h, v5.4s umin v2.8h, v2.8h, v31.8h umin v3.8h, v3.8h, v31.8h .else rshrn v2.4h, v2.4s, #6 rshrn2 v2.8h, v3.4s, #6 rshrn v3.4h, v4.4s, #6 rshrn2 v3.8h, v5.4s, #6 sub v2.8h, v2.8h, v29.8h // PREP_BIAS sub v3.8h, v3.8h, v29.8h // PREP_BIAS .endif subs \h, \h, #2 st1 {v2.8h}, [\dst], \d_strd st1 {v3.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b mov v17.16b, v19.16b mov v18.16b, v20.16b mov v19.16b, v21.16b mov v20.16b, v22.16b mov v21.16b, v23.16b mov v22.16b, v24.16b b 88b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #3 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 168b 0: ret x15 L(\type\()_8tap_filter_8): ld1 {v4.8h, v5.8h}, [\sr2], \s_strd ld1 {v6.8h, v7.8h}, [\src], \s_strd smull v25.4s, v4.4h, v0.h[0] smull2 v26.4s, v4.8h, v0.h[0] smull v27.4s, v6.4h, v0.h[0] smull2 v28.4s, v6.8h, v0.h[0] .irpc i, 1234567 ext v23.16b, v4.16b, v5.16b, #(2*\i) ext v24.16b, v6.16b, v7.16b, #(2*\i) smlal v25.4s, v23.4h, v0.h[\i] smlal2 v26.4s, v23.8h, v0.h[\i] smlal v27.4s, v24.4h, v0.h[\i] smlal2 v28.4s, v24.8h, v0.h[\i] .endr srshl v25.4s, v25.4s, v30.4s // -(6-intermediate_bits) srshl v26.4s, v26.4s, v30.4s // -(6-intermediate_bits) srshl v27.4s, v27.4s, v30.4s // -(6-intermediate_bits) srshl v28.4s, v28.4s, v30.4s // -(6-intermediate_bits) uzp1 v23.8h, v25.8h, v26.8h // Same as xtn, xtn2 uzp1 v24.8h, v27.8h, v28.8h // Ditto ret L(\type\()_8tap_hv_tbl): .hword L(\type\()_8tap_hv_tbl) - 1280b .hword L(\type\()_8tap_hv_tbl) - 640b .hword L(\type\()_8tap_hv_tbl) - 320b .hword L(\type\()_8tap_hv_tbl) - 160b .hword L(\type\()_8tap_hv_tbl) - 80b .hword L(\type\()_8tap_hv_tbl) - 40b .hword L(\type\()_8tap_hv_tbl) - 20b .hword 0 endfunc function \type\()_bilin_16bpc_neon, export=1 .ifc \bdmax, w8 ldr w8, [sp] .endif dup v1.8h, \mx dup v3.8h, \my mov w10, #16 sub w9, w10, \mx sub w10, w10, \my dup v0.8h, w9 dup v2.8h, w10 .ifc \type, prep uxtw \d_strd, \w lsl \d_strd, \d_strd, #1 .endif clz \bdmax, \bdmax // bitdepth_max clz w9, \w sub \bdmax, \bdmax, #18 // intermediate_bits = clz(bitdepth_max) - 18 mov w11, #4 sub w9, w9, #24 sub w11, w11, \bdmax // 4 - intermediate_bits add w12, \bdmax, #4 // 4 + intermediate_bits cbnz \mx, L(\type\()_bilin_h) cbnz \my, L(\type\()_bilin_v) b \type\()_neon L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) adr x10, L(\type\()_bilin_h_tbl) dup v31.8h, w11 // 4 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.8h, \bdmax // intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v30.8h, v30.8h // -intermediate_bits .endif br x10 20: // 2xN h AARCH64_VALID_JUMP_TARGET .ifc \type, put add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 2: ld1 {v4.4h}, [\src], \s_strd ld1 {v6.4h}, [\sr2], \s_strd ext v5.8b, v4.8b, v4.8b, #2 ext v7.8b, v6.8b, v6.8b, #2 trn1 v4.2s, v4.2s, v6.2s trn1 v5.2s, v5.2s, v7.2s subs \h, \h, #2 mul v4.4h, v4.4h, v0.4h mla v4.4h, v5.4h, v1.4h urshl v4.4h, v4.4h, v31.4h urshl v4.4h, v4.4h, v30.4h st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd b.gt 2b ret .endif 40: // 4xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 4: ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v4.16b, #2 ext v7.16b, v6.16b, v6.16b, #2 trn1 v4.2d, v4.2d, v6.2d trn1 v5.2d, v5.2d, v7.2d subs \h, \h, #2 mul v4.8h, v4.8h, v0.8h mla v4.8h, v5.8h, v1.8h urshl v4.8h, v4.8h, v31.8h .ifc \type, put urshl v4.8h, v4.8h, v30.8h .else sub v4.8h, v4.8h, v29.8h .endif st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.gt 4b ret 80: // 8xN h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \d_strd, \d_strd, #1 lsl \s_strd, \s_strd, #1 8: ldr h5, [\src, #16] ldr h7, [\sr2, #16] ld1 {v4.8h}, [\src], \s_strd ld1 {v6.8h}, [\sr2], \s_strd ext v5.16b, v4.16b, v5.16b, #2 ext v7.16b, v6.16b, v7.16b, #2 subs \h, \h, #2 mul v4.8h, v4.8h, v0.8h mla v4.8h, v5.8h, v1.8h mul v6.8h, v6.8h, v0.8h mla v6.8h, v7.8h, v1.8h urshl v4.8h, v4.8h, v31.8h urshl v6.8h, v6.8h, v31.8h .ifc \type, put urshl v4.8h, v4.8h, v30.8h urshl v6.8h, v6.8h, v30.8h .else sub v4.8h, v4.8h, v29.8h sub v6.8h, v6.8h, v29.8h .endif st1 {v4.8h}, [\dst], \d_strd st1 {v6.8h}, [\ds2], \d_strd b.gt 8b ret 160: 320: 640: 1280: // 16xN, 32xN, ... h AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 sub \s_strd, \s_strd, \w, uxtw #1 sub \s_strd, \s_strd, #16 .ifc \type, put lsl \d_strd, \d_strd, #1 sub \d_strd, \d_strd, \w, uxtw #1 .endif 161: ld1 {v16.8h}, [\src], #16 ld1 {v21.8h}, [\sr2], #16 mov \mx, \w 16: ld1 {v17.8h, v18.8h}, [\src], #32 ld1 {v22.8h, v23.8h}, [\sr2], #32 ext v19.16b, v16.16b, v17.16b, #2 ext v20.16b, v17.16b, v18.16b, #2 ext v24.16b, v21.16b, v22.16b, #2 ext v25.16b, v22.16b, v23.16b, #2 mul v16.8h, v16.8h, v0.8h mla v16.8h, v19.8h, v1.8h mul v17.8h, v17.8h, v0.8h mla v17.8h, v20.8h, v1.8h mul v21.8h, v21.8h, v0.8h mla v21.8h, v24.8h, v1.8h mul v22.8h, v22.8h, v0.8h mla v22.8h, v25.8h, v1.8h urshl v16.8h, v16.8h, v31.8h urshl v17.8h, v17.8h, v31.8h urshl v21.8h, v21.8h, v31.8h urshl v22.8h, v22.8h, v31.8h subs \mx, \mx, #16 .ifc \type, put urshl v16.8h, v16.8h, v30.8h urshl v17.8h, v17.8h, v30.8h urshl v21.8h, v21.8h, v30.8h urshl v22.8h, v22.8h, v30.8h .else sub v16.8h, v16.8h, v29.8h sub v17.8h, v17.8h, v29.8h sub v21.8h, v21.8h, v29.8h sub v22.8h, v22.8h, v29.8h .endif st1 {v16.8h, v17.8h}, [\dst], #32 st1 {v21.8h, v22.8h}, [\ds2], #32 b.le 9f mov v16.16b, v18.16b mov v21.16b, v23.16b b 16b 9: add \dst, \dst, \d_strd add \ds2, \ds2, \d_strd add \src, \src, \s_strd add \sr2, \sr2, \s_strd subs \h, \h, #2 b.gt 161b ret L(\type\()_bilin_h_tbl): .hword L(\type\()_bilin_h_tbl) - 1280b .hword L(\type\()_bilin_h_tbl) - 640b .hword L(\type\()_bilin_h_tbl) - 320b .hword L(\type\()_bilin_h_tbl) - 160b .hword L(\type\()_bilin_h_tbl) - 80b .hword L(\type\()_bilin_h_tbl) - 40b .hword L(\type\()_bilin_h_tbl) - 20b .hword 0 L(\type\()_bilin_v): cmp \h, #4 adr x10, L(\type\()_bilin_v_tbl) .ifc \type, prep dup v31.8h, w11 // 4 - intermediate_bits .endif ldrh w9, [x10, x9, lsl #1] .ifc \type, prep movi v29.8h, #(PREP_BIAS >> 8), lsl #8 neg v31.8h, v31.8h // -(4-intermediate_bits) .endif sub x10, x10, w9, uxtw br x10 20: // 2xN v AARCH64_VALID_JUMP_TARGET .ifc \type, put cmp \h, #2 add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 // 2x2 v ld1 {v16.s}[0], [\src], \s_strd b.gt 24f 22: ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s mul v4.4h, v16.4h, v2.4h mla v4.4h, v17.4h, v3.4h urshr v4.8h, v4.8h, #4 st1 {v4.s}[0], [\dst] st1 {v4.s}[1], [\ds2] ret 24: // 2x4, 2x6, 2x8, ... v ld1 {v17.s}[0], [\sr2], \s_strd ld1 {v18.s}[0], [\src], \s_strd ld1 {v19.s}[0], [\sr2], \s_strd ld1 {v20.s}[0], [\src], \s_strd sub \h, \h, #4 trn1 v16.2s, v16.2s, v17.2s trn1 v17.2s, v17.2s, v18.2s trn1 v18.2s, v18.2s, v19.2s trn1 v19.2s, v19.2s, v20.2s trn1 v16.2d, v16.2d, v18.2d trn1 v17.2d, v17.2d, v19.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h cmp \h, #2 urshr v4.8h, v4.8h, #4 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd st1 {v4.s}[2], [\dst], \d_strd st1 {v4.s}[3], [\ds2], \d_strd b.lt 0f mov v16.8b, v20.8b b.eq 22b b 24b 0: ret .endif 40: // 4xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.4h}, [\src], \s_strd 4: ld1 {v17.4h}, [\sr2], \s_strd ld1 {v18.4h}, [\src], \s_strd trn1 v16.2d, v16.2d, v17.2d trn1 v17.2d, v17.2d, v18.2d mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 .else urshl v4.8h, v4.8h, v31.8h sub v4.8h, v4.8h, v29.8h .endif st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f mov v16.8b, v18.8b b 4b 0: ret 80: // 8xN v AARCH64_VALID_JUMP_TARGET add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8h}, [\src], \s_strd 8: ld1 {v17.8h}, [\sr2], \s_strd ld1 {v18.8h}, [\src], \s_strd mul v4.8h, v16.8h, v2.8h mla v4.8h, v17.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v18.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 .else urshl v4.8h, v4.8h, v31.8h urshl v5.8h, v5.8h, v31.8h sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h .endif st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd b.le 0f mov v16.16b, v18.16b b 8b 0: ret 160: // 16xN, 32xN, ... 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \ds2, \dst, \d_strd add \sr2, \src, \s_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v16.8h, v17.8h}, [\src], \s_strd 2: ld1 {v18.8h, v19.8h}, [\sr2], \s_strd ld1 {v20.8h, v21.8h}, [\src], \s_strd mul v4.8h, v16.8h, v2.8h mla v4.8h, v18.8h, v3.8h mul v5.8h, v17.8h, v2.8h mla v5.8h, v19.8h, v3.8h mul v6.8h, v18.8h, v2.8h mla v6.8h, v20.8h, v3.8h mul v7.8h, v19.8h, v2.8h mla v7.8h, v21.8h, v3.8h subs \h, \h, #2 .ifc \type, put urshr v4.8h, v4.8h, #4 urshr v5.8h, v5.8h, #4 urshr v6.8h, v6.8h, #4 urshr v7.8h, v7.8h, #4 .else urshl v4.8h, v4.8h, v31.8h urshl v5.8h, v5.8h, v31.8h urshl v6.8h, v6.8h, v31.8h urshl v7.8h, v7.8h, v31.8h sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h sub v6.8h, v6.8h, v29.8h sub v7.8h, v7.8h, v29.8h .endif st1 {v4.8h, v5.8h}, [\dst], \d_strd st1 {v6.8h, v7.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v20.16b mov v17.16b, v21.16b b 2b 9: subs \w, \w, #16 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #32 add \dst, \dst, #32 b 1b 0: ret L(\type\()_bilin_v_tbl): .hword L(\type\()_bilin_v_tbl) - 1280b .hword L(\type\()_bilin_v_tbl) - 640b .hword L(\type\()_bilin_v_tbl) - 320b .hword L(\type\()_bilin_v_tbl) - 160b .hword L(\type\()_bilin_v_tbl) - 80b .hword L(\type\()_bilin_v_tbl) - 40b .hword L(\type\()_bilin_v_tbl) - 20b .hword 0 L(\type\()_bilin_hv): adr x10, L(\type\()_bilin_hv_tbl) dup v31.8h, w11 // 4 - intermediate_bits ldrh w9, [x10, x9, lsl #1] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.4s, w12 // 4 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif sub x10, x10, w9, uxtw .ifc \type, put neg v30.4s, v30.4s // -(4+intermediate_bits) .endif br x10 20: // 2xN hv AARCH64_VALID_JUMP_TARGET .ifc \type, put add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v20.4h}, [\src], \s_strd ext v21.8b, v20.8b, v20.8b, #2 mul v16.4h, v20.4h, v0.4h mla v16.4h, v21.4h, v1.4h urshl v16.4h, v16.4h, v31.4h 2: ld1 {v22.4h}, [\sr2], \s_strd ld1 {v24.4h}, [\src], \s_strd ext v23.8b, v22.8b, v22.8b, #2 ext v25.8b, v24.8b, v24.8b, #2 trn1 v22.2s, v22.2s, v24.2s trn1 v23.2s, v23.2s, v25.2s mul v17.4h, v22.4h, v0.4h mla v17.4h, v23.4h, v1.4h urshl v17.4h, v17.4h, v31.4h trn1 v16.2s, v16.2s, v17.2s umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h urshl v4.4s, v4.4s, v30.4s xtn v4.4h, v4.4s subs \h, \h, #2 st1 {v4.s}[0], [\dst], \d_strd st1 {v4.s}[1], [\ds2], \d_strd b.le 0f trn2 v16.2s, v17.2s, v17.2s b 2b 0: ret .endif 40: // 4xN hv AARCH64_VALID_JUMP_TARGET add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ld1 {v20.8h}, [\src], \s_strd ext v21.16b, v20.16b, v20.16b, #2 mul v16.4h, v20.4h, v0.4h mla v16.4h, v21.4h, v1.4h urshl v16.4h, v16.4h, v31.4h 4: ld1 {v22.8h}, [\sr2], \s_strd ld1 {v24.8h}, [\src], \s_strd ext v23.16b, v22.16b, v22.16b, #2 ext v25.16b, v24.16b, v24.16b, #2 trn1 v22.2d, v22.2d, v24.2d trn1 v23.2d, v23.2d, v25.2d mul v17.8h, v22.8h, v0.8h mla v17.8h, v23.8h, v1.8h urshl v17.8h, v17.8h, v31.8h trn1 v16.2d, v16.2d, v17.2d umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h umull2 v5.4s, v16.8h, v2.8h umlal2 v5.4s, v17.8h, v3.8h .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 sub v4.8h, v4.8h, v29.8h .endif subs \h, \h, #2 st1 {v4.d}[0], [\dst], \d_strd st1 {v4.d}[1], [\ds2], \d_strd b.le 0f trn2 v16.2d, v17.2d, v17.2d b 4b 0: ret 80: // 8xN, 16xN, ... hv 160: 320: 640: 1280: AARCH64_VALID_JUMP_TARGET mov \my, \h 1: add \sr2, \src, \s_strd add \ds2, \dst, \d_strd lsl \s_strd, \s_strd, #1 lsl \d_strd, \d_strd, #1 ldr h21, [\src, #16] ld1 {v20.8h}, [\src], \s_strd ext v21.16b, v20.16b, v21.16b, #2 mul v16.8h, v20.8h, v0.8h mla v16.8h, v21.8h, v1.8h urshl v16.8h, v16.8h, v31.8h 2: ldr h23, [\sr2, #16] ld1 {v22.8h}, [\sr2], \s_strd ldr h25, [\src, #16] ld1 {v24.8h}, [\src], \s_strd ext v23.16b, v22.16b, v23.16b, #2 ext v25.16b, v24.16b, v25.16b, #2 mul v17.8h, v22.8h, v0.8h mla v17.8h, v23.8h, v1.8h mul v18.8h, v24.8h, v0.8h mla v18.8h, v25.8h, v1.8h urshl v17.8h, v17.8h, v31.8h urshl v18.8h, v18.8h, v31.8h umull v4.4s, v16.4h, v2.4h umlal v4.4s, v17.4h, v3.4h umull2 v5.4s, v16.8h, v2.8h umlal2 v5.4s, v17.8h, v3.8h umull v6.4s, v17.4h, v2.4h umlal v6.4s, v18.4h, v3.4h umull2 v7.4s, v17.8h, v2.8h umlal2 v7.4s, v18.8h, v3.8h .ifc \type, put urshl v4.4s, v4.4s, v30.4s urshl v5.4s, v5.4s, v30.4s urshl v6.4s, v6.4s, v30.4s urshl v7.4s, v7.4s, v30.4s uzp1 v4.8h, v4.8h, v5.8h // Same as xtn, xtn2 uzp1 v5.8h, v6.8h, v7.8h // Ditto .else rshrn v4.4h, v4.4s, #4 rshrn2 v4.8h, v5.4s, #4 rshrn v5.4h, v6.4s, #4 rshrn2 v5.8h, v7.4s, #4 sub v4.8h, v4.8h, v29.8h sub v5.8h, v5.8h, v29.8h .endif subs \h, \h, #2 st1 {v4.8h}, [\dst], \d_strd st1 {v5.8h}, [\ds2], \d_strd b.le 9f mov v16.16b, v18.16b b 2b 9: subs \w, \w, #8 b.le 0f asr \s_strd, \s_strd, #1 asr \d_strd, \d_strd, #1 msub \src, \s_strd, \xmy, \src msub \dst, \d_strd, \xmy, \dst sub \src, \src, \s_strd, lsl #1 mov \h, \my add \src, \src, #16 add \dst, \dst, #16 b 1b 0: ret L(\type\()_bilin_hv_tbl): .hword L(\type\()_bilin_hv_tbl) - 1280b .hword L(\type\()_bilin_hv_tbl) - 640b .hword L(\type\()_bilin_hv_tbl) - 320b .hword L(\type\()_bilin_hv_tbl) - 160b .hword L(\type\()_bilin_hv_tbl) - 80b .hword L(\type\()_bilin_hv_tbl) - 40b .hword L(\type\()_bilin_hv_tbl) - 20b .hword 0 endfunc .endm filter_fn put, x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10 filter_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10 .macro load_filter_row dst, src, inc asr w13, \src, #10 add \src, \src, \inc ldr \dst, [x11, w13, sxtw #3] .endm function warp_filter_horz_neon add w12, w5, #512 ld1 {v16.8h, v17.8h}, [x2], x3 load_filter_row d0, w12, w7 load_filter_row d1, w12, w7 load_filter_row d2, w12, w7 sxtl v0.8h, v0.8b load_filter_row d3, w12, w7 sxtl v1.8h, v1.8b load_filter_row d4, w12, w7 sxtl v2.8h, v2.8b load_filter_row d5, w12, w7 sxtl v3.8h, v3.8b load_filter_row d6, w12, w7 sxtl v4.8h, v4.8b load_filter_row d7, w12, w7 sxtl v5.8h, v5.8b ext v18.16b, v16.16b, v17.16b, #2*1 smull v8.4s, v16.4h, v0.4h smull2 v9.4s, v16.8h, v0.8h sxtl v6.8h, v6.8b ext v19.16b, v16.16b, v17.16b, #2*2 smull v10.4s, v18.4h, v1.4h smull2 v11.4s, v18.8h, v1.8h sxtl v7.8h, v7.8b ext v20.16b, v16.16b, v17.16b, #2*3 smull v0.4s, v19.4h, v2.4h smull2 v1.4s, v19.8h, v2.8h ext v21.16b, v16.16b, v17.16b, #2*4 addp v8.4s, v8.4s, v9.4s smull v2.4s, v20.4h, v3.4h smull2 v3.4s, v20.8h, v3.8h ext v22.16b, v16.16b, v17.16b, #2*5 addp v9.4s, v10.4s, v11.4s smull v10.4s, v21.4h, v4.4h smull2 v11.4s, v21.8h, v4.8h ext v23.16b, v16.16b, v17.16b, #2*6 addp v0.4s, v0.4s, v1.4s smull v18.4s, v22.4h, v5.4h smull2 v19.4s, v22.8h, v5.8h ext v16.16b, v16.16b, v17.16b, #2*7 addp v1.4s, v2.4s, v3.4s addp v2.4s, v10.4s, v11.4s smull v20.4s, v23.4h, v6.4h smull2 v21.4s, v23.8h, v6.8h addp v3.4s, v18.4s, v19.4s smull v22.4s, v16.4h, v7.4h smull2 v23.4s, v16.8h, v7.8h addp v4.4s, v20.4s, v21.4s addp v5.4s, v22.4s, v23.4s addp v8.4s, v8.4s, v9.4s addp v0.4s, v0.4s, v1.4s addp v2.4s, v2.4s, v3.4s addp v4.4s, v4.4s, v5.4s addp v16.4s, v8.4s, v0.4s addp v17.4s, v2.4s, v4.4s add w5, w5, w8 srshl v16.4s, v16.4s, v14.4s // -(7 - intermediate_bits) srshl v17.4s, v17.4s, v14.4s // -(7 - intermediate_bits) ret endfunc // void dav1d_warp_affine_8x8_16bpc_neon( // pixel *dst, const ptrdiff_t dst_stride, // const pixel *src, const ptrdiff_t src_stride, // const int16_t *const abcd, int mx, int my, // const int bitdepth_max) .macro warp t function warp_affine_8x8\t\()_16bpc_neon, export=1 stp d8, d9, [sp, #-0x40]! stp d10, d11, [sp, #0x10] stp d12, d13, [sp, #0x20] stp d14, d15, [sp, #0x30] .ifb \t dup v15.8h, w7 // bitdepth_max .else movi v15.8h, #(PREP_BIAS >> 8), lsl #8 .endif clz w7, w7 // intermediate_bits = clz(bitdepth_max) - 18 .ifb \t sub w8, w7, #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7 .endif sub w7, w7, #25 // -(7 - intermediate_bits) .ifb \t neg w8, w8 // -(7 + intermediate_bits) .endif dup v14.4s, w7 // -(7 - intermediate_bits) .ifb \t dup v13.4s, w8 // -(7 + intermediate_bits) .endif ldr x4, [x4] sbfx x7, x4, #0, #16 sbfx x8, x4, #16, #16 sbfx x9, x4, #32, #16 sbfx x4, x4, #48, #16 mov w10, #8 sub x2, x2, x3, lsl #1 sub x2, x2, x3 sub x2, x2, #6 movrel x11, X(mc_warp_filter), 64*8 mov x15, x30 .ifnb \t lsl x1, x1, #1 .endif bl warp_filter_horz_neon uzp1 v24.8h, v16.8h, v17.8h // Same as xtn, xtn2 bl warp_filter_horz_neon uzp1 v25.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v26.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v27.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v28.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v29.8h, v16.8h, v17.8h // Ditto bl warp_filter_horz_neon uzp1 v30.8h, v16.8h, v17.8h // Ditto 1: add w14, w6, #512 bl warp_filter_horz_neon uzp1 v31.8h, v16.8h, v17.8h // Same as xtn, xtn2 load_filter_row d0, w14, w9 load_filter_row d1, w14, w9 load_filter_row d2, w14, w9 load_filter_row d3, w14, w9 load_filter_row d4, w14, w9 load_filter_row d5, w14, w9 load_filter_row d6, w14, w9 load_filter_row d7, w14, w9 transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl // This ordering of smull/smlal/smull2/smlal2 is highly // beneficial for Cortex A53 here. smull v16.4s, v24.4h, v0.4h smlal v16.4s, v25.4h, v1.4h smlal v16.4s, v26.4h, v2.4h smlal v16.4s, v27.4h, v3.4h smlal v16.4s, v28.4h, v4.4h smlal v16.4s, v29.4h, v5.4h smlal v16.4s, v30.4h, v6.4h smlal v16.4s, v31.4h, v7.4h smull2 v17.4s, v24.8h, v0.8h smlal2 v17.4s, v25.8h, v1.8h smlal2 v17.4s, v26.8h, v2.8h smlal2 v17.4s, v27.8h, v3.8h smlal2 v17.4s, v28.8h, v4.8h smlal2 v17.4s, v29.8h, v5.8h smlal2 v17.4s, v30.8h, v6.8h smlal2 v17.4s, v31.8h, v7.8h mov v24.16b, v25.16b mov v25.16b, v26.16b .ifb \t srshl v16.4s, v16.4s, v13.4s // -(7 + intermediate_bits) srshl v17.4s, v17.4s, v13.4s // -(7 + intermediate_bits) .else rshrn v16.4h, v16.4s, #7 rshrn2 v16.8h, v17.4s, #7 .endif mov v26.16b, v27.16b .ifb \t sqxtun v16.4h, v16.4s sqxtun2 v16.8h, v17.4s .else sub v16.8h, v16.8h, v15.8h // PREP_BIAS .endif mov v27.16b, v28.16b mov v28.16b, v29.16b .ifb \t umin v16.8h, v16.8h, v15.8h // bitdepth_max .endif mov v29.16b, v30.16b mov v30.16b, v31.16b subs w10, w10, #1 st1 {v16.8h}, [x0], x1 add w6, w6, w4 b.gt 1b ldp d14, d15, [sp, #0x30] ldp d12, d13, [sp, #0x20] ldp d10, d11, [sp, #0x10] ldp d8, d9, [sp], 0x40 ret x15 endfunc .endm warp warp t // void dav1d_emu_edge_16bpc_neon( // const intptr_t bw, const intptr_t bh, // const intptr_t iw, const intptr_t ih, // const intptr_t x, const intptr_t y, // pixel *dst, const ptrdiff_t dst_stride, // const pixel *ref, const ptrdiff_t ref_stride) function emu_edge_16bpc_neon, export=1 ldp x8, x9, [sp] // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) // ref += iclip(x, 0, iw - 1) sub x12, x3, #1 // ih - 1 cmp x5, x3 sub x13, x2, #1 // iw - 1 csel x12, x12, x5, ge // min(y, ih - 1) cmp x4, x2 bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) csel x13, x13, x4, ge // min(x, iw - 1) bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) madd x8, x12, x9, x8 // ref += iclip() * stride add x8, x8, x13, lsl #1 // ref += iclip() // bottom_ext = iclip(y + bh - ih, 0, bh - 1) // top_ext = iclip(-y, 0, bh - 1) add x10, x5, x1 // y + bh neg x5, x5 // -y sub x10, x10, x3 // y + bh - ih sub x12, x1, #1 // bh - 1 cmp x10, x1 bic x5, x5, x5, asr #63 // max(-y, 0) csel x10, x10, x12, lt // min(y + bh - ih, bh-1) cmp x5, x1 bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) csel x5, x5, x12, lt // min(max(-y, 0), bh-1) // right_ext = iclip(x + bw - iw, 0, bw - 1) // left_ext = iclip(-x, 0, bw - 1) add x11, x4, x0 // x + bw neg x4, x4 // -x sub x11, x11, x2 // x + bw - iw sub x13, x0, #1 // bw - 1 cmp x11, x0 bic x4, x4, x4, asr #63 // max(-x, 0) csel x11, x11, x13, lt // min(x + bw - iw, bw-1) cmp x4, x0 bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) // center_h = bh - top_ext - bottom_ext // dst += top_ext * PXSTRIDE(dst_stride) // center_w = bw - left_ext - right_ext sub x1, x1, x5 // bh - top_ext madd x6, x5, x7, x6 sub x2, x0, x4 // bw - left_ext sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext sub x2, x2, x11 // center_w = bw - left_ext - right_ext mov x14, x6 // backup of dst .macro v_loop need_left, need_right 0: .if \need_left ld1r {v0.8h}, [x8] mov x12, x6 // out = dst mov x3, x4 mov v1.16b, v0.16b 1: subs x3, x3, #16 st1 {v0.8h, v1.8h}, [x12], #32 b.gt 1b .endif mov x13, x8 add x12, x6, x4, lsl #1 // out = dst + left_ext mov x3, x2 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 subs x3, x3, #32 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 b.gt 1b .if \need_right add x3, x8, x2, lsl #1 // in + center_w sub x3, x3, #2 // in + center_w - 1 add x12, x6, x4, lsl #1 // dst + left_ext ld1r {v0.8h}, [x3] add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w mov x3, x11 mov v1.16b, v0.16b 1: subs x3, x3, #16 st1 {v0.8h, v1.8h}, [x12], #32 b.gt 1b .endif subs x1, x1, #1 // center_h-- add x6, x6, x7 add x8, x8, x9 b.gt 0b .endm cbz x4, 2f // need_left cbz x11, 3f // need_left + need_right v_loop 1, 1 b 5f 2: // !need_left cbz x11, 4f // !need_left + need_right v_loop 0, 1 b 5f 3: // need_left + !need_right v_loop 1, 0 b 5f 4: // !need_left + !need_right v_loop 0, 0 5: cbz x10, 3f // need_bottom sub x8, x6, x7 // ref = dst - stride mov x4, x0 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 mov x3, x10 2: subs x3, x3, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 b.gt 2b msub x6, x7, x10, x6 // dst -= bottom_ext * stride subs x4, x4, #32 // bw -= 32 add x6, x6, #64 // dst += 32 b.gt 1b 3: cbz x5, 3f // need_top msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride 1: ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 mov x3, x5 2: subs x3, x3, #1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 b.gt 2b msub x6, x7, x5, x6 // dst -= top_ext * stride subs x0, x0, #32 // bw -= 32 add x6, x6, #64 // dst += 32 b.gt 1b 3: ret endfunc rav1e-0.7.1/src/arm/64/msac.S000064400000000000000000000470531046102023000135260ustar 00000000000000/* * Copyright © 2019, VideoLAN and dav1d authors * Copyright © 2019, Martin Storsjo * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" #include "util.S" #define BUF_POS 0 #define BUF_END 8 #define DIF 16 #define RNG 24 #define CNT 28 #define ALLOW_UPDATE_CDF 32 const coeffs .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 endconst const bits .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 endconst .macro ld1_n d0, d1, src, sz, n .if \n <= 8 ld1 {\d0\sz}, [\src] .else ld1 {\d0\sz, \d1\sz}, [\src] .endif .endm .macro st1_n s0, s1, dst, sz, n .if \n <= 8 st1 {\s0\sz}, [\dst] .else st1 {\s0\sz, \s1\sz}, [\dst] .endif .endm .macro ushr_n d0, d1, s0, s1, shift, sz, n ushr \d0\sz, \s0\sz, \shift .if \n == 16 ushr \d1\sz, \s1\sz, \shift .endif .endm .macro add_n d0, d1, s0, s1, s2, s3, sz, n add \d0\sz, \s0\sz, \s2\sz .if \n == 16 add \d1\sz, \s1\sz, \s3\sz .endif .endm .macro sub_n d0, d1, s0, s1, s2, s3, sz, n sub \d0\sz, \s0\sz, \s2\sz .if \n == 16 sub \d1\sz, \s1\sz, \s3\sz .endif .endm .macro and_n d0, d1, s0, s1, s2, s3, sz, n and \d0\sz, \s0\sz, \s2\sz .if \n == 16 and \d1\sz, \s1\sz, \s3\sz .endif .endm .macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n cmhs \d0\sz, \s0\sz, \s2\sz .if \n == 16 cmhs \d1\sz, \s1\sz, \s3\sz .endif .endm .macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n urhadd \d0\sz, \s0\sz, \s2\sz .if \n == 16 urhadd \d1\sz, \s1\sz, \s3\sz .endif .endm .macro sshl_n d0, d1, s0, s1, s2, s3, sz, n sshl \d0\sz, \s0\sz, \s2\sz .if \n == 16 sshl \d1\sz, \s1\sz, \s3\sz .endif .endm .macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n sqdmulh \d0\sz, \s0\sz, \s2\sz .if \n == 16 sqdmulh \d1\sz, \s1\sz, \s3\sz .endif .endm .macro str_n idx0, idx1, dstreg, dstoff, n str \idx0, [\dstreg, \dstoff] .if \n == 16 str \idx1, [\dstreg, \dstoff + 16] .endif .endm // unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, // size_t n_symbols); function msac_decode_symbol_adapt4_neon, export=1 .macro decode_update sz, szb, n sub sp, sp, #48 add x8, x0, #RNG ld1_n v0, v1, x1, \sz, \n // cdf ld1r {v4\sz}, [x8] // rng movrel x9, coeffs, 30 movi v31\sz, #0x7f, lsl #8 // 0x7f00 sub x9, x9, x2, lsl #1 mvni v30\sz, #0x3f // 0xffc0 and v7\szb, v4\szb, v31\szb // rng & 0x7f00 str h4, [sp, #14] // store original u = s->rng and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0 ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret) sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add x8, x0, #DIF + 6 add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) movrel x8, bits str_n q4, q5, sp, #16, \n // store v values to allow indexed access ld1_n v16, v17, x8, .8h, \n cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask .if \n == 16 add v6.8h, v6.8h, v7.8h .endif addv h6, v6.8h // Aggregate mask bits ldr w4, [x0, #ALLOW_UPDATE_CDF] umov w3, v6.h[0] rbit w3, w3 clz w15, w3 // ret cbz w4, L(renorm) // update_cdf ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols] movi v5\szb, #0xff .if \n == 16 mov w4, #-5 .else mvn w14, w2 mov w4, #-4 cmn w14, #3 // set C if n_symbols <= 2 .endif urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768 .if \n == 16 sub w4, w4, w3, lsr #4 // -((count >> 4) + 5) .else lsr w14, w3, #4 // count >> 4 sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) .endif sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6\sz, w4 // -rate sub w3, w3, w3, lsr #5 // count - (count == 32) sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate add w3, w3, #1 // count + (count < 32) add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate st1_n v0, v1, x1, \sz, \n strh w3, [x1, x2, lsl #1] .endm decode_update .4h, .8b, 4 L(renorm): add x8, sp, #16 add x8, x8, w15, uxtw #1 ldrh w3, [x8] // v ldurh w4, [x8, #-2] // u ldr w6, [x0, #CNT] ldr x7, [x0, #DIF] sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 mvn x7, x7 // ~dif add x7, x7, x3, lsl #48 // ~dif + (v << 48) L(renorm2): lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (~dif + (v << 48)) << d str w4, [x0, #RNG] mvn x7, x7 // ~dif b.hs 9f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 cmp x5, x4 b.gt 2f ldr x3, [x3] // next_bits add w8, w6, #23 // shift_bits = cnt + 23 add w6, w6, #16 // cnt += 16 rev x3, x3 // next_bits = bswap(next_bits) sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 and w8, w8, #24 // shift_bits &= 24 lsr x3, x3, x8 // next_bits >>= shift_bits sub w8, w8, w6 // shift_bits -= 16 + cnt str x5, [x0, #BUF_POS] lsl x3, x3, x8 // next_bits <<= shift_bits mov w4, #48 sub w6, w4, w8 // cnt = cnt + 64 - shift_bits eor x7, x7, x3 // dif ^= next_bits b 9f 2: // refill_eob mov w14, #40 sub w5, w14, w6 // c = 40 - cnt 3: cmp x3, x4 b.ge 4f ldrb w8, [x3], #1 lsl x8, x8, x5 eor x7, x7, x8 subs w5, w5, #8 b.ge 3b 4: // refill_eob_end str x3, [x0, #BUF_POS] sub w6, w14, w5 // cnt = 40 - c 9: str w6, [x0, #CNT] str x7, [x0, #DIF] mov w0, w15 add sp, sp, #48 ret endfunc function msac_decode_symbol_adapt8_neon, export=1 decode_update .8h, .16b, 8 b L(renorm) endfunc function msac_decode_symbol_adapt16_neon, export=1 decode_update .8h, .16b, 16 b L(renorm) endfunc function msac_decode_hi_tok_neon, export=1 ld1 {v0.4h}, [x1] // cdf add x16, x0, #RNG movi v31.4h, #0x7f, lsl #8 // 0x7f00 movrel x17, coeffs, 30-2*3 mvni v30.4h, #0x3f // 0xffc0 ldrh w9, [x1, #6] // count = cdf[n_symbols] ld1r {v3.4h}, [x16] // rng movrel x16, bits ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) add x17, x0, #DIF + 6 ld1 {v16.8h}, [x16] mov w13, #-24 and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 ldr w10, [x0, #ALLOW_UPDATE_CDF] ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) sub sp, sp, #48 ldr w6, [x0, #CNT] ldr x7, [x0, #DIF] 1: and v7.8b, v3.8b, v31.8b // rng & 0x7f00 sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) str h3, [sp, #14] // store original u = s->rng cmhs v2.8h, v1.8h, v4.8h // c >= v str q4, [sp, #16] // store v values to allow indexed access and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask addv h6, v6.8h // Aggregate mask bits umov w3, v6.h[0] add w13, w13, #5 rbit w3, w3 add x8, sp, #16 clz w15, w3 // ret cbz w10, 2f // update_cdf movi v5.8b, #0xff mov w4, #-5 urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) dup v6.4h, w4 // -rate sub w9, w9, w9, lsr #5 // count - (count == 32) sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate add w9, w9, #1 // count + (count < 32) add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate st1 {v0.4h}, [x1] and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 strh w9, [x1, #6] 2: add x8, x8, w15, uxtw #1 ldrh w3, [x8] // v ldurh w4, [x8, #-2] // u sub w4, w4, w3 // rng = u - v clz w5, w4 // clz(rng) eor w5, w5, #16 // d = clz(rng) ^ 16 mvn x7, x7 // ~dif add x7, x7, x3, lsl #48 // ~dif + (v << 48) lsl w4, w4, w5 // rng << d subs w6, w6, w5 // cnt -= d lsl x7, x7, x5 // (~dif + (v << 48)) << d str w4, [x0, #RNG] dup v3.4h, w4 mvn x7, x7 // ~dif b.hs 9f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END add x5, x3, #8 cmp x5, x4 b.gt 2f ldr x3, [x3] // next_bits add w8, w6, #23 // shift_bits = cnt + 23 add w6, w6, #16 // cnt += 16 rev x3, x3 // next_bits = bswap(next_bits) sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 and w8, w8, #24 // shift_bits &= 24 lsr x3, x3, x8 // next_bits >>= shift_bits sub w8, w8, w6 // shift_bits -= 16 + cnt str x5, [x0, #BUF_POS] lsl x3, x3, x8 // next_bits <<= shift_bits mov w4, #48 sub w6, w4, w8 // cnt = cnt + 64 - shift_bits eor x7, x7, x3 // dif ^= next_bits b 9f 2: // refill_eob mov w14, #40 sub w5, w14, w6 // c = 40 - cnt 3: cmp x3, x4 b.ge 4f ldrb w8, [x3], #1 lsl x8, x8, x5 eor x7, x7, x8 subs w5, w5, #8 b.ge 3b 4: // refill_eob_end str x3, [x0, #BUF_POS] sub w6, w14, w5 // cnt = 40 - c 9: lsl w15, w15, #1 sub w15, w15, #5 lsr x12, x7, #48 adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 dup v1.8h, w12 b.cc 1b // loop if !carry add w13, w13, #30 str w6, [x0, #CNT] add sp, sp, #48 str x7, [x0, #DIF] lsr w0, w13, #1 ret endfunc function msac_decode_bool_equi_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT sub sp, sp, #48 ldr x7, [x0, #DIF] bic w4, w5, #0xff // r &= 0xff00 add w4, w4, #8 subs x8, x7, x4, lsl #47 // dif - vw lsr w4, w4, #1 // v sub w5, w5, w4 // r - v cset w15, lo csel w4, w5, w4, hs // if (ret) v = r - v; csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 b L(renorm2) endfunc function msac_decode_bool_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT sub sp, sp, #48 ldr x7, [x0, #DIF] lsr w4, w5, #8 // r >> 8 bic w1, w1, #0x3f // f &= ~63 mul w4, w4, w1 lsr w4, w4, #7 add w4, w4, #4 // v subs x8, x7, x4, lsl #48 // dif - vw sub w5, w5, w4 // r - v cset w15, lo csel w4, w5, w4, hs // if (ret) v = r - v; csel x7, x8, x7, hs // if (ret) dif = dif - vw; clz w5, w4 // clz(rng) mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 b L(renorm2) endfunc function msac_decode_bool_adapt_neon, export=1 ldr w9, [x1] // cdf[0-1] ldp w5, w6, [x0, #RNG] // + CNT sub sp, sp, #48 ldr x7, [x0, #DIF] lsr w4, w5, #8 // r >> 8 and w2, w9, #0xffc0 // f &= ~63 mul w4, w4, w2 lsr w4, w4, #7 add w4, w4, #4 // v subs x8, x7, x4, lsl #48 // dif - vw sub w5, w5, w4 // r - v cset w15, lo csel w4, w5, w4, hs // if (ret) v = r - v; csel x7, x8, x7, hs // if (ret) dif = dif - vw; ldr w10, [x0, #ALLOW_UPDATE_CDF] clz w5, w4 // clz(rng) mvn x7, x7 // ~dif eor w5, w5, #16 // d = clz(rng) ^ 16 cbz w10, L(renorm2) lsr w2, w9, #16 // count = cdf[1] and w9, w9, #0xffff // cdf[0] sub w3, w2, w2, lsr #5 // count - (count >= 32) lsr w2, w2, #4 // count >> 4 add w10, w3, #1 // count + (count < 32) add w2, w2, #4 // rate = (count >> 4) | 4 sub w9, w9, w15 // cdf[0] -= bit sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769} asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate sub w9, w9, w11 // cdf[0] strh w9, [x1] strh w10, [x1, #2] b L(renorm2) endfunc rav1e-0.7.1/src/arm/64/sad.S000064400000000000000000000417341046102023000133520ustar 00000000000000/* * Copyright (c) 2016, Alliance for Open Media. All rights reserved * Copyright (c) 2020-2023, The rav1e contributors. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "src/arm/asm.S" #include "util.S" .macro sad_rect width, height function sad\width\()x\height\()_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, \height .if \width >= 16 mov v1.16b, v0.16b .endif b L(sad_w\width\()) endfunc .endm function sad4x4_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #4 L(sad_w4): ldr s2, [x0] ldr s3, [x2] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.8h, v2.8b, v3.8b bne L(sad_w4) uaddlp v0.2s, v0.4h uaddlp v0.1d, v0.2s fmov w0, s0 ret endfunc sad_rect 4, 8 sad_rect 4, 16 .macro horizontal_long_add_16x8 ushll v2.4s, v1.4h, #0 uaddw2 v1.4s, v2.4s, v1.8h uaddw v1.4s, v1.4s, v0.4h uaddw2 v0.4s, v1.4s, v0.8h uaddlp v0.2d, v0.4s ext v1.16b, v0.16b, v0.16b, #8 add v0.2s, v1.2s, v0.2s fmov w0, s0 ret .endm .macro horizontal_add_16x8 uaddlp v0.4s, v0.8h uaddlp v0.2d, v0.4s ext v1.16b, v0.16b, v0.16b, #8 add v0.2s, v1.2s, v0.2s fmov w0, s0 ret .endm function sad64x64_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #64 mov v1.16b, v0.16b L(sad_w64): ldp q2, q4, [x0] ldp q3, q5, [x2] ldp q6, q16, [x0, #32] ldp q7, q17, [x2, #32] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.8h, v2.8b, v3.8b uabal2 v1.8h, v2.16b, v3.16b uabal v0.8h, v4.8b, v5.8b uabal2 v1.8h, v4.16b, v5.16b uabal v0.8h, v6.8b, v7.8b uabal2 v1.8h, v6.16b, v7.16b uabal v0.8h, v16.8b, v17.8b uabal2 v1.8h, v16.16b, v17.16b bne L(sad_w64) horizontal_long_add_16x8 endfunc sad_rect 64, 16 sad_rect 64, 32 sad_rect 64, 128 function sad128x128_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #128 mov v1.16b, v0.16b L(sad_w128): ldp q2, q4, [x0] ldp q3, q5, [x2] ldp q6, q16, [x0, #32] ldp q7, q17, [x2, #32] ldp q18, q20, [x0, #64] ldp q19, q21, [x2, #64] ldp q22, q24, [x0, #96] ldp q23, q25, [x2, #96] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabdl v26.8h, v2.8b, v3.8b uabal2 v26.8h, v2.16b, v3.16b uabal v26.8h, v4.8b, v5.8b uabal2 v26.8h, v4.16b, v5.16b uabal v26.8h, v6.8b, v7.8b uabal2 v26.8h, v6.16b, v7.16b uabal v26.8h, v16.8b, v17.8b uabal2 v26.8h, v16.16b, v17.16b uabal v26.8h, v18.8b, v19.8b uabal2 v26.8h, v18.16b, v19.16b uabal v26.8h, v20.8b, v21.8b uabal2 v26.8h, v20.16b, v21.16b uabal v26.8h, v22.8b, v23.8b uabal2 v26.8h, v22.16b, v23.16b uabal v26.8h, v24.8b, v25.8b uabal2 v26.8h, v24.16b, v25.16b uaddw v1.4s, v1.4s, v26.4h uaddw2 v0.4s, v0.4s, v26.8h bne L(sad_w128) add v0.4s, v0.4s, v1.4s uaddlp v0.2d, v0.4s dup d3, v0.d[1] add v0.2s, v0.2s, v3.2s umov w0, v0.s[0] ret endfunc sad_rect 128, 64 function sad32x32_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #32 mov v1.16b, v0.16b L(sad_w32): ldp q2, q4, [x0] ldp q3, q5, [x2] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v1.8h, v2.8b, v3.8b uabal2 v0.8h, v2.16b, v3.16b uabal v1.8h, v4.8b, v5.8b uabal2 v0.8h, v4.16b, v5.16b bne L(sad_w32) add v0.8h, v0.8h, v1.8h horizontal_add_16x8 endfunc sad_rect 32, 8 sad_rect 32, 16 sad_rect 32, 64 function sad16x16_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #16 mov v1.16b, v0.16b L(sad_w16): ldr q2, [x0] ldr q3, [x2] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.8h, v2.8b, v3.8b uabal2 v1.8h, v2.16b, v3.16b bne L(sad_w16) add v0.8h, v0.8h, v1.8h horizontal_add_16x8 endfunc sad_rect 16, 4 sad_rect 16, 8 sad_rect 16, 32 sad_rect 16, 64 function sad8x8_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #8 L(sad_w8): ldr d2, [x0] ldr d3, [x2] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.8h, v2.8b, v3.8b bne L(sad_w8) horizontal_add_16x8 endfunc sad_rect 8, 4 sad_rect 8, 16 sad_rect 8, 32 .macro sad_hbd_rect width, height function sad\width\()x\height\()_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, \height .if \width >= 8 mov v1.16b, v0.16b .endif .if \width >= 64 && \height >= 64 b L(sad_hbd_large_w\width\()) .else b L(sad_hbd_w\width\()) .endif endfunc .endm function sad4x4_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #4 L(sad_hbd_w4): ldr d2, [x0] ldr d3, [x2] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.4s, v2.4h, v3.4h bne L(sad_hbd_w4) addv s0, v0.4s fmov w0, s0 ret endfunc sad_hbd_rect 4, 8 sad_hbd_rect 4, 16 .macro horizontal_add_32x4 uaddlp v0.2d, v0.4s ext v1.16b, v0.16b, v0.16b, #8 add v0.2s, v1.2s, v0.2s fmov w0, s0 ret .endm function sad64x32_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #32 mov v1.16b, v0.16b L(sad_hbd_w64): ldp q2, q4, [x0] ldp q3, q5, [x2] ldp q6, q16, [x0, #32] ldp q7, q17, [x2, #32] ldp q18, q20, [x0, #64] ldp q19, q21, [x2, #64] ldp q22, q24, [x0, #96] ldp q23, q25, [x2, #96] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.4s, v2.4h, v3.4h uabal2 v1.4s, v2.8h, v3.8h uabal v0.4s, v4.4h, v5.4h uabal2 v1.4s, v4.8h, v5.8h uabal v0.4s, v6.4h, v7.4h uabal2 v1.4s, v6.8h, v7.8h uabal v0.4s, v16.4h, v17.4h uabal2 v1.4s, v16.8h, v17.8h uabal v0.4s, v18.4h, v19.4h uabal2 v1.4s, v18.8h, v19.8h uabal v0.4s, v20.4h, v21.4h uabal2 v1.4s, v20.8h, v21.8h uabal v0.4s, v22.4h, v23.4h uabal2 v1.4s, v22.8h, v23.8h uabal v0.4s, v24.4h, v25.4h uabal2 v1.4s, v24.8h, v25.8h bne L(sad_hbd_w64) horizontal_long_add_16x8 endfunc function sad64x64_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #64 mov v1.16b, v0.16b L(sad_hbd_large_w64): ldp q2, q4, [x0] ldp q3, q5, [x2] ldp q6, q16, [x0, #32] ldp q7, q17, [x2, #32] ldp q18, q20, [x0, #64] ldp q19, q21, [x2, #64] ldp q22, q24, [x0, #96] ldp q23, q25, [x2, #96] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabdl v26.4s, v2.4h, v3.4h uabal2 v26.4s, v2.8h, v3.8h uabal v26.4s, v4.4h, v5.4h uabal2 v26.4s, v4.8h, v5.8h uabal v26.4s, v6.4h, v7.4h uabal2 v26.4s, v6.8h, v7.8h uabal v26.4s, v16.4h, v17.4h uabal2 v26.4s, v16.8h, v17.8h uabal v26.4s, v18.4h, v19.4h uabal2 v26.4s, v18.8h, v19.8h uabal v26.4s, v20.4h, v21.4h uabal2 v26.4s, v20.8h, v21.8h uabal v26.4s, v22.4h, v23.4h uabal2 v26.4s, v22.8h, v23.8h uabal v26.4s, v24.4h, v25.4h uabal2 v26.4s, v24.8h, v25.8h uaddw v1.2d, v1.2d, v26.2s uaddw2 v0.2d, v0.2d, v26.4s bne L(sad_hbd_large_w64) add v0.2d, v0.2d, v1.2d dup d3, v0.d[1] add v0.2d, v0.2d, v3.2d umov x0, v0.d[0] ret endfunc sad_hbd_rect 64, 16 sad_hbd_rect 64, 128 function sad128x128_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #128 mov v1.16b, v0.16b L(sad_hbd_large_w128): ldp q2, q4, [x0] ldp q3, q5, [x2] ldp q6, q16, [x0, #32] ldp q7, q17, [x2, #32] ldp q18, q20, [x0, #64] ldp q19, q21, [x2, #64] ldp q22, q24, [x0, #96] ldp q23, q25, [x2, #96] uabdl v26.4s, v2.4h, v3.4h uabal2 v26.4s, v2.8h, v3.8h uabal v26.4s, v4.4h, v5.4h uabal2 v26.4s, v4.8h, v5.8h uabal v26.4s, v6.4h, v7.4h uabal2 v26.4s, v6.8h, v7.8h uabal v26.4s, v16.4h, v17.4h uabal2 v26.4s, v16.8h, v17.8h uabal v26.4s, v18.4h, v19.4h uabal2 v26.4s, v18.8h, v19.8h uabal v26.4s, v20.4h, v21.4h uabal2 v26.4s, v20.8h, v21.8h uabal v26.4s, v22.4h, v23.4h uabal2 v26.4s, v22.8h, v23.8h uabal v26.4s, v24.4h, v25.4h uabal2 v26.4s, v24.8h, v25.8h uaddw v1.2d, v1.2d, v26.2s uaddw2 v0.2d, v0.2d, v26.4s ldp q2, q4, [x0, #128] ldp q3, q5, [x2, #128] ldp q6, q16, [x0, #160] ldp q7, q17, [x2, #160] ldp q18, q20, [x0, #192] ldp q19, q21, [x2, #192] ldp q22, q24, [x0, #224] ldp q23, q25, [x2, #224] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabdl v26.4s, v2.4h, v3.4h uabal2 v26.4s, v2.8h, v3.8h uabal v26.4s, v4.4h, v5.4h uabal2 v26.4s, v4.8h, v5.8h uabal v26.4s, v6.4h, v7.4h uabal2 v26.4s, v6.8h, v7.8h uabal v26.4s, v16.4h, v17.4h uabal2 v26.4s, v16.8h, v17.8h uabal v26.4s, v18.4h, v19.4h uabal2 v26.4s, v18.8h, v19.8h uabal v26.4s, v20.4h, v21.4h uabal2 v26.4s, v20.8h, v21.8h uabal v26.4s, v22.4h, v23.4h uabal2 v26.4s, v22.8h, v23.8h uabal v26.4s, v24.4h, v25.4h uabal2 v26.4s, v24.8h, v25.8h uaddw v1.2d, v1.2d, v26.2s uaddw2 v0.2d, v0.2d, v26.4s bne L(sad_hbd_large_w128) add v0.2d, v0.2d, v1.2d dup d3, v0.d[1] add v0.2d, v0.2d, v3.2d umov x0, v0.d[0] ret endfunc sad_hbd_rect 128, 64 function sad32x32_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #32 mov v1.16b, v0.16b L(sad_hbd_w32): ldp q2, q4, [x0] ldp q3, q5, [x2] ldp q6, q16, [x0, #32] ldp q7, q17, [x2, #32] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.4s, v2.4h, v3.4h uabal2 v1.4s, v2.8h, v3.8h uabal v0.4s, v4.4h, v5.4h uabal2 v1.4s, v4.8h, v5.8h uabal v0.4s, v6.4h, v7.4h uabal2 v1.4s, v6.8h, v7.8h uabal v0.4s, v16.4h, v17.4h uabal2 v1.4s, v16.8h, v17.8h bne L(sad_hbd_w32) add v0.4s, v0.4s, v1.4s horizontal_add_32x4 endfunc sad_hbd_rect 32, 8 sad_hbd_rect 32, 16 sad_hbd_rect 32, 64 function sad16x16_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #16 mov v1.16b, v0.16b L(sad_hbd_w16): ldp q2, q4, [x0] ldp q3, q5, [x2] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.4s, v2.4h, v3.4h uabal2 v1.4s, v2.8h, v3.8h uabal v0.4s, v4.4h, v5.4h uabal2 v1.4s, v4.8h, v5.8h bne L(sad_hbd_w16) add v0.4s, v0.4s, v1.4s horizontal_add_32x4 endfunc sad_hbd_rect 16, 4 sad_hbd_rect 16, 8 sad_hbd_rect 16, 32 sad_hbd_rect 16, 64 function sad8x8_hbd_neon, export=1 movi v0.4s, #0 sxtw x1, w1 sxtw x3, w3 mov w4, #8 mov v1.16b, v0.16b L(sad_hbd_w8): ldr q2, [x0] ldr q3, [x2] add x0, x0, x1 add x2, x2, x3 subs w4, w4, #1 uabal v0.4s, v2.4h, v3.4h uabal2 v1.4s, v2.8h, v3.8h bne L(sad_hbd_w8) add v0.4s, v0.4s, v1.4s horizontal_add_32x4 endfunc sad_hbd_rect 8, 4 sad_hbd_rect 8, 16 sad_hbd_rect 8, 32 rav1e-0.7.1/src/arm/64/satd.S000064400000000000000000001076231046102023000135360ustar 00000000000000/* Copyright (c) 2022-2023, The rav1e contributors. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "src/arm/asm.S" #include "util.S" .macro butterfly r0, r1, r2, r3, t=8h add \r0\().\t, \r2\().\t, \r3\().\t sub \r1\().\t, \r2\().\t, \r3\().\t .endm .macro butterflyw r0, r1, r2, r3, r4, r5 sxtl \r0\().4s, \r4\().4h sxtl2 \r2\().4s, \r4\().8h ssubw \r1\().4s, \r0\().4s, \r5\().4h ssubw2 \r3\().4s, \r2\().4s, \r5\().8h saddw \r0\().4s, \r0\().4s, \r5\().4h saddw2 \r2\().4s, \r2\().4s, \r5\().8h .endm .macro interleave r0, r1, r2, r3 zip1 \r0\().8h, \r2\().8h, \r3\().8h zip2 \r1\().8h, \r2\().8h, \r3\().8h .endm .macro interleave_pairs r0, r1, r2, r3 zip1 \r0\().4s, \r2\().4s, \r3\().4s zip2 \r1\().4s, \r2\().4s, \r3\().4s .endm .macro interleave_quads r0, r1, r2, r3 zip1 \r0\().2d, \r2\().2d, \r3\().2d zip2 \r1\().2d, \r2\().2d, \r3\().2d .endm .macro normalize_4 add w0, w0, 2 lsr w0, w0, 2 .endm .macro normalize_8 add w0, w0, 4 lsr w0, w0, 3 .endm // x0: src: *const u8, // x1: src_stride: isize, // x2: dst: *const u8, // x3: dst_stride: isize, function satd4x4_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 ldr s0, [src] ldr s1, [dst] // subtract; cast to 16-bit usubl v0.8h, v0.8b, v1.8b ldr s1, [src, src_stride] ldr s2, [dst, dst_stride] usubl v1.8h, v1.8b, v2.8b // stride * 2 lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr s2, [src, x8] ldr s3, [dst, x9] usubl v2.8h, v2.8b, v3.8b // stride * 3 add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr s3, [src, x8] ldr s4, [dst, x9] usubl v3.8h, v3.8b, v4.8b // pack rows 0-2, 1-3 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] // Horizontal transform // v0 0 1 2 3 8 9 10 11 // v1 4 5 6 7 12 13 14 15 butterfly v2, v3, v0, v1 // v2 [0+4][1+5][2+6][3+7] [8+12][9+13][10+14][11+15] // v3 [0-4][1-5][2-6][3-7] [8-12][9-13][10-14][11-15] interleave v0, v1, v2, v3 // v0 [ 0+4][ 0-4][ 1+5][ 1-5] [2 + 6][2 - 6][3 + 7][3 - 7] // v1 [8+12][8-12][9+13][9-13] [10+14][10-14][11+15][11-15] butterfly v2, v3, v0, v1 // v2 [0+4+8+12][0-4+8-12][1+5+9+13][1-5+9-13] [2+6+10+14][2-6+10-14][3+7+11+15][3-7+11-15] // v3 [0+4-8-12][0-4-8+12][1+5-9-13][1-5-9+13] [2+6-10-14][2-6-10+14][3+7-11-15][3-7-11+15] interleave_pairs v0, v1, v2, v3 // Vertical transform butterfly v2, v3, v0, v1 interleave v0, v1, v2, v3 butterfly v2, v3, v0, v1 // sum up transform abs v2.8h, v2.8h abs v3.8h, v3.8h add v0.8h, v2.8h, v3.8h addv h0, v0.8h fmov w0, s0 normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc .macro DOUBLE_HADAMARD_4X4 hbd=0 // Horizontal transform butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 interleave_pairs v0, v1, v2, v3 interleave_pairs v4, v5, v6, v7 // Vertical transform butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 .if \hbd == 0 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 .else butterflyw v2, v3, v16, v17, v0, v1 butterflyw v6, v7, v18, v19, v4, v5 .endif .endm .macro SUM_DOUBLE_HADAMARD_4X4 abs v2.8h, v2.8h abs v3.8h, v3.8h abs v6.8h, v6.8h abs v7.8h, v7.8h add v0.8h, v2.8h, v3.8h add v1.8h, v6.8h, v7.8h add v0.8h, v0.8h, v1.8h addv h0, v0.8h fmov w0, s0 normalize_4 .endm function satd8x4_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 // load 8 pixel row ldr d0, [src] ldr d1, [dst] usubl v0.8h, v0.8b, v1.8b ldr d1, [src, src_stride] ldr d2, [dst, dst_stride] usubl v1.8h, v1.8b, v2.8b lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr d2, [src, x8] ldr d3, [dst, x9] usubl v2.8h, v2.8b, v3.8b // stride * 3 add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr d3, [src, x8] ldr d4, [dst, x9] usubl v3.8h, v3.8b, v4.8b // extract top 64 bits out of register // (4 x 16 bits = 64) ext v4.16b, v0.16b, v0.16b, 8 ext v5.16b, v1.16b, v1.16b, 8 // pack rows 0-2, 1-3 (set 1) mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] // pack rows 0-2, 1-3 (set 2) mov v4.d[1], v2.d[1] mov v5.d[1], v3.d[1] // v2-3 temp registers for first 4x4 block// // 6-7 for second block DOUBLE_HADAMARD_4X4 SUM_DOUBLE_HADAMARD_4X4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc .macro load_row n0, n1, src, dst, src_stride, dst_stride, should_add=1 ldr s\n0, [\src] ldr s\n1, [\dst] usubl v\n0\().8h, v\n0\().8b, v\n1\().8b .if \should_add != 0 add \src, \src, \src_stride add \dst, \dst, \dst_stride .endif .endm .macro load_row2 n0, n1, src, dst, src_stride, dst_stride ldr s\n0, [\src, \src_stride] ldr s\n1, [\dst, \dst_stride] usubl v\n0\().8h, v\n0\().8b, v\n1\().8b .endm function satd4x8_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 // 0 * stride load_row 0, 1, src, dst, src_stride, dst_stride, 0 // 1 * stride load_row2 1, 2, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 // pattern repeats load_row 2, 3, src, dst, src_stride, dst_stride, 0 load_row2 3, 4, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 4, 5, src, dst, src_stride, dst_stride, 0 load_row2 5, 6, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 6, 7, src, dst, src_stride, dst_stride, 0 load_row2 7, 8, src, dst, src_stride, dst_stride // pack rows mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] mov v4.d[1], v6.d[0] mov v5.d[1], v7.d[0] DOUBLE_HADAMARD_4X4 SUM_DOUBLE_HADAMARD_4X4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc function satd16x4_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define ROW1 v0 #define ROW2 v1 #define TMP1 v2 #define TMP2 v3 #define ROW3 v4 #define ROW4 v5 #define TMP3 v6 #define TMP4 v7 #define ROW5 v16 #define ROW6 v17 #define TMP5 v20 #define TMP6 v21 #define ROW7 v18 #define ROW8 v19 #define TMP7 v22 #define TMP8 v23 // load 16 pixel row ldr q0, [src] ldr q1, [dst] usubl2 v16.8h, v0.16b, v1.16b usubl v0.8h, v0.8b, v1.8b ldr q1, [src, src_stride] ldr q2, [dst, dst_stride] usubl2 v17.8h, v1.16b, v2.16b usubl v1.8h, v1.8b, v2.8b lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr q2, [src, x8] ldr q3, [dst, x9] usubl2 v6.8h, v2.16b, v3.16b usubl v2.8h, v2.8b, v3.8b // stride * 3 add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr q3, [src, x8] ldr q4, [dst, x9] usubl2 v7.8h, v3.16b, v4.16b usubl v3.8h, v3.8b, v4.8b // swap high/low 64 bits ext v4.16b, v0.16b, v0.16b, 8 ext v5.16b, v1.16b, v1.16b, 8 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] ext v18.16b, v16.16b, v16.16b, 8 ext v19.16b, v17.16b, v17.16b, 8 mov v16.d[1], v6.d[0] mov v17.d[1], v7.d[0] // 2-3 free mov v4.d[1], v2.d[1] mov v5.d[1], v3.d[1] // 6-7 free mov v18.d[1], v6.d[1] mov v19.d[1], v7.d[1] // 0,1 2,3 // 4,5 6,7 // 16,17 20,21 // 18,19 22,23 // quadruple 4x4 hadamard butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 interleave ROW1, ROW2, TMP1, TMP2 interleave ROW3, ROW4, TMP3, TMP4 interleave ROW5, ROW6, TMP5, TMP6 interleave ROW7, ROW8, TMP7, TMP8 butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 interleave_pairs ROW1, ROW2, TMP1, TMP2 interleave_pairs ROW3, ROW4, TMP3, TMP4 interleave_pairs ROW5, ROW6, TMP5, TMP6 interleave_pairs ROW7, ROW8, TMP7, TMP8 butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 interleave ROW1, ROW2, TMP1, TMP2 interleave ROW3, ROW4, TMP3, TMP4 interleave ROW5, ROW6, TMP5, TMP6 interleave ROW7, ROW8, TMP7, TMP8 butterfly TMP1, TMP2, ROW1, ROW2 butterfly TMP3, TMP4, ROW3, ROW4 butterfly TMP5, TMP6, ROW5, ROW6 butterfly TMP7, TMP8, ROW7, ROW8 // absolute value of transform coefficients abs TMP1.8h, TMP1.8h abs TMP2.8h, TMP2.8h abs TMP3.8h, TMP3.8h abs TMP4.8h, TMP4.8h abs TMP5.8h, TMP5.8h abs TMP6.8h, TMP6.8h abs TMP7.8h, TMP7.8h abs TMP8.8h, TMP8.8h // stage 1 sum add TMP1.8h, TMP1.8h, TMP5.8h add TMP2.8h, TMP2.8h, TMP6.8h add TMP3.8h, TMP3.8h, TMP7.8h add TMP4.8h, TMP4.8h, TMP8.8h // stage 2 sum add TMP1.8h, TMP1.8h, TMP3.8h add TMP2.8h, TMP2.8h, TMP4.8h add v0.8h, TMP1.8h, TMP2.8h addv h0, v0.8h fmov w0, s0 normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef ROW1 #undef TMP1 #undef ROW2 #undef TMP2 #undef ROW3 #undef TMP3 #undef ROW4 #undef TMP4 #undef ROW5 #undef TMP5 #undef ROW6 #undef TMP6 #undef ROW7 #undef TMP7 #undef ROW8 #undef TMP8 endfunc function satd4x16_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 load_row 0, 1, src, dst, src_stride, dst_stride, 0 load_row2 1, 2, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 2, 3, src, dst, src_stride, dst_stride, 0 load_row2 3, 4, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 4, 5, src, dst, src_stride, dst_stride, 0 load_row2 5, 6, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 6, 7, src, dst, src_stride, dst_stride, 0 load_row2 7, 16, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 16, 17, src, dst, src_stride, dst_stride, 0 load_row2 17, 18, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 18, 19, src, dst, src_stride, dst_stride, 0 load_row2 19, 20, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 20, 21, src, dst, src_stride, dst_stride, 0 load_row2 21, 22, src, dst, src_stride, dst_stride add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 load_row 22, 23, src, dst, src_stride, dst_stride, 0 load_row2 23, 24, src, dst, src_stride, dst_stride // pack rows mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] mov v4.d[1], v6.d[0] mov v5.d[1], v7.d[0] mov v16.d[1], v18.d[0] mov v17.d[1], v19.d[0] mov v20.d[1], v22.d[0] mov v21.d[1], v23.d[0] butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 interleave v16, v17, v18, v19 interleave v20, v21, v22, v23 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 interleave_pairs v0, v1, v2, v3 interleave_pairs v4, v5, v6, v7 interleave_pairs v16, v17, v18, v19 interleave_pairs v20, v21, v22, v23 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 interleave v0, v1, v2, v3 interleave v4, v5, v6, v7 interleave v16, v17, v18, v19 interleave v20, v21, v22, v23 butterfly v2, v3, v0, v1 butterfly v6, v7, v4, v5 butterfly v18, v19, v16, v17 butterfly v22, v23, v20, v21 abs v2.8h, v2.8h abs v3.8h, v3.8h abs v6.8h, v6.8h abs v7.8h, v7.8h abs v18.8h, v18.8h abs v19.8h, v19.8h abs v22.8h, v22.8h abs v23.8h, v23.8h add v2.8h, v2.8h, v3.8h add v6.8h, v6.8h, v7.8h add v18.8h, v18.8h, v19.8h add v22.8h, v22.8h, v23.8h add v2.8h, v2.8h, v6.8h add v18.8h, v18.8h, v22.8h add v0.8h, v2.8h, v18.8h addv h0, v0.8h fmov w0, s0 normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride endfunc .macro load_rows n0, n1, n2, src, dst, src_stride, dst_stride, n3=0, n4=0 .if \n3 == 0 ldr d\n0, [\src] ldr d\n1, [\dst] .else ldr q\n0, [\src] ldr q\n1, [\dst] usubl2 v\n3\().8h, v\n0\().16b, v\n1\().16b .endif usubl v\n0\().8h, v\n0\().8b, v\n1\().8b .if \n4 == 0 ldr d\n1, [\src, \src_stride] ldr d\n2, [\dst, \dst_stride] .else ldr q\n1, [\src, \src_stride] ldr q\n2, [\dst, \dst_stride] usubl2 v\n4\().8h, v\n1\().16b, v\n2\().16b .endif usubl v\n1\().8h, v\n1\().8b, v\n2\().8b add \src, \src, \src_stride, lsl 1 add \dst, \dst, \dst_stride, lsl 1 .endm .macro HADAMARD_8X8_H \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 // Horizontal transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 interleave v\a0, v\a1, v\b0, v\b1 interleave v\a2, v\a3, v\b2, v\b3 interleave v\a4, v\a5, v\b4, v\b5 interleave v\a6, v\a7, v\b6, v\b7 butterfly v\b0, v\b2, v\a0, v\a2 butterfly v\b1, v\b3, v\a1, v\a3 butterfly v\b4, v\b6, v\a4, v\a6 butterfly v\b5, v\b7, v\a5, v\a7 interleave_pairs v\a0, v\a2, v\b0, v\b2 interleave_pairs v\a1, v\a3, v\b1, v\b3 interleave_pairs v\a4, v\a6, v\b4, v\b6 interleave_pairs v\a5, v\a7, v\b5, v\b7 butterfly v\b0, v\b4, v\a0, v\a4 butterfly v\b1, v\b5, v\a1, v\a5 butterfly v\b2, v\b6, v\a2, v\a6 butterfly v\b3, v\b7, v\a3, v\a7 interleave_quads v\a0, v\a4, v\b0, v\b4 interleave_quads v\a1, v\a5, v\b1, v\b5 interleave_quads v\a2, v\a6, v\b2, v\b6 interleave_quads v\a3, v\a7, v\b3, v\b7 .endm .macro HADAMARD_8X8_V \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 // Vertical transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a2, v\b0, v\b2 butterfly v\a1, v\a3, v\b1, v\b3 butterfly v\a4, v\a6, v\b4, v\b6 butterfly v\a5, v\a7, v\b5, v\b7 butterfly v\b0, v\b4, v\a0, v\a4 butterfly v\b1, v\b5, v\a1, v\a5 butterfly v\b2, v\b6, v\a2, v\a6 butterfly v\b3, v\b7, v\a3, v\a7 .endm .macro SUM_HADAMARD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 // absolute value of transform coefficients abs v\b0\().8h, v\b0\().8h abs v\b1\().8h, v\b1\().8h abs v\b2\().8h, v\b2\().8h abs v\b3\().8h, v\b3\().8h abs v\b4\().8h, v\b4\().8h abs v\b5\().8h, v\b5\().8h abs v\b6\().8h, v\b6\().8h abs v\b7\().8h, v\b7\().8h // stage 1 sum sxtl v\a0\().4s, v\b0\().4h sxtl v\a1\().4s, v\b1\().4h sxtl v\a2\().4s, v\b2\().4h sxtl v\a3\().4s, v\b3\().4h saddw2 v\a0\().4s, v\a0\().4s, v\b0\().8h saddw2 v\a1\().4s, v\a1\().4s, v\b1\().8h saddw2 v\a2\().4s, v\a2\().4s, v\b2\().8h saddw2 v\a3\().4s, v\a3\().4s, v\b3\().8h saddw v\a0\().4s, v\a0\().4s, v\b4\().4h saddw2 v\a1\().4s, v\a1\().4s, v\b4\().8h saddw v\a2\().4s, v\a2\().4s, v\b5\().4h saddw2 v\a3\().4s, v\a3\().4s, v\b5\().8h saddw v\a0\().4s, v\a0\().4s, v\b6\().4h saddw2 v\a1\().4s, v\a1\().4s, v\b6\().8h saddw v\a2\().4s, v\a2\().4s, v\b7\().4h saddw2 v\a3\().4s, v\a3\().4s, v\b7\().8h // stage 2 sum add v\a0\().4s, v\a0\().4s, v\a1\().4s add v\a2\().4s, v\a2\().4s, v\a3\().4s // stage 3 sum add v0.4s, v\a0\().4s, v\a2\().4s addv s0, v0.4s .endm .macro SATD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 HADAMARD_8X8_H \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 HADAMARD_8X8_V \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 SUM_HADAMARD_8X8 \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 .endm function satd8x8_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define height w13 mov height, 8 mov total, wzr // 0, 1; 2, 3 // 4, 5; 6, 7 // 16, 17; 20, 21 // 18, 19; 22, 23 L(satd_w8): load_rows 0, 1, 2, src, dst, src_stride, dst_stride load_rows 4, 5, 6, src, dst, src_stride, dst_stride load_rows 16, 17, 20, src, dst, src_stride, dst_stride load_rows 18, 19, 22, src, dst, src_stride, dst_stride SATD_8X8 \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23 fmov subtotal, s0 add total, subtotal, total subs height, height, #8 bne L(satd_w8) mov w0, total normalize_8 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef subtotal #undef total #undef height endfunc .macro DOUBLE_HADAMARD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 // Horizontal transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a1, v\c0, v\c1 butterfly v\a2, v\a3, v\c2, v\c3 butterfly v\a4, v\a5, v\c4, v\c5 butterfly v\a6, v\a7, v\c6, v\c7 interleave v\c0, v\c1, v\b0, v\b1 interleave v\c2, v\c3, v\b2, v\b3 interleave v\c4, v\c5, v\b4, v\b5 interleave v\c6, v\c7, v\b6, v\b7 interleave v\b0, v\b1, v\a0, v\a1 interleave v\b2, v\b3, v\a2, v\a3 interleave v\b4, v\b5, v\a4, v\a5 interleave v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a2, v\c0, v\c2 butterfly v\a1, v\a3, v\c1, v\c3 butterfly v\a4, v\a6, v\c4, v\c6 butterfly v\a5, v\a7, v\c5, v\c7 butterfly v\c0, v\c2, v\b0, v\b2 butterfly v\c1, v\c3, v\b1, v\b3 butterfly v\c4, v\c6, v\b4, v\b6 butterfly v\c5, v\c7, v\b5, v\b7 interleave_pairs v\b0, v\b2, v\a0, v\a2 interleave_pairs v\b1, v\b3, v\a1, v\a3 interleave_pairs v\b4, v\b6, v\a4, v\a6 interleave_pairs v\b5, v\b7, v\a5, v\a7 interleave_pairs v\a0, v\a2, v\c0, v\c2 interleave_pairs v\a1, v\a3, v\c1, v\c3 interleave_pairs v\a4, v\a6, v\c4, v\c6 interleave_pairs v\a5, v\a7, v\c5, v\c7 butterfly v\c0, v\c4, v\b0, v\b4 butterfly v\c1, v\c5, v\b1, v\b5 butterfly v\c2, v\c6, v\b2, v\b6 butterfly v\c3, v\c7, v\b3, v\b7 butterfly v\b0, v\b4, v\a0, v\a4 butterfly v\b1, v\b5, v\a1, v\a5 butterfly v\b2, v\b6, v\a2, v\a6 butterfly v\b3, v\b7, v\a3, v\a7 interleave_quads v\a0, v\a4, v\c0, v\c4 interleave_quads v\a1, v\a5, v\c1, v\c5 interleave_quads v\a2, v\a6, v\c2, v\c6 interleave_quads v\a3, v\a7, v\c3, v\c7 interleave_quads v\c0, v\c4, v\b0, v\b4 interleave_quads v\c1, v\c5, v\b1, v\b5 interleave_quads v\c2, v\c6, v\b2, v\b6 interleave_quads v\c3, v\c7, v\b3, v\b7 // Vertical transform butterfly v\b0, v\b1, v\a0, v\a1 butterfly v\b2, v\b3, v\a2, v\a3 butterfly v\b4, v\b5, v\a4, v\a5 butterfly v\b6, v\b7, v\a6, v\a7 butterfly v\a0, v\a1, v\c0, v\c1 butterfly v\a2, v\a3, v\c2, v\c3 butterfly v\a4, v\a5, v\c4, v\c5 butterfly v\a6, v\a7, v\c6, v\c7 butterfly v\c0, v\c2, v\b0, v\b2 butterfly v\c1, v\c3, v\b1, v\b3 butterfly v\c4, v\c6, v\b4, v\b6 butterfly v\c5, v\c7, v\b5, v\b7 butterfly v\b0, v\b2, v\a0, v\a2 butterfly v\b1, v\b3, v\a1, v\a3 butterfly v\b4, v\b6, v\a4, v\a6 butterfly v\b5, v\b7, v\a5, v\a7 butterfly v\a0, v\a4, v\c0, v\c4 butterfly v\a1, v\a5, v\c1, v\c5 butterfly v\a2, v\a6, v\c2, v\c6 butterfly v\a3, v\a7, v\c3, v\c7 butterfly v\c0, v\c4, v\b0, v\b4 butterfly v\c1, v\c5, v\b1, v\b5 butterfly v\c2, v\c6, v\b2, v\b6 butterfly v\c3, v\c7, v\b3, v\b7 .endm .macro SUM_DOUBLE_HADAMARD_8X8 \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 // absolute value of transform coefficients abs v\a0\().8h, v\a0\().8h abs v\a1\().8h, v\a1\().8h abs v\a2\().8h, v\a2\().8h abs v\a3\().8h, v\a3\().8h abs v\a4\().8h, v\a4\().8h abs v\a5\().8h, v\a5\().8h abs v\a6\().8h, v\a6\().8h abs v\a7\().8h, v\a7\().8h abs v\c0\().8h, v\c0\().8h abs v\c1\().8h, v\c1\().8h abs v\c2\().8h, v\c2\().8h abs v\c3\().8h, v\c3\().8h abs v\c4\().8h, v\c4\().8h abs v\c5\().8h, v\c5\().8h abs v\c6\().8h, v\c6\().8h abs v\c7\().8h, v\c7\().8h // stage 1 sum sxtl v\b0\().4s, v\a0\().4h sxtl v\b1\().4s, v\a1\().4h sxtl v\b2\().4s, v\a2\().4h sxtl v\b3\().4s, v\a3\().4h sxtl v\b4\().4s, v\a4\().4h sxtl v\b5\().4s, v\a5\().4h sxtl v\b6\().4s, v\a6\().4h sxtl v\b7\().4s, v\a7\().4h saddw2 v\b0\().4s, v\b0\().4s, v\a0\().8h saddw2 v\b1\().4s, v\b1\().4s, v\a1\().8h saddw2 v\b2\().4s, v\b2\().4s, v\a2\().8h saddw2 v\b3\().4s, v\b3\().4s, v\a3\().8h saddw2 v\b4\().4s, v\b4\().4s, v\a4\().8h saddw2 v\b5\().4s, v\b5\().4s, v\a5\().8h saddw2 v\b6\().4s, v\b6\().4s, v\a6\().8h saddw2 v\b7\().4s, v\b7\().4s, v\a7\().8h saddw v\b0\().4s, v\b0\().4s, v\c0\().4h saddw2 v\b1\().4s, v\b1\().4s, v\c0\().8h saddw v\b2\().4s, v\b2\().4s, v\c1\().4h saddw2 v\b3\().4s, v\b3\().4s, v\c1\().8h saddw v\b4\().4s, v\b4\().4s, v\c2\().4h saddw2 v\b5\().4s, v\b5\().4s, v\c2\().8h saddw v\b6\().4s, v\b6\().4s, v\c3\().4h saddw2 v\b7\().4s, v\b7\().4s, v\c3\().8h saddw v\b0\().4s, v\b0\().4s, v\c4\().4h saddw2 v\b1\().4s, v\b1\().4s, v\c4\().8h saddw v\b2\().4s, v\b2\().4s, v\c5\().4h saddw2 v\b3\().4s, v\b3\().4s, v\c5\().8h saddw v\b4\().4s, v\b4\().4s, v\c6\().4h saddw2 v\b5\().4s, v\b5\().4s, v\c6\().8h saddw v\b6\().4s, v\b6\().4s, v\c7\().4h saddw2 v\b7\().4s, v\b7\().4s, v\c7\().8h // stage 2 sum add v\b0\().4s, v\b0\().4s, v\b1\().4s add v\b2\().4s, v\b2\().4s, v\b3\().4s add v\b4\().4s, v\b4\().4s, v\b5\().4s add v\b6\().4s, v\b6\().4s, v\b7\().4s // stage 3 sum add v\b0\().4s, v\b0\().4s, v\b2\().4s add v\b4\().4s, v\b4\().4s, v\b6\().4s // stage 4 sum add v0.4s, v\b0\().4s, v\b4\().4s addv s0, v0.4s .endm function satd16x8_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define w_ext x11 #define w_bak w11 #define width w12 #define height w13 mov height, 8 mov width, 16 sxtw w_ext, width mov total, wzr // 0, 1; 2, 3; 24, 25 // 4, 5; 6, 7; 26, 27 // 16, 17; 20, 21; 28, 29 // 18, 19; 22, 23; 30, 31 L(satd_w16up): load_rows 0, 1, 2, src, dst, src_stride, dst_stride, 24, 25 load_rows 4, 5, 6, src, dst, src_stride, dst_stride, 26, 27 load_rows 16, 17, 20, src, dst, src_stride, dst_stride, 28, 29 load_rows 18, 19, 22, src, dst, src_stride, dst_stride, 30, 31 DOUBLE_HADAMARD_8X8 \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 SUM_DOUBLE_HADAMARD_8X8 \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 fmov subtotal, s0 add total, subtotal, total sub src, src, src_stride, lsl 3 sub dst, dst, dst_stride, lsl 3 add src, src, #16 add dst, dst, #16 subs width, width, #16 bne L(satd_w16up) sub src, src, w_ext sub dst, dst, w_ext add src, src, src_stride, lsl 3 add dst, dst, dst_stride, lsl 3 subs height, height, #8 mov width, w_bak bne L(satd_w16up) mov w0, total normalize_8 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef w_ext #undef w_bak #undef subtotal #undef total #undef height #undef width endfunc .macro satd_x8up width, height function satd\width\()x\height\()_neon, export=1 mov w13, \height .if \width == 8 mov w10, wzr b L(satd_w8) .else mov w12, \width sxtw x11, w12 mov w10, wzr b L(satd_w16up) .endif endfunc .endm satd_x8up 8, 16 satd_x8up 8, 32 satd_x8up 16, 16 satd_x8up 16, 32 satd_x8up 16, 64 satd_x8up 32, 8 satd_x8up 32, 16 satd_x8up 32, 32 satd_x8up 32, 64 satd_x8up 64, 16 satd_x8up 64, 32 satd_x8up 64, 64 satd_x8up 64, 128 satd_x8up 128, 64 satd_x8up 128, 128 .macro load_rows_hbd n0, n1, n2, src, dst, src_stride, dst_stride ldr q\n0, [\src] ldr q\n1, [\dst] sub v\n0\().8h, v\n0\().8h, v\n1\().8h ldr q\n1, [\src, \src_stride] ldr q\n2, [\dst, \dst_stride] sub v\n1\().8h, v\n1\().8h, v\n2\().8h add \src, \src, \src_stride, lsl 1 add \dst, \dst, \dst_stride, lsl 1 .endm .macro HADAMARD_8X8_V_HBD \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 // Vertical transform butterflyw v\b0, v\b1, v\c0, v\c1, v\a0, v\a1 butterflyw v\b2, v\b3, v\c2, v\c3, v\a2, v\a3 butterflyw v\b4, v\b5, v\c4, v\c5, v\a4, v\a5 butterflyw v\b6, v\b7, v\c6, v\c7, v\a6, v\a7 butterfly v\a0, v\a2, v\b0, v\b2, 4s butterfly v\a1, v\a3, v\b1, v\b3, 4s butterfly v\a4, v\a6, v\b4, v\b6, 4s butterfly v\a5, v\a7, v\b5, v\b7, 4s butterfly v\b0, v\b2, v\c0, v\c2, 4s butterfly v\b1, v\b3, v\c1, v\c3, 4s butterfly v\b4, v\b6, v\c4, v\c6, 4s butterfly v\b5, v\b7, v\c5, v\c7, 4s butterfly v\c0, v\c4, v\a0, v\a4, 4s butterfly v\c1, v\c5, v\a1, v\a5, 4s butterfly v\c2, v\c6, v\a2, v\a6, 4s butterfly v\c3, v\c7, v\a3, v\a7, 4s butterfly v\a0, v\a4, v\b0, v\b4, 4s butterfly v\a1, v\a5, v\b1, v\b5, 4s butterfly v\a2, v\a6, v\b2, v\b6, 4s butterfly v\a3, v\a7, v\b3, v\b7, 4s .endm .macro SUM_HADAMARD_8X8_HBD \ a0 a1 a2 a3 a4 a5 a6 a7 \ c0 c1 c2 c3 c4 c5 c6 c7 // absolute value of transform coefficients abs v\a0\().4s, v\a0\().4s abs v\a1\().4s, v\a1\().4s abs v\a2\().4s, v\a2\().4s abs v\a3\().4s, v\a3\().4s abs v\a4\().4s, v\a4\().4s abs v\a5\().4s, v\a5\().4s abs v\a6\().4s, v\a6\().4s abs v\a7\().4s, v\a7\().4s abs v\c0\().4s, v\c0\().4s abs v\c1\().4s, v\c1\().4s abs v\c2\().4s, v\c2\().4s abs v\c3\().4s, v\c3\().4s abs v\c4\().4s, v\c4\().4s abs v\c5\().4s, v\c5\().4s abs v\c6\().4s, v\c6\().4s abs v\c7\().4s, v\c7\().4s // stage 1 sum add v\a0\().4s, v\a0\().4s, v\a1\().4s add v\a2\().4s, v\a2\().4s, v\a3\().4s add v\a4\().4s, v\a4\().4s, v\a5\().4s add v\a6\().4s, v\a6\().4s, v\a7\().4s add v\c0\().4s, v\c0\().4s, v\c1\().4s add v\c2\().4s, v\c2\().4s, v\c3\().4s add v\c4\().4s, v\c4\().4s, v\c5\().4s add v\c6\().4s, v\c6\().4s, v\c7\().4s // stage 2 sum add v\a0\().4s, v\a0\().4s, v\a2\().4s add v\a4\().4s, v\a4\().4s, v\a6\().4s add v\c0\().4s, v\c0\().4s, v\c2\().4s add v\c4\().4s, v\c4\().4s, v\c6\().4s // stage 3 sum add v\a0\().4s, v\a0\().4s, v\a4\().4s add v\c0\().4s, v\c0\().4s, v\c4\().4s // stage 4 sum add v0.4s, v\a0\().4s, v\c0\().4s addv s0, v0.4s .endm .macro SATD_8X8_HBD \ a0 a1 a2 a3 a4 a5 a6 a7 \ b0 b1 b2 b3 b4 b5 b6 b7 \ c0 c1 c2 c3 c4 c5 c6 c7 HADAMARD_8X8_H \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7 HADAMARD_8X8_V_HBD \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7, \ \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 SUM_HADAMARD_8X8_HBD \ \a0, \a1, \a2, \a3, \a4, \a5, \a6, \a7, \ \c0, \c1, \c2, \c3, \c4, \c5, \c6, \c7 .endm function satd8x8_hbd_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define w_ext x11 #define w_bak w11 #define width w12 #define height w13 mov height, 8 mov width, 8 sxtw w_ext, width mov total, wzr // 0, 1; 2, 3; 24, 25 // 4, 5; 6, 7; 26, 27 // 16, 17; 20, 21; 28, 29 // 18, 19; 22, 23; 30, 31 L(satd_w8up_hbd): load_rows_hbd 0, 1, 2, src, dst, src_stride, dst_stride load_rows_hbd 4, 5, 6, src, dst, src_stride, dst_stride load_rows_hbd 16, 17, 20, src, dst, src_stride, dst_stride load_rows_hbd 18, 19, 22, src, dst, src_stride, dst_stride SATD_8X8_HBD \ 0, 1, 4, 5, 16, 17, 18, 19, \ 2, 3, 6, 7, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 fmov subtotal, s0 add total, subtotal, total sub src, src, src_stride, lsl 3 sub dst, dst, dst_stride, lsl 3 add src, src, #16 add dst, dst, #16 subs width, width, #8 bne L(satd_w8up_hbd) sub src, src, w_ext, lsl 1 sub dst, dst, w_ext, lsl 1 add src, src, src_stride, lsl 3 add dst, dst, dst_stride, lsl 3 subs height, height, #8 mov width, w_bak bne L(satd_w8up_hbd) mov w0, total normalize_8 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef w_ext #undef w_bak #undef subtotal #undef total #undef height #undef width endfunc .macro satd_x8up_hbd width, height function satd\width\()x\height\()_hbd_neon, export=1 mov w13, \height mov w12, \width sxtw x11, w12 mov w10, wzr b L(satd_w8up_hbd) endfunc .endm satd_x8up_hbd 8, 16 satd_x8up_hbd 8, 32 satd_x8up_hbd 16, 8 satd_x8up_hbd 16, 16 satd_x8up_hbd 16, 32 satd_x8up_hbd 16, 64 satd_x8up_hbd 32, 8 satd_x8up_hbd 32, 16 satd_x8up_hbd 32, 32 satd_x8up_hbd 32, 64 satd_x8up_hbd 64, 16 satd_x8up_hbd 64, 32 satd_x8up_hbd 64, 64 satd_x8up_hbd 64, 128 satd_x8up_hbd 128, 64 satd_x8up_hbd 128, 128 // x0: src: *const u16, // x1: src_stride: isize, // x2: dst: *const u16, // x3: dst_stride: isize, function satd4x4_hbd_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define height w13 mov height, 4 mov total, wzr L(satd_w4_hbd): ldr d0, [src] ldr d1, [dst] sub v0.8h, v0.8h, v1.8h ldr d1, [src, src_stride] ldr d2, [dst, dst_stride] sub v1.8h, v1.8h, v2.8h add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 ldr d2, [src] ldr d3, [dst] sub v2.8h, v2.8h, v3.8h ldr d3, [src, src_stride] ldr d4, [dst, src_stride] sub v3.8h, v3.8h, v4.8h add src, src, src_stride, lsl 1 add dst, dst, dst_stride, lsl 1 // pack rows 0-2, 1-3 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] // Horizontal transform butterfly v2, v3, v0, v1 interleave v0, v1, v2, v3 butterfly v2, v3, v0, v1 interleave_pairs v0, v1, v2, v3 // Vertical transform butterfly v2, v3, v0, v1 interleave v0, v1, v2, v3 butterflyw v2, v3, v4, v5, v0, v1 // absolute value of transform coefficients abs v2.4s, v2.4s abs v3.4s, v3.4s abs v4.4s, v4.4s abs v5.4s, v5.4s // stage 1 sum add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s // stage 2 sum add v0.4s, v2.4s, v4.4s addv s0, v0.4s fmov subtotal, s0 add total, subtotal, total subs height, height, #4 bne L(satd_w4_hbd) mov w0, total normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef subtotal #undef total #undef height endfunc function satd4x8_hbd_neon, export=1 mov w13, 8 mov w10, wzr b L(satd_w4_hbd) endfunc function satd4x16_hbd_neon, export=1 mov w13, 16 mov w10, wzr b L(satd_w4_hbd) endfunc .macro SUM_DOUBLE_HADAMARD_4X4_HBD \ a0 a1 a2 a3 c0 c1 c2 c3 // absolute value of transform coefficients abs v\a0\().4s, v\a0\().4s abs v\a1\().4s, v\a1\().4s abs v\a2\().4s, v\a2\().4s abs v\a3\().4s, v\a3\().4s abs v\c0\().4s, v\c0\().4s abs v\c1\().4s, v\c1\().4s abs v\c2\().4s, v\c2\().4s abs v\c3\().4s, v\c3\().4s // stage 1 sum add v\a0\().4s, v\a0\().4s, v\a1\().4s add v\a2\().4s, v\a2\().4s, v\a3\().4s add v\c0\().4s, v\c0\().4s, v\c1\().4s add v\c2\().4s, v\c2\().4s, v\c3\().4s // stage 2 sum add v\a0\().4s, v\a0\().4s, v\a2\().4s add v\c0\().4s, v\c0\().4s, v\c2\().4s // stage 3 sum add v0.4s, v\a0\().4s, v\c0\().4s addv s0, v0.4s .endm function satd8x4_hbd_neon, export=1 #define src x0 #define src_stride x1 #define dst x2 #define dst_stride x3 #define subtotal w9 #define total w10 #define width w12 mov width, 8 mov total, wzr L(satd_h4_hbd): ldr q0, [src] ldr q1, [dst] sub v0.8h, v0.8h, v1.8h ldr q1, [src, src_stride] ldr q2, [dst, dst_stride] sub v1.8h, v1.8h, v2.8h lsl x8, src_stride, 1 lsl x9, dst_stride, 1 ldr q2, [src, x8] ldr q3, [dst, x9] sub v2.8h, v2.8h, v3.8h add x8, src_stride, src_stride, lsl 1 add x9, dst_stride, dst_stride, lsl 1 ldr q3, [src, x8] ldr q4, [dst, x9] sub v3.8h, v3.8h, v4.8h ext v4.16b, v0.16b, v0.16b, 8 ext v5.16b, v1.16b, v1.16b, 8 mov v0.d[1], v2.d[0] mov v1.d[1], v3.d[0] mov v4.d[1], v2.d[1] mov v5.d[1], v3.d[1] DOUBLE_HADAMARD_4X4 hbd=1 SUM_DOUBLE_HADAMARD_4X4_HBD 2, 3, 16, 17, 6, 7, 18, 19 fmov subtotal, s0 add total, subtotal, total add src, src, #16 add dst, dst, #16 subs width, width, #8 bne L(satd_h4_hbd) mov w0, total normalize_4 ret #undef src #undef src_stride #undef dst #undef dst_stride #undef subtotal #undef total #undef width endfunc function satd16x4_hbd_neon, export=1 mov w12, 16 mov w10, wzr b L(satd_h4_hbd) endfunc rav1e-0.7.1/src/arm/64/sse.S000064400000000000000000000325461046102023000133760ustar 00000000000000/* Copyright (c) 2023, The rav1e contributors. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "src/arm/asm.S" #include "util.S" .macro LOAD_X4 t=q ldr \t\()0, [x0] ldr \t\()4, [x2] ldr \t\()1, [x0, x1] ldr \t\()5, [x2, x3] ldr \t\()2, [x0, x11] ldr \t\()6, [x2, x12] ldr \t\()3, [x0, x8] ldr \t\()7, [x2, x9] ldr \t\()16, [x4] add x0, x0, x1, lsl 2 add x2, x2, x3, lsl 2 add x4, x4, x5 subs w10, w10, #4 .ifc \t,q mov v18.d[0], v16.d[1] .else mov v0.\t[1], v1.\t[0] mov v4.\t[1], v5.\t[0] mov v2.\t[1], v3.\t[0] mov v6.\t[1], v7.\t[0] .endif .endm .macro INIT width, height .if \width <= 16 add x11, x1, x1 add x12, x3, x3 add x8, x1, x1, lsl 1 add x9, x3, x3, lsl 1 .elseif \width >= 64 mov w8, #(\width) sxtw x9, w8 .endif movi v17.4s, #0 mov w10, #(\height) .endm // x0: src: *const u8, // x1: src_stride: isize, // x2: dst: *const u8, // x3: dst_stride: isize, // x4: scale: *const u32, // x5: scale_stride: isize, function weighted_sse_4x4_neon, export=1 INIT 4, 4 L(wsse_w4): LOAD_X4 t=s uabd v0.8b, v0.8b, v4.8b // diff pixel values uabd v1.8b, v2.8b, v6.8b umull v0.8h, v0.8b, v0.8b // square umull v1.8h, v1.8b, v1.8b uaddl v2.4s, v0.4h, v1.4h // accumulate uaddl2 v3.4s, v0.8h, v1.8h add v0.4s, v2.4s, v3.4s addv s0, v0.4s umull v0.2d, v0.2s, v16.2s // apply scale urshr d0, d0, #8 add v17.2d, v17.2d, v0.2d bne L(wsse_w4) fmov x0, d17 ret endfunc .macro RET_SUM mov v0.d[0], v17.d[1] add d0, d0, d17 fmov x0, d0 ret .endm function weighted_sse_8x8_neon, export=1 INIT 8, 8 L(wsse_w8): LOAD_X4 t=d uabd v0.16b, v0.16b, v4.16b // diff pixel values uabd v1.16b, v2.16b, v6.16b umull2 v2.8h, v0.16b, v0.16b // square umull2 v3.8h, v1.16b, v1.16b umull v0.8h, v0.8b, v0.8b umull v1.8h, v1.8b, v1.8b uaddlp v2.4s, v2.8h // accumulate uaddlp v3.4s, v3.8h uaddlp v0.4s, v0.8h uaddlp v1.4s, v1.8h uaddlp v2.2d, v2.4s uadalp v2.2d, v3.4s uadalp v2.2d, v0.4s uadalp v2.2d, v1.4s xtn v0.2s, v2.2d umull v0.2d, v0.2s, v16.2s // apply scale urshr v0.2d, v0.2d, #8 add v17.2d, v17.2d, v0.2d bne L(wsse_w8) RET_SUM endfunc function weighted_sse_16x16_neon, export=1 INIT 16, 16 L(wsse_w16): LOAD_X4 uabd v0.16b, v0.16b, v4.16b // diff pixel values uabd v1.16b, v1.16b, v5.16b uabd v2.16b, v2.16b, v6.16b uabd v3.16b, v3.16b, v7.16b umull2 v4.8h, v0.16b, v0.16b // square umull2 v5.8h, v1.16b, v1.16b umull2 v6.8h, v2.16b, v2.16b umull2 v7.8h, v3.16b, v3.16b umull v0.8h, v0.8b, v0.8b umull v1.8h, v1.8b, v1.8b umull v2.8h, v2.8b, v2.8b umull v3.8h, v3.8b, v3.8b uaddlp v4.4s, v4.8h // accumulate uaddlp v5.4s, v5.8h uaddlp v6.4s, v6.8h uaddlp v7.4s, v7.8h uaddlp v0.4s, v0.8h uaddlp v1.4s, v1.8h uaddlp v2.4s, v2.8h uaddlp v3.4s, v3.8h uaddlp v4.2d, v4.4s uadalp v4.2d, v5.4s uadalp v4.2d, v6.4s uadalp v4.2d, v7.4s xtn v4.2s, v4.2d uaddlp v0.2d, v0.4s uadalp v0.2d, v1.4s uadalp v0.2d, v2.4s uadalp v0.2d, v3.4s xtn v0.2s, v0.2d umull v4.2d, v4.2s, v18.2s // apply scale umull v0.2d, v0.2s, v16.2s urshr v4.2d, v4.2d, #8 urshr v0.2d, v0.2d, #8 add v17.2d, v17.2d, v4.2d add v17.2d, v17.2d, v0.2d bne L(wsse_w16) RET_SUM endfunc .macro LOAD_32X4 vert=1, hbd=0 ldp q0, q22, [x0] ldp q4, q26, [x2] add x0, x0, x1 add x2, x2, x3 ldp q1, q23, [x0] ldp q5, q27, [x2] add x0, x0, x1 add x2, x2, x3 ldp q2, q24, [x0] ldp q6, q28, [x2] add x0, x0, x1 add x2, x2, x3 ldp q3, q25, [x0] ldp q7, q29, [x2] add x0, x0, x1 add x2, x2, x3 .if \vert == 1 .if \hbd == 0 ldp q16, q19, [x4] .else ldr q16, [x4] .endif add x4, x4, x5 subs w10, w10, #4 .else sub x0, x0, x1, lsl 2 sub x2, x2, x3, lsl 2 add x0, x0, #32 add x2, x2, #32 .if \hbd == 0 ldp q16, q19, [x4] add x4, x4, #32 .else ldr q16, [x4] add x4, x4, #16 .endif .endif mov v18.d[0], v16.d[1] .if \hbd == 0 mov v20.d[0], v19.d[1] .endif .endm .macro WEIGHTED_SSE_32X4 uabd v0.16b, v0.16b, v4.16b // diff pixel values uabd v1.16b, v1.16b, v5.16b uabd v2.16b, v2.16b, v6.16b uabd v3.16b, v3.16b, v7.16b uabd v22.16b, v22.16b, v26.16b uabd v23.16b, v23.16b, v27.16b uabd v24.16b, v24.16b, v28.16b uabd v25.16b, v25.16b, v29.16b umull2 v4.8h, v0.16b, v0.16b // square umull2 v5.8h, v1.16b, v1.16b umull2 v6.8h, v2.16b, v2.16b umull2 v7.8h, v3.16b, v3.16b umull2 v26.8h, v22.16b, v22.16b umull2 v27.8h, v23.16b, v23.16b umull2 v28.8h, v24.16b, v24.16b umull2 v29.8h, v25.16b, v25.16b umull v0.8h, v0.8b, v0.8b umull v1.8h, v1.8b, v1.8b umull v2.8h, v2.8b, v2.8b umull v3.8h, v3.8b, v3.8b umull v22.8h, v22.8b, v22.8b umull v23.8h, v23.8b, v23.8b umull v24.8h, v24.8b, v24.8b umull v25.8h, v25.8b, v25.8b uaddlp v4.4s, v4.8h // accumulate uadalp v4.4s, v5.8h uadalp v4.4s, v6.8h uadalp v4.4s, v7.8h uaddlp v26.4s, v26.8h uadalp v26.4s, v27.8h uadalp v26.4s, v28.8h uadalp v26.4s, v29.8h uaddlp v0.4s, v0.8h uadalp v0.4s, v1.8h uadalp v0.4s, v2.8h uadalp v0.4s, v3.8h uaddlp v22.4s, v22.8h uadalp v22.4s, v23.8h uadalp v22.4s, v24.8h uadalp v22.4s, v25.8h uaddlp v4.2d, v4.4s uaddlp v26.2d, v26.4s uaddlp v0.2d, v0.4s uaddlp v22.2d, v22.4s xtn v4.2s, v4.2d xtn v26.2s, v26.2d xtn v0.2s, v0.2d xtn v22.2s, v22.2d umull v4.2d, v4.2s, v18.2s // apply scale umull v26.2d, v26.2s, v20.2s umull v0.2d, v0.2s, v16.2s umull v22.2d, v22.2s, v19.2s urshr v4.2d, v4.2d, #8 urshr v26.2d, v26.2d, #8 urshr v0.2d, v0.2d, #8 urshr v22.2d, v22.2d, #8 add v4.2d, v4.2d, v26.2d add v0.2d, v0.2d, v22.2d add v17.2d, v17.2d, v4.2d add v17.2d, v17.2d, v0.2d .endm function weighted_sse_32x32_neon, export=1 INIT 32, 32 L(wsse_w32): LOAD_32X4 WEIGHTED_SSE_32X4 bne L(wsse_w32) RET_SUM endfunc function weighted_sse_64x64_neon, export=1 INIT 64, 64 L(wsse_w32up): LOAD_32X4 vert=0 WEIGHTED_SSE_32X4 subs w8, w8, #32 bne L(wsse_w32up) mov w8, w9 sub x0, x0, x9 sub x2, x2, x9 add x0, x0, x1, lsl 2 add x2, x2, x3, lsl 2 sub x4, x4, x9 add x4, x4, x5 subs w10, w10, #4 bne L(wsse_w32up) RET_SUM endfunc .macro weighted_sse width, height function weighted_sse_\width\()x\height\()_neon, export=1 INIT \width, \height .if \width <= 32 b L(wsse_w\width) .else b L(wsse_w32up) .endif endfunc .endm weighted_sse 4, 8 weighted_sse 4, 16 weighted_sse 8, 4 weighted_sse 8, 16 weighted_sse 8, 32 weighted_sse 16, 4 weighted_sse 16, 8 weighted_sse 16, 32 weighted_sse 16, 64 weighted_sse 32, 8 weighted_sse 32, 16 weighted_sse 32, 64 weighted_sse 64, 16 weighted_sse 64, 32 weighted_sse 64, 128 weighted_sse 128, 64 weighted_sse 128, 128 .macro LOAD_X4_HBD t=q ldr \t\()0, [x0] ldr \t\()4, [x2] ldr \t\()1, [x0, x1] ldr \t\()5, [x2, x3] ldr \t\()2, [x0, x11] ldr \t\()6, [x2, x12] ldr \t\()3, [x0, x8] ldr \t\()7, [x2, x9] .ifc \t,q ldr d16, [x4] .else ldr s16, [x4] .endif add x0, x0, x1, lsl 2 add x2, x2, x3, lsl 2 add x4, x4, x5 subs w10, w10, #4 .endm .macro INIT_HBD width, height .if \width <= 8 add x11, x1, x1 add x12, x3, x3 add x8, x1, x1, lsl 1 add x9, x3, x3, lsl 1 .elseif \width >= 32 mov w8, #(\width) sxtw x9, w8 .endif movi v17.4s, #0 mov w10, #(\height) .endm // x0: src: *const u16, // x1: src_stride: isize, // x2: dst: *const u16, // x3: dst_stride: isize, // x4: scale: *const u32, // x5: scale_stride: isize, function weighted_sse_4x4_hbd_neon, export=1 INIT_HBD 4, 4 L(wsse_hbd_w4): LOAD_X4_HBD t=d uabd v0.8h, v0.8h, v4.8h // diff pixel values uabd v1.8h, v1.8h, v5.8h uabd v2.8h, v2.8h, v6.8h uabd v3.8h, v3.8h, v7.8h umull v0.4s, v0.4h, v0.4h // square umull v1.4s, v1.4h, v1.4h umull v2.4s, v2.4h, v2.4h umull v3.4s, v3.4h, v3.4h add v0.4s, v0.4s, v1.4s // accumulate add v2.4s, v2.4s, v3.4s add v0.4s, v0.4s, v2.4s addv s0, v0.4s umull v0.2d, v0.2s, v16.2s // apply scale urshr d0, d0, #8 add v17.2d, v17.2d, v0.2d bne L(wsse_hbd_w4) fmov x0, d17 ret endfunc function weighted_sse_8x8_hbd_neon, export=1 INIT_HBD 8, 8 L(wsse_hbd_w8): LOAD_X4_HBD uabd v4.8h, v0.8h, v4.8h // diff pixel values uabd v5.8h, v1.8h, v5.8h uabd v6.8h, v2.8h, v6.8h uabd v7.8h, v3.8h, v7.8h umull v0.4s, v4.4h, v4.4h // square umull v1.4s, v5.4h, v5.4h umull v2.4s, v6.4h, v6.4h umull v3.4s, v7.4h, v7.4h umull2 v4.4s, v4.8h, v4.8h umull2 v5.4s, v5.8h, v5.8h umull2 v6.4s, v6.8h, v6.8h umull2 v7.4s, v7.8h, v7.8h add v0.4s, v0.4s, v1.4s // accumulate add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s add v6.4s, v6.4s, v7.4s add v0.4s, v0.4s, v2.4s add v4.4s, v4.4s, v6.4s addv s0, v0.4s addv s4, v4.4s mov v0.s[1], v4.s[0] umull v0.2d, v0.2s, v16.2s // apply scale urshr v0.2d, v0.2d, #8 add v17.2d, v17.2d, v0.2d bne L(wsse_hbd_w8) RET_SUM endfunc .macro WEIGHTED_SSE_16X4_HBD uabd v0.8h, v0.8h, v4.8h // diff pixel values uabd v1.8h, v1.8h, v5.8h uabd v2.8h, v2.8h, v6.8h uabd v3.8h, v3.8h, v7.8h uabd v22.8h, v22.8h, v26.8h uabd v23.8h, v23.8h, v27.8h uabd v24.8h, v24.8h, v28.8h uabd v25.8h, v25.8h, v29.8h umull2 v4.4s, v0.8h, v0.8h // square umull2 v5.4s, v1.8h, v1.8h umull2 v6.4s, v2.8h, v2.8h umull2 v7.4s, v3.8h, v3.8h umull2 v26.4s, v22.8h, v22.8h umull2 v27.4s, v23.8h, v23.8h umull2 v28.4s, v24.8h, v24.8h umull2 v29.4s, v25.8h, v25.8h umull v0.4s, v0.4h, v0.4h umull v1.4s, v1.4h, v1.4h umull v2.4s, v2.4h, v2.4h umull v3.4s, v3.4h, v3.4h umull v22.4s, v22.4h, v22.4h umull v23.4s, v23.4h, v23.4h umull v24.4s, v24.4h, v24.4h umull v25.4s, v25.4h, v25.4h add v0.4s, v0.4s, v1.4s // accumulate add v2.4s, v2.4s, v3.4s add v4.4s, v4.4s, v5.4s add v6.4s, v6.4s, v7.4s add v22.4s, v22.4s, v23.4s add v24.4s, v24.4s, v25.4s add v26.4s, v26.4s, v27.4s add v28.4s, v28.4s, v29.4s add v0.4s, v0.4s, v2.4s add v4.4s, v4.4s, v6.4s add v22.4s, v22.4s, v24.4s add v26.4s, v26.4s, v28.4s addv s0, v0.4s addv s4, v4.4s addv s22, v22.4s addv s26, v26.4s mov v0.s[1], v4.s[0] mov v22.s[1], v26.s[0] umull v0.2d, v0.2s, v16.2s // apply scale umull v22.2d, v22.2s, v18.2s urshr v0.2d, v0.2d, #8 urshr v22.2d, v22.2d, #8 add v0.2d, v0.2d, v22.2d add v17.2d, v17.2d, v0.2d .endm function weighted_sse_16x16_hbd_neon, export=1 INIT_HBD 16, 16 L(wsse_hbd_w16): LOAD_32X4 vert=1, hbd=1 WEIGHTED_SSE_16X4_HBD bne L(wsse_hbd_w16) RET_SUM endfunc function weighted_sse_32x32_hbd_neon, export=1 INIT_HBD 32, 32 L(wsse_hbd_w32up): LOAD_32X4 vert=0, hbd=1 WEIGHTED_SSE_16X4_HBD subs w8, w8, #16 bne L(wsse_hbd_w32up) mov w8, w9 sub x0, x0, x9, lsl 1 sub x2, x2, x9, lsl 1 add x0, x0, x1, lsl 2 add x2, x2, x3, lsl 2 sub x4, x4, x9 add x4, x4, x5 subs w10, w10, #4 bne L(wsse_hbd_w32up) RET_SUM endfunc .macro weighted_sse_hbd width, height function weighted_sse_\width\()x\height\()_hbd_neon, export=1 INIT_HBD \width, \height .if \width <= 16 b L(wsse_hbd_w\width) .else b L(wsse_hbd_w32up) .endif endfunc .endm weighted_sse_hbd 4, 8 weighted_sse_hbd 4, 16 weighted_sse_hbd 8, 4 weighted_sse_hbd 8, 16 weighted_sse_hbd 8, 32 weighted_sse_hbd 16, 4 weighted_sse_hbd 16, 8 weighted_sse_hbd 16, 32 weighted_sse_hbd 16, 64 weighted_sse_hbd 32, 8 weighted_sse_hbd 32, 16 weighted_sse_hbd 32, 64 weighted_sse_hbd 64, 16 weighted_sse_hbd 64, 32 weighted_sse_hbd 64, 64 weighted_sse_hbd 64, 128 weighted_sse_hbd 128, 64 weighted_sse_hbd 128, 128 rav1e-0.7.1/src/arm/64/util.S000064400000000000000000000235431046102023000135560ustar 00000000000000/****************************************************************************** * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2015 Martin Storsjo * Copyright © 2015 Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #ifndef DAV1D_SRC_ARM_64_UTIL_S #define DAV1D_SRC_ARM_64_UTIL_S #include "config.h" #include "src/arm/asm.S" .macro movrel rd, val, offset=0 #if defined(__APPLE__) .if \offset < 0 adrp \rd, \val@PAGE add \rd, \rd, \val@PAGEOFF sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset)@PAGE add \rd, \rd, \val+(\offset)@PAGEOFF .endif #elif defined(PIC) && defined(_WIN32) .if \offset < 0 adrp \rd, \val add \rd, \rd, :lo12:\val sub \rd, \rd, -(\offset) .else adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) .endif #elif defined(PIC) adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) #else ldr \rd, =\val+\offset #endif .endm .macro sub_sp space #ifdef _WIN32 .if \space > 8192 // Here, we'd need to touch two (or more) pages while decrementing // the stack pointer. .error "sub_sp_align doesn't support values over 8K at the moment" .elseif \space > 4096 sub x16, sp, #4096 ldr xzr, [x16] sub sp, x16, #(\space - 4096) .else sub sp, sp, #\space .endif #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 .endif .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif #endif .endm .macro transpose_8x8b_xtl r0, r1, r2, r3, r4, r5, r6, r7, xtl // a0 b0 a1 b1 a2 b2 a3 b3 a4 b4 a5 b5 a6 b6 a7 b7 zip1 \r0\().16b, \r0\().16b, \r1\().16b // c0 d0 c1 d1 c2 d2 d3 d3 c4 d4 c5 d5 c6 d6 d7 d7 zip1 \r2\().16b, \r2\().16b, \r3\().16b // e0 f0 e1 f1 e2 f2 e3 f3 e4 f4 e5 f5 e6 f6 e7 f7 zip1 \r4\().16b, \r4\().16b, \r5\().16b // g0 h0 g1 h1 g2 h2 h3 h3 g4 h4 g5 h5 g6 h6 h7 h7 zip1 \r6\().16b, \r6\().16b, \r7\().16b // a0 b0 c0 d0 a2 b2 c2 d2 a4 b4 c4 d4 a6 b6 c6 d6 trn1 \r1\().8h, \r0\().8h, \r2\().8h // a1 b1 c1 d1 a3 b3 c3 d3 a5 b5 c5 d5 a7 b7 c7 d7 trn2 \r3\().8h, \r0\().8h, \r2\().8h // e0 f0 g0 h0 e2 f2 g2 h2 e4 f4 g4 h4 e6 f6 g6 h6 trn1 \r5\().8h, \r4\().8h, \r6\().8h // e1 f1 g1 h1 e3 f3 g3 h3 e5 f5 g5 h5 e7 f7 g7 h7 trn2 \r7\().8h, \r4\().8h, \r6\().8h // a0 b0 c0 d0 e0 f0 g0 h0 a4 b4 c4 d4 e4 f4 g4 h4 trn1 \r0\().4s, \r1\().4s, \r5\().4s // a2 b2 c2 d2 e2 f2 g2 h2 a6 b6 c6 d6 e6 f6 g6 h6 trn2 \r2\().4s, \r1\().4s, \r5\().4s // a1 b1 c1 d1 e1 f1 g1 h1 a5 b5 c5 d5 e5 f5 g5 h5 trn1 \r1\().4s, \r3\().4s, \r7\().4s // a3 b3 c3 d3 e3 f3 g3 h3 a7 b7 c7 d7 e7 f7 g7 h7 trn2 \r3\().4s, \r3\().4s, \r7\().4s \xtl\()2 \r4\().8h, \r0\().16b \xtl \r0\().8h, \r0\().8b \xtl\()2 \r6\().8h, \r2\().16b \xtl \r2\().8h, \r2\().8b \xtl\()2 \r5\().8h, \r1\().16b \xtl \r1\().8h, \r1\().8b \xtl\()2 \r7\().8h, \r3\().16b \xtl \r3\().8h, \r3\().8b .endm .macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().8h, \r0\().8h, \r1\().8h trn2 \t9\().8h, \r0\().8h, \r1\().8h trn1 \r1\().8h, \r2\().8h, \r3\().8h trn2 \r3\().8h, \r2\().8h, \r3\().8h trn1 \r0\().8h, \r4\().8h, \r5\().8h trn2 \r5\().8h, \r4\().8h, \r5\().8h trn1 \r2\().8h, \r6\().8h, \r7\().8h trn2 \r7\().8h, \r6\().8h, \r7\().8h trn1 \r4\().4s, \r0\().4s, \r2\().4s trn2 \r2\().4s, \r0\().4s, \r2\().4s trn1 \r6\().4s, \r5\().4s, \r7\().4s trn2 \r7\().4s, \r5\().4s, \r7\().4s trn1 \r5\().4s, \t9\().4s, \r3\().4s trn2 \t9\().4s, \t9\().4s, \r3\().4s trn1 \r3\().4s, \t8\().4s, \r1\().4s trn2 \t8\().4s, \t8\().4s, \r1\().4s trn1 \r0\().2d, \r3\().2d, \r4\().2d trn2 \r4\().2d, \r3\().2d, \r4\().2d trn1 \r1\().2d, \r5\().2d, \r6\().2d trn2 \r5\().2d, \r5\().2d, \r6\().2d trn2 \r6\().2d, \t8\().2d, \r2\().2d trn1 \r2\().2d, \t8\().2d, \r2\().2d trn1 \r3\().2d, \t9\().2d, \r7\().2d trn2 \r7\().2d, \t9\().2d, \r7\().2d .endm .macro transpose_8x16b r0, r1, r2, r3, r4, r5, r6, r7, t8, t9 trn1 \t8\().16b, \r0\().16b, \r1\().16b trn2 \t9\().16b, \r0\().16b, \r1\().16b trn1 \r1\().16b, \r2\().16b, \r3\().16b trn2 \r3\().16b, \r2\().16b, \r3\().16b trn1 \r0\().16b, \r4\().16b, \r5\().16b trn2 \r5\().16b, \r4\().16b, \r5\().16b trn1 \r2\().16b, \r6\().16b, \r7\().16b trn2 \r7\().16b, \r6\().16b, \r7\().16b trn1 \r4\().8h, \r0\().8h, \r2\().8h trn2 \r2\().8h, \r0\().8h, \r2\().8h trn1 \r6\().8h, \r5\().8h, \r7\().8h trn2 \r7\().8h, \r5\().8h, \r7\().8h trn1 \r5\().8h, \t9\().8h, \r3\().8h trn2 \t9\().8h, \t9\().8h, \r3\().8h trn1 \r3\().8h, \t8\().8h, \r1\().8h trn2 \t8\().8h, \t8\().8h, \r1\().8h trn1 \r0\().4s, \r3\().4s, \r4\().4s trn2 \r4\().4s, \r3\().4s, \r4\().4s trn1 \r1\().4s, \r5\().4s, \r6\().4s trn2 \r5\().4s, \r5\().4s, \r6\().4s trn2 \r6\().4s, \t8\().4s, \r2\().4s trn1 \r2\().4s, \t8\().4s, \r2\().4s trn1 \r3\().4s, \t9\().4s, \r7\().4s trn2 \r7\().4s, \t9\().4s, \r7\().4s .endm .macro transpose_4x16b r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().16b, \r0\().16b, \r1\().16b trn2 \t5\().16b, \r0\().16b, \r1\().16b trn1 \t6\().16b, \r2\().16b, \r3\().16b trn2 \t7\().16b, \r2\().16b, \r3\().16b trn1 \r0\().8h, \t4\().8h, \t6\().8h trn2 \r2\().8h, \t4\().8h, \t6\().8h trn1 \r1\().8h, \t5\().8h, \t7\().8h trn2 \r3\().8h, \t5\().8h, \t7\().8h .endm .macro transpose_4x4h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().4h, \r0\().4h, \r1\().4h trn2 \t5\().4h, \r0\().4h, \r1\().4h trn1 \t6\().4h, \r2\().4h, \r3\().4h trn2 \t7\().4h, \r2\().4h, \r3\().4h trn1 \r0\().2s, \t4\().2s, \t6\().2s trn2 \r2\().2s, \t4\().2s, \t6\().2s trn1 \r1\().2s, \t5\().2s, \t7\().2s trn2 \r3\().2s, \t5\().2s, \t7\().2s .endm .macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().4s, \r0\().4s, \r1\().4s trn2 \t5\().4s, \r0\().4s, \r1\().4s trn1 \t6\().4s, \r2\().4s, \r3\().4s trn2 \t7\().4s, \r2\().4s, \r3\().4s trn1 \r0\().2d, \t4\().2d, \t6\().2d trn2 \r2\().2d, \t4\().2d, \t6\().2d trn1 \r1\().2d, \t5\().2d, \t7\().2d trn2 \r3\().2d, \t5\().2d, \t7\().2d .endm .macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().8h, \r0\().8h, \r1\().8h trn2 \t5\().8h, \r0\().8h, \r1\().8h trn1 \t6\().8h, \r2\().8h, \r3\().8h trn2 \t7\().8h, \r2\().8h, \r3\().8h trn1 \r0\().4s, \t4\().4s, \t6\().4s trn2 \r2\().4s, \t4\().4s, \t6\().4s trn1 \r1\().4s, \t5\().4s, \t7\().4s trn2 \r3\().4s, \t5\().4s, \t7\().4s .endm #endif /* DAV1D_SRC_ARM_64_UTIL_S */ rav1e-0.7.1/src/arm/asm.S000064400000000000000000000212161046102023000131230ustar 00000000000000/* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Janne Grunau * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef DAV1D_SRC_ARM_ASM_S #define DAV1D_SRC_ARM_ASM_S #include "config.h" #if ARCH_AARCH64 #define x18 do_not_use_x18 #define w18 do_not_use_w18 /* Support macros for * - Armv8.3-A Pointer Authentication and * - Armv8.5-A Branch Target Identification * features which require emitting a .note.gnu.property section with the * appropriate architecture-dependent feature bits set. * * |AARCH64_SIGN_LINK_REGISTER| and |AARCH64_VALIDATE_LINK_REGISTER| expand to * PACIxSP and AUTIxSP, respectively. |AARCH64_SIGN_LINK_REGISTER| should be * used immediately before saving the LR register (x30) to the stack. * |AARCH64_VALIDATE_LINK_REGISTER| should be used immediately after restoring * it. Note |AARCH64_SIGN_LINK_REGISTER|'s modifications to LR must be undone * with |AARCH64_VALIDATE_LINK_REGISTER| before RET. The SP register must also * have the same value at the two points. For example: * * .global f * f: * AARCH64_SIGN_LINK_REGISTER * stp x29, x30, [sp, #-96]! * mov x29, sp * ... * ldp x29, x30, [sp], #96 * AARCH64_VALIDATE_LINK_REGISTER * ret * * |AARCH64_VALID_CALL_TARGET| expands to BTI 'c'. Either it, or * |AARCH64_SIGN_LINK_REGISTER|, must be used at every point that may be an * indirect call target. In particular, all symbols exported from a file must * begin with one of these macros. For example, a leaf function that does not * save LR can instead use |AARCH64_VALID_CALL_TARGET|: * * .globl return_zero * return_zero: * AARCH64_VALID_CALL_TARGET * mov x0, #0 * ret * * A non-leaf function which does not immediately save LR may need both macros * because |AARCH64_SIGN_LINK_REGISTER| appears late. For example, the function * may jump to an alternate implementation before setting up the stack: * * .globl with_early_jump * with_early_jump: * AARCH64_VALID_CALL_TARGET * cmp x0, #128 * b.lt .Lwith_early_jump_128 * AARCH64_SIGN_LINK_REGISTER * stp x29, x30, [sp, #-96]! * mov x29, sp * ... * ldp x29, x30, [sp], #96 * AARCH64_VALIDATE_LINK_REGISTER * ret * * .Lwith_early_jump_128: * ... * ret * * These annotations are only required with indirect calls. Private symbols that * are only the target of direct calls do not require annotations. Also note * that |AARCH64_VALID_CALL_TARGET| is only valid for indirect calls (BLR), not * indirect jumps (BR). Indirect jumps in assembly are supported through * |AARCH64_VALID_JUMP_TARGET|. Landing Pads which shall serve for jumps and * calls can be created using |AARCH64_VALID_JUMP_CALL_TARGET|. * * Although not necessary, it is safe to use these macros in 32-bit ARM * assembly. This may be used to simplify dual 32-bit and 64-bit files. * * References: * - "ELF for the Arm® 64-bit Architecture" * https: *github.com/ARM-software/abi-aa/blob/master/aaelf64/aaelf64.rst * - "Providing protection for complex software" * https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software */ #if defined(__ARM_FEATURE_BTI_DEFAULT) && (__ARM_FEATURE_BTI_DEFAULT == 1) #define GNU_PROPERTY_AARCH64_BTI (1 << 0) // Has Branch Target Identification #define AARCH64_VALID_JUMP_CALL_TARGET hint #38 // BTI 'jc' #define AARCH64_VALID_CALL_TARGET hint #34 // BTI 'c' #define AARCH64_VALID_JUMP_TARGET hint #36 // BTI 'j' #else #define GNU_PROPERTY_AARCH64_BTI 0 // No Branch Target Identification #define AARCH64_VALID_JUMP_CALL_TARGET #define AARCH64_VALID_CALL_TARGET #define AARCH64_VALID_JUMP_TARGET #endif #if defined(__ARM_FEATURE_PAC_DEFAULT) #if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 0)) != 0) // authentication using key A #define AARCH64_SIGN_LINK_REGISTER paciasp #define AARCH64_VALIDATE_LINK_REGISTER autiasp #elif ((__ARM_FEATURE_PAC_DEFAULT & (1 << 1)) != 0) // authentication using key B #define AARCH64_SIGN_LINK_REGISTER pacibsp #define AARCH64_VALIDATE_LINK_REGISTER autibsp #else #error Pointer authentication defines no valid key! #endif #if ((__ARM_FEATURE_PAC_DEFAULT & (1 << 2)) != 0) // authentication of leaf functions #error Authentication of leaf functions is enabled but not supported in dav1d! #endif #define GNU_PROPERTY_AARCH64_PAC (1 << 1) #elif defined(__APPLE__) && defined(__arm64e__) #define GNU_PROPERTY_AARCH64_PAC 0 #define AARCH64_SIGN_LINK_REGISTER pacibsp #define AARCH64_VALIDATE_LINK_REGISTER autibsp #else /* __ARM_FEATURE_PAC_DEFAULT */ #define GNU_PROPERTY_AARCH64_PAC 0 #define AARCH64_SIGN_LINK_REGISTER #define AARCH64_VALIDATE_LINK_REGISTER #endif /* !__ARM_FEATURE_PAC_DEFAULT */ #if (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) .pushsection .note.gnu.property, "a" .balign 8 .long 4 .long 0x10 .long 0x5 .asciz "GNU" .long 0xc0000000 /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ .long 4 .long (GNU_PROPERTY_AARCH64_BTI | GNU_PROPERTY_AARCH64_PAC) .long 0 .popsection #endif /* (GNU_PROPERTY_AARCH64_BTI != 0 || GNU_PROPERTY_AARCH64_PAC != 0) && defined(__ELF__) */ #endif /* ARCH_AARCH64 */ #if ARCH_ARM .syntax unified #ifdef __ELF__ .arch armv7-a .fpu neon .eabi_attribute 10, 0 // suppress Tag_FP_arch .eabi_attribute 12, 0 // suppress Tag_Advanced_SIMD_arch .section .note.GNU-stack,"",%progbits // Mark stack as non-executable #endif /* __ELF__ */ #ifdef _WIN32 #define CONFIG_THUMB 1 #else #define CONFIG_THUMB 0 #endif #if CONFIG_THUMB .thumb #define A @ #define T #else #define A #define T @ #endif /* CONFIG_THUMB */ #endif /* ARCH_ARM */ #if !defined(PIC) #if defined(__PIC__) #define PIC __PIC__ #elif defined(__pic__) #define PIC __pic__ #endif #endif #ifndef PRIVATE_PREFIX #define PRIVATE_PREFIX dav1d_ #endif #define PASTE(a,b) a ## b #define CONCAT(a,b) PASTE(a,b) #ifdef PREFIX #define EXTERN CONCAT(_,PRIVATE_PREFIX) #else #define EXTERN PRIVATE_PREFIX #endif .macro function name, export=0, align=2 .macro endfunc #ifdef __ELF__ .size \name, . - \name #endif #if HAVE_AS_FUNC .endfunc #endif .purgem endfunc .endm .text .align \align .if \export .global EXTERN\name #ifdef __ELF__ .type EXTERN\name, %function .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif #if HAVE_AS_FUNC .func EXTERN\name #endif EXTERN\name: .else #ifdef __ELF__ .type \name, %function #endif #if HAVE_AS_FUNC .func \name #endif .endif \name: #if ARCH_AARCH64 .if \export AARCH64_VALID_CALL_TARGET .endif #endif .endm .macro const name, export=0, align=2 .macro endconst #ifdef __ELF__ .size \name, . - \name #endif .purgem endconst .endm #if defined(_WIN32) .section .rdata #elif !defined(__MACH__) .section .rodata #else .const_data #endif .align \align .if \export .global EXTERN\name #ifdef __ELF__ .hidden EXTERN\name #elif defined(__MACH__) .private_extern EXTERN\name #endif EXTERN\name: .endif \name: .endm #ifdef __APPLE__ #define L(x) L ## x #else #define L(x) .L ## x #endif #define X(x) CONCAT(EXTERN, x) #endif /* DAV1D_SRC_ARM_ASM_S */ rav1e-0.7.1/src/arm/tables.S000064400000000000000000000431061046102023000136170ustar 00000000000000/* * Copyright (c) 2019-2022, The rav1e contributors. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License * was not distributed with this source code in the LICENSE file, you can * obtain it at www.aomedia.org/license/software. If the Alliance for Open * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ /* * Copyright © 2018, VideoLAN and dav1d authors * Copyright © 2018, Two Orioles, LLC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "src/arm/asm.S" const mc_subpel_filters, export=1, align=3 .byte 0, 1, -3, 63, 4, -1, 0, 0 /* REGULAR */ .byte 0, 1, -5, 61, 9, -2, 0, 0 .byte 0, 1, -6, 58, 14, -4, 1, 0 .byte 0, 1, -7, 55, 19, -5, 1, 0 .byte 0, 1, -7, 51, 24, -6, 1, 0 .byte 0, 1, -8, 47, 29, -6, 1, 0 .byte 0, 1, -7, 42, 33, -6, 1, 0 .byte 0, 1, -7, 38, 38, -7, 1, 0 .byte 0, 1, -6, 33, 42, -7, 1, 0 .byte 0, 1, -6, 29, 47, -8, 1, 0 .byte 0, 1, -6, 24, 51, -7, 1, 0 .byte 0, 1, -5, 19, 55, -7, 1, 0 .byte 0, 1, -4, 14, 58, -6, 1, 0 .byte 0, 0, -2, 9, 61, -5, 1, 0 .byte 0, 0, -1, 4, 63, -3, 1, 0 .byte 0, 1, 14, 31, 17, 1, 0, 0 /* SMOOTH */ .byte 0, 0, 13, 31, 18, 2, 0, 0 .byte 0, 0, 11, 31, 20, 2, 0, 0 .byte 0, 0, 10, 30, 21, 3, 0, 0 .byte 0, 0, 9, 29, 22, 4, 0, 0 .byte 0, 0, 8, 28, 23, 5, 0, 0 .byte 0, -1, 8, 27, 24, 6, 0, 0 .byte 0, -1, 7, 26, 26, 7, -1, 0 .byte 0, 0, 6, 24, 27, 8, -1, 0 .byte 0, 0, 5, 23, 28, 8, 0, 0 .byte 0, 0, 4, 22, 29, 9, 0, 0 .byte 0, 0, 3, 21, 30, 10, 0, 0 .byte 0, 0, 2, 20, 31, 11, 0, 0 .byte 0, 0, 2, 18, 31, 13, 0, 0 .byte 0, 0, 1, 17, 31, 14, 1, 0 .byte -1, 1, -3, 63, 4, -1, 1, 0 /* SHARP */ .byte -1, 3, -6, 62, 8, -3, 2, -1 .byte -1, 4, -9, 60, 13, -5, 3, -1 .byte -2, 5, -11, 58, 19, -7, 3, -1 .byte -2, 5, -11, 54, 24, -9, 4, -1 .byte -2, 5, -12, 50, 30, -10, 4, -1 .byte -2, 5, -12, 45, 35, -11, 5, -1 .byte -2, 6, -12, 40, 40, -12, 6, -2 .byte -1, 5, -11, 35, 45, -12, 5, -2 .byte -1, 4, -10, 30, 50, -12, 5, -2 .byte -1, 4, -9, 24, 54, -11, 5, -2 .byte -1, 3, -7, 19, 58, -11, 5, -2 .byte -1, 3, -5, 13, 60, -9, 4, -1 .byte -1, 2, -3, 8, 62, -6, 3, -1 .byte 0, 1, -1, 4, 63, -3, 1, -1 .byte 0, 0, -2, 63, 4, -1, 0, 0 /* REGULAR 4 */ .byte 0, 0, -4, 61, 9, -2, 0, 0 .byte 0, 0, -5, 58, 14, -3, 0, 0 .byte 0, 0, -6, 55, 19, -4, 0, 0 .byte 0, 0, -6, 51, 24, -5, 0, 0 .byte 0, 0, -7, 47, 29, -5, 0, 0 .byte 0, 0, -6, 42, 33, -5, 0, 0 .byte 0, 0, -6, 38, 38, -6, 0, 0 .byte 0, 0, -5, 33, 42, -6, 0, 0 .byte 0, 0, -5, 29, 47, -7, 0, 0 .byte 0, 0, -5, 24, 51, -6, 0, 0 .byte 0, 0, -4, 19, 55, -6, 0, 0 .byte 0, 0, -3, 14, 58, -5, 0, 0 .byte 0, 0, -2, 9, 61, -4, 0, 0 .byte 0, 0, -1, 4, 63, -2, 0, 0 .byte 0, 0, 15, 31, 17, 1, 0, 0 /* SMOOTH 4 */ .byte 0, 0, 13, 31, 18, 2, 0, 0 .byte 0, 0, 11, 31, 20, 2, 0, 0 .byte 0, 0, 10, 30, 21, 3, 0, 0 .byte 0, 0, 9, 29, 22, 4, 0, 0 .byte 0, 0, 8, 28, 23, 5, 0, 0 .byte 0, 0, 7, 27, 24, 6, 0, 0 .byte 0, 0, 6, 26, 26, 6, 0, 0 .byte 0, 0, 6, 24, 27, 7, 0, 0 .byte 0, 0, 5, 23, 28, 8, 0, 0 .byte 0, 0, 4, 22, 29, 9, 0, 0 .byte 0, 0, 3, 21, 30, 10, 0, 0 .byte 0, 0, 2, 20, 31, 11, 0, 0 .byte 0, 0, 2, 18, 31, 13, 0, 0 .byte 0, 0, 1, 17, 31, 15, 0, 0 /* Bilin scaled being very rarely used, add a new table entry * and use the put/prep_8tap_scaled code, thus acting as a * scaled bilinear filter. */ .byte 0, 0, 0, 60, 4, 0, 0, 0 .byte 0, 0, 0, 56, 8, 0, 0, 0 .byte 0, 0, 0, 52, 12, 0, 0, 0 .byte 0, 0, 0, 48, 16, 0, 0, 0 .byte 0, 0, 0, 44, 20, 0, 0, 0 .byte 0, 0, 0, 40, 24, 0, 0, 0 .byte 0, 0, 0, 36, 28, 0, 0, 0 .byte 0, 0, 0, 32, 32, 0, 0, 0 .byte 0, 0, 0, 28, 36, 0, 0, 0 .byte 0, 0, 0, 24, 40, 0, 0, 0 .byte 0, 0, 0, 20, 44, 0, 0, 0 .byte 0, 0, 0, 16, 48, 0, 0, 0 .byte 0, 0, 0, 12, 52, 0, 0, 0 .byte 0, 0, 0, 8, 56, 0, 0, 0 .byte 0, 0, 0, 4, 60, 0, 0, 0 endconst const filter_intra_taps, export=1, align=4 .byte -6, 10, -5, 2, -3, 1, -3, 1 /* 0 */ .byte -4, 6, -3, 2, -3, 2, -3, 1 .byte 0, 0, 10, 0, 1, 10, 1, 2 .byte 0, 0, 6, 0, 2, 6, 2, 2 .byte 0, 12, 0, 9, 0, 7, 10, 5 .byte 0, 2, 0, 2, 0, 2, 6, 3 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 12, 0, 9, 0, 7, 0, 5, 0 .byte -10, 16, -6, 0, -4, 0, -2, 0 /* 1 */ .byte -10, 16, -6, 0, -4, 0, -2, 0 .byte 0, 0, 16, 0, 0, 16, 0, 0 .byte 0, 0, 16, 0, 0, 16, 0, 0 .byte 0, 10, 0, 6, 0, 4, 16, 2 .byte 0, 0, 0, 0, 0, 0, 16, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 10, 0, 6, 0, 4, 0, 2, 0 .byte -8, 8, -8, 0, -8, 0, -8, 0 /* 2 */ .byte -4, 4, -4, 0, -4, 0, -4, 0 .byte 0, 0, 8, 0, 0, 8, 0, 0 .byte 0, 0, 4, 0, 0, 4, 0, 0 .byte 0, 16, 0, 16, 0, 16, 8, 16 .byte 0, 0, 0, 0, 0, 0, 4, 0 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 16, 0, 16, 0, 16, 0, 16, 0 .byte -2, 8, -1, 3, -1, 2, 0, 1 /* 3 */ .byte -1, 4, -1, 3, -1, 2, -1, 2 .byte 0, 0, 8, 0, 3, 8, 2, 3 .byte 0, 0, 4, 0, 3, 4, 2, 3 .byte 0, 10, 0, 6, 0, 4, 8, 2 .byte 0, 3, 0, 4, 0, 4, 4, 3 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 10, 0, 6, 0, 4, 0, 3, 0 .byte -12, 14, -10, 0, -9, 0, -8, 0 /* 4 */ .byte -10, 12, -9, 1, -8, 0, -7, 0 .byte 0, 0, 14, 0, 0, 14, 0, 0 .byte 0, 0, 12, 0, 0, 12, 0, 1 .byte 0, 14, 0, 12, 0, 11, 14, 10 .byte 0, 0, 0, 0, 0, 1, 12, 1 .byte 0, 0, 0, 0, 0, 0, 0, 0 .byte 14, 0, 12, 0, 11, 0, 9, 0 endconst const sgr_x_by_x, export=1, align=4 .byte 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17 .byte 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9 .byte 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6 .byte 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 .byte 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3 .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 .byte 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 .byte 0 endconst const mc_warp_filter, export=1, align=3 /* [-1, 0) */ .byte 0, 0, 127, 1, 0, 0, 0, 0, 0, - 1, 127, 2, 0, 0, 0, 0 .byte 1, - 3, 127, 4, - 1, 0, 0, 0, 1, - 4, 126, 6, - 2, 1, 0, 0 .byte 1, - 5, 126, 8, - 3, 1, 0, 0, 1, - 6, 125, 11, - 4, 1, 0, 0 .byte 1, - 7, 124, 13, - 4, 1, 0, 0, 2, - 8, 123, 15, - 5, 1, 0, 0 .byte 2, - 9, 122, 18, - 6, 1, 0, 0, 2, -10, 121, 20, - 6, 1, 0, 0 .byte 2, -11, 120, 22, - 7, 2, 0, 0, 2, -12, 119, 25, - 8, 2, 0, 0 .byte 3, -13, 117, 27, - 8, 2, 0, 0, 3, -13, 116, 29, - 9, 2, 0, 0 .byte 3, -14, 114, 32, -10, 3, 0, 0, 3, -15, 113, 35, -10, 2, 0, 0 .byte 3, -15, 111, 37, -11, 3, 0, 0, 3, -16, 109, 40, -11, 3, 0, 0 .byte 3, -16, 108, 42, -12, 3, 0, 0, 4, -17, 106, 45, -13, 3, 0, 0 .byte 4, -17, 104, 47, -13, 3, 0, 0, 4, -17, 102, 50, -14, 3, 0, 0 .byte 4, -17, 100, 52, -14, 3, 0, 0, 4, -18, 98, 55, -15, 4, 0, 0 .byte 4, -18, 96, 58, -15, 3, 0, 0, 4, -18, 94, 60, -16, 4, 0, 0 .byte 4, -18, 91, 63, -16, 4, 0, 0, 4, -18, 89, 65, -16, 4, 0, 0 .byte 4, -18, 87, 68, -17, 4, 0, 0, 4, -18, 85, 70, -17, 4, 0, 0 .byte 4, -18, 82, 73, -17, 4, 0, 0, 4, -18, 80, 75, -17, 4, 0, 0 .byte 4, -18, 78, 78, -18, 4, 0, 0, 4, -17, 75, 80, -18, 4, 0, 0 .byte 4, -17, 73, 82, -18, 4, 0, 0, 4, -17, 70, 85, -18, 4, 0, 0 .byte 4, -17, 68, 87, -18, 4, 0, 0, 4, -16, 65, 89, -18, 4, 0, 0 .byte 4, -16, 63, 91, -18, 4, 0, 0, 4, -16, 60, 94, -18, 4, 0, 0 .byte 3, -15, 58, 96, -18, 4, 0, 0, 4, -15, 55, 98, -18, 4, 0, 0 .byte 3, -14, 52, 100, -17, 4, 0, 0, 3, -14, 50, 102, -17, 4, 0, 0 .byte 3, -13, 47, 104, -17, 4, 0, 0, 3, -13, 45, 106, -17, 4, 0, 0 .byte 3, -12, 42, 108, -16, 3, 0, 0, 3, -11, 40, 109, -16, 3, 0, 0 .byte 3, -11, 37, 111, -15, 3, 0, 0, 2, -10, 35, 113, -15, 3, 0, 0 .byte 3, -10, 32, 114, -14, 3, 0, 0, 2, - 9, 29, 116, -13, 3, 0, 0 .byte 2, - 8, 27, 117, -13, 3, 0, 0, 2, - 8, 25, 119, -12, 2, 0, 0 .byte 2, - 7, 22, 120, -11, 2, 0, 0, 1, - 6, 20, 121, -10, 2, 0, 0 .byte 1, - 6, 18, 122, - 9, 2, 0, 0, 1, - 5, 15, 123, - 8, 2, 0, 0 .byte 1, - 4, 13, 124, - 7, 1, 0, 0, 1, - 4, 11, 125, - 6, 1, 0, 0 .byte 1, - 3, 8, 126, - 5, 1, 0, 0, 1, - 2, 6, 126, - 4, 1, 0, 0 .byte 0, - 1, 4, 127, - 3, 1, 0, 0, 0, 0, 2, 127, - 1, 0, 0, 0 /* [0, 1) */ .byte 0, 0, 0, 127, 1, 0, 0, 0, 0, 0, -1, 127, 2, 0, 0, 0 .byte 0, 1, -3, 127, 4, -2, 1, 0, 0, 1, -5, 127, 6, -2, 1, 0 .byte 0, 2, -6, 126, 8, -3, 1, 0, -1, 2, -7, 126, 11, -4, 2, -1 .byte -1, 3, -8, 125, 13, -5, 2, -1, -1, 3, -10, 124, 16, -6, 3, -1 .byte -1, 4, -11, 123, 18, -7, 3, -1, -1, 4, -12, 122, 20, -7, 3, -1 .byte -1, 4, -13, 121, 23, -8, 3, -1, -2, 5, -14, 120, 25, -9, 4, -1 .byte -1, 5, -15, 119, 27, -10, 4, -1, -1, 5, -16, 118, 30, -11, 4, -1 .byte -2, 6, -17, 116, 33, -12, 5, -1, -2, 6, -17, 114, 35, -12, 5, -1 .byte -2, 6, -18, 113, 38, -13, 5, -1, -2, 7, -19, 111, 41, -14, 6, -2 .byte -2, 7, -19, 110, 43, -15, 6, -2, -2, 7, -20, 108, 46, -15, 6, -2 .byte -2, 7, -20, 106, 49, -16, 6, -2, -2, 7, -21, 104, 51, -16, 7, -2 .byte -2, 7, -21, 102, 54, -17, 7, -2, -2, 8, -21, 100, 56, -18, 7, -2 .byte -2, 8, -22, 98, 59, -18, 7, -2, -2, 8, -22, 96, 62, -19, 7, -2 .byte -2, 8, -22, 94, 64, -19, 7, -2, -2, 8, -22, 91, 67, -20, 8, -2 .byte -2, 8, -22, 89, 69, -20, 8, -2, -2, 8, -22, 87, 72, -21, 8, -2 .byte -2, 8, -21, 84, 74, -21, 8, -2, -2, 8, -22, 82, 77, -21, 8, -2 .byte -2, 8, -21, 79, 79, -21, 8, -2, -2, 8, -21, 77, 82, -22, 8, -2 .byte -2, 8, -21, 74, 84, -21, 8, -2, -2, 8, -21, 72, 87, -22, 8, -2 .byte -2, 8, -20, 69, 89, -22, 8, -2, -2, 8, -20, 67, 91, -22, 8, -2 .byte -2, 7, -19, 64, 94, -22, 8, -2, -2, 7, -19, 62, 96, -22, 8, -2 .byte -2, 7, -18, 59, 98, -22, 8, -2, -2, 7, -18, 56, 100, -21, 8, -2 .byte -2, 7, -17, 54, 102, -21, 7, -2, -2, 7, -16, 51, 104, -21, 7, -2 .byte -2, 6, -16, 49, 106, -20, 7, -2, -2, 6, -15, 46, 108, -20, 7, -2 .byte -2, 6, -15, 43, 110, -19, 7, -2, -2, 6, -14, 41, 111, -19, 7, -2 .byte -1, 5, -13, 38, 113, -18, 6, -2, -1, 5, -12, 35, 114, -17, 6, -2 .byte -1, 5, -12, 33, 116, -17, 6, -2, -1, 4, -11, 30, 118, -16, 5, -1 .byte -1, 4, -10, 27, 119, -15, 5, -1, -1, 4, -9, 25, 120, -14, 5, -2 .byte -1, 3, -8, 23, 121, -13, 4, -1, -1, 3, -7, 20, 122, -12, 4, -1 .byte -1, 3, -7, 18, 123, -11, 4, -1, -1, 3, -6, 16, 124, -10, 3, -1 .byte -1, 2, -5, 13, 125, -8, 3, -1, -1, 2, -4, 11, 126, -7, 2, -1 .byte 0, 1, -3, 8, 126, -6, 2, 0, 0, 1, -2, 6, 127, -5, 1, 0 .byte 0, 1, -2, 4, 127, -3, 1, 0, 0, 0, 0, 2, 127, -1, 0, 0 /* [1, 2) */ .byte 0, 0, 0, 1, 127, 0, 0, 0, 0, 0, 0, - 1, 127, 2, 0, 0 .byte 0, 0, 1, - 3, 127, 4, - 1, 0, 0, 0, 1, - 4, 126, 6, - 2, 1 .byte 0, 0, 1, - 5, 126, 8, - 3, 1, 0, 0, 1, - 6, 125, 11, - 4, 1 .byte 0, 0, 1, - 7, 124, 13, - 4, 1, 0, 0, 2, - 8, 123, 15, - 5, 1 .byte 0, 0, 2, - 9, 122, 18, - 6, 1, 0, 0, 2, -10, 121, 20, - 6, 1 .byte 0, 0, 2, -11, 120, 22, - 7, 2, 0, 0, 2, -12, 119, 25, - 8, 2 .byte 0, 0, 3, -13, 117, 27, - 8, 2, 0, 0, 3, -13, 116, 29, - 9, 2 .byte 0, 0, 3, -14, 114, 32, -10, 3, 0, 0, 3, -15, 113, 35, -10, 2 .byte 0, 0, 3, -15, 111, 37, -11, 3, 0, 0, 3, -16, 109, 40, -11, 3 .byte 0, 0, 3, -16, 108, 42, -12, 3, 0, 0, 4, -17, 106, 45, -13, 3 .byte 0, 0, 4, -17, 104, 47, -13, 3, 0, 0, 4, -17, 102, 50, -14, 3 .byte 0, 0, 4, -17, 100, 52, -14, 3, 0, 0, 4, -18, 98, 55, -15, 4 .byte 0, 0, 4, -18, 96, 58, -15, 3, 0, 0, 4, -18, 94, 60, -16, 4 .byte 0, 0, 4, -18, 91, 63, -16, 4, 0, 0, 4, -18, 89, 65, -16, 4 .byte 0, 0, 4, -18, 87, 68, -17, 4, 0, 0, 4, -18, 85, 70, -17, 4 .byte 0, 0, 4, -18, 82, 73, -17, 4, 0, 0, 4, -18, 80, 75, -17, 4 .byte 0, 0, 4, -18, 78, 78, -18, 4, 0, 0, 4, -17, 75, 80, -18, 4 .byte 0, 0, 4, -17, 73, 82, -18, 4, 0, 0, 4, -17, 70, 85, -18, 4 .byte 0, 0, 4, -17, 68, 87, -18, 4, 0, 0, 4, -16, 65, 89, -18, 4 .byte 0, 0, 4, -16, 63, 91, -18, 4, 0, 0, 4, -16, 60, 94, -18, 4 .byte 0, 0, 3, -15, 58, 96, -18, 4, 0, 0, 4, -15, 55, 98, -18, 4 .byte 0, 0, 3, -14, 52, 100, -17, 4, 0, 0, 3, -14, 50, 102, -17, 4 .byte 0, 0, 3, -13, 47, 104, -17, 4, 0, 0, 3, -13, 45, 106, -17, 4 .byte 0, 0, 3, -12, 42, 108, -16, 3, 0, 0, 3, -11, 40, 109, -16, 3 .byte 0, 0, 3, -11, 37, 111, -15, 3, 0, 0, 2, -10, 35, 113, -15, 3 .byte 0, 0, 3, -10, 32, 114, -14, 3, 0, 0, 2, - 9, 29, 116, -13, 3 .byte 0, 0, 2, - 8, 27, 117, -13, 3, 0, 0, 2, - 8, 25, 119, -12, 2 .byte 0, 0, 2, - 7, 22, 120, -11, 2, 0, 0, 1, - 6, 20, 121, -10, 2 .byte 0, 0, 1, - 6, 18, 122, - 9, 2, 0, 0, 1, - 5, 15, 123, - 8, 2 .byte 0, 0, 1, - 4, 13, 124, - 7, 1, 0, 0, 1, - 4, 11, 125, - 6, 1 .byte 0, 0, 1, - 3, 8, 126, - 5, 1, 0, 0, 1, - 2, 6, 126, - 4, 1 .byte 0, 0, 0, - 1, 4, 127, - 3, 1, 0, 0, 0, 0, 2, 127, - 1, 0 /* dummy (replicate row index 191) */ .byte 0, 0, 0, 0, 2, 127, - 1, 0 endconst const sm_weights, export=1 .byte 0, 0 /* Unused, because we always offset by bs, which is at least 2. */ .byte 255, 128 /* bs = 2 */ .byte 255, 149, 85, 64 /* bs = 4 */ .byte 255, 197, 146, 105, 73, 50, 37, 32 /* bs = 8 */ .byte 255, 225, 196, 170, 145, 123, 102, 84 /* bs = 16 */ .byte 68, 54, 43, 33, 26, 20, 17, 16 .byte 255, 240, 225, 210, 196, 182, 169, 157 /* bs =32 */ .byte 145, 133, 122, 111, 101, 92, 83, 74 .byte 66, 59, 52, 45, 39, 34, 29, 25 .byte 21, 17, 14, 12, 10, 9, 8, 8 .byte 255, 248, 240, 233, 225, 218, 210, 203 /* bs = 64 */ .byte 196, 189, 182, 176, 169, 163, 156, 150 .byte 144, 138, 133, 127, 121, 116, 111, 106 .byte 101, 96, 91, 86, 82, 77, 73, 69 .byte 65, 61, 57, 54, 50, 47, 44, 41 .byte 38, 35, 32, 29, 27, 25, 22, 20 .byte 18, 16, 15, 13, 12, 10, 9, 8 .byte 7, 6, 6, 5, 5, 4, 4, 4 endconst const obmc_masks, export=1, align=4 .byte 0, 0 /* Unused */ .byte 19, 0 /* 2 */ .byte 25, 14, 5, 0 /* 4 */ .byte 28, 22, 16, 11, 7, 3, 0, 0 /* 8 */ .byte 30, 27, 24, 21, 18, 15, 12, 10 .byte 8, 6, 4, 3, 0, 0, 0, 0 /* 16 */ .byte 31, 29, 28, 26, 24, 23, 21, 20 .byte 19, 17, 16, 14, 13, 12, 11, 9 .byte 8, 7, 6, 5, 4, 4, 3, 2 .byte 0, 0, 0, 0, 0, 0, 0, 0 /* 32 */ endconst rav1e-0.7.1/src/asm/aarch64/cdef.rs000064400000000000000000000302401046102023000147140ustar 00000000000000// Copyright (c) 2020-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cdef::*; use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::tiling::PlaneRegionMut; use crate::util::*; type CdefPaddingFn = unsafe extern fn( tmp: *mut u16, src: *const u8, src_stride: isize, left: *const [u8; 2], top: *const u8, bottom: *const u8, h: i32, edges: isize, ); type CdefPaddingHBDFn = unsafe extern fn( tmp: *mut u16, src: *const u16, src_stride: isize, left: *const [u16; 2], top: *const u16, bottom: *const u16, h: i32, edges: isize, ); type CdefFilterFn = unsafe extern fn( dst: *mut u8, dst_stride: isize, tmp: *const u16, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, h: i32, edges: isize, ); type CdefFilterHBDFn = unsafe extern fn( dst: *mut u16, dst_stride: isize, tmp: *const u16, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, h: i32, edges: isize, bitdepth_max: i32, ); #[inline(always)] const fn decimate_index(xdec: usize, ydec: usize) -> usize { ((ydec << 1) | xdec) & 3 } pub(crate) unsafe fn cdef_filter_block( dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize, pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut| { rust::cdef_filter_block( dst, src, src_stride, pri_strength, sec_strength, dir, damping, bit_depth, xdec, ydec, edges, cpu, ); }; #[cfg(feature = "check_asm")] let ref_dst = { let mut copy = dst.scratch_copy(); call_rust(&mut copy.as_region_mut()); copy }; match T::type_enum() { PixelType::U8 => { match ( CDEF_PAD_FNS[cpu.as_index()][decimate_index(xdec, ydec)], CDEF_FILTER_FNS[cpu.as_index()][decimate_index(xdec, ydec)], ) { (Some(pad), Some(func)) => { let h = if ydec == 1 { 4 } else { 8 }; let tmpstride = if xdec == 1 { 8 } else { 16 } as isize; const MAXTMPSTRIDE: isize = 16; const TMPSIZE: usize = (12 * MAXTMPSTRIDE + 8) as usize; let mut tmp: Aligned<[u16; TMPSIZE]> = Aligned::new([CDEF_VERY_LARGE; TMPSIZE]); let top = src.offset(-2 * src_stride); let bottom = src.offset(h as isize * src_stride); let mut left: Aligned<[[u8; 2]; 8]> = Aligned::new([[0; 2]; 8]); // Rather than modify the dav1d assembly, just swallow our // pride and copy relevant portions of src into a left // array. The array is a monolithic, packed [x=2][y=8], // though it is aligned to start on a 16-byte boundary. if edges & CDEF_HAVE_LEFT != 0 { let mut wptr = src.offset(-2) as *const u8; for i in 0..h { left.data[i as usize][0] = *wptr; left.data[i as usize][1] = *wptr.add(1); wptr = wptr.offset(src_stride); } } // dav1d's implicit indexing as of this version: tmp array // working pointer points to the upper-left of the current // coded block, not the upper-left of the tmp array storage, // with an adjustment to ensure UL of the block is 16-byte // aligned. src, similarly, points to upper left of src // block being coded. top points to coded block minus two // rows (that is, src.x, src.y-2). It does _not_ point to // src.x-2, src.y-2. (pad)( tmp.data.as_mut_ptr().offset(2 * tmpstride + 8) as *mut u16, src as *const u8, T::to_asm_stride(src_stride as usize), left.data.as_ptr() as *const [u8; 2], top as *const u8, bottom as *const u8, 8 >> ydec, edges.into(), ); (func)( dst.data_ptr_mut() as *mut u8, T::to_asm_stride(dst.plane_cfg.stride), tmp.data.as_ptr().offset(2 * tmpstride + 8) as *const u16, pri_strength, sec_strength, dir as i32, damping, 8 >> ydec, edges.into(), ); } _ => call_rust(dst), } } PixelType::U16 => { match ( CDEF_PAD_HBD_FNS[cpu.as_index()][decimate_index(xdec, ydec)], CDEF_FILTER_HBD_FNS[cpu.as_index()][decimate_index(xdec, ydec)], ) { // almost exactly as above, but the call convention isn't // quite what we'd need to roll 8 bit and HBD together in one // clause using Rust macros. See comments above for // indexing/addressing notes. (Some(pad), Some(func)) => { let h = if ydec == 1 { 4 } else { 8 }; let tmpstride = if xdec == 1 { 8 } else { 16 } as isize; const MAXTMPSTRIDE: isize = 16; const TMPSIZE: usize = (12 * MAXTMPSTRIDE + 8) as usize; let mut tmp: Aligned<[u16; TMPSIZE]> = Aligned::new([CDEF_VERY_LARGE; TMPSIZE]); let top = src.offset(-2 * src_stride); let bottom = src.offset(h as isize * src_stride); let mut left: Aligned<[[u16; 2]; 8]> = Aligned::new([[0; 2]; 8]); if edges & CDEF_HAVE_LEFT != 0 { let mut wptr = src.offset(-2) as *const u16; for i in 0..h { left.data[i as usize][0] = *wptr; left.data[i as usize][1] = *wptr.add(1); wptr = wptr.offset(src_stride); } } (pad)( tmp.data.as_mut_ptr().offset(2 * tmpstride + 8) as *mut u16, src as *const u16, T::to_asm_stride(src_stride as usize), left.data.as_ptr() as *const [u16; 2], top as *const u16, bottom as *const u16, 8 >> ydec, edges.into(), ); (func)( dst.data_ptr_mut() as *mut u16, T::to_asm_stride(dst.plane_cfg.stride), tmp.data.as_ptr().offset(2 * tmpstride + 8) as *const u16, pri_strength, sec_strength, dir as i32, damping, 8 >> ydec, edges.into(), (1 << bit_depth) - 1, ); } _ => call_rust(dst), } } } #[cfg(feature = "check_asm")] { for (dst_row, ref_row) in dst.rows_iter().zip(ref_dst.as_region().rows_iter()) { for (dst, reference) in dst_row.iter().zip(ref_row) { assert_eq!(*dst, *reference); } } } } extern { fn rav1e_cdef_filter4_8bpc_neon( dst: *mut u8, dst_stride: isize, tmp: *const u16, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, h: i32, edges: isize, ); fn rav1e_cdef_padding4_8bpc_neon( tmp: *mut u16, src: *const u8, src_stride: isize, left: *const [u8; 2], top: *const u8, bottom: *const u8, h: i32, edges: isize, ); fn rav1e_cdef_filter8_8bpc_neon( dst: *mut u8, dst_stride: isize, tmp: *const u16, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, h: i32, edges: isize, ); fn rav1e_cdef_padding8_8bpc_neon( tmp: *mut u16, src: *const u8, src_stride: isize, left: *const [u8; 2], top: *const u8, bottom: *const u8, h: i32, edges: isize, ); fn rav1e_cdef_filter4_16bpc_neon( dst: *mut u16, dst_stride: isize, tmp: *const u16, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, h: i32, edges: isize, bd: i32, ); fn rav1e_cdef_padding4_16bpc_neon( tmp: *mut u16, src: *const u16, src_stride: isize, left: *const [u16; 2], top: *const u16, bottom: *const u16, h: i32, edges: isize, ); fn rav1e_cdef_filter8_16bpc_neon( dst: *mut u16, dst_stride: isize, tmp: *const u16, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, h: i32, edges: isize, bd: i32, ); fn rav1e_cdef_padding8_16bpc_neon( tmp: *mut u16, src: *const u16, src_stride: isize, left: *const [u16; 2], top: *const u16, bottom: *const u16, h: i32, edges: isize, ); } static CDEF_PAD_FNS_NEON: [Option; 4] = { let mut out: [Option; 4] = [None; 4]; out[decimate_index(1, 1)] = Some(rav1e_cdef_padding4_8bpc_neon); out[decimate_index(1, 0)] = Some(rav1e_cdef_padding4_8bpc_neon); out[decimate_index(0, 0)] = Some(rav1e_cdef_padding8_8bpc_neon); out }; static CDEF_FILTER_FNS_NEON: [Option; 4] = { let mut out: [Option; 4] = [None; 4]; out[decimate_index(1, 1)] = Some(rav1e_cdef_filter4_8bpc_neon); out[decimate_index(1, 0)] = Some(rav1e_cdef_filter4_8bpc_neon); out[decimate_index(0, 0)] = Some(rav1e_cdef_filter8_8bpc_neon); out }; static CDEF_PAD_HBD_FNS_NEON: [Option; 4] = { let mut out: [Option; 4] = [None; 4]; out[decimate_index(1, 1)] = Some(rav1e_cdef_padding4_16bpc_neon); out[decimate_index(1, 0)] = Some(rav1e_cdef_padding4_16bpc_neon); out[decimate_index(0, 0)] = Some(rav1e_cdef_padding8_16bpc_neon); out }; static CDEF_FILTER_HBD_FNS_NEON: [Option; 4] = { let mut out: [Option; 4] = [None; 4]; out[decimate_index(1, 1)] = Some(rav1e_cdef_filter4_16bpc_neon); out[decimate_index(1, 0)] = Some(rav1e_cdef_filter4_16bpc_neon); out[decimate_index(0, 0)] = Some(rav1e_cdef_filter8_16bpc_neon); out }; cpu_function_lookup_table!( CDEF_PAD_FNS: [[Option; 4]], default: [None; 4], [NEON] ); cpu_function_lookup_table!( CDEF_FILTER_FNS: [[Option; 4]], default: [None; 4], [NEON] ); cpu_function_lookup_table!( CDEF_PAD_HBD_FNS: [[Option; 4]], default: [None; 4], [NEON] ); cpu_function_lookup_table!( CDEF_FILTER_HBD_FNS: [[Option; 4]], default: [None; 4], [NEON] ); type CdefDirLBDFn = unsafe extern fn(tmp: *const u8, tmp_stride: isize, var: *mut u32) -> i32; type CdefDirHBDFn = unsafe extern fn( tmp: *const u16, tmp_stride: isize, var: *mut u32, bitdepth_max: i32, ) -> i32; #[inline(always)] #[allow(clippy::let_and_return)] pub(crate) fn cdef_find_dir( img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize, cpu: CpuFeatureLevel, ) -> i32 { let call_rust = |var: &mut u32| rust::cdef_find_dir::(img, var, coeff_shift, cpu); #[cfg(feature = "check_asm")] let (ref_dir, ref_var) = { let mut var: u32 = 0; let dir = call_rust(&mut var); (dir, var) }; let dir = match T::type_enum() { PixelType::U8 => { if let Some(func) = CDEF_DIR_LBD_FNS[cpu.as_index()] { unsafe { (func)( img.as_ptr() as *const _, T::to_asm_stride(img.plane.cfg.stride), var as *mut u32, ) } } else { call_rust(var) } } PixelType::U16 if coeff_shift > 0 => { if let Some(func) = CDEF_DIR_HBD_FNS[cpu.as_index()] { unsafe { (func)( img.as_ptr() as *const _, T::to_asm_stride(img.plane.cfg.stride), var as *mut u32, (1 << (coeff_shift + 8)) - 1, ) } } else { call_rust(var) } } _ => call_rust(var), }; #[cfg(feature = "check_asm")] { assert_eq!(dir, ref_dir); assert_eq!(*var, ref_var); } dir } extern { fn rav1e_cdef_find_dir_8bpc_neon( tmp: *const u8, tmp_stride: isize, var: *mut u32, ) -> i32; } extern { fn rav1e_cdef_find_dir_16bpc_neon( tmp: *const u16, tmp_stride: isize, var: *mut u32, max_bitdepth: i32, ) -> i32; } cpu_function_lookup_table!( CDEF_DIR_LBD_FNS: [Option], default: None, [(NEON, Some(rav1e_cdef_find_dir_8bpc_neon))] ); cpu_function_lookup_table!( CDEF_DIR_HBD_FNS: [Option], default: None, [(NEON, Some(rav1e_cdef_find_dir_16bpc_neon))] ); rav1e-0.7.1/src/asm/aarch64/dist/cdef_dist.rs000064400000000000000000000133531046102023000167100ustar 00000000000000// Copyright (c) 2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::activity::apply_ssim_boost; use crate::cpu_features::CpuFeatureLevel; use crate::dist::*; use crate::tiling::PlaneRegion; use crate::util::Pixel; use crate::util::PixelType; type CdefDistKernelFn = unsafe extern fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); type CdefDistKernelHBDFn = unsafe extern fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ret_ptr: *mut u32, ); extern { fn rav1e_cdef_dist_kernel_4x4_neon( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_4x8_neon( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_8x4_neon( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_8x8_neon( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_4x4_hbd_neon( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_4x8_hbd_neon( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_8x4_hbd_neon( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_8x8_hbd_neon( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ret_ptr: *mut u32, ); } /// # Panics /// /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[allow(clippy::let_and_return)] pub fn cdef_dist_kernel( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { debug_assert!(src.plane_cfg.xdec == 0); debug_assert!(src.plane_cfg.ydec == 0); debug_assert!(dst.plane_cfg.xdec == 0); debug_assert!(dst.plane_cfg.ydec == 0); // Limit kernel to 8x8 debug_assert!(w <= 8); debug_assert!(h <= 8); let call_rust = || -> u32 { rust::cdef_dist_kernel(dst, src, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); let (svar, dvar, sse) = match T::type_enum() { PixelType::U8 => { if let Some(func) = CDEF_DIST_KERNEL_FNS[cpu.as_index()][kernel_fn_index(w, h)] { let mut ret_buf = [0u32; 3]; // SAFETY: Calls Assembly code. unsafe { func( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ret_buf.as_mut_ptr(), ) } (ret_buf[0], ret_buf[1], ret_buf[2]) } else { return call_rust(); } } PixelType::U16 => { if let Some(func) = CDEF_DIST_KERNEL_HBD_FNS[cpu.as_index()][kernel_fn_index(w, h)] { let mut ret_buf = [0u32; 3]; // SAFETY: Calls Assembly code. unsafe { func( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ret_buf.as_mut_ptr(), ) } (ret_buf[0], ret_buf[1], ret_buf[2]) } else { return call_rust(); } } }; let dist = apply_ssim_boost(sse, svar, dvar, bit_depth); #[cfg(feature = "check_asm")] assert_eq!( dist, ref_dist, "CDEF Distortion {}x{}: Assembly doesn't match reference code.", w, h ); dist } /// Store functions in a 8x8 grid. Most will be empty. const CDEF_DIST_KERNEL_FNS_LENGTH: usize = 8 * 8; const fn kernel_fn_index(w: usize, h: usize) -> usize { ((w - 1) << 3) | (h - 1) } static CDEF_DIST_KERNEL_FNS_NEON: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = { let mut out: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = [None; CDEF_DIST_KERNEL_FNS_LENGTH]; out[kernel_fn_index(4, 4)] = Some(rav1e_cdef_dist_kernel_4x4_neon); out[kernel_fn_index(4, 8)] = Some(rav1e_cdef_dist_kernel_4x8_neon); out[kernel_fn_index(8, 4)] = Some(rav1e_cdef_dist_kernel_8x4_neon); out[kernel_fn_index(8, 8)] = Some(rav1e_cdef_dist_kernel_8x8_neon); out }; cpu_function_lookup_table!( CDEF_DIST_KERNEL_FNS: [[Option; CDEF_DIST_KERNEL_FNS_LENGTH]], default: [None; CDEF_DIST_KERNEL_FNS_LENGTH], [NEON] ); static CDEF_DIST_KERNEL_HBD_FNS_NEON: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = { let mut out: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = [None; CDEF_DIST_KERNEL_FNS_LENGTH]; out[kernel_fn_index(4, 4)] = Some(rav1e_cdef_dist_kernel_4x4_hbd_neon); out[kernel_fn_index(4, 8)] = Some(rav1e_cdef_dist_kernel_4x8_hbd_neon); out[kernel_fn_index(8, 4)] = Some(rav1e_cdef_dist_kernel_8x4_hbd_neon); out[kernel_fn_index(8, 8)] = Some(rav1e_cdef_dist_kernel_8x8_hbd_neon); out }; cpu_function_lookup_table!( CDEF_DIST_KERNEL_HBD_FNS: [[Option; CDEF_DIST_KERNEL_FNS_LENGTH]], default: [None; CDEF_DIST_KERNEL_FNS_LENGTH], [NEON] ); rav1e-0.7.1/src/asm/aarch64/dist/mod.rs000064400000000000000000000421441046102023000155430ustar 00000000000000// Copyright (c) 2020-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub use self::cdef_dist::*; pub use self::sse::*; use crate::cpu_features::CpuFeatureLevel; use crate::dist::*; use crate::partition::BlockSize; use crate::tiling::*; use crate::util::*; mod cdef_dist; mod sse; type SadFn = unsafe extern fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ) -> u32; type SadHbdFn = unsafe extern fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> u32; type SatdFn = SadFn; type SatdHbdFn = SadHbdFn; macro_rules! declare_asm_dist_fn { ($(($name: ident, $T: ident)),+) => ( $( extern { fn $name ( src: *const $T, src_stride: isize, dst: *const $T, dst_stride: isize ) -> u32; } )+ ) } declare_asm_dist_fn![ (rav1e_sad4x4_neon, u8), (rav1e_sad4x8_neon, u8), (rav1e_sad4x16_neon, u8), (rav1e_sad8x4_neon, u8), (rav1e_sad8x8_neon, u8), (rav1e_sad8x16_neon, u8), (rav1e_sad8x32_neon, u8), (rav1e_sad16x4_neon, u8), (rav1e_sad16x8_neon, u8), (rav1e_sad16x16_neon, u8), (rav1e_sad16x32_neon, u8), (rav1e_sad16x64_neon, u8), (rav1e_sad32x8_neon, u8), (rav1e_sad32x16_neon, u8), (rav1e_sad32x32_neon, u8), (rav1e_sad32x64_neon, u8), (rav1e_sad64x16_neon, u8), (rav1e_sad64x32_neon, u8), (rav1e_sad64x64_neon, u8), (rav1e_sad64x128_neon, u8), (rav1e_sad128x64_neon, u8), (rav1e_sad128x128_neon, u8), /* SAD HBD */ (rav1e_sad4x4_hbd_neon, u16), (rav1e_sad4x8_hbd_neon, u16), (rav1e_sad4x16_hbd_neon, u16), (rav1e_sad8x4_hbd_neon, u16), (rav1e_sad8x8_hbd_neon, u16), (rav1e_sad8x16_hbd_neon, u16), (rav1e_sad8x32_hbd_neon, u16), (rav1e_sad16x4_hbd_neon, u16), (rav1e_sad16x8_hbd_neon, u16), (rav1e_sad16x16_hbd_neon, u16), (rav1e_sad16x32_hbd_neon, u16), (rav1e_sad16x64_hbd_neon, u16), (rav1e_sad32x8_hbd_neon, u16), (rav1e_sad32x16_hbd_neon, u16), (rav1e_sad32x32_hbd_neon, u16), (rav1e_sad32x64_hbd_neon, u16), (rav1e_sad64x16_hbd_neon, u16), (rav1e_sad64x32_hbd_neon, u16), (rav1e_sad64x64_hbd_neon, u16), (rav1e_sad64x128_hbd_neon, u16), (rav1e_sad128x64_hbd_neon, u16), (rav1e_sad128x128_hbd_neon, u16), /* SATD */ (rav1e_satd4x4_neon, u8), (rav1e_satd4x8_neon, u8), (rav1e_satd4x16_neon, u8), (rav1e_satd8x4_neon, u8), (rav1e_satd8x8_neon, u8), (rav1e_satd8x16_neon, u8), (rav1e_satd8x32_neon, u8), (rav1e_satd16x4_neon, u8), (rav1e_satd16x8_neon, u8), (rav1e_satd16x16_neon, u8), (rav1e_satd16x32_neon, u8), (rav1e_satd16x64_neon, u8), (rav1e_satd32x8_neon, u8), (rav1e_satd32x16_neon, u8), (rav1e_satd32x32_neon, u8), (rav1e_satd32x64_neon, u8), (rav1e_satd64x16_neon, u8), (rav1e_satd64x32_neon, u8), (rav1e_satd64x64_neon, u8), (rav1e_satd64x128_neon, u8), (rav1e_satd128x64_neon, u8), (rav1e_satd128x128_neon, u8), /* SATD HBD */ (rav1e_satd4x4_hbd_neon, u16), (rav1e_satd4x8_hbd_neon, u16), (rav1e_satd4x16_hbd_neon, u16), (rav1e_satd8x4_hbd_neon, u16), (rav1e_satd8x8_hbd_neon, u16), (rav1e_satd8x16_hbd_neon, u16), (rav1e_satd8x32_hbd_neon, u16), (rav1e_satd16x4_hbd_neon, u16), (rav1e_satd16x8_hbd_neon, u16), (rav1e_satd16x16_hbd_neon, u16), (rav1e_satd16x32_hbd_neon, u16), (rav1e_satd16x64_hbd_neon, u16), (rav1e_satd32x8_hbd_neon, u16), (rav1e_satd32x16_hbd_neon, u16), (rav1e_satd32x32_hbd_neon, u16), (rav1e_satd32x64_hbd_neon, u16), (rav1e_satd64x16_hbd_neon, u16), (rav1e_satd64x32_hbd_neon, u16), (rav1e_satd64x64_hbd_neon, u16), (rav1e_satd64x128_hbd_neon, u16), (rav1e_satd128x64_hbd_neon, u16), (rav1e_satd128x128_hbd_neon, u16) ]; // BlockSize::BLOCK_SIZES.next_power_of_two(); const DIST_FNS_LENGTH: usize = 32; #[inline] const fn to_index(bsize: BlockSize) -> usize { bsize as usize & (DIST_FNS_LENGTH - 1) } #[inline(always)] #[allow(clippy::let_and_return)] pub fn get_sad( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u32 { rust::get_sad(src, dst, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); let dist = match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SAD_FNS[cpu.as_index()][to_index(bsize)] { Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SAD_HBD_FNS[cpu.as_index()][to_index(bsize)] { Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } }; #[cfg(feature = "check_asm")] assert_eq!(dist, ref_dist); dist } #[inline(always)] #[allow(clippy::let_and_return)] pub fn get_satd( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u32 { rust::get_satd(src, dst, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); let dist = match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SATD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SATD_HBD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } }; #[cfg(feature = "check_asm")] assert_eq!(dist, ref_dist); dist } static SAD_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_sad4x4_neon); out[BLOCK_4X8 as usize] = Some(rav1e_sad4x8_neon); out[BLOCK_4X16 as usize] = Some(rav1e_sad4x16_neon); out[BLOCK_8X4 as usize] = Some(rav1e_sad8x4_neon); out[BLOCK_8X8 as usize] = Some(rav1e_sad8x8_neon); out[BLOCK_8X16 as usize] = Some(rav1e_sad8x16_neon); out[BLOCK_8X32 as usize] = Some(rav1e_sad8x32_neon); out[BLOCK_16X4 as usize] = Some(rav1e_sad16x4_neon); out[BLOCK_16X8 as usize] = Some(rav1e_sad16x8_neon); out[BLOCK_16X16 as usize] = Some(rav1e_sad16x16_neon); out[BLOCK_16X32 as usize] = Some(rav1e_sad16x32_neon); out[BLOCK_16X64 as usize] = Some(rav1e_sad16x64_neon); out[BLOCK_32X8 as usize] = Some(rav1e_sad32x8_neon); out[BLOCK_32X16 as usize] = Some(rav1e_sad32x16_neon); out[BLOCK_32X32 as usize] = Some(rav1e_sad32x32_neon); out[BLOCK_32X64 as usize] = Some(rav1e_sad32x64_neon); out[BLOCK_64X16 as usize] = Some(rav1e_sad64x16_neon); out[BLOCK_64X32 as usize] = Some(rav1e_sad64x32_neon); out[BLOCK_64X64 as usize] = Some(rav1e_sad64x64_neon); out[BLOCK_64X128 as usize] = Some(rav1e_sad64x128_neon); out[BLOCK_128X64 as usize] = Some(rav1e_sad128x64_neon); out[BLOCK_128X128 as usize] = Some(rav1e_sad128x128_neon); out }; static SAD_HBD_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_sad4x4_hbd_neon); out[BLOCK_4X8 as usize] = Some(rav1e_sad4x8_hbd_neon); out[BLOCK_4X16 as usize] = Some(rav1e_sad4x16_hbd_neon); out[BLOCK_8X4 as usize] = Some(rav1e_sad8x4_hbd_neon); out[BLOCK_8X8 as usize] = Some(rav1e_sad8x8_hbd_neon); out[BLOCK_8X16 as usize] = Some(rav1e_sad8x16_hbd_neon); out[BLOCK_8X32 as usize] = Some(rav1e_sad8x32_hbd_neon); out[BLOCK_16X4 as usize] = Some(rav1e_sad16x4_hbd_neon); out[BLOCK_16X8 as usize] = Some(rav1e_sad16x8_hbd_neon); out[BLOCK_16X16 as usize] = Some(rav1e_sad16x16_hbd_neon); out[BLOCK_16X32 as usize] = Some(rav1e_sad16x32_hbd_neon); out[BLOCK_16X64 as usize] = Some(rav1e_sad16x64_hbd_neon); out[BLOCK_32X8 as usize] = Some(rav1e_sad32x8_hbd_neon); out[BLOCK_32X16 as usize] = Some(rav1e_sad32x16_hbd_neon); out[BLOCK_32X32 as usize] = Some(rav1e_sad32x32_hbd_neon); out[BLOCK_32X64 as usize] = Some(rav1e_sad32x64_hbd_neon); out[BLOCK_64X16 as usize] = Some(rav1e_sad64x16_hbd_neon); out[BLOCK_64X32 as usize] = Some(rav1e_sad64x32_hbd_neon); out[BLOCK_64X64 as usize] = Some(rav1e_sad64x64_hbd_neon); out[BLOCK_64X128 as usize] = Some(rav1e_sad64x128_hbd_neon); out[BLOCK_128X64 as usize] = Some(rav1e_sad128x64_hbd_neon); out[BLOCK_128X128 as usize] = Some(rav1e_sad128x128_hbd_neon); out }; static SATD_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_satd4x4_neon); out[BLOCK_4X8 as usize] = Some(rav1e_satd4x8_neon); out[BLOCK_4X16 as usize] = Some(rav1e_satd4x16_neon); out[BLOCK_8X4 as usize] = Some(rav1e_satd8x4_neon); out[BLOCK_16X4 as usize] = Some(rav1e_satd16x4_neon); out[BLOCK_8X8 as usize] = Some(rav1e_satd8x8_neon); out[BLOCK_8X16 as usize] = Some(rav1e_satd8x16_neon); out[BLOCK_8X32 as usize] = Some(rav1e_satd8x32_neon); out[BLOCK_16X8 as usize] = Some(rav1e_satd16x8_neon); out[BLOCK_16X16 as usize] = Some(rav1e_satd16x16_neon); out[BLOCK_16X32 as usize] = Some(rav1e_satd16x32_neon); out[BLOCK_16X64 as usize] = Some(rav1e_satd16x64_neon); out[BLOCK_32X8 as usize] = Some(rav1e_satd32x8_neon); out[BLOCK_32X16 as usize] = Some(rav1e_satd32x16_neon); out[BLOCK_32X32 as usize] = Some(rav1e_satd32x32_neon); out[BLOCK_32X64 as usize] = Some(rav1e_satd32x64_neon); out[BLOCK_64X16 as usize] = Some(rav1e_satd64x16_neon); out[BLOCK_64X32 as usize] = Some(rav1e_satd64x32_neon); out[BLOCK_64X64 as usize] = Some(rav1e_satd64x64_neon); out[BLOCK_64X128 as usize] = Some(rav1e_satd64x128_neon); out[BLOCK_128X64 as usize] = Some(rav1e_satd128x64_neon); out[BLOCK_128X128 as usize] = Some(rav1e_satd128x128_neon); out }; static SATD_HBD_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_satd4x4_hbd_neon); out[BLOCK_4X8 as usize] = Some(rav1e_satd4x8_hbd_neon); out[BLOCK_4X16 as usize] = Some(rav1e_satd4x16_hbd_neon); out[BLOCK_8X4 as usize] = Some(rav1e_satd8x4_hbd_neon); out[BLOCK_16X4 as usize] = Some(rav1e_satd16x4_hbd_neon); out[BLOCK_8X8 as usize] = Some(rav1e_satd8x8_hbd_neon); out[BLOCK_8X16 as usize] = Some(rav1e_satd8x16_hbd_neon); out[BLOCK_8X32 as usize] = Some(rav1e_satd8x32_hbd_neon); out[BLOCK_16X8 as usize] = Some(rav1e_satd16x8_hbd_neon); out[BLOCK_16X16 as usize] = Some(rav1e_satd16x16_hbd_neon); out[BLOCK_16X32 as usize] = Some(rav1e_satd16x32_hbd_neon); out[BLOCK_16X64 as usize] = Some(rav1e_satd16x64_hbd_neon); out[BLOCK_32X8 as usize] = Some(rav1e_satd32x8_hbd_neon); out[BLOCK_32X16 as usize] = Some(rav1e_satd32x16_hbd_neon); out[BLOCK_32X32 as usize] = Some(rav1e_satd32x32_hbd_neon); out[BLOCK_32X64 as usize] = Some(rav1e_satd32x64_hbd_neon); out[BLOCK_64X16 as usize] = Some(rav1e_satd64x16_hbd_neon); out[BLOCK_64X32 as usize] = Some(rav1e_satd64x32_hbd_neon); out[BLOCK_64X64 as usize] = Some(rav1e_satd64x64_hbd_neon); out[BLOCK_64X128 as usize] = Some(rav1e_satd64x128_hbd_neon); out[BLOCK_128X64 as usize] = Some(rav1e_satd128x64_hbd_neon); out[BLOCK_128X128 as usize] = Some(rav1e_satd128x128_hbd_neon); out }; cpu_function_lookup_table!( SAD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); cpu_function_lookup_table!( SAD_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); cpu_function_lookup_table!( SATD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); cpu_function_lookup_table!( SATD_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); #[cfg(test)] mod test { use super::*; use crate::frame::{AsRegion, Plane}; use rand::random; use std::str::FromStr; macro_rules! test_dist_fns { ($(($W:expr, $H:expr)),*, $DIST_TY:ident, $BD:expr, $OPT:ident, $OPTLIT:tt) => { $( paste::item! { #[test] fn []() { if $BD > 8 { // dynamic allocation: test let mut src = Plane::from_slice(&vec![0u16; $W * $H], $W); // dynamic allocation: test let mut dst = Plane::from_slice(&vec![0u16; $W * $H], $W); for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { *s = random::() as u16 * $BD / 8; *d = random::() as u16 * $BD / 8; } let result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap()); let rust_result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST); assert_eq!(rust_result, result); } else { // dynamic allocation: test let mut src = Plane::from_slice(&vec![0u8; $W * $H], $W); // dynamic allocation: test let mut dst = Plane::from_slice(&vec![0u8; $W * $H], $W); for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { *s = random::(); *d = random::(); } let result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap()); let rust_result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST); assert_eq!(rust_result, result); } } } )* } } test_dist_fns!( (4, 4), (4, 8), (4, 16), (8, 4), (8, 8), (8, 16), (8, 32), (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128), sad, 8, neon, "neon" ); test_dist_fns!( (4, 4), (4, 8), (4, 16), (8, 4), (8, 8), (8, 16), (8, 32), (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128), sad, 10, neon, "neon" ); test_dist_fns!( (4, 4), (4, 8), (4, 16), (8, 4), (8, 8), (8, 16), (8, 32), (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128), sad, 12, neon, "neon" ); test_dist_fns!( (4, 4), (4, 8), (4, 16), (8, 4), (8, 8), (8, 16), (8, 32), (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128), satd, 8, neon, "neon" ); test_dist_fns!( (4, 4), (4, 8), (4, 16), (8, 4), (8, 8), (8, 16), (8, 32), (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128), satd, 10, neon, "neon" ); test_dist_fns!( (4, 4), (4, 8), (4, 16), (8, 4), (8, 8), (8, 16), (8, 32), (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128), satd, 12, neon, "neon" ); } rav1e-0.7.1/src/asm/aarch64/dist/sse.rs000064400000000000000000000215071046102023000155560ustar 00000000000000// Copyright (c) 2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::asm::aarch64::dist::to_index; use crate::asm::aarch64::dist::DIST_FNS_LENGTH; use crate::cpu_features::CpuFeatureLevel; use crate::dist::*; use crate::encoder::IMPORTANCE_BLOCK_SIZE; use crate::partition::BlockSize; use crate::rdo::DistortionScale; use crate::tiling::PlaneRegion; use crate::util::*; type WeightedSseFn = unsafe extern fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, scale: *const u32, scale_stride: isize, ) -> u64; type WeightedSseHBDFn = unsafe extern fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, scale: *const u32, scale_stride: isize, ) -> u64; macro_rules! declare_asm_sse_fn { ($($name: ident),+) => ( $( extern { fn $name ( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, scale: *const u32, scale_stride: isize ) -> u64; } )+ ) } macro_rules! declare_asm_hbd_sse_fn { ($($name: ident),+) => ( $( extern { fn $name ( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, scale: *const u32, scale_stride: isize ) -> u64; } )+ ) } declare_asm_sse_fn![ rav1e_weighted_sse_4x4_neon, rav1e_weighted_sse_4x8_neon, rav1e_weighted_sse_4x16_neon, rav1e_weighted_sse_8x4_neon, rav1e_weighted_sse_8x8_neon, rav1e_weighted_sse_8x16_neon, rav1e_weighted_sse_8x32_neon, rav1e_weighted_sse_16x4_neon, rav1e_weighted_sse_16x8_neon, rav1e_weighted_sse_16x16_neon, rav1e_weighted_sse_16x32_neon, rav1e_weighted_sse_16x64_neon, rav1e_weighted_sse_32x8_neon, rav1e_weighted_sse_32x16_neon, rav1e_weighted_sse_32x32_neon, rav1e_weighted_sse_32x64_neon, rav1e_weighted_sse_64x16_neon, rav1e_weighted_sse_64x32_neon, rav1e_weighted_sse_64x64_neon, rav1e_weighted_sse_64x128_neon, rav1e_weighted_sse_128x64_neon, rav1e_weighted_sse_128x128_neon ]; declare_asm_hbd_sse_fn![ rav1e_weighted_sse_4x4_hbd_neon, rav1e_weighted_sse_4x8_hbd_neon, rav1e_weighted_sse_4x16_hbd_neon, rav1e_weighted_sse_8x4_hbd_neon, rav1e_weighted_sse_8x8_hbd_neon, rav1e_weighted_sse_8x16_hbd_neon, rav1e_weighted_sse_8x32_hbd_neon, rav1e_weighted_sse_16x4_hbd_neon, rav1e_weighted_sse_16x8_hbd_neon, rav1e_weighted_sse_16x16_hbd_neon, rav1e_weighted_sse_16x32_hbd_neon, rav1e_weighted_sse_16x64_hbd_neon, rav1e_weighted_sse_32x8_hbd_neon, rav1e_weighted_sse_32x16_hbd_neon, rav1e_weighted_sse_32x32_hbd_neon, rav1e_weighted_sse_32x64_hbd_neon, rav1e_weighted_sse_64x16_hbd_neon, rav1e_weighted_sse_64x32_hbd_neon, rav1e_weighted_sse_64x64_hbd_neon, rav1e_weighted_sse_64x128_hbd_neon, rav1e_weighted_sse_128x64_hbd_neon, rav1e_weighted_sse_128x128_hbd_neon ]; /// # Panics /// /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[inline(always)] #[allow(clippy::let_and_return)] pub fn get_weighted_sse( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, scale: &[u32], scale_stride: usize, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u64 { // Assembly breaks if imp block size changes. assert_eq!(IMPORTANCE_BLOCK_SIZE >> 1, 4); let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u64 { rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); let den = DistortionScale::new(1, 1 << rust::GET_WEIGHTED_SSE_SHIFT).0 as u64; let dist = match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SSE_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { ((func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), scale.as_ptr(), (scale_stride * std::mem::size_of::()) as isize, ) + (den >> 1)) / den }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SSE_HBD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { ((func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), scale.as_ptr(), (scale_stride * std::mem::size_of::()) as isize, ) + (den >> 1)) / den }, None => call_rust(), } } }; #[cfg(feature = "check_asm")] assert_eq!( dist, ref_dist, "Weighted SSE {:?}: Assembly doesn't match reference code.", bsize_opt ); dist } static SSE_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_weighted_sse_4x4_neon); out[BLOCK_4X8 as usize] = Some(rav1e_weighted_sse_4x8_neon); out[BLOCK_4X16 as usize] = Some(rav1e_weighted_sse_4x16_neon); out[BLOCK_8X4 as usize] = Some(rav1e_weighted_sse_8x4_neon); out[BLOCK_8X8 as usize] = Some(rav1e_weighted_sse_8x8_neon); out[BLOCK_8X16 as usize] = Some(rav1e_weighted_sse_8x16_neon); out[BLOCK_8X32 as usize] = Some(rav1e_weighted_sse_8x32_neon); out[BLOCK_16X4 as usize] = Some(rav1e_weighted_sse_16x4_neon); out[BLOCK_16X8 as usize] = Some(rav1e_weighted_sse_16x8_neon); out[BLOCK_16X16 as usize] = Some(rav1e_weighted_sse_16x16_neon); out[BLOCK_16X32 as usize] = Some(rav1e_weighted_sse_16x32_neon); out[BLOCK_16X64 as usize] = Some(rav1e_weighted_sse_16x64_neon); out[BLOCK_32X8 as usize] = Some(rav1e_weighted_sse_32x8_neon); out[BLOCK_32X16 as usize] = Some(rav1e_weighted_sse_32x16_neon); out[BLOCK_32X32 as usize] = Some(rav1e_weighted_sse_32x32_neon); out[BLOCK_32X64 as usize] = Some(rav1e_weighted_sse_32x64_neon); out[BLOCK_64X16 as usize] = Some(rav1e_weighted_sse_64x16_neon); out[BLOCK_64X32 as usize] = Some(rav1e_weighted_sse_64x32_neon); out[BLOCK_64X64 as usize] = Some(rav1e_weighted_sse_64x64_neon); out[BLOCK_64X128 as usize] = Some(rav1e_weighted_sse_64x128_neon); out[BLOCK_128X64 as usize] = Some(rav1e_weighted_sse_128x64_neon); out[BLOCK_128X128 as usize] = Some(rav1e_weighted_sse_128x128_neon); out }; static SSE_HBD_FNS_NEON: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_weighted_sse_4x4_hbd_neon); out[BLOCK_4X8 as usize] = Some(rav1e_weighted_sse_4x8_hbd_neon); out[BLOCK_4X16 as usize] = Some(rav1e_weighted_sse_4x16_hbd_neon); out[BLOCK_8X4 as usize] = Some(rav1e_weighted_sse_8x4_hbd_neon); out[BLOCK_8X8 as usize] = Some(rav1e_weighted_sse_8x8_hbd_neon); out[BLOCK_8X16 as usize] = Some(rav1e_weighted_sse_8x16_hbd_neon); out[BLOCK_8X32 as usize] = Some(rav1e_weighted_sse_8x32_hbd_neon); out[BLOCK_16X4 as usize] = Some(rav1e_weighted_sse_16x4_hbd_neon); out[BLOCK_16X8 as usize] = Some(rav1e_weighted_sse_16x8_hbd_neon); out[BLOCK_16X16 as usize] = Some(rav1e_weighted_sse_16x16_hbd_neon); out[BLOCK_16X32 as usize] = Some(rav1e_weighted_sse_16x32_hbd_neon); out[BLOCK_16X64 as usize] = Some(rav1e_weighted_sse_16x64_hbd_neon); out[BLOCK_32X8 as usize] = Some(rav1e_weighted_sse_32x8_hbd_neon); out[BLOCK_32X16 as usize] = Some(rav1e_weighted_sse_32x16_hbd_neon); out[BLOCK_32X32 as usize] = Some(rav1e_weighted_sse_32x32_hbd_neon); out[BLOCK_32X64 as usize] = Some(rav1e_weighted_sse_32x64_hbd_neon); out[BLOCK_64X16 as usize] = Some(rav1e_weighted_sse_64x16_hbd_neon); out[BLOCK_64X32 as usize] = Some(rav1e_weighted_sse_64x32_hbd_neon); out[BLOCK_64X64 as usize] = Some(rav1e_weighted_sse_64x64_hbd_neon); out[BLOCK_64X128 as usize] = Some(rav1e_weighted_sse_64x128_hbd_neon); out[BLOCK_128X64 as usize] = Some(rav1e_weighted_sse_128x64_hbd_neon); out[BLOCK_128X128 as usize] = Some(rav1e_weighted_sse_128x128_hbd_neon); out }; cpu_function_lookup_table!( SSE_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); cpu_function_lookup_table!( SSE_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [NEON] ); rav1e-0.7.1/src/asm/aarch64/mc.rs000064400000000000000000000331441046102023000144200ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::mc::FilterMode::*; use crate::mc::*; use crate::tiling::*; use crate::util::*; type PutFn = unsafe extern fn( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, ); type PutHBDFn = unsafe extern fn( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, bitdepth_max: i32, ); type PrepFn = unsafe extern fn( tmp: *mut i16, src: *const u8, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, ); type PrepHBDFn = unsafe extern fn( tmp: *mut i16, src: *const u16, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, bitdepth_max: i32, ); type AvgFn = unsafe extern fn( dst: *mut u8, dst_stride: isize, tmp1: *const i16, tmp2: *const i16, width: i32, height: i32, ); type AvgHBDFn = unsafe extern fn( dst: *mut u16, dst_stride: isize, tmp1: *const i16, tmp2: *const i16, width: i32, height: i32, bitdepth_max: i32, ); // gets an index that can be mapped to a function for a pair of filter modes #[inline] const fn get_2d_mode_idx(mode_x: FilterMode, mode_y: FilterMode) -> usize { (mode_x as usize + 4 * (mode_y as usize)) & 15 } #[inline(always)] pub fn put_8tap( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, bit_depth: usize, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { rust::put_8tap( dst, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth, cpu, ); }; #[cfg(feature = "check_asm")] let ref_dst = { let mut copy = dst.scratch_copy(); call_rust(&mut copy.as_region_mut()); copy }; unsafe { // SAFETY: The assembly only supports even heights and valid uncropped // widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check bounds of dst assert!(dst.rect().width >= width && dst.rect().height >= height); // SAFETY: Check bounds of src assert!(src.accessible(width + 4, height + 4)); assert!(src.accessible_neg(3, 3)); match T::type_enum() { PixelType::U8 => { match PUT_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, ), None => call_rust(dst), } } PixelType::U16 if bit_depth > 8 => { match PUT_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, (1 << bit_depth) - 1, ), None => call_rust(dst), } } _ => call_rust(dst), } } #[cfg(feature = "check_asm")] { for (dst_row, ref_row) in dst.rows_iter().zip(ref_dst.as_region().rows_iter()) { for (dst, reference) in dst_row.iter().zip(ref_row) { assert_eq!(*dst, *reference); } } } } #[inline(always)] pub fn prep_8tap( tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, bit_depth: usize, cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2 <= width || width <= 128)); let call_rust = |tmp: &mut [i16]| { rust::prep_8tap( tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth, cpu, ); }; #[cfg(feature = "check_asm")] let ref_tmp = { let mut copy = vec![0; width * height]; copy[..].copy_from_slice(&tmp[..width * height]); call_rust(&mut copy); copy }; unsafe { // SAFETY: The assembly only supports even heights and valid uncropped // widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check length of tmp assert!(tmp.len() >= width * height); // SAFETY: Check bounds of src assert!(src.accessible(width + 4, height + 4)); assert!(src.accessible_neg(3, 3)); match T::type_enum() { PixelType::U8 => { match PREP_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( tmp.as_mut_ptr(), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, ), None => call_rust(tmp), } } PixelType::U16 if bit_depth > 8 => { match PREP_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( tmp.as_mut_ptr() as *mut _, src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, (1 << bit_depth) - 1, ), None => call_rust(tmp), } } _ => call_rust(tmp), } } #[cfg(feature = "check_asm")] { assert_eq!(&tmp[..width * height], &ref_tmp[..]); } } pub fn mc_avg( dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize, height: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2 <= width || width <= 128)); let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { rust::mc_avg(dst, tmp1, tmp2, width, height, bit_depth, cpu); }; #[cfg(feature = "check_asm")] let ref_dst = { let mut copy = dst.scratch_copy(); call_rust(&mut copy.as_region_mut()); copy }; unsafe { // SAFETY: The assembly only supports even heights and valid uncropped // widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check bounds of dst assert!(dst.rect().width >= width && dst.rect().height >= height); // SAFETY: Check length of tmp1 & tmp2 assert!(tmp1.len() >= width * height); assert!(tmp2.len() >= width * height); match T::type_enum() { PixelType::U8 => match AVG_FNS[cpu.as_index()] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), tmp1.as_ptr(), tmp2.as_ptr(), width as i32, height as i32, ), None => call_rust(dst), }, PixelType::U16 if bit_depth > 8 => match AVG_HBD_FNS[cpu.as_index()] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), tmp1.as_ptr(), tmp2.as_ptr(), width as i32, height as i32, (1 << bit_depth) - 1, ), None => call_rust(dst), }, _ => call_rust(dst), } } #[cfg(feature = "check_asm")] { for (dst_row, ref_row) in dst.rows_iter().zip(ref_dst.as_region().rows_iter()) { for (dst, reference) in dst_row.iter().zip(ref_row) { assert_eq!(*dst, *reference); } } } } macro_rules! decl_mc_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { extern { $( fn $func_name( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, ); )* } static PUT_FNS_NEON: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); )* out }; } } decl_mc_fns!( (REGULAR, REGULAR, rav1e_put_8tap_regular_8bpc_neon), (REGULAR, SMOOTH, rav1e_put_8tap_regular_smooth_8bpc_neon), (REGULAR, SHARP, rav1e_put_8tap_regular_sharp_8bpc_neon), (SMOOTH, REGULAR, rav1e_put_8tap_smooth_regular_8bpc_neon), (SMOOTH, SMOOTH, rav1e_put_8tap_smooth_8bpc_neon), (SMOOTH, SHARP, rav1e_put_8tap_smooth_sharp_8bpc_neon), (SHARP, REGULAR, rav1e_put_8tap_sharp_regular_8bpc_neon), (SHARP, SMOOTH, rav1e_put_8tap_sharp_smooth_8bpc_neon), (SHARP, SHARP, rav1e_put_8tap_sharp_8bpc_neon), (BILINEAR, BILINEAR, rav1e_put_bilin_8bpc_neon) ); cpu_function_lookup_table!( PUT_FNS: [[Option; 16]], default: [None; 16], [NEON] ); macro_rules! decl_mc_hbd_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { extern { $( fn $func_name( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); )* } static PUT_HBD_FNS_NEON: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); )* out }; } } decl_mc_hbd_fns!( (REGULAR, REGULAR, rav1e_put_8tap_regular_16bpc_neon), (REGULAR, SMOOTH, rav1e_put_8tap_regular_smooth_16bpc_neon), (REGULAR, SHARP, rav1e_put_8tap_regular_sharp_16bpc_neon), (SMOOTH, REGULAR, rav1e_put_8tap_smooth_regular_16bpc_neon), (SMOOTH, SMOOTH, rav1e_put_8tap_smooth_16bpc_neon), (SMOOTH, SHARP, rav1e_put_8tap_smooth_sharp_16bpc_neon), (SHARP, REGULAR, rav1e_put_8tap_sharp_regular_16bpc_neon), (SHARP, SMOOTH, rav1e_put_8tap_sharp_smooth_16bpc_neon), (SHARP, SHARP, rav1e_put_8tap_sharp_16bpc_neon), (BILINEAR, BILINEAR, rav1e_put_bilin_16bpc_neon) ); cpu_function_lookup_table!( PUT_HBD_FNS: [[Option; 16]], default: [None; 16], [NEON] ); macro_rules! decl_mct_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { extern { $( fn $func_name( tmp: *mut i16, src: *const u8, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32, ); )* } static PREP_FNS_NEON: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); )* out }; } } decl_mct_fns!( (REGULAR, REGULAR, rav1e_prep_8tap_regular_8bpc_neon), (REGULAR, SMOOTH, rav1e_prep_8tap_regular_smooth_8bpc_neon), (REGULAR, SHARP, rav1e_prep_8tap_regular_sharp_8bpc_neon), (SMOOTH, REGULAR, rav1e_prep_8tap_smooth_regular_8bpc_neon), (SMOOTH, SMOOTH, rav1e_prep_8tap_smooth_8bpc_neon), (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp_8bpc_neon), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular_8bpc_neon), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth_8bpc_neon), (SHARP, SHARP, rav1e_prep_8tap_sharp_8bpc_neon), (BILINEAR, BILINEAR, rav1e_prep_bilin_8bpc_neon) ); cpu_function_lookup_table!( PREP_FNS: [[Option; 16]], default: [None; 16], [NEON] ); macro_rules! decl_mct_hbd_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { extern { $( fn $func_name( tmp: *mut i16, src: *const u16, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); )* } static PREP_HBD_FNS_NEON: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some($func_name); )* out }; } } decl_mct_hbd_fns!( (REGULAR, REGULAR, rav1e_prep_8tap_regular_16bpc_neon), (REGULAR, SMOOTH, rav1e_prep_8tap_regular_smooth_16bpc_neon), (REGULAR, SHARP, rav1e_prep_8tap_regular_sharp_16bpc_neon), (SMOOTH, REGULAR, rav1e_prep_8tap_smooth_regular_16bpc_neon), (SMOOTH, SMOOTH, rav1e_prep_8tap_smooth_16bpc_neon), (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp_16bpc_neon), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular_16bpc_neon), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth_16bpc_neon), (SHARP, SHARP, rav1e_prep_8tap_sharp_16bpc_neon), (BILINEAR, BILINEAR, rav1e_prep_bilin_16bpc_neon) ); cpu_function_lookup_table!( PREP_HBD_FNS: [[Option; 16]], default: [None; 16], [NEON] ); extern { fn rav1e_avg_8bpc_neon( dst: *mut u8, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, ); } cpu_function_lookup_table!( AVG_FNS: [Option], default: None, [(NEON, Some(rav1e_avg_8bpc_neon))] ); extern { fn rav1e_avg_16bpc_neon( dst: *mut u16, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, bitdepth_max: i32, ); } cpu_function_lookup_table!( AVG_HBD_FNS: [Option], default: None, [(NEON, Some(rav1e_avg_16bpc_neon))] ); rav1e-0.7.1/src/asm/aarch64/mod.rs000064400000000000000000000011331046102023000145710ustar 00000000000000// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub mod cdef; pub mod dist; pub mod mc; pub mod predict; pub mod transform; rav1e-0.7.1/src/asm/aarch64/predict.rs000064400000000000000000000555741046102023000154660ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::context::MAX_TX_SIZE; use crate::cpu_features::CpuFeatureLevel; use crate::partition::{BlockSize, IntraEdge}; use crate::predict::rust::{ dr_intra_derivative, select_ief_strength, select_ief_upsample, }; use crate::predict::{ rust, IntraEdgeFilterParameters, PredictionMode, PredictionVariant, }; use crate::tiling::{PlaneRegion, PlaneRegionMut}; use crate::transform::TxSize; use crate::{Pixel, PixelType}; use libc; use libc::{c_int, ptrdiff_t}; use std::mem::MaybeUninit; use PixelType::{U16, U8}; macro_rules! decl_cfl_ac_fn { ($($f:ident),+) => { extern { $( fn $f( ac: *mut MaybeUninit, src: *const u8, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); )* } }; } decl_cfl_ac_fn! { rav1e_ipred_cfl_ac_420_8bpc_neon, rav1e_ipred_cfl_ac_422_8bpc_neon, rav1e_ipred_cfl_ac_444_8bpc_neon } macro_rules! decl_cfl_ac_hbd_fn { ($($f:ident),+) => { extern { $( fn $f( ac: *mut MaybeUninit, src: *const u16, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); )* } }; } decl_cfl_ac_hbd_fn! { rav1e_ipred_cfl_ac_420_16bpc_neon, rav1e_ipred_cfl_ac_422_16bpc_neon, rav1e_ipred_cfl_ac_444_16bpc_neon } macro_rules! decl_angular_ipred_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, angle: libc::c_int, ); )* } }; } decl_angular_ipred_fn! { rav1e_ipred_dc_8bpc_neon, rav1e_ipred_dc_128_8bpc_neon, rav1e_ipred_dc_left_8bpc_neon, rav1e_ipred_dc_top_8bpc_neon, rav1e_ipred_v_8bpc_neon, rav1e_ipred_h_8bpc_neon, rav1e_ipred_smooth_8bpc_neon, rav1e_ipred_smooth_v_8bpc_neon, rav1e_ipred_smooth_h_8bpc_neon, rav1e_ipred_paeth_8bpc_neon } macro_rules! decl_angular_ipred_hbd_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, width: libc::c_int, height: libc::c_int, angle: libc::c_int, max_width: libc::c_int, max_height: libc::c_int, bit_depth_max: libc::c_int, ); )* } }; } decl_angular_ipred_hbd_fn! { rav1e_ipred_dc_16bpc_neon, rav1e_ipred_dc_128_16bpc_neon, rav1e_ipred_dc_left_16bpc_neon, rav1e_ipred_dc_top_16bpc_neon, rav1e_ipred_v_16bpc_neon, rav1e_ipred_h_16bpc_neon, rav1e_ipred_smooth_16bpc_neon, rav1e_ipred_smooth_v_16bpc_neon, rav1e_ipred_smooth_h_16bpc_neon, rav1e_ipred_paeth_16bpc_neon } macro_rules! decl_cfl_pred_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, ac: *const i16, alpha: libc::c_int, ); )* } }; } decl_cfl_pred_fn! { rav1e_ipred_cfl_8bpc_neon, rav1e_ipred_cfl_128_8bpc_neon, rav1e_ipred_cfl_left_8bpc_neon, rav1e_ipred_cfl_top_8bpc_neon } macro_rules! decl_cfl_pred_hbd_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, width: libc::c_int, height: libc::c_int, ac: *const i16, alpha: libc::c_int, bit_depth_max: libc::c_int, ); )* } }; } decl_cfl_pred_hbd_fn! { rav1e_ipred_cfl_16bpc_neon, rav1e_ipred_cfl_128_16bpc_neon, rav1e_ipred_cfl_left_16bpc_neon, rav1e_ipred_cfl_top_16bpc_neon } extern { fn rav1e_ipred_z1_upsample_edge_8bpc_neon( out: *mut u8, hsz: c_int, _in: *const u8, end: c_int, ); fn rav1e_ipred_z1_upsample_edge_16bpc_neon( out: *mut u16, hsz: c_int, _in: *const u16, end: c_int, bit_depth_max: c_int, ); fn rav1e_ipred_z1_filter_edge_8bpc_neon( out: *mut u8, sz: c_int, _in: *const u8, end: c_int, strength: c_int, ); fn rav1e_ipred_z1_filter_edge_16bpc_neon( out: *mut u16, sz: c_int, _in: *const u16, end: c_int, strength: c_int, ); fn rav1e_ipred_z1_fill1_8bpc_neon( dst: *mut u8, stride: ptrdiff_t, top: *const u8, width: c_int, height: c_int, dx: c_int, max_base_x: c_int, ); fn rav1e_ipred_z1_fill1_16bpc_neon( dst: *mut u16, stride: ptrdiff_t, top: *const u16, width: c_int, height: c_int, dx: c_int, max_base_x: c_int, ); fn rav1e_ipred_z1_fill2_8bpc_neon( dst: *mut u8, stride: ptrdiff_t, top: *const u8, width: c_int, height: c_int, dx: c_int, max_base_x: c_int, ); fn rav1e_ipred_z1_fill2_16bpc_neon( dst: *mut u16, stride: ptrdiff_t, top: *const u16, width: c_int, height: c_int, dx: c_int, max_base_x: c_int, ); fn rav1e_ipred_z2_upsample_edge_8bpc_neon( out: *mut u8, sz: c_int, _in: *const u8, ); fn rav1e_ipred_z2_upsample_edge_16bpc_neon( out: *mut u16, sz: c_int, _in: *const u16, bit_depth_max: c_int, ); fn rav1e_ipred_z2_fill1_8bpc_neon( dst: *mut u8, stride: ptrdiff_t, top: *const u8, left: *const u8, width: c_int, height: c_int, dx: c_int, dy: c_int, ); fn rav1e_ipred_z2_fill1_16bpc_neon( dst: *mut u16, stride: ptrdiff_t, top: *const u16, left: *const u16, width: c_int, height: c_int, dx: c_int, dy: c_int, ); fn rav1e_ipred_z2_fill2_8bpc_neon( dst: *mut u8, stride: ptrdiff_t, top: *const u8, left: *const u8, width: c_int, height: c_int, dx: c_int, dy: c_int, ); fn rav1e_ipred_z2_fill2_16bpc_neon( dst: *mut u16, stride: ptrdiff_t, top: *const u16, left: *const u16, width: c_int, height: c_int, dx: c_int, dy: c_int, ); fn rav1e_ipred_z2_fill3_8bpc_neon( dst: *mut u8, stride: ptrdiff_t, top: *const u8, left: *const u8, width: c_int, height: c_int, dx: c_int, dy: c_int, ); fn rav1e_ipred_z2_fill3_16bpc_neon( dst: *mut u16, stride: ptrdiff_t, top: *const u16, left: *const u16, width: c_int, height: c_int, dx: c_int, dy: c_int, ); fn rav1e_ipred_z3_fill1_8bpc_neon( dst: *mut u8, stride: ptrdiff_t, left: *const u8, width: c_int, height: c_int, dy: c_int, max_base_y: c_int, ); fn rav1e_ipred_z3_fill1_16bpc_neon( dst: *mut u16, stride: ptrdiff_t, left: *const u16, width: c_int, height: c_int, dy: c_int, max_base_y: c_int, ); fn rav1e_ipred_z3_fill2_8bpc_neon( dst: *mut u8, stride: ptrdiff_t, left: *const u8, width: c_int, height: c_int, dy: c_int, max_base_y: c_int, ); fn rav1e_ipred_z3_fill2_16bpc_neon( dst: *mut u16, stride: ptrdiff_t, left: *const u16, width: c_int, height: c_int, dy: c_int, max_base_y: c_int, ); fn rav1e_ipred_reverse_8bpc_neon(dst: *mut u8, src: *const u8, n: c_int); fn rav1e_ipred_reverse_16bpc_neon(dst: *mut u16, src: *const u16, n: c_int); } #[inline] unsafe fn ipred_z1_upsample_edge( o: *mut T, sz: c_int, i: *const T, n: c_int, m: c_int, ) { match T::type_enum() { U8 => rav1e_ipred_z1_upsample_edge_8bpc_neon(o as _, sz, i as _, n), U16 => rav1e_ipred_z1_upsample_edge_16bpc_neon(o as _, sz, i as _, n, m), } } #[inline] unsafe fn ipred_z2_upsample_edge( o: *mut T, sz: c_int, i: *const T, m: c_int, ) { match T::type_enum() { U8 => rav1e_ipred_z2_upsample_edge_8bpc_neon(o as _, sz, i as _), U16 => rav1e_ipred_z2_upsample_edge_16bpc_neon(o as _, sz, i as _, m), } } #[inline] unsafe fn ipred_z1_filter_edge( o: *mut T, sz: c_int, i: *const T, n: c_int, s: c_int, ) { match T::type_enum() { U8 => rav1e_ipred_z1_filter_edge_8bpc_neon(o as _, sz, i as _, n, s), U16 => rav1e_ipred_z1_filter_edge_16bpc_neon(o as _, sz, i as _, n, s), } } #[inline] unsafe fn ipred_reverse(o: *mut T, i: *const T, n: c_int) { match T::type_enum() { U8 => rav1e_ipred_reverse_8bpc_neon(o as _, i as _, n), U16 => rav1e_ipred_reverse_16bpc_neon(o as _, i as _, n), } } #[rustfmt::skip] struct Fill( [unsafe extern fn(*mut u8, ptrdiff_t, *const u8, c_int, c_int, c_int, c_int); 2], [unsafe extern fn(*mut u16, ptrdiff_t, *const u16, c_int, c_int, c_int, c_int); 2], ); impl Fill { unsafe fn ipred_fill( self, dst: *mut T, stride: ptrdiff_t, src: *const T, w: c_int, h: c_int, d: c_int, max_base: c_int, upsample: bool, ) { let u = upsample as usize; match T::type_enum() { U8 => self.0[u](dst as _, stride, src as _, w, h, d, max_base), U16 => self.1[u](dst as _, stride, src as _, w, h, d, max_base), } } } const Z1: Fill = Fill( [rav1e_ipred_z1_fill1_8bpc_neon, rav1e_ipred_z1_fill2_8bpc_neon], [rav1e_ipred_z1_fill1_16bpc_neon, rav1e_ipred_z1_fill2_16bpc_neon], ); const Z3: Fill = Fill( [rav1e_ipred_z3_fill1_8bpc_neon, rav1e_ipred_z3_fill2_8bpc_neon], [rav1e_ipred_z3_fill1_16bpc_neon, rav1e_ipred_z3_fill2_16bpc_neon], ); #[rustfmt::skip] struct Fill2( [unsafe extern fn(*mut u8, ptrdiff_t, *const u8, *const u8, c_int, c_int, c_int, c_int); 3], [unsafe extern fn(*mut u16, ptrdiff_t, *const u16, *const u16, c_int, c_int, c_int, c_int); 3], ); impl Fill2 { unsafe fn ipred_fill( self, dst: *mut T, stride: ptrdiff_t, top: *const T, left: *const T, w: c_int, h: c_int, dx: c_int, dy: c_int, upsample_above: bool, upsample_left: bool, ) { let u = if upsample_left { 2 } else { upsample_above as usize }; match T::type_enum() { U8 => self.0[u](dst as _, stride, top as _, left as _, w, h, dx, dy), U16 => self.1[u](dst as _, stride, top as _, left as _, w, h, dx, dy), } } } const Z2: Fill2 = Fill2( [ rav1e_ipred_z2_fill1_8bpc_neon, rav1e_ipred_z2_fill2_8bpc_neon, rav1e_ipred_z2_fill3_8bpc_neon, ], [ rav1e_ipred_z2_fill1_16bpc_neon, rav1e_ipred_z2_fill2_16bpc_neon, rav1e_ipred_z2_fill3_16bpc_neon, ], ); unsafe fn ipred_z1( dst: *mut T, stride: ptrdiff_t, src: *const T, angle: isize, w: c_int, h: c_int, bd_max: c_int, edge_filter: bool, smooth_filter: bool, ) { let mut dx = dr_intra_derivative(angle as _) as c_int; let mut out = [MaybeUninit::::uninit(); MAX_TX_SIZE * 4 + 15 * 2 + 16]; let out = out.as_mut_ptr() as *mut T; let upsample_above = edge_filter && select_ief_upsample(w as _, h as _, smooth_filter, 90 - angle); let max_base_x = if upsample_above { ipred_z1_upsample_edge(out, w + h, src, w + w.min(h), bd_max); dx <<= 1; 2 * (w + h) - 2 } else { let strength = select_ief_strength(w as _, h as _, smooth_filter, 90 - angle) as c_int; if strength != 0 { ipred_z1_filter_edge(out, w + h, src, w + w.min(h), strength); w + h - 1 } else { let n = w + w.min(h); out.copy_from_nonoverlapping(src.add(1), n as usize); n - 1 } }; let base_inc = 1 + upsample_above as c_int; let pad_pixels = w + 15; let fill_pixel = out.add(max_base_x as usize).read(); let base = out.add(max_base_x as usize + 1); for i in 0..(pad_pixels * base_inc) as usize { base.add(i).write(fill_pixel); } Z1.ipred_fill(dst, stride, out, w, h, dx, max_base_x, upsample_above); } unsafe fn ipred_z2( dst: *mut T, stride: ptrdiff_t, src: *const T, angle: isize, w: c_int, h: c_int, max_w: c_int, max_h: c_int, bd_max: c_int, edge_filter: bool, smooth_filter: bool, ) { assert!(angle > 90 && angle < 180); let mut dx = dr_intra_derivative((180 - angle) as _) as c_int; let mut dy = dr_intra_derivative((angle - 90) as _) as c_int; let us_left = edge_filter && select_ief_upsample(w as _, h as _, smooth_filter, 180 - angle); let us_above = edge_filter && select_ief_upsample(w as _, h as _, smooth_filter, angle - 90); let mut out = [MaybeUninit::::uninit(); 3 * (MAX_TX_SIZE * 4 + 1)]; let out = out.as_mut_ptr() as *mut T; let left = out.add(2 * (64 + 1)); let top = out.add(64 + 1); let flipped = out; if us_above { ipred_z2_upsample_edge(top, w, src, bd_max); dx <<= 1; } else { let strength = select_ief_strength(w as _, h as _, smooth_filter, angle - 90) as c_int; if edge_filter && strength != 0 { ipred_z1_filter_edge(top.add(1), max_w.min(w), src, w, strength); if max_w < w { top.add((1 + max_w) as _).copy_from_nonoverlapping( src.add((1 + max_w) as _), (w - max_w) as _, ); } } else { top.add(1).copy_from_nonoverlapping(src.add(1), w as _); } } if us_left { flipped.write(src.read()); ipred_reverse(flipped.add(1), src, h); ipred_z2_upsample_edge(left, h, flipped, bd_max); dy <<= 1; } else { let strength = select_ief_strength(w as _, h as _, smooth_filter, 180 - angle) as c_int; if edge_filter && strength != 0 { flipped.write(src.read()); ipred_reverse(flipped.add(1), src, h); ipred_z1_filter_edge(left.add(1), max_h.min(h), flipped, h, strength); if max_h < h { left.add((1 + max_h) as _).copy_from_nonoverlapping( flipped.add((1 + max_h) as _), (h - max_h) as _, ); } } else { ipred_reverse(left.add(1), src, h); } } let top_left = src.read(); top.write(top_left); left.write(top_left); assert!(!(us_above && us_left)); Z2.ipred_fill(dst, stride, top, left, w, h, dx, dy, us_above, us_left); } unsafe fn ipred_z3( dst: *mut T, stride: ptrdiff_t, src: *const T, angle: isize, w: c_int, h: c_int, bd_max: c_int, edge_filter: bool, smooth_filter: bool, ) { assert!(angle > 180); let mut dy = dr_intra_derivative(270 - angle as usize) as c_int; let mut tmp = [MaybeUninit::::uninit(); MAX_TX_SIZE * 4 + 16]; let mut out = [MaybeUninit::::uninit(); MAX_TX_SIZE * 8 + 15 * 2]; let out = out.as_mut_ptr() as *mut T; let tmp = tmp.as_mut_ptr() as *mut T; let upsample_left = edge_filter && select_ief_upsample(w as _, h as _, smooth_filter, angle - 180); let max_base_y = if upsample_left { tmp.write(src.read()); ipred_reverse(tmp.add(1), src, h + w.max(h)); ipred_z1_upsample_edge(out, w + h, tmp, h + w.min(h), bd_max); dy <<= 1; 2 * (w + h) - 2 } else { let strength = select_ief_strength(w as _, h as _, smooth_filter, angle - 180) as c_int; if strength != 0 { tmp.write(src.read()); ipred_reverse(tmp.add(1), src, h + w.max(h)); ipred_z1_filter_edge(out, w + h, tmp, h + w.min(h), strength); w + h - 1 } else { let n = w + w.min(h); ipred_reverse(out, src, n); n - 1 } }; let base_inc = 1 + upsample_left as c_int; let pad_pixels = (h + 15).max(64 - max_base_y - 1); let fill_pixel = out.add(max_base_y as usize).read(); let base = out.add(max_base_y as usize + 1); for i in 0..(pad_pixels * base_inc) as usize { base.add(i).write(fill_pixel); } Z3.ipred_fill(dst, stride, out, w, h, dy, max_base_y, upsample_left); } #[inline(always)] pub fn dispatch_predict_intra( mode: PredictionMode, variant: PredictionVariant, dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, ac: &[i16], angle: isize, ief_params: Option, edge_buf: &IntraEdge, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { rust::dispatch_predict_intra( mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf, cpu, ); }; if cpu < CpuFeatureLevel::NEON { return call_rust(dst); } unsafe { let dst_ptr = dst.data_ptr_mut() as *mut _; let dst_u16 = dst.data_ptr_mut() as *mut u16; let stride = T::to_asm_stride(dst.plane_cfg.stride) as libc::ptrdiff_t; let edge_ptr = edge_buf.top_left_ptr() as *const _; let edge_u16 = edge_buf.top_left_ptr() as *const u16; let w = tx_size.width() as libc::c_int; let h = tx_size.height() as libc::c_int; let angle = angle as libc::c_int; let bd_max = (1 << bit_depth) - 1; match T::type_enum() { PixelType::U8 => match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_8bpc_neon, PredictionVariant::LEFT => rav1e_ipred_dc_left_8bpc_neon, PredictionVariant::TOP => rav1e_ipred_dc_top_8bpc_neon, PredictionVariant::BOTH => rav1e_ipred_dc_8bpc_neon, })(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_8bpc_neon(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_8bpc_neon(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_8bpc_neon(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_8bpc_neon( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_8bpc_neon( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_8bpc_neon(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_8bpc_neon, PredictionVariant::LEFT => rav1e_ipred_cfl_left_8bpc_neon, PredictionVariant::TOP => rav1e_ipred_cfl_top_8bpc_neon, PredictionVariant::BOTH => rav1e_ipred_cfl_8bpc_neon, })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); } _ => call_rust(dst), }, PixelType::U16 if bit_depth > 8 => match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_16bpc_neon, PredictionVariant::LEFT => rav1e_ipred_dc_left_16bpc_neon, PredictionVariant::TOP => rav1e_ipred_dc_top_16bpc_neon, PredictionVariant::BOTH => rav1e_ipred_dc_16bpc_neon, })(dst_u16, stride, edge_u16, w, h, angle, 0, 0, bd_max); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_16bpc_neon( dst_u16, stride, edge_u16, w, h, angle, 0, 0, bd_max, ); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_16bpc_neon( dst_u16, stride, edge_u16, w, h, angle, 0, 0, bd_max, ); } PredictionMode::H_PRED | PredictionMode::V_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => { let edge_filter = ief_params.is_some(); let smooth_filter = ief_params .map(IntraEdgeFilterParameters::use_smooth_filter) .unwrap_or_default(); if (90..=180).contains(&angle) { // From dav1d, bw and bh are the frame width and height rounded to 8px units let (bw, bh) = ( ((dst.plane_cfg.width + 7) >> 3) << 3, ((dst.plane_cfg.height + 7) >> 3) << 3, ); // From dav1d, dx and dy are the distance from the predicted block to the frame edge let (dx, dy) = ( (bw as isize - dst.rect().x) as libc::c_int, (bh as isize - dst.rect().y) as libc::c_int, ); return ipred_z2( dst.data_ptr_mut(), stride, edge_buf.top_left_ptr(), angle as isize, w, h, dx, dy, bd_max, edge_filter, smooth_filter, ); } (if angle < 90 { ipred_z1 } else { ipred_z3 })( dst.data_ptr_mut(), stride, edge_buf.top_left_ptr(), angle as isize, w, h, bd_max, edge_filter, smooth_filter, ); } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_16bpc_neon( dst_u16, stride, edge_u16, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_16bpc_neon( dst_u16, stride, edge_u16, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_16bpc_neon( dst_u16, stride, edge_u16, w, h, angle, 0, 0, bd_max, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_16bpc_neon( dst_u16, stride, edge_u16, w, h, angle, 0, 0, bd_max, ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_16bpc_neon, PredictionVariant::LEFT => rav1e_ipred_cfl_left_16bpc_neon, PredictionVariant::TOP => rav1e_ipred_cfl_top_16bpc_neon, PredictionVariant::BOTH => rav1e_ipred_cfl_16bpc_neon, })(dst_u16, stride, edge_u16, w, h, ac_ptr, angle, bd_max); } _ => call_rust(dst), }, _ => call_rust(dst), } } } /// It MUST initialize all `ac` elements. #[inline(always)] pub(crate) fn pred_cfl_ac( ac: &mut [MaybeUninit], luma: &PlaneRegion<'_, T>, bsize: BlockSize, w_pad: usize, h_pad: usize, cpu: CpuFeatureLevel, ) { debug_assert_eq!(ac.len(), bsize.area()); if cpu < CpuFeatureLevel::NEON { return rust::pred_cfl_ac::( ac, luma, bsize, w_pad, h_pad, cpu, ); } let stride = T::to_asm_stride(luma.plane_cfg.stride) as libc::ptrdiff_t; let w = bsize.width() as libc::c_int; let h = bsize.height() as libc::c_int; let w_pad = w_pad as libc::c_int; let h_pad = h_pad as libc::c_int; // SAFETY: Calls Assembly code. unsafe { let ac_ptr = ac.as_mut_ptr(); match T::type_enum() { PixelType::U8 => { let luma_ptr = luma.data_ptr() as *const u8; (match (XDEC, YDEC) { (0, 0) => rav1e_ipred_cfl_ac_444_8bpc_neon, (1, 0) => rav1e_ipred_cfl_ac_422_8bpc_neon, _ => rav1e_ipred_cfl_ac_420_8bpc_neon, })(ac_ptr, luma_ptr, stride, w_pad, h_pad, w, h) } PixelType::U16 => { let luma_ptr = luma.data_ptr() as *const u16; (match (XDEC, YDEC) { (0, 0) => rav1e_ipred_cfl_ac_444_16bpc_neon, (1, 0) => rav1e_ipred_cfl_ac_422_16bpc_neon, _ => rav1e_ipred_cfl_ac_420_16bpc_neon, })(ac_ptr, luma_ptr, stride, w_pad, h_pad, w, h) } } } } rav1e-0.7.1/src/asm/aarch64/transform/forward.rs000064400000000000000000000335651046102023000175070ustar 00000000000000// Copyright (c) 2019-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::asm::shared::transform::forward::*; use crate::cpu_features::CpuFeatureLevel; use crate::transform::forward::rust; use crate::transform::forward_shared::*; use crate::transform::*; use crate::util::*; use std::mem::MaybeUninit; use debug_unreachable::debug_unreachable; use core::arch::aarch64::*; #[derive(Clone, Copy)] #[repr(transparent)] struct I32X8(int32x4x2_t); impl I32X8 { #[inline] const unsafe fn vec(self) -> int32x4x2_t { self.0 } #[inline] const unsafe fn raw(a: int32x4x2_t) -> I32X8 { Self(a) } #[inline] const unsafe fn new(a: int32x4_t, b: int32x4_t) -> I32X8 { Self(int32x4x2_t(a, b)) } } type TxfmFunc = unsafe fn(&mut [I32X8]); impl_1d_tx!(allow(unused_attributes), unsafe); impl TxOperations for I32X8 { #[inline] unsafe fn zero() -> Self { let zero = vdupq_n_s32(0); I32X8::new(zero, zero) } #[inline] unsafe fn tx_mul(self, mul: i32) -> Self { I32X8::new( vrshrq_n_s32(vmulq_n_s32(self.vec().0, mul), SHIFT), vrshrq_n_s32(vmulq_n_s32(self.vec().1, mul), SHIFT), ) } #[inline] unsafe fn rshift1(self) -> Self { I32X8::new( vhsubq_s32( self.vec().0, vreinterpretq_s32_u32(vcltzq_s32(self.vec().0)), ), vhsubq_s32( self.vec().1, vreinterpretq_s32_u32(vcltzq_s32(self.vec().1)), ), ) } #[inline] unsafe fn add(self, b: Self) -> Self { I32X8::new( vaddq_s32(self.vec().0, b.vec().0), vaddq_s32(self.vec().1, b.vec().1), ) } #[inline] unsafe fn sub(self, b: Self) -> Self { I32X8::new( vsubq_s32(self.vec().0, b.vec().0), vsubq_s32(self.vec().1, b.vec().1), ) } #[inline] unsafe fn add_avg(self, b: Self) -> Self { I32X8::new( vhaddq_s32(self.vec().0, b.vec().0), vhaddq_s32(self.vec().1, b.vec().1), ) } #[inline] unsafe fn sub_avg(self, b: Self) -> Self { I32X8::new( vhsubq_s32(self.vec().0, b.vec().0), vhsubq_s32(self.vec().1, b.vec().1), ) } } #[inline] unsafe fn vreinterpretq_u32_s32_x2(a: int32x4x2_t) -> uint32x4x2_t { uint32x4x2_t(vreinterpretq_u32_s32(a.0), vreinterpretq_u32_s32(a.1)) } #[inline] unsafe fn vtrnq_s64_to_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t { let a = vreinterpretq_s64_s32(a); let b = vreinterpretq_s64_s32(b); int32x4x2_t( vreinterpretq_s32_s64(vtrn1q_s64(a, b)), vreinterpretq_s32_s64(vtrn2q_s64(a, b)), ) } #[inline] unsafe fn transpose_8x8_neon( input: &[I32X8; 8], into: &mut [MaybeUninit; 8], ) { let stage1 = ( vtrnq_s32(input[0].vec().0, input[1].vec().0), vtrnq_s32(input[2].vec().0, input[3].vec().0), vtrnq_s32(input[4].vec().0, input[5].vec().0), vtrnq_s32(input[6].vec().0, input[7].vec().0), vtrnq_s32(input[0].vec().1, input[1].vec().1), vtrnq_s32(input[2].vec().1, input[3].vec().1), vtrnq_s32(input[4].vec().1, input[5].vec().1), vtrnq_s32(input[6].vec().1, input[7].vec().1), ); let stage2 = ( vtrnq_s64_to_s32(stage1.0 .0, stage1.1 .0), vtrnq_s64_to_s32(stage1.0 .1, stage1.1 .1), vtrnq_s64_to_s32(stage1.2 .0, stage1.3 .0), vtrnq_s64_to_s32(stage1.2 .1, stage1.3 .1), vtrnq_s64_to_s32(stage1.4 .0, stage1.5 .0), vtrnq_s64_to_s32(stage1.4 .1, stage1.5 .1), vtrnq_s64_to_s32(stage1.6 .0, stage1.7 .0), vtrnq_s64_to_s32(stage1.6 .1, stage1.7 .1), ); into[0].write(I32X8::new(stage2.0 .0, stage2.2 .0)); into[1].write(I32X8::new(stage2.1 .0, stage2.3 .0)); into[2].write(I32X8::new(stage2.0 .1, stage2.2 .1)); into[3].write(I32X8::new(stage2.1 .1, stage2.3 .1)); into[4].write(I32X8::new(stage2.4 .0, stage2.6 .0)); into[5].write(I32X8::new(stage2.5 .0, stage2.7 .0)); into[6].write(I32X8::new(stage2.4 .1, stage2.6 .1)); into[7].write(I32X8::new(stage2.5 .1, stage2.7 .1)); } #[inline] unsafe fn transpose_8x4_neon( input: &[I32X8; 8], into: &mut [MaybeUninit; 4], ) { let stage1 = ( vtrnq_s32(input[0].vec().0, input[1].vec().0), vtrnq_s32(input[2].vec().0, input[3].vec().0), vtrnq_s32(input[4].vec().0, input[5].vec().0), vtrnq_s32(input[6].vec().0, input[7].vec().0), ); let stage2 = ( vtrnq_s64_to_s32(stage1.0 .0, stage1.1 .0), vtrnq_s64_to_s32(stage1.0 .1, stage1.1 .1), vtrnq_s64_to_s32(stage1.2 .0, stage1.3 .0), vtrnq_s64_to_s32(stage1.2 .1, stage1.3 .1), ); into[0].write(I32X8::new(stage2.0 .0, stage2.2 .0)); into[1].write(I32X8::new(stage2.1 .0, stage2.3 .0)); into[2].write(I32X8::new(stage2.0 .1, stage2.2 .1)); into[3].write(I32X8::new(stage2.1 .1, stage2.3 .1)); } #[inline] unsafe fn transpose_4x8_neon( input: &[I32X8; 4], into: &mut [MaybeUninit; 8], ) { let stage1 = ( vtrnq_s32(input[0].vec().0, input[1].vec().0), vtrnq_s32(input[2].vec().0, input[3].vec().0), vtrnq_s32(input[0].vec().1, input[1].vec().1), vtrnq_s32(input[2].vec().1, input[3].vec().1), ); let stage2 = ( vtrnq_s64_to_s32(stage1.0 .0, stage1.1 .0), vtrnq_s64_to_s32(stage1.0 .1, stage1.1 .1), vtrnq_s64_to_s32(stage1.2 .0, stage1.3 .0), vtrnq_s64_to_s32(stage1.2 .1, stage1.3 .1), ); into[0].write(I32X8::raw(stage2.0)); into[1].write(I32X8::raw(stage2.1)); into[2].write(I32X8::new(stage2.0 .1, stage2.0 .0)); into[3].write(I32X8::new(stage2.1 .1, stage2.1 .0)); into[4].write(I32X8::raw(stage2.2)); into[5].write(I32X8::raw(stage2.3)); into[6].write(I32X8::new(stage2.2 .1, stage2.2 .0)); into[7].write(I32X8::new(stage2.3 .1, stage2.3 .0)); } #[inline] unsafe fn transpose_4x4_neon( input: &[I32X8; 4], into: &mut [MaybeUninit; 4], ) { let stage1 = ( vtrnq_s32(input[0].vec().0, input[1].vec().0), vtrnq_s32(input[2].vec().0, input[3].vec().0), ); let stage2 = ( vtrnq_s64_to_s32(stage1.0 .0, stage1.1 .0), vtrnq_s64_to_s32(stage1.0 .1, stage1.1 .1), ); into[0].write(I32X8::raw(stage2.0)); into[1].write(I32X8::raw(stage2.1)); into[2].write(I32X8::new(stage2.0 .1, stage2.0 .0)); into[3].write(I32X8::new(stage2.1 .1, stage2.1 .0)); } #[inline] unsafe fn shift_left_neon(a: I32X8, shift: u8) -> I32X8 { let shift = vdupq_n_s32(shift.into()); I32X8::new(vrshlq_s32(a.vec().0, shift), vrshlq_s32(a.vec().1, shift)) } #[inline] unsafe fn shift_right_neon(a: I32X8) -> I32X8 { I32X8::new(vrshrq_n_s32(a.vec().0, SHIFT), vrshrq_n_s32(a.vec().1, SHIFT)) } #[inline] unsafe fn round_shift_array_neon(arr: &mut [I32X8], bit: i8) { if arr.len() % 4 != 0 { debug_unreachable!(); } if bit == 0 { return; } if bit > 0 { if bit == 1 { for s in arr.chunks_exact_mut(4) { for chunk in s { *chunk = shift_right_neon::<1>(*chunk) } } } else if bit == 2 { for s in arr.chunks_exact_mut(4) { for chunk in s { *chunk = shift_right_neon::<2>(*chunk) } } } else { debug_unreachable!(); } } else { let shift = (-bit) as u8; for s in arr.chunks_exact_mut(4) { for chunk in s { *chunk = shift_left_neon(*chunk, shift); } } } } #[allow(clippy::identity_op, clippy::erasing_op)] unsafe fn forward_transform_neon( input: &[i16], output: &mut [MaybeUninit], stride: usize, tx_size: TxSize, tx_type: TxType, bd: usize, ) { // Note when assigning txfm_size_col, we use the txfm_size from the // row configuration and vice versa. This is intentionally done to // accurately perform rectangular transforms. When the transform is // rectangular, the number of columns will be the same as the // txfm_size stored in the row cfg struct. It will make no difference // for square transforms. let txfm_size_col = tx_size.width(); let txfm_size_row = tx_size.height(); let col_class = SizeClass1D::from_length(txfm_size_col); let row_class = SizeClass1D::from_length(txfm_size_row); let mut buf = Aligned::new([MaybeUninit::::uninit(); 64 * 64 / 8]); let buf = &mut buf.data[..txfm_size_col * (txfm_size_row / 8).max(1)]; let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); let txfm_func_col = get_func(cfg.txfm_type_col); let txfm_func_row = get_func(cfg.txfm_type_row); // Columns for cg in (0..txfm_size_col).step_by(8) { let shift = cfg.shift[0] as u8; #[inline] unsafe fn load_columns(input_ptr: *const i16, shift: u8) -> I32X8 { // TODO: load 64-bits for x4 wide columns let a = vld1q_s16(input_ptr); shift_left_neon( I32X8::new(vmovl_s16(vget_low_s16(a)), vmovl_high_s16(a)), shift, ) } // Avoid zero initialization let col_coeffs = &mut [MaybeUninit::::uninit(); 64][..txfm_size_row]; if cfg.ud_flip { // flip upside down for (in_slice, out_reg) in input[cg..].chunks(stride).zip(col_coeffs.iter_mut().rev()) { *out_reg = MaybeUninit::new(load_columns(in_slice.as_ptr(), shift)); } } else { for (in_slice, out_reg) in input[cg..].chunks(stride).zip(col_coeffs.iter_mut()) { *out_reg = MaybeUninit::new(load_columns(in_slice.as_ptr(), shift)); } } let col_coeffs = slice_assume_init_mut(col_coeffs); txfm_func_col(col_coeffs); round_shift_array_neon(col_coeffs, -cfg.shift[1]); // Transpose the array. Select the appropriate method to do so. match (row_class, col_class) { (SizeClass1D::X8UP, SizeClass1D::X8UP) => { for rg in (0..txfm_size_row).step_by(8) { let buf = &mut buf[(rg / 8 * txfm_size_col) + cg..]; let buf = cast_mut::<8, _>(buf); let input = &col_coeffs[rg..]; let input = cast::<8, _>(input); transpose_8x8_neon(input, buf); } } (SizeClass1D::X8UP, SizeClass1D::X4) => { for rg in (0..txfm_size_row).step_by(8) { let buf = &mut buf[(rg / 8 * txfm_size_col) + cg..]; let buf = cast_mut::<4, _>(buf); let input = &col_coeffs[rg..]; let input = cast::<8, _>(input); transpose_8x4_neon(input, buf); } } (SizeClass1D::X4, SizeClass1D::X8UP) => { // Don't need to loop over rows let buf = &mut buf[cg..]; let buf = cast_mut::<8, _>(buf); let input = cast::<4, _>(col_coeffs); transpose_4x8_neon(input, buf); } (SizeClass1D::X4, SizeClass1D::X4) => { // Don't need to loop over rows let buf = cast_mut::<4, _>(buf); let input = cast::<4, _>(col_coeffs); transpose_4x4_neon(input, buf); } } } let buf = slice_assume_init_mut(buf); // Rows for rg in (0..txfm_size_row).step_by(8) { let row_coeffs = &mut buf[rg / 8 * txfm_size_col..][..txfm_size_col]; if cfg.lr_flip { row_coeffs.reverse(); } txfm_func_row(row_coeffs); round_shift_array_neon(row_coeffs, -cfg.shift[2]); // Write out the coefficients using the correct method for transforms of // this size. match row_class { SizeClass1D::X8UP => { // Store output in at most 32x32 chunks. See rust code for details. // Output is grouped into 32x32 chunks so a stride of at most 32 is // used for each chunk let output_stride = txfm_size_row.min(32); // Split the first 32 rows from the last 32 rows and offset by rg % 32 let output = &mut output[(rg & 31) + (rg >= 32) as usize * output_stride * txfm_size_col.min(32)..]; for cg in (0..txfm_size_col).step_by(32) { // Offset by zero or half of output let output = &mut output[txfm_size_row * cg..]; for c in 0..txfm_size_col.min(32) { match T::Pixel::type_enum() { PixelType::U8 => { let vec = row_coeffs[c + cg].vec(); vst1_u16_x2( output[c * output_stride..].as_mut_ptr() as *mut _, uint16x4x2_t( vreinterpret_u16_s16(vmovn_s32(vec.0)), vreinterpret_u16_s16(vmovn_s32(vec.1)), ), ); } PixelType::U16 => { vst1q_u32_x2( output[c * output_stride..].as_mut_ptr() as *mut _, vreinterpretq_u32_s32_x2(row_coeffs[c + cg].vec()), ); } } } } } SizeClass1D::X4 => { // Write out coefficients in normal order - it isn't possible to have // more than 32 rows. for c in 0..txfm_size_col { match T::Pixel::type_enum() { PixelType::U8 => { vst1_s16( output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _, vmovn_s32(row_coeffs[c].vec().0), ); } PixelType::U16 => { vst1q_s32( output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _, row_coeffs[c].vec().0, ); } } } } } } } /// # Panics /// /// - If called with an invalid combination of `tx_size` and `tx_type` #[inline] pub fn forward_transform( input: &[i16], output: &mut [MaybeUninit], stride: usize, tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, ) { assert!(valid_av1_transform(tx_size, tx_type)); if cpu >= CpuFeatureLevel::NEON { // SAFETY: Calls Assembly code. unsafe { forward_transform_neon(input, output, stride, tx_size, tx_type, bd); } } else { rust::forward_transform(input, output, stride, tx_size, tx_type, bd, cpu); } } rav1e-0.7.1/src/asm/aarch64/transform/inverse.rs000064400000000000000000000240161046102023000175050ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::tiling::PlaneRegionMut; use crate::transform::inverse::*; use crate::transform::*; use crate::{Pixel, PixelType}; use crate::asm::shared::transform::inverse::*; pub fn inverse_transform_add( input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16, tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, ) { match T::type_enum() { PixelType::U8 => { if let Some(func) = INV_TXFM_FNS[cpu.as_index()][tx_size][tx_type] { return call_inverse_func( func, input, output, eob, tx_size.width(), tx_size.height(), bd, ); } } PixelType::U16 if bd == 10 => { if let Some(func) = INV_TXFM_HBD_FNS[cpu.as_index()][tx_size][tx_type] { return call_inverse_hbd_func( func, input, output, eob, tx_size.width(), tx_size.height(), bd, ); } } PixelType::U16 => {} }; rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu); } macro_rules! decl_itx_fns { // Takes a 2d list of tx types for W and H ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr, $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { // For each tx type, declare an function for the current WxH $( $( extern { // Note: type1 and type2 are flipped fn []( dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32 ); } )* )* // Create a lookup table for the tx types declared above const []: [Option; TX_TYPES_PLUS_LL] = { let mut out: [Option; TX_TYPES_PLUS_LL] = [None; TX_TYPES_PLUS_LL]; $( $( out[$ENUM as usize] = Some([]); )* )* out }; } }; } macro_rules! decl_itx_hbd_fns { // Takes a 2d list of tx types for W and H ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr, $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { // For each tx type, declare an function for the current WxH $( $( extern { // Note: type1 and type2 are flipped fn []( dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32, bitdepth_max: i32, ); } )* )* // Create a lookup table for the tx types declared above const []: [Option; TX_TYPES_PLUS_LL] = { let mut out: [Option; TX_TYPES_PLUS_LL] = [None; TX_TYPES_PLUS_LL]; $( $( out[$ENUM as usize] = Some([]); )* )* out }; } }; } macro_rules! create_wxh_tables { // Create a lookup table for each cpu feature ([$([$(($W:expr, $H:expr)),*]),*], $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { const []: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = { let mut out: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]; // For each dimension, add an entry to the table $( $( out[TxSize::[] as usize] = []; )* )* out }; } }; // Loop through cpu features ($DIMS:tt, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( create_wxh_tables!($DIMS, $OPT_LOWER, $OPT_UPPER); )* }; } macro_rules! create_wxh_hbd_tables { // Create a lookup table for each cpu feature ([$([$(($W:expr, $H:expr)),*]),*], $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { const []: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = { let mut out: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]; // For each dimension, add an entry to the table $( $( out[TxSize::[] as usize] = []; )* )* out }; } }; // Loop through cpu features ($DIMS:tt, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( create_wxh_hbd_tables!($DIMS, $OPT_LOWER, $OPT_UPPER); )* }; } macro_rules! impl_itx_fns { ($TYPES:tt, $W:expr, $H:expr, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( decl_itx_fns!($TYPES, $W, $H, $OPT_LOWER, $OPT_UPPER); )* }; // Loop over a list of dimensions ($TYPES_VALID:tt, [$(($W:expr, $H:expr)),*], $OPT:tt) => { $( impl_itx_fns!($TYPES_VALID, $W, $H, $OPT); )* }; ($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt, $TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt, $OPT:tt) => { // Make 2d list of tx types for each set of dimensions. Each set of // dimensions uses a superset of the previous set of tx types. impl_itx_fns!([$TYPES64], $DIMS64, $OPT); impl_itx_fns!([$TYPES64, $TYPES32], $DIMS32, $OPT); impl_itx_fns!([$TYPES64, $TYPES32, $TYPES16], $DIMS16, $OPT); impl_itx_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84, $OPT ); impl_itx_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4, $OPT ); // Pool all of the dimensions together to create a table for each cpu // feature level. create_wxh_tables!( [$DIMS64, $DIMS32, $DIMS16, $DIMS84, $DIMS4], $OPT ); }; } impl_itx_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16 and 4x4) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8)], // 4x4 [(TxType::WHT_WHT, wht, wht)], [(4, 4)], [(neon, NEON)] ); cpu_function_lookup_table!( INV_TXFM_FNS: [[[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]], default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL], [NEON] ); macro_rules! impl_itx_hbd_fns { ($TYPES:tt, $W:expr, $H:expr, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( decl_itx_hbd_fns!($TYPES, $W, $H, $OPT_LOWER, $OPT_UPPER); )* }; // Loop over a list of dimensions ($TYPES_VALID:tt, [$(($W:expr, $H:expr)),*], $OPT:tt) => { $( impl_itx_hbd_fns!($TYPES_VALID, $W, $H, $OPT); )* }; ($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt, $TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt, $OPT:tt) => { // Make 2d list of tx types for each set of dimensions. Each set of // dimensions uses a superset of the previous set of tx types. impl_itx_hbd_fns!([$TYPES64], $DIMS64, $OPT); impl_itx_hbd_fns!([$TYPES64, $TYPES32], $DIMS32, $OPT); impl_itx_hbd_fns!([$TYPES64, $TYPES32, $TYPES16], $DIMS16, $OPT); impl_itx_hbd_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84, $OPT ); impl_itx_hbd_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4, $OPT ); // Pool all of the dimensions together to create a table for each cpu // feature level. create_wxh_hbd_tables!( [$DIMS64, $DIMS32, $DIMS16, $DIMS84, $DIMS4], $OPT ); }; } impl_itx_hbd_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16 and 4x4) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8)], // 4x4 [(TxType::WHT_WHT, wht, wht)], [(4, 4)], [(neon, NEON)] ); cpu_function_lookup_table!( INV_TXFM_HBD_FNS: [[[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]], default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL], [NEON] ); rav1e-0.7.1/src/asm/aarch64/transform/mod.rs000064400000000000000000000010611046102023000166040ustar 00000000000000// Copyright (c) 2019-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub mod forward; pub mod inverse; rav1e-0.7.1/src/asm/mod.rs000064400000000000000000000012021046102023000133360ustar 00000000000000// Copyright (c) 2019, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #[cfg(nasm_x86_64)] pub mod x86; #[cfg(asm_neon)] pub mod aarch64; #[cfg(any(nasm_x86_64, asm_neon))] pub mod shared; rav1e-0.7.1/src/asm/shared/dist/cdef_dist.rs000064400000000000000000000105551046102023000167270ustar 00000000000000// Copyright (c) 2022-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #[cfg(test)] pub mod test { use crate::cpu_features::CpuFeatureLevel; use crate::dist::*; use crate::frame::*; use crate::tiling::Area; use crate::util::Pixel; use rand::{thread_rng, Rng}; fn random_planes(bd: usize) -> (Plane, Plane) { let mut rng = thread_rng(); // Two planes with different strides let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8); let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8); for rows in input_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from(rng.gen_range(0u16..(1 << bd))); } } for rows in rec_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from(rng.gen_range(0u16..(1 << bd))); } } (input_plane, rec_plane) } /// Create planes with the max values for pixels. fn max_planes(bd: usize) -> (Plane, Plane) { // Two planes with different strides let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8); let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8); for rows in input_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from((1 << bd) - 1); } } for rows in rec_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from((1 << bd) - 1); } } (input_plane, rec_plane) } /// Create planes with the max difference between the two values. fn max_diff_planes(bd: usize) -> (Plane, Plane) { // Two planes with different strides let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8); let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8); for rows in input_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from(0); } } for rows in rec_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from((1 << bd) - 1); } } (input_plane, rec_plane) } #[test] fn cdef_dist_simd_random() { cdef_diff_tester(8, random_planes::); } #[test] fn cdef_dist_simd_random_hbd() { cdef_diff_tester(10, random_planes::); cdef_diff_tester(12, random_planes::); } #[test] fn cdef_dist_simd_large() { cdef_diff_tester(8, max_planes::); } #[test] fn cdef_dist_simd_large_hbd() { cdef_diff_tester(10, max_planes::); cdef_diff_tester(12, max_planes::); } #[test] fn cdef_dist_simd_large_diff() { cdef_diff_tester(8, max_diff_planes::); } #[test] fn cdef_dist_simd_large_diff_hbd() { cdef_diff_tester(10, max_diff_planes::); cdef_diff_tester(12, max_diff_planes::); } fn cdef_diff_tester( bd: usize, gen_planes: fn(bd: usize) -> (Plane, Plane), ) { let (src_plane, dst_plane) = gen_planes(bd); let mut fail = false; for w in 1..=8 { for h in 1..=8 { // Test alignment by choosing starting location based on width. let area = Area::StartingAt { x: if w <= 4 { 4 } else { 8 }, y: 40 }; let src_region = src_plane.region(area); let dst_region = dst_plane.region(area); let rust = rust::cdef_dist_kernel( &src_region, &dst_region, w, h, bd, CpuFeatureLevel::default(), ); let simd = cdef_dist_kernel( &src_region, &dst_region, w, h, bd, CpuFeatureLevel::default(), ); if simd != rust { eprintln!( "CDEF Distortion {}x{}: Assembly doesn't match reference code \ \t {} (asm) != {} (ref)", w, h, simd, rust ); fail = true; } } if fail { panic!(); } } } } rav1e-0.7.1/src/asm/shared/dist/mod.rs000064400000000000000000000010571046102023000155570ustar 00000000000000// Copyright (c) 2020-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub mod cdef_dist; pub mod sse; rav1e-0.7.1/src/asm/shared/dist/sse.rs000064400000000000000000000121221046102023000155650ustar 00000000000000// Copyright (c) 2020-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #[cfg(test)] pub mod test { use crate::config::CpuFeatureLevel; use crate::dist::rust; use crate::dist::*; use crate::frame::*; use crate::partition::BlockSize; use crate::rdo::DistortionScale; use crate::tiling::Area; use crate::util::*; use rand::{thread_rng, Rng}; fn random_planes(bd: usize) -> (Plane, Plane) { let mut rng = thread_rng(); // Two planes with different strides let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8); let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8); for rows in input_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from(rng.gen_range(0u16..(1 << bd))); } } for rows in rec_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from(rng.gen_range(0u16..(1 << bd))); } } (input_plane, rec_plane) } // Create planes with the max difference between the two values. fn max_diff_planes(bd: usize) -> (Plane, Plane) { // Two planes with different strides let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8); let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8); for rows in input_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from(0); } } for rows in rec_plane.as_region_mut().rows_iter_mut() { for c in rows { *c = T::cast_from((1 << bd) - 1); } } (input_plane, rec_plane) } /// Fill data for scaling of one (i.e. no scaling between blocks) fn scaling_one(scales: &mut [u32]) { for a in scales.iter_mut() { *a = DistortionScale::default().0; } } /// Fill data for scaling of one fn scaling_random(scales: &mut [u32]) { let mut rng = thread_rng(); for a in scales.iter_mut() { *a = rng .gen_range(DistortionScale::from(0.5).0..DistortionScale::from(1.5).0); } } /// Fill the max value for scaling /// TODO: Pair with max difference test fn scaling_large(scales: &mut [u32]) { for a in scales.iter_mut() { *a = DistortionScale::from(f64::MAX).0; } } #[test] fn weighted_sse_simd_no_scaling() { weighted_sse_simd_tester(8, scaling_one, random_planes::); } #[test] fn weighted_sse_simd_random() { weighted_sse_simd_tester(8, scaling_random, random_planes::); } #[test] fn weighted_sse_simd_large() { weighted_sse_simd_tester(8, scaling_large, max_diff_planes::); } #[test] fn weighted_sse_hbd_simd_no_scaling() { weighted_sse_simd_tester(12, scaling_one, random_planes::); } #[test] fn weighted_sse_hbd_simd_random() { weighted_sse_simd_tester(12, scaling_random, random_planes::); } #[test] fn weighted_sse_hbd_simd_large() { weighted_sse_simd_tester(12, scaling_large, max_diff_planes::); } fn weighted_sse_simd_tester( bd: usize, fill_scales: fn(scales: &mut [u32]), gen_planes: fn(bd: usize) -> (Plane, Plane), ) { use BlockSize::*; let blocks = vec![ BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128, BLOCK_4X16, BLOCK_16X4, BLOCK_8X32, BLOCK_32X8, BLOCK_16X64, BLOCK_64X16, ]; const SCALE_STRIDE: usize = 256; let mut scaling_storage = Aligned::new([0u32; 256 * SCALE_STRIDE]); let scaling = &mut scaling_storage.data; fill_scales(scaling); let (input_plane, rec_plane) = gen_planes(bd); for block in blocks { // Start at block width to test alignment. let area = Area::StartingAt { x: block.width() as isize, y: 40 }; let input_region = input_plane.region(area); let rec_region = rec_plane.region(area); let rust = rust::get_weighted_sse( &input_region, &rec_region, scaling, SCALE_STRIDE, block.width(), block.height(), bd, CpuFeatureLevel::default(), ); let simd = get_weighted_sse( &input_region, &rec_region, scaling, SCALE_STRIDE, block.width(), block.height(), bd, CpuFeatureLevel::default(), ); assert!( simd == rust, "Weighted SSE {}: Assembly doesn't match reference code. {} (asm) != {} (ref)", block, simd, rust, ); } } } rav1e-0.7.1/src/asm/shared/mod.rs000064400000000000000000000011011046102023000146020ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub mod dist; pub mod predict; pub mod transform; rav1e-0.7.1/src/asm/shared/predict.rs000064400000000000000000000172121046102023000154670ustar 00000000000000// Copyright (c) 2020-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #[cfg(test)] mod test { use interpolate_name::interpolate_test; use rand::random; use std::mem::MaybeUninit; use crate::context::MAX_TX_SIZE; use crate::cpu_features::CpuFeatureLevel; use crate::frame::{AsRegion, Plane}; use crate::partition::{BlockSize, IntraEdge}; use crate::predict::dispatch_predict_intra; use crate::predict::pred_cfl_ac; use crate::predict::rust; use crate::predict::{ IntraEdgeFilterParameters, PredictionMode, PredictionVariant, }; use crate::transform::TxSize; use crate::util::{slice_assume_init_mut, Aligned}; use crate::Pixel; #[test] fn pred_matches() { for cpu in &CpuFeatureLevel::all()[..=CpuFeatureLevel::default().as_index()] { pred_matches_inner::(*cpu, 8); pred_matches_inner::(*cpu, 10); pred_matches_inner::(*cpu, 12); } } fn pred_matches_inner(cpu: CpuFeatureLevel, bit_depth: usize) { let tx_size = TxSize::TX_4X4; let ac: Aligned<[i16; 32 * 32]> = Aligned::from_fn(|i| i as i16 - 16 * 32); let edge_buf = Aligned::from_fn(|i| { T::cast_from(((i ^ 1) + 32).saturating_sub(2 * MAX_TX_SIZE)) }); let edge_buf = IntraEdge::mock(&edge_buf); let ief_params_all = [ None, Some(IntraEdgeFilterParameters::default()), Some(IntraEdgeFilterParameters { above_mode: Some(PredictionMode::SMOOTH_PRED), left_mode: Some(PredictionMode::SMOOTH_PRED), ..Default::default() }), ]; for (mode, variant) in [ (PredictionMode::DC_PRED, PredictionVariant::BOTH), (PredictionMode::DC_PRED, PredictionVariant::TOP), (PredictionMode::DC_PRED, PredictionVariant::LEFT), (PredictionMode::DC_PRED, PredictionVariant::NONE), (PredictionMode::V_PRED, PredictionVariant::BOTH), (PredictionMode::H_PRED, PredictionVariant::BOTH), (PredictionMode::D45_PRED, PredictionVariant::BOTH), (PredictionMode::D135_PRED, PredictionVariant::BOTH), (PredictionMode::D113_PRED, PredictionVariant::BOTH), (PredictionMode::D157_PRED, PredictionVariant::BOTH), (PredictionMode::D203_PRED, PredictionVariant::BOTH), (PredictionMode::D67_PRED, PredictionVariant::BOTH), (PredictionMode::SMOOTH_PRED, PredictionVariant::BOTH), (PredictionMode::SMOOTH_V_PRED, PredictionVariant::BOTH), (PredictionMode::SMOOTH_H_PRED, PredictionVariant::BOTH), (PredictionMode::PAETH_PRED, PredictionVariant::BOTH), (PredictionMode::UV_CFL_PRED, PredictionVariant::BOTH), (PredictionMode::UV_CFL_PRED, PredictionVariant::TOP), (PredictionMode::UV_CFL_PRED, PredictionVariant::LEFT), (PredictionMode::UV_CFL_PRED, PredictionVariant::NONE), ] .iter() { let angles = match mode { PredictionMode::V_PRED => [81, 84, 87, 90, 93, 96, 99].iter(), PredictionMode::H_PRED => [171, 174, 177, 180, 183, 186, 189].iter(), PredictionMode::D45_PRED => [36, 39, 42, 45, 48, 51, 54].iter(), PredictionMode::D135_PRED => { [126, 129, 132, 135, 138, 141, 144].iter() } PredictionMode::D113_PRED => { [104, 107, 110, 113, 116, 119, 122].iter() } PredictionMode::D157_PRED => { [148, 151, 154, 157, 160, 163, 166].iter() } PredictionMode::D203_PRED => { [194, 197, 200, 203, 206, 209, 212].iter() } PredictionMode::D67_PRED => [58, 61, 64, 67, 70, 73, 76].iter(), PredictionMode::UV_CFL_PRED => [ -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ] .iter(), _ => [0].iter(), }; for angle in angles { for ief_params in match mode { PredictionMode::V_PRED if *angle == 90 => [None].iter(), PredictionMode::H_PRED if *angle == 180 => [None].iter(), PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => ief_params_all.iter(), _ => [None].iter(), } { let expected = { let mut plane = Plane::from_slice(&[T::zero(); 4 * 4], 4); rust::dispatch_predict_intra( *mode, *variant, &mut plane.as_region_mut(), tx_size, bit_depth, &ac.data, *angle, *ief_params, &edge_buf, cpu, ); let mut data = [T::zero(); 4 * 4]; for (v, d) in data.iter_mut().zip(plane.data[..].iter()) { *v = *d; } data }; let mut output = Plane::from_slice(&[T::zero(); 4 * 4], 4); dispatch_predict_intra( *mode, *variant, &mut output.as_region_mut(), tx_size, bit_depth, &ac.data, *angle, *ief_params, &edge_buf, cpu, ); assert_eq!( expected, &output.data[..], "mode={:?} variant={:?} angle={} ief_params.use_smooth_filter={:?} bit_depth={} cpu={:?}", *mode, *variant, *angle, ief_params.map(|p| p.use_smooth_filter()), bit_depth, cpu ); } } } } #[interpolate_test(444, 0, 0)] #[interpolate_test(422, 1, 0)] #[interpolate_test(420, 1, 1)] fn pred_cfl_ac_matches(xdec: usize, ydec: usize) { pred_cfl_ac_matches_inner::(xdec, ydec, 8); pred_cfl_ac_matches_inner::(xdec, ydec, 10); pred_cfl_ac_matches_inner::(xdec, ydec, 12); } fn pred_cfl_ac_matches_inner( xdec: usize, ydec: usize, bit_depth: usize, ) { let h_pad = 0; let w_pad = 0; let plane_bsize = BlockSize::BLOCK_16X16; let mut plane = Plane::from_slice(&[T::zero(); 32 * 32], 32); for p in plane.data_origin_mut() { *p = T::cast_from(random::() >> (16 - bit_depth)); } let luma = &plane.as_region(); let mut ac_ref = Aligned::new([MaybeUninit::new(0x3333i16); 32 * 32]); let ac_ref = &mut ac_ref.data[..plane_bsize.area()]; let cpu = CpuFeatureLevel::RUST; (match (xdec, ydec) { (0, 0) => rust::pred_cfl_ac::, (1, 0) => rust::pred_cfl_ac::, (_, _) => rust::pred_cfl_ac::, })(ac_ref, luma, plane_bsize, w_pad, h_pad, cpu); for &cpu in &CpuFeatureLevel::all()[..=CpuFeatureLevel::default().as_index()] { let mut ac = Aligned::new([MaybeUninit::new(0x7FFFi16); 32 * 32]); let ac = &mut ac.data[..plane_bsize.area()]; (match (xdec, ydec) { (0, 0) => pred_cfl_ac::, (1, 0) => pred_cfl_ac::, (_, _) => pred_cfl_ac::, })(ac, luma, plane_bsize, w_pad, h_pad, cpu); unsafe { let ac_ref = slice_assume_init_mut(ac_ref); let ac = slice_assume_init_mut(ac); assert_eq!(&ac_ref, &ac); } } } } rav1e-0.7.1/src/asm/shared/transform/forward.rs000064400000000000000000000064641046102023000175230ustar 00000000000000// Copyright (c) 2019-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. /// For classifying the number of rows and columns in a transform. Used to /// select the operations to perform for different vector lengths. #[derive(Debug, Clone, Copy)] pub enum SizeClass1D { X4, X8UP, } impl SizeClass1D { #[inline] pub fn from_length(len: usize) -> Self { assert!(len.is_power_of_two()); use SizeClass1D::*; match len { 4 => X4, _ => X8UP, } } } pub fn cast(x: &[T]) -> &[T; N] { // SAFETY: we perform a bounds check with [..N], // so casting to *const [T; N] is valid because the bounds // check guarantees that x has N elements unsafe { &*(&x[..N] as *const [T] as *const [T; N]) } } pub fn cast_mut(x: &mut [T]) -> &mut [T; N] { // SAFETY: we perform a bounds check with [..N], // so casting to *mut [T; N] is valid because the bounds // check guarantees that x has N elements unsafe { &mut *(&mut x[..N] as *mut [T] as *mut [T; N]) } } #[cfg(test)] mod test { use crate::cpu_features::*; use crate::transform::{forward_transform, get_valid_txfm_types, TxSize}; use crate::util::slice_assume_init_mut; use rand::Rng; use std::mem::MaybeUninit; // Ensure that the simd results match the rust code #[test] fn test_forward_transform() { for &cpu in &CpuFeatureLevel::all()[1..=CpuFeatureLevel::default().as_index()] { println!("Testing {:?}", cpu); test_forward_transform_simd(cpu); } } fn test_forward_transform_simd(cpu: CpuFeatureLevel) { let mut rng = rand::thread_rng(); let tx_sizes = { use TxSize::*; [ TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, ] }; for &tx_size in &tx_sizes { let area = tx_size.area(); let input: Vec = (0..area).map(|_| rng.gen_range(-255..256)).collect(); for &tx_type in get_valid_txfm_types(tx_size) { let mut output_ref = vec![MaybeUninit::new(0i16); area]; let mut output_simd = vec![MaybeUninit::new(0i16); area]; println!("Testing combination {:?}, {:?}", tx_size, tx_type); forward_transform( &input[..], &mut output_ref[..], tx_size.width(), tx_size, tx_type, 8, CpuFeatureLevel::RUST, ); let output_ref = unsafe { slice_assume_init_mut(&mut output_ref[..]) }; forward_transform( &input[..], &mut output_simd[..], tx_size.width(), tx_size, tx_type, 8, cpu, ); let output_simd = unsafe { slice_assume_init_mut(&mut output_simd[..]) }; assert_eq!(output_ref, output_simd) } } } } rav1e-0.7.1/src/asm/shared/transform/inverse.rs000064400000000000000000000207261046102023000175270ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::tiling::PlaneRegionMut; use crate::util::*; use std::mem::MaybeUninit; // Note: Input coeffs are mutable since the assembly uses them as a scratchpad pub type InvTxfmFunc = unsafe extern fn(*mut u8, libc::ptrdiff_t, *mut i16, i32); pub type InvTxfmHBDFunc = unsafe extern fn(*mut u16, libc::ptrdiff_t, *mut i16, i32, i32); pub fn call_inverse_func( func: InvTxfmFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16, width: usize, height: usize, bd: usize, ) { debug_assert!(bd == 8); // Only use at most 32 columns and 32 rows of input coefficients. let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)]; let mut copied = Aligned::<[MaybeUninit; 32 * 32]>::uninit_array(); // Convert input to 16-bits. // TODO: Remove by changing inverse assembly to not overwrite its input for (a, b) in copied.data.iter_mut().zip(input) { a.write(*b); } // perform the inverse transform // SAFETY: Calls Assembly code. unsafe { func( output.data_ptr_mut() as *mut _, output.plane_cfg.stride as isize, copied.data.as_mut_ptr() as *mut _, eob as i32 - 1, ); } } pub fn call_inverse_hbd_func( func: InvTxfmHBDFunc, input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16, width: usize, height: usize, bd: usize, ) { // Only use at most 32 columns and 32 rows of input coefficients. let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)]; let mut copied = Aligned::<[MaybeUninit; 32 * 32]>::uninit_array(); // Convert input to 16-bits. // TODO: Remove by changing inverse assembly to not overwrite its input for (a, b) in copied.data.iter_mut().zip(input) { a.write(*b); } // perform the inverse transform // SAFETY: Calls Assembly code. unsafe { func( output.data_ptr_mut() as *mut _, T::to_asm_stride(output.plane_cfg.stride), copied.data.as_mut_ptr() as *mut _, eob as i32 - 1, (1 << bd) - 1, ); } } #[cfg(test)] pub mod test { use super::*; use crate::context::av1_get_coded_tx_size; use crate::cpu_features::CpuFeatureLevel; use crate::frame::{AsRegion, Plane}; use crate::scan_order::av1_scan_orders; use crate::transform::TxSize::*; use crate::transform::*; use rand::{random, thread_rng, Rng}; use std::mem::MaybeUninit; pub fn pick_eob( coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize, ) -> u16 { /* From dav1d * copy the topleft coefficients such that the return value (being the * coefficient scantable index for the eob token) guarantees that only * the topleft $sub out of $sz (where $sz >= $sub) coefficients in both * dimensions are non-zero. This leads to braching to specific optimized * simd versions (e.g. dc-only) so that we get full asm coverage in this * test */ let coeff_h = av1_get_coded_tx_size(tx_size).height(); let sub_high: usize = if sub_h > 0 { sub_h * 8 - 1 } else { 0 }; let sub_low: usize = if sub_h > 1 { sub_high - 8 } else { 0 }; let mut eob = 0u16; let mut exit = 0; let scan = av1_scan_orders[tx_size][tx_type].scan; for (i, &pos) in scan.iter().enumerate() { exit = i as u16; let rc = pos as usize; let rcx = rc % coeff_h; let rcy = rc / coeff_h; if rcx > sub_high || rcy > sub_high { break; } else if eob == 0 && (rcx > sub_low || rcy > sub_low) { eob = i as u16; } } if eob != 0 { eob += thread_rng().gen_range(0..(exit - eob).min(1)); } for &pos in scan.iter().skip(usize::from(eob)) { coeffs[pos as usize] = T::cast_from(0); } eob + 1 } pub fn test_transform( tx_size: TxSize, tx_type: TxType, bit_depth: usize, cpu: CpuFeatureLevel, ) { let sub_h_iterations: usize = match tx_size.height().max(tx_size.width()) { 4 => 2, 8 => 2, 16 => 3, 32 | 64 => 4, _ => unreachable!(), }; for sub_h in 0..sub_h_iterations { let mut src_storage = [T::zero(); 64 * 64]; let src = &mut src_storage[..tx_size.area()]; let mut dst = Plane::from_slice( &[T::zero(); 64 * 64][..tx_size.area()], tx_size.width(), ); let mut res = Aligned::<[MaybeUninit; 64 * 64]>::uninit_array(); let res = &mut res.data[..tx_size.area()]; let mut freq = Aligned::<[MaybeUninit; 64 * 64]>::uninit_array(); let freq = &mut freq.data[..tx_size.area()]; for ((r, s), d) in res.iter_mut().zip(src.iter_mut()).zip(dst.data.iter_mut()) { *s = T::cast_from(random::() >> (16 - bit_depth)); *d = T::cast_from(random::() >> (16 - bit_depth)); r.write(i16::cast_from(*s) - i16::cast_from(*d)); } // SAFETY: The loop just initialized res, and all three slices have the same length let res = unsafe { slice_assume_init_mut(res) }; forward_transform( res, freq, tx_size.width(), tx_size, tx_type, bit_depth, CpuFeatureLevel::RUST, ); // SAFETY: forward_transform initialized freq let freq = unsafe { slice_assume_init_mut(freq) }; let eob: u16 = pick_eob(freq, tx_size, tx_type, sub_h); let mut rust_dst = dst.clone(); inverse_transform_add( freq, &mut dst.as_region_mut(), eob, tx_size, tx_type, bit_depth, cpu, ); inverse_transform_add( freq, &mut rust_dst.as_region_mut(), eob, tx_size, tx_type, bit_depth, CpuFeatureLevel::RUST, ); assert_eq!(rust_dst.data_origin(), dst.data_origin()); } } macro_rules! test_itx_fns { ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr) => { paste::item! { $( $( #[test] fn []() { for &cpu in &CpuFeatureLevel::all()[..=CpuFeatureLevel::default().as_index()] { test_transform::([], $ENUM, 8, cpu); test_transform::([], $ENUM, 10, cpu); test_transform::([], $ENUM, 12, cpu); } } )* )* } }; ($TYPES_VALID:tt, [$(($W:expr, $H:expr)),*]) => { $( test_itx_fns!($TYPES_VALID, $W, $H); )* }; ($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt, $TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt) => { test_itx_fns!([$TYPES64], $DIMS64); test_itx_fns!([$TYPES64, $TYPES32], $DIMS32); test_itx_fns!([$TYPES64, $TYPES32, $TYPES16], $DIMS16); test_itx_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84 ); test_itx_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4 ); }; } test_itx_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16 and 4x4) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8)], // 4x4 [(TxType::WHT_WHT, wht, wht)], [(4, 4)] ); } rav1e-0.7.1/src/asm/shared/transform/mod.rs000064400000000000000000000024731046102023000166320ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub mod forward; pub mod inverse; use crate::transform::*; use std::ops::Index; impl Index for [T; TxSize::TX_SIZES_ALL] { type Output = T; #[inline] fn index(&self, tx_size: TxSize) -> &Self::Output { // SAFETY: values of TxSize are < TX_SIZES_ALL unsafe { self.get_unchecked(tx_size as usize) } } } impl Index for [T; TX_TYPES] { type Output = T; #[inline] fn index(&self, tx_type: TxType) -> &Self::Output { // SAFETY: Wraps WHT_WHT to DCT_DCT unsafe { self.get_unchecked((tx_type as usize) & 15) } } } impl Index for [T; TX_TYPES_PLUS_LL] { type Output = T; #[inline] fn index(&self, tx_type: TxType) -> &Self::Output { // SAFETY: values of TxType are < TX_TYPES_PLUS_LL unsafe { self.get_unchecked(tx_type as usize) } } } rav1e-0.7.1/src/asm/x86/cdef.rs000064400000000000000000000364421046102023000141230ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cdef::*; use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::tiling::PlaneRegionMut; use crate::util::*; type CdefFilterFn = unsafe extern fn( dst: *mut u8, dst_stride: isize, tmp: *const u16, tmp_stride: isize, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, ); type CdefFilterHBDFn = unsafe extern fn( dst: *mut u16, dst_stride: isize, tmp: *const u16, tmp_stride: isize, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, bitdepth_max: i32, ); #[inline(always)] const fn decimate_index(xdec: usize, ydec: usize) -> usize { ((ydec << 1) | xdec) & 3 } pub(crate) unsafe fn cdef_filter_block( dst: &mut PlaneRegionMut<'_, T>, src: *const T, src_stride: isize, pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, bit_depth: usize, xdec: usize, ydec: usize, edges: u8, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut| { rust::cdef_filter_block( dst, src, src_stride, pri_strength, sec_strength, dir, damping, bit_depth, xdec, ydec, edges, cpu, ); }; // TODO: handle padding in the fast path if edges != CDEF_HAVE_ALL && matches!(T::type_enum(), PixelType::U16) { call_rust(dst); } else { #[cfg(feature = "check_asm")] let ref_dst = { let mut copy = dst.scratch_copy(); call_rust(&mut copy.as_region_mut()); copy }; match T::type_enum() { PixelType::U8 => { match CDEF_FILTER_FNS[cpu.as_index()][decimate_index(xdec, ydec)] { Some(func) => { // current cdef_filter_block asm does 16->8 for historical // reasons. Copy into tmp space for now (also handling // padding) until asm is updated const TMPSTRIDE: isize = std::mem::align_of::>() as isize; /* 256 or 512-bit alignment, greater than 2 * (8>>xdec) + 2 */ const TMPSIZE: usize = ((2 + 8 + 2) * TMPSTRIDE + TMPSTRIDE) as usize; let mut tmp: Aligned<[u16; TMPSIZE]> = Aligned::new([CDEF_VERY_LARGE; TMPSIZE]); rust::pad_into_tmp16( tmp.data.as_mut_ptr().offset(TMPSTRIDE - 2), // points to // *padding* upper left; the -2 is to make sure the // block area is SIMD-aligned, not the padding TMPSTRIDE, src, // points to *block* upper left src_stride, 8 >> xdec, 8 >> ydec, edges, ); (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), tmp.data.as_ptr().offset(3 * TMPSTRIDE), TMPSTRIDE, pri_strength, sec_strength, dir as i32, damping, ); } None => call_rust(dst), } } PixelType::U16 => { match CDEF_FILTER_HBD_FNS[cpu.as_index()][decimate_index(xdec, ydec)] { Some(func) => { (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src as *const _, src_stride, pri_strength, sec_strength, dir as i32, damping, (1 << bit_depth) - 1, ); } None => call_rust(dst), } } } #[cfg(feature = "check_asm")] { for (dst_row, ref_row) in dst.rows_iter().zip(ref_dst.as_region().rows_iter()) { for (dst, reference) in dst_row.iter().zip(ref_row) { assert_eq!(*dst, *reference); } } } } } extern { fn rav1e_cdef_filter_4x4_avx2( dst: *mut u8, dst_stride: isize, tmp: *const u16, tmp_stride: isize, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, ); fn rav1e_cdef_filter_4x8_avx2( dst: *mut u8, dst_stride: isize, tmp: *const u16, tmp_stride: isize, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, ); fn rav1e_cdef_filter_8x8_avx2( dst: *mut u8, dst_stride: isize, tmp: *const u16, tmp_stride: isize, pri_strength: i32, sec_strength: i32, dir: i32, damping: i32, ); } static CDEF_FILTER_FNS_AVX2: [Option; 4] = { let mut out: [Option; 4] = [None; 4]; out[decimate_index(1, 1)] = Some(rav1e_cdef_filter_4x4_avx2); out[decimate_index(1, 0)] = Some(rav1e_cdef_filter_4x8_avx2); out[decimate_index(0, 0)] = Some(rav1e_cdef_filter_8x8_avx2); out }; cpu_function_lookup_table!( CDEF_FILTER_FNS: [[Option; 4]], default: [None; 4], [AVX2] ); cpu_function_lookup_table!( CDEF_FILTER_HBD_FNS: [[Option; 4]], default: [None; 4], [] ); type CdefDirLBDFn = unsafe extern fn(tmp: *const u8, tmp_stride: isize, var: *mut u32) -> i32; type CdefDirHBDFn = unsafe extern fn( tmp: *const u16, tmp_stride: isize, var: *mut u32, bitdepth_max: i32, ) -> i32; #[inline(always)] #[allow(clippy::let_and_return)] pub(crate) fn cdef_find_dir( img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize, cpu: CpuFeatureLevel, ) -> i32 { let call_rust = |var: &mut u32| rust::cdef_find_dir::(img, var, coeff_shift, cpu); #[cfg(feature = "check_asm")] let (ref_dir, ref_var) = { let mut var: u32 = 0; let dir = call_rust(&mut var); (dir, var) }; let dir = match T::type_enum() { PixelType::U8 => { if let Some(func) = CDEF_DIR_LBD_FNS[cpu.as_index()] { // SAFETY: Calls Assembly code. unsafe { (func)( img.as_ptr() as *const _, T::to_asm_stride(img.plane.cfg.stride), var as *mut u32, ) } } else { call_rust(var) } } PixelType::U16 if coeff_shift > 0 => { if let Some(func) = CDEF_DIR_HBD_FNS[cpu.as_index()] { // SAFETY: Calls Assembly code. unsafe { (func)( img.as_ptr() as *const _, T::to_asm_stride(img.plane.cfg.stride), var as *mut u32, (1 << (coeff_shift + 8)) - 1, ) } } else { call_rust(var) } } _ => call_rust(var), }; #[cfg(feature = "check_asm")] { assert_eq!(dir, ref_dir); assert_eq!(*var, ref_var); } dir } extern { fn rav1e_cdef_dir_8bpc_ssse3( tmp: *const u8, tmp_stride: isize, var: *mut u32, ) -> i32; fn rav1e_cdef_dir_8bpc_avx2( tmp: *const u8, tmp_stride: isize, var: *mut u32, ) -> i32; fn rav1e_cdef_dir_16bpc_ssse3( tmp: *const u16, tmp_stride: isize, var: *mut u32, bitdepth_max: i32, ) -> i32; fn rav1e_cdef_dir_16bpc_sse4( tmp: *const u16, tmp_stride: isize, var: *mut u32, bitdepth_max: i32, ) -> i32; fn rav1e_cdef_dir_16bpc_avx2( tmp: *const u16, tmp_stride: isize, var: *mut u32, bitdepth_max: i32, ) -> i32; } cpu_function_lookup_table!( CDEF_DIR_LBD_FNS: [Option], default: None, [ (SSSE3, Some(rav1e_cdef_dir_8bpc_ssse3)), (AVX2, Some(rav1e_cdef_dir_8bpc_avx2)) ] ); cpu_function_lookup_table!( CDEF_DIR_HBD_FNS: [Option], default: None, [ (SSSE3, Some(rav1e_cdef_dir_16bpc_ssse3)), (SSE4_1, Some(rav1e_cdef_dir_16bpc_sse4)), (AVX2, Some(rav1e_cdef_dir_16bpc_avx2)) ] ); #[cfg(test)] mod test { pub const CDEF_HAVE_NONE: u8 = 0; use super::*; use crate::frame::{AsRegion, Plane}; use interpolate_name::interpolate_test; use rand::random; use std::str::FromStr; macro_rules! test_cdef_filter_block { ($(($XDEC:expr, $YDEC:expr)),*, $OPT:ident, $OPTLIT:literal) => { $( paste::item! { #[interpolate_test(dir_0, 0)] #[interpolate_test(dir_1, 1)] #[interpolate_test(dir_2, 2)] #[interpolate_test(dir_3, 3)] #[interpolate_test(dir_4, 4)] #[interpolate_test(dir_5, 5)] #[interpolate_test(dir_6, 6)] #[interpolate_test(dir_7, 7)] fn [](dir: usize) { if CpuFeatureLevel::default() < CpuFeatureLevel::from_str($OPTLIT).unwrap() { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; } let width = 8 >> $XDEC; let height = 8 >> $YDEC; let area = width * height; // dynamic allocation: test let mut src = vec![0u8; area]; // dynamic allocation: test let mut dst = Plane::from_slice(&vec![0u8; area], width); for (s, d) in src.iter_mut().zip(dst.data.iter_mut()) { *s = random::(); *d = random::(); } let mut rust_dst = dst.clone(); let src_stride = width as isize; let pri_strength = 1; let sec_strength = 0; let damping = 2; let bit_depth = 8; // SAFETY: Calling functions with raw pointers--we created the // planes above and only read from the start. unsafe { cdef_filter_block(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap()); cdef_filter_block(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST); assert_eq!(rust_dst.data_origin(), dst.data_origin()); } } } )* } } macro_rules! test_cdef_filter_block_hbd { ($(($XDEC:expr, $YDEC:expr)),*, $OPT:ident, $OPTLIT:literal) => { $( paste::item! { #[interpolate_test(dir_0, 0)] #[interpolate_test(dir_1, 1)] #[interpolate_test(dir_2, 2)] #[interpolate_test(dir_3, 3)] #[interpolate_test(dir_4, 4)] #[interpolate_test(dir_5, 5)] #[interpolate_test(dir_6, 6)] #[interpolate_test(dir_7, 7)] fn [](dir: usize) { if !is_x86_feature_detected!($OPTLIT) { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; } let width = 8 >> $XDEC; let height = 8 >> $YDEC; let area = width * height; // dynamic allocation: test let mut src = vec![0u16; area]; // dynamic allocation: test let mut dst = Plane::from_slice(&vec![0u16; area], width); for (s, d) in src.iter_mut().zip(dst.data.iter_mut()) { *s = (random::() as u16) << 2; *d = (random::() as u16) << 2; } let mut rust_dst = dst.clone(); let src_stride = width as isize; let pri_strength = 1; let sec_strength = 0; let damping = 2; let bit_depth = 10; // SAFETY: Calling functions with raw pointers--we created the // planes above and only read from the start. unsafe { cdef_filter_block(&mut dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::from_str($OPTLIT).unwrap()); cdef_filter_block(&mut rust_dst.as_region_mut(), src.as_ptr(), src_stride, pri_strength, sec_strength, dir, damping, bit_depth, $XDEC, $YDEC, CDEF_HAVE_NONE, CpuFeatureLevel::RUST); assert_eq!(rust_dst.data_origin(), dst.data_origin()); } } } )* } } macro_rules! test_cdef_dir { ($OPT:ident, $OPTLIT:literal) => { paste::item! { #[test] fn []() { use crate::context::{TileSuperBlockOffset, SuperBlockOffset}; if !is_x86_feature_detected!($OPTLIT) { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; } let width = 8; let height = 8; let area = width * height; // dynamic allocation: test let mut src = vec![0u8; area]; for s in src.iter_mut() { *s = random::(); } let src = Plane::from_slice(&src, width); let bit_depth = 8; let coeff_shift = bit_depth - 8; let sbo = TileSuperBlockOffset(SuperBlockOffset{ x:0, y:0 }); let mut var_asm: u32 = 0; let mut var_rust: u32 = 0; let in_po = sbo.plane_offset(&src.cfg); let dir_asm = cdef_find_dir::( &src.slice(in_po), &mut var_asm, coeff_shift, CpuFeatureLevel::from_str($OPTLIT).unwrap(), ); let dir_rust = cdef_find_dir::( &src.slice(in_po), &mut var_rust, coeff_shift, CpuFeatureLevel::RUST, ); assert_eq!(var_asm, var_rust); assert_eq!(dir_asm, dir_rust); } } } } macro_rules! test_cdef_dir_hbd { ($OPT:ident, $OPTLIT:literal) => { paste::item! { #[test] fn []() { use crate::context::{TileSuperBlockOffset, SuperBlockOffset}; if !is_x86_feature_detected!($OPTLIT) { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; } let width = 8; let height = 8; let area = width * height; // dynamic allocation: test let mut src = vec![0u16; area]; for s in src.iter_mut() { *s = (random::() as u16) << 2; } let src = Plane::from_slice(&src, width); let bit_depth = 10; let coeff_shift = bit_depth - 8; let sbo = TileSuperBlockOffset(SuperBlockOffset{ x:0, y:0 }); let mut var_asm: u32 = 0; let mut var_rust: u32 = 0; let in_po = sbo.plane_offset(&src.cfg); let dir_asm = cdef_find_dir::( &src.slice(in_po), &mut var_asm, coeff_shift, CpuFeatureLevel::from_str($OPTLIT).unwrap(), ); let dir_rust = cdef_find_dir::( &src.slice(in_po), &mut var_rust, coeff_shift, CpuFeatureLevel::RUST, ); assert_eq!(var_asm, var_rust); assert_eq!(dir_asm, dir_rust); } } } } test_cdef_filter_block!((1, 1), (1, 0), (0, 0), avx2, "avx2"); test_cdef_filter_block_hbd!((1, 1), (1, 0), (0, 0), avx2, "avx2"); test_cdef_dir!(ssse3, "ssse3"); test_cdef_dir!(avx2, "avx2"); test_cdef_dir_hbd!(ssse3, "ssse3"); test_cdef_dir_hbd!(sse4, "sse4.1"); test_cdef_dir_hbd!(avx2, "avx2"); } rav1e-0.7.1/src/asm/x86/dist/cdef_dist.rs000064400000000000000000000162311046102023000161030ustar 00000000000000// Copyright (c) 2022-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::activity::apply_ssim_boost; use crate::cpu_features::CpuFeatureLevel; use crate::dist::*; use crate::tiling::PlaneRegion; use crate::util::Pixel; use crate::util::PixelType; use std::arch::x86_64::*; type CdefDistKernelFn = unsafe extern fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); type CdefDistKernelHBDFn = unsafe fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> (u32, u32, u32); extern { fn rav1e_cdef_dist_kernel_4x4_sse2( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_4x8_sse2( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_8x4_sse2( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); fn rav1e_cdef_dist_kernel_8x8_sse2( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ret_ptr: *mut u32, ); } /// # Panics /// /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[allow(clippy::let_and_return)] pub fn cdef_dist_kernel( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { debug_assert!(src.plane_cfg.xdec == 0); debug_assert!(src.plane_cfg.ydec == 0); debug_assert!(dst.plane_cfg.xdec == 0); debug_assert!(dst.plane_cfg.ydec == 0); // Limit kernel to 8x8 debug_assert!(w <= 8); debug_assert!(h <= 8); let call_rust = || -> u32 { rust::cdef_dist_kernel(dst, src, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); let (svar, dvar, sse) = match T::type_enum() { PixelType::U8 => { if let Some(func) = CDEF_DIST_KERNEL_FNS[cpu.as_index()][kernel_fn_index(w, h)] { let mut ret_buf = [0u32; 3]; // SAFETY: Calls Assembly code. unsafe { func( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ret_buf.as_mut_ptr(), ) } (ret_buf[0], ret_buf[1], ret_buf[2]) } else { return call_rust(); } } PixelType::U16 => { if let Some(func) = CDEF_DIST_KERNEL_HBD_FNS[cpu.as_index()][kernel_fn_index(w, h)] { // SAFETY: Calls Assembly code. unsafe { func( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) } } else { return call_rust(); } } }; let dist = apply_ssim_boost(sse, svar, dvar, bit_depth); #[cfg(feature = "check_asm")] assert_eq!( dist, ref_dist, "CDEF Distortion {}x{}: Assembly doesn't match reference code.", w, h ); dist } /// Store functions in a 8x8 grid. Most will be empty. const CDEF_DIST_KERNEL_FNS_LENGTH: usize = 8 * 8; const fn kernel_fn_index(w: usize, h: usize) -> usize { ((w - 1) << 3) | (h - 1) } static CDEF_DIST_KERNEL_FNS_SSE2: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = { let mut out: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = [None; CDEF_DIST_KERNEL_FNS_LENGTH]; out[kernel_fn_index(4, 4)] = Some(rav1e_cdef_dist_kernel_4x4_sse2); out[kernel_fn_index(4, 8)] = Some(rav1e_cdef_dist_kernel_4x8_sse2); out[kernel_fn_index(8, 4)] = Some(rav1e_cdef_dist_kernel_8x4_sse2); out[kernel_fn_index(8, 8)] = Some(rav1e_cdef_dist_kernel_8x8_sse2); out }; cpu_function_lookup_table!( CDEF_DIST_KERNEL_FNS: [[Option; CDEF_DIST_KERNEL_FNS_LENGTH]], default: [None; CDEF_DIST_KERNEL_FNS_LENGTH], [SSE2] ); #[target_feature(enable = "avx2")] #[inline] unsafe fn mm256_sum_i32(ymm: __m256i) -> i32 { // We split the vector in half and then add the two halves and sum. let m1 = _mm256_extracti128_si256(ymm, 1); let m2 = _mm256_castsi256_si128(ymm); let m2 = _mm_add_epi32(m2, m1); let m1 = _mm_shuffle_epi32(m2, 0b11_10_11_10); let m2 = _mm_add_epi32(m2, m1); let m1 = _mm_shuffle_epi32(m2, 0b01_01_01_01); let m2 = _mm_add_epi32(m2, m1); _mm_cvtsi128_si32(m2) } #[target_feature(enable = "avx2")] #[inline] unsafe fn rav1e_cdef_dist_kernel_8x8_hbd_avx2( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> (u32, u32, u32) { let src = src as *const u8; let dst = dst as *const u8; unsafe fn sum16(src: *const u8, src_stride: isize) -> u32 { let h = 8; let res = (0..h) .map(|row| _mm_load_si128(src.offset(row * src_stride) as *const _)) .reduce(|a, b| _mm_add_epi16(a, b)) .unwrap(); let m32 = _mm256_cvtepi16_epi32(res); mm256_sum_i32(m32) as u32 } unsafe fn mpadd32( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ) -> u32 { let h = 8; let res = (0..h / 2) .map(|row| { let s = _mm256_loadu2_m128i( src.offset(2 * row * src_stride) as *const _, src.offset((2 * row + 1) * src_stride) as *const _, ); let d = _mm256_loadu2_m128i( dst.offset(2 * row * dst_stride) as *const _, dst.offset((2 * row + 1) * dst_stride) as *const _, ); _mm256_madd_epi16(s, d) }) .reduce(|a, b| _mm256_add_epi32(a, b)) .unwrap(); mm256_sum_i32(res) as u32 } let sum_s = sum16(src, src_stride); let sum_d = sum16(dst, dst_stride); let sum_s2 = mpadd32(src, src_stride, src, src_stride); let sum_d2 = mpadd32(dst, dst_stride, dst, dst_stride); let sum_sd = mpadd32(src, src_stride, dst, dst_stride); // To get the distortion, compute sum of squared error and apply a weight // based on the variance of the two planes. let sse = sum_d2 + sum_s2 - 2 * sum_sd; // Convert to 64-bits to avoid overflow when squaring let sum_s = sum_s as u64; let sum_d = sum_d as u64; let svar = (sum_s2 as u64 - (sum_s * sum_s + 32) / 64) as u32; let dvar = (sum_d2 as u64 - (sum_d * sum_d + 32) / 64) as u32; (svar, dvar, sse) } static CDEF_DIST_KERNEL_HBD_FNS_AVX2: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = { let mut out: [Option; CDEF_DIST_KERNEL_FNS_LENGTH] = [None; CDEF_DIST_KERNEL_FNS_LENGTH]; out[kernel_fn_index(8, 8)] = Some(rav1e_cdef_dist_kernel_8x8_hbd_avx2); out }; cpu_function_lookup_table!( CDEF_DIST_KERNEL_HBD_FNS: [[Option; CDEF_DIST_KERNEL_FNS_LENGTH]], default: [None; CDEF_DIST_KERNEL_FNS_LENGTH], [AVX2] ); rav1e-0.7.1/src/asm/x86/dist/mod.rs000064400000000000000000000657551046102023000147550ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub use self::cdef_dist::*; pub use self::sse::*; use crate::cpu_features::CpuFeatureLevel; use crate::dist::*; use crate::partition::BlockSize; use crate::tiling::*; use crate::util::*; mod cdef_dist; mod sse; type SadFn = unsafe extern fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, ) -> u32; type SatdFn = SadFn; type SadHBDFn = unsafe extern fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> u32; type SatdHBDFn = unsafe extern fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, bdmax: u32, ) -> u32; macro_rules! declare_asm_dist_fn { ($(($name: ident, $T: ident)),+) => ( $( extern { fn $name ( src: *const $T, src_stride: isize, dst: *const $T, dst_stride: isize ) -> u32; } )+ ) } macro_rules! declare_asm_satd_hbd_fn { ($($name: ident),+) => ( $( extern { pub(crate) fn $name ( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, bdmax: u32 ) -> u32; } )+ ) } #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; /// Horizontally sum 8 32-bit values in a YMM register #[inline] #[target_feature(enable = "avx2")] unsafe fn mm256_sum_i32(ymm: __m256i) -> i32 { // We split the vector in half and then add (2, 3) + (0, 1), and finally 0 + 1. let m1 = _mm256_extracti128_si256(ymm, 1); let m2 = _mm256_castsi256_si128(ymm); let m2 = _mm_add_epi32(m2, m1); let m1 = _mm_shuffle_epi32(m2, 0b11_10_11_10); let m2 = _mm_add_epi32(m2, m1); let m1 = _mm_shuffle_epi32(m2, 0b01_01_01_01); let m2 = _mm_add_epi32(m2, m1); _mm_cvtsi128_si32(m2) } /// Perform SAD over at most 256 (10 or 12-bit) values /// /// Returns a YMM register of 8 32-bit sums, which can /// be used as an intermediate result for larger summations /// /// Limited to 256 values as any more 12-bit SADs may overflow /// a 16-bit value. /// /// # Panics /// - If `W` * `n_rows` > 256 /// - If `W` is not a multiple of 16 #[inline] #[target_feature(enable = "avx2")] unsafe fn rav1e_sad_wxh_hbd_avx2_inner( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, n_rows: usize, ) -> __m256i { const LOADS_PER_REGISTER: usize = 32 / core::mem::size_of::(); // 16 // Check we can't overflow a 16-bit register assert!(W * n_rows <= 256); // Check we load the right number of values assert_eq!(W % LOADS_PER_REGISTER, 0); let mut sum16 = _mm256_setzero_si256(); (0..n_rows as isize).for_each(|n| { (0..(W / LOADS_PER_REGISTER) as isize).for_each(|x| { let res = _mm256_abs_epi16(_mm256_sub_epi16( _mm256_loadu_si256( src.offset(n * src_stride + x * LOADS_PER_REGISTER as isize * 2) as *const _, ), _mm256_loadu_si256( dst.offset(n * dst_stride + x * LOADS_PER_REGISTER as isize * 2) as *const _, ), )); sum16 = _mm256_add_epi16(sum16, res); }) }); let b = _mm256_unpackhi_epi16(sum16, _mm256_setzero_si256()); let c = _mm256_unpacklo_epi16(sum16, _mm256_setzero_si256()); _mm256_add_epi32(b, c) } /// Sum of Absolute Differences /// /// By convention, `src_stride` and `dst_stride` are measured in bytes, not u16's #[inline] #[target_feature(enable = "avx2")] unsafe fn rav1e_sad_wxh_hbd_avx2( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, n_rows: usize, ) -> u32 { const MAX_INNER_SIZE: usize = 256; let src = src as *const u8; let dst = dst as *const u8; // Can we sum at least 1 entire row at once? if W <= MAX_INNER_SIZE { // If we can fit the entire area in one inner iteration, do it if W * n_rows <= MAX_INNER_SIZE { return mm256_sum_i32(rav1e_sad_wxh_hbd_avx2_inner::( src, src_stride, dst, dst_stride, n_rows, )) as u32; } // Otherwise, sum rows in sets of 256 assert_eq!(MAX_INNER_SIZE % W, 0); let row_step = (MAX_INNER_SIZE / W).min(n_rows); assert_eq!(n_rows % row_step, 0); let mut sum = _mm256_setzero_si256(); (0..(n_rows / row_step) as isize).for_each(|w| { let src = src.offset(w * row_step as isize * src_stride); let dst = dst.offset(w * row_step as isize * dst_stride); let res = rav1e_sad_wxh_hbd_avx2_inner::( src, src_stride, dst, dst_stride, row_step, ); sum = _mm256_add_epi32(sum, res) }); mm256_sum_i32(sum) as u32 } else { // If we can't sum an entire row in one shot, split each row up assert_eq!(W % MAX_INNER_SIZE, 0); let sum = (0..n_rows) .flat_map(|_| { (0..(W / MAX_INNER_SIZE)).map(move |h| { let src = src.offset(src_stride + (h * MAX_INNER_SIZE * 2) as isize); let dst = dst.offset(dst_stride + (h * MAX_INNER_SIZE * 2) as isize); rav1e_sad_wxh_hbd_avx2_inner::( src, src_stride, dst, dst_stride, 1, ) }) }) .reduce(|a, b| unsafe { _mm256_add_epi32(a, b) }) .unwrap(); mm256_sum_i32(sum) as u32 } } declare_asm_dist_fn![ // SSSE3 (rav1e_sad_4x4_hbd_ssse3, u16), (rav1e_sad_16x16_hbd_ssse3, u16), (rav1e_satd_8x8_ssse3, u8), // SSE2 (rav1e_sad4x4_sse2, u8), (rav1e_sad4x8_sse2, u8), (rav1e_sad4x16_sse2, u8), (rav1e_sad8x4_sse2, u8), (rav1e_sad8x8_sse2, u8), (rav1e_sad8x16_sse2, u8), (rav1e_sad8x32_sse2, u8), (rav1e_sad16x4_sse2, u8), (rav1e_sad16x8_sse2, u8), (rav1e_sad16x16_sse2, u8), (rav1e_sad16x32_sse2, u8), (rav1e_sad16x64_sse2, u8), (rav1e_sad32x8_sse2, u8), (rav1e_sad32x16_sse2, u8), (rav1e_sad32x32_sse2, u8), (rav1e_sad32x64_sse2, u8), (rav1e_sad64x16_sse2, u8), (rav1e_sad64x32_sse2, u8), (rav1e_sad64x64_sse2, u8), (rav1e_sad64x128_sse2, u8), (rav1e_sad128x64_sse2, u8), (rav1e_sad128x128_sse2, u8), // SSE4 (rav1e_satd_4x4_sse4, u8), // AVX (rav1e_sad32x8_avx2, u8), (rav1e_sad32x16_avx2, u8), (rav1e_sad32x32_avx2, u8), (rav1e_sad32x64_avx2, u8), (rav1e_sad64x16_avx2, u8), (rav1e_sad64x32_avx2, u8), (rav1e_sad64x64_avx2, u8), (rav1e_sad64x128_avx2, u8), (rav1e_sad128x64_avx2, u8), (rav1e_sad128x128_avx2, u8), (rav1e_satd_4x4_avx2, u8), (rav1e_satd_8x8_avx2, u8), (rav1e_satd_16x16_avx2, u8), (rav1e_satd_32x32_avx2, u8), (rav1e_satd_64x64_avx2, u8), (rav1e_satd_128x128_avx2, u8), (rav1e_satd_4x8_avx2, u8), (rav1e_satd_8x4_avx2, u8), (rav1e_satd_8x16_avx2, u8), (rav1e_satd_16x8_avx2, u8), (rav1e_satd_16x32_avx2, u8), (rav1e_satd_32x16_avx2, u8), (rav1e_satd_32x64_avx2, u8), (rav1e_satd_64x32_avx2, u8), (rav1e_satd_64x128_avx2, u8), (rav1e_satd_128x64_avx2, u8), (rav1e_satd_4x16_avx2, u8), (rav1e_satd_16x4_avx2, u8), (rav1e_satd_8x32_avx2, u8), (rav1e_satd_32x8_avx2, u8), (rav1e_satd_16x64_avx2, u8), (rav1e_satd_64x16_avx2, u8) ]; declare_asm_satd_hbd_fn![ rav1e_satd_4x4_hbd_avx2, rav1e_satd_8x4_hbd_avx2, rav1e_satd_4x8_hbd_avx2, rav1e_satd_8x8_hbd_avx2, rav1e_satd_16x8_hbd_avx2, rav1e_satd_16x16_hbd_avx2, rav1e_satd_32x32_hbd_avx2, rav1e_satd_64x64_hbd_avx2, rav1e_satd_128x128_hbd_avx2, rav1e_satd_16x32_hbd_avx2, rav1e_satd_16x64_hbd_avx2, rav1e_satd_32x16_hbd_avx2, rav1e_satd_32x64_hbd_avx2, rav1e_satd_64x16_hbd_avx2, rav1e_satd_64x32_hbd_avx2, rav1e_satd_64x128_hbd_avx2, rav1e_satd_128x64_hbd_avx2, rav1e_satd_32x8_hbd_avx2, rav1e_satd_8x16_hbd_avx2, rav1e_satd_8x32_hbd_avx2, rav1e_satd_16x4_hbd_avx2, rav1e_satd_4x16_hbd_avx2 ]; // BlockSize::BLOCK_SIZES.next_power_of_two(); pub(crate) const DIST_FNS_LENGTH: usize = 32; #[inline] pub(crate) const fn to_index(bsize: BlockSize) -> usize { bsize as usize & (DIST_FNS_LENGTH - 1) } /// # Panics /// /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[inline(always)] #[allow(clippy::let_and_return)] pub fn get_sad( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u32 { rust::get_sad(dst, src, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); let dist = match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SAD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SAD_HBD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } }; #[cfg(feature = "check_asm")] assert_eq!(dist, ref_dist); dist } /// # Panics /// /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[inline(always)] #[allow(clippy::let_and_return)] pub fn get_satd( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u32 { let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u32 { rust::get_satd(dst, src, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); let dist = match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SATD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), ) }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SATD_HBD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { (func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), (1 << bit_depth) - 1, ) }, None => call_rust(), } } }; #[cfg(feature = "check_asm")] assert_eq!(dist, ref_dist); dist } // We have hand-written ASM for 4x4 and 16x16 HBD blocks, // so we can use those for other block sizes as well. macro_rules! get_sad_hbd_ssse3 { ($(($W:expr, $H:expr, $BS:expr)),*) => { $( paste::item! { #[target_feature(enable = "ssse3")] unsafe extern fn []( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> u32 { let mut sum = 0; for w in (0..$W).step_by($BS) { for h in (0..$H).step_by($BS) { sum += []( src.offset(w + h * src_stride / 2), src_stride, dst.offset(w + h * dst_stride / 2), dst_stride ); } } sum } } )* } } get_sad_hbd_ssse3!( // 4x4 base (8, 8, 4), (4, 8, 4), (8, 4, 4), (8, 16, 4), (16, 8, 4), (4, 16, 4), (16, 4, 4), (8, 32, 4), (32, 8, 4), (32, 32, 16), (64, 64, 16), (128, 128, 16), (16, 32, 16), (32, 16, 16), (32, 64, 16), (64, 32, 16), (64, 128, 16), (128, 64, 16), (16, 64, 16), (64, 16, 16) ); macro_rules! get_sad_hbd_avx2_WxH { ($(($W:expr, $H:expr)),*) => { $( paste::item! { #[target_feature(enable = "avx2")] unsafe extern fn []( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, ) -> u32 { rav1e_sad_wxh_hbd_avx2::<$W>( src, src_stride, dst, dst_stride, $H, ) } } )* } } get_sad_hbd_avx2_WxH!( // No advantage to AVX below 16 width (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128) ); static SAD_FNS_SSE2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_sad4x4_sse2); out[BLOCK_4X8 as usize] = Some(rav1e_sad4x8_sse2); out[BLOCK_4X16 as usize] = Some(rav1e_sad4x16_sse2); out[BLOCK_8X4 as usize] = Some(rav1e_sad8x4_sse2); out[BLOCK_8X8 as usize] = Some(rav1e_sad8x8_sse2); out[BLOCK_8X16 as usize] = Some(rav1e_sad8x16_sse2); out[BLOCK_8X32 as usize] = Some(rav1e_sad8x32_sse2); out[BLOCK_16X4 as usize] = Some(rav1e_sad16x4_sse2); out[BLOCK_16X8 as usize] = Some(rav1e_sad16x8_sse2); out[BLOCK_16X16 as usize] = Some(rav1e_sad16x16_sse2); out[BLOCK_16X32 as usize] = Some(rav1e_sad16x32_sse2); out[BLOCK_16X64 as usize] = Some(rav1e_sad16x64_sse2); out[BLOCK_32X8 as usize] = Some(rav1e_sad32x8_sse2); out[BLOCK_32X16 as usize] = Some(rav1e_sad32x16_sse2); out[BLOCK_32X32 as usize] = Some(rav1e_sad32x32_sse2); out[BLOCK_32X64 as usize] = Some(rav1e_sad32x64_sse2); out[BLOCK_64X16 as usize] = Some(rav1e_sad64x16_sse2); out[BLOCK_64X32 as usize] = Some(rav1e_sad64x32_sse2); out[BLOCK_64X64 as usize] = Some(rav1e_sad64x64_sse2); out[BLOCK_64X128 as usize] = Some(rav1e_sad64x128_sse2); out[BLOCK_128X64 as usize] = Some(rav1e_sad128x64_sse2); out[BLOCK_128X128 as usize] = Some(rav1e_sad128x128_sse2); out }; static SAD_FNS_AVX2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_sad4x4_sse2); out[BLOCK_4X8 as usize] = Some(rav1e_sad4x8_sse2); out[BLOCK_4X16 as usize] = Some(rav1e_sad4x16_sse2); out[BLOCK_8X4 as usize] = Some(rav1e_sad8x4_sse2); out[BLOCK_8X8 as usize] = Some(rav1e_sad8x8_sse2); out[BLOCK_8X16 as usize] = Some(rav1e_sad8x16_sse2); out[BLOCK_8X32 as usize] = Some(rav1e_sad8x32_sse2); out[BLOCK_16X4 as usize] = Some(rav1e_sad16x4_sse2); out[BLOCK_16X8 as usize] = Some(rav1e_sad16x8_sse2); out[BLOCK_16X16 as usize] = Some(rav1e_sad16x16_sse2); out[BLOCK_16X32 as usize] = Some(rav1e_sad16x32_sse2); out[BLOCK_16X64 as usize] = Some(rav1e_sad16x64_sse2); out[BLOCK_32X8 as usize] = Some(rav1e_sad32x8_avx2); out[BLOCK_32X16 as usize] = Some(rav1e_sad32x16_avx2); out[BLOCK_32X32 as usize] = Some(rav1e_sad32x32_avx2); out[BLOCK_32X64 as usize] = Some(rav1e_sad32x64_avx2); out[BLOCK_64X16 as usize] = Some(rav1e_sad64x16_avx2); out[BLOCK_64X32 as usize] = Some(rav1e_sad64x32_avx2); out[BLOCK_64X64 as usize] = Some(rav1e_sad64x64_avx2); out[BLOCK_64X128 as usize] = Some(rav1e_sad64x128_avx2); out[BLOCK_128X64 as usize] = Some(rav1e_sad128x64_avx2); out[BLOCK_128X128 as usize] = Some(rav1e_sad128x128_avx2); out }; cpu_function_lookup_table!( SAD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [SSE2, AVX2] ); static SAD_HBD_FNS_SSSE3: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_sad_4x4_hbd_ssse3); out[BLOCK_8X8 as usize] = Some(rav1e_sad_8x8_hbd_ssse3); out[BLOCK_16X16 as usize] = Some(rav1e_sad_16x16_hbd_ssse3); out[BLOCK_32X32 as usize] = Some(rav1e_sad_32x32_hbd_ssse3); out[BLOCK_64X64 as usize] = Some(rav1e_sad_64x64_hbd_ssse3); out[BLOCK_128X128 as usize] = Some(rav1e_sad_128x128_hbd_ssse3); out[BLOCK_4X8 as usize] = Some(rav1e_sad_4x8_hbd_ssse3); out[BLOCK_8X4 as usize] = Some(rav1e_sad_8x4_hbd_ssse3); out[BLOCK_8X16 as usize] = Some(rav1e_sad_8x16_hbd_ssse3); out[BLOCK_16X8 as usize] = Some(rav1e_sad_16x8_hbd_ssse3); out[BLOCK_16X32 as usize] = Some(rav1e_sad_16x32_hbd_ssse3); out[BLOCK_32X16 as usize] = Some(rav1e_sad_32x16_hbd_ssse3); out[BLOCK_32X64 as usize] = Some(rav1e_sad_32x64_hbd_ssse3); out[BLOCK_64X32 as usize] = Some(rav1e_sad_64x32_hbd_ssse3); out[BLOCK_64X128 as usize] = Some(rav1e_sad_64x128_hbd_ssse3); out[BLOCK_128X64 as usize] = Some(rav1e_sad_128x64_hbd_ssse3); out[BLOCK_4X16 as usize] = Some(rav1e_sad_4x16_hbd_ssse3); out[BLOCK_16X4 as usize] = Some(rav1e_sad_16x4_hbd_ssse3); out[BLOCK_8X32 as usize] = Some(rav1e_sad_8x32_hbd_ssse3); out[BLOCK_32X8 as usize] = Some(rav1e_sad_32x8_hbd_ssse3); out[BLOCK_16X64 as usize] = Some(rav1e_sad_16x64_hbd_ssse3); out[BLOCK_64X16 as usize] = Some(rav1e_sad_64x16_hbd_ssse3); out }; static SAD_HBD_FNS_AVX2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_sad_4x4_hbd_ssse3); out[BLOCK_8X8 as usize] = Some(rav1e_sad_8x8_hbd_ssse3); out[BLOCK_16X16 as usize] = Some(rav1e_sad_16x16_hbd_avx2); out[BLOCK_32X32 as usize] = Some(rav1e_sad_32x32_hbd_avx2); out[BLOCK_64X64 as usize] = Some(rav1e_sad_64x64_hbd_avx2); out[BLOCK_128X128 as usize] = Some(rav1e_sad_128x128_hbd_avx2); out[BLOCK_4X8 as usize] = Some(rav1e_sad_4x8_hbd_ssse3); out[BLOCK_8X4 as usize] = Some(rav1e_sad_8x4_hbd_ssse3); out[BLOCK_8X16 as usize] = Some(rav1e_sad_8x16_hbd_ssse3); out[BLOCK_16X8 as usize] = Some(rav1e_sad_16x8_hbd_avx2); out[BLOCK_16X32 as usize] = Some(rav1e_sad_16x32_hbd_avx2); out[BLOCK_32X16 as usize] = Some(rav1e_sad_32x16_hbd_avx2); out[BLOCK_32X64 as usize] = Some(rav1e_sad_32x64_hbd_avx2); out[BLOCK_64X32 as usize] = Some(rav1e_sad_64x32_hbd_avx2); out[BLOCK_64X128 as usize] = Some(rav1e_sad_64x128_hbd_avx2); out[BLOCK_128X64 as usize] = Some(rav1e_sad_128x64_hbd_avx2); out[BLOCK_4X16 as usize] = Some(rav1e_sad_4x16_hbd_ssse3); out[BLOCK_16X4 as usize] = Some(rav1e_sad_16x4_hbd_avx2); out[BLOCK_32X8 as usize] = Some(rav1e_sad_32x8_hbd_avx2); out[BLOCK_16X64 as usize] = Some(rav1e_sad_16x64_hbd_avx2); out[BLOCK_64X16 as usize] = Some(rav1e_sad_64x16_hbd_avx2); out }; cpu_function_lookup_table!( SAD_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [SSSE3, AVX2] ); static SATD_FNS_SSSE3: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_8X8 as usize] = Some(rav1e_satd_8x8_ssse3); out }; static SATD_FNS_SSE4_1: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_satd_4x4_sse4); out[BLOCK_8X8 as usize] = Some(rav1e_satd_8x8_ssse3); out }; static SATD_FNS_AVX2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_satd_4x4_avx2); out[BLOCK_8X8 as usize] = Some(rav1e_satd_8x8_avx2); out[BLOCK_16X16 as usize] = Some(rav1e_satd_16x16_avx2); out[BLOCK_32X32 as usize] = Some(rav1e_satd_32x32_avx2); out[BLOCK_64X64 as usize] = Some(rav1e_satd_64x64_avx2); out[BLOCK_128X128 as usize] = Some(rav1e_satd_128x128_avx2); out[BLOCK_4X8 as usize] = Some(rav1e_satd_4x8_avx2); out[BLOCK_8X4 as usize] = Some(rav1e_satd_8x4_avx2); out[BLOCK_8X16 as usize] = Some(rav1e_satd_8x16_avx2); out[BLOCK_16X8 as usize] = Some(rav1e_satd_16x8_avx2); out[BLOCK_16X32 as usize] = Some(rav1e_satd_16x32_avx2); out[BLOCK_32X16 as usize] = Some(rav1e_satd_32x16_avx2); out[BLOCK_32X64 as usize] = Some(rav1e_satd_32x64_avx2); out[BLOCK_64X32 as usize] = Some(rav1e_satd_64x32_avx2); out[BLOCK_64X128 as usize] = Some(rav1e_satd_64x128_avx2); out[BLOCK_128X64 as usize] = Some(rav1e_satd_128x64_avx2); out[BLOCK_4X16 as usize] = Some(rav1e_satd_4x16_avx2); out[BLOCK_16X4 as usize] = Some(rav1e_satd_16x4_avx2); out[BLOCK_8X32 as usize] = Some(rav1e_satd_8x32_avx2); out[BLOCK_32X8 as usize] = Some(rav1e_satd_32x8_avx2); out[BLOCK_16X64 as usize] = Some(rav1e_satd_16x64_avx2); out[BLOCK_64X16 as usize] = Some(rav1e_satd_64x16_avx2); out }; cpu_function_lookup_table!( SATD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [SSSE3, SSE4_1, AVX2] ); static SATD_HBD_FNS_AVX2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_satd_4x4_hbd_avx2); out[BLOCK_8X8 as usize] = Some(rav1e_satd_8x8_hbd_avx2); out[BLOCK_16X16 as usize] = Some(rav1e_satd_16x16_hbd_avx2); out[BLOCK_32X32 as usize] = Some(rav1e_satd_32x32_hbd_avx2); out[BLOCK_64X64 as usize] = Some(rav1e_satd_64x64_hbd_avx2); out[BLOCK_128X128 as usize] = Some(rav1e_satd_128x128_hbd_avx2); out[BLOCK_4X8 as usize] = Some(rav1e_satd_4x8_hbd_avx2); out[BLOCK_8X4 as usize] = Some(rav1e_satd_8x4_hbd_avx2); out[BLOCK_8X16 as usize] = Some(rav1e_satd_8x16_hbd_avx2); out[BLOCK_16X8 as usize] = Some(rav1e_satd_16x8_hbd_avx2); out[BLOCK_16X32 as usize] = Some(rav1e_satd_16x32_hbd_avx2); out[BLOCK_32X16 as usize] = Some(rav1e_satd_32x16_hbd_avx2); out[BLOCK_32X64 as usize] = Some(rav1e_satd_32x64_hbd_avx2); out[BLOCK_64X32 as usize] = Some(rav1e_satd_64x32_hbd_avx2); out[BLOCK_64X128 as usize] = Some(rav1e_satd_64x128_hbd_avx2); out[BLOCK_128X64 as usize] = Some(rav1e_satd_128x64_hbd_avx2); out[BLOCK_4X16 as usize] = Some(rav1e_satd_4x16_hbd_avx2); out[BLOCK_16X4 as usize] = Some(rav1e_satd_16x4_hbd_avx2); out[BLOCK_8X32 as usize] = Some(rav1e_satd_8x32_hbd_avx2); out[BLOCK_32X8 as usize] = Some(rav1e_satd_32x8_hbd_avx2); out[BLOCK_16X64 as usize] = Some(rav1e_satd_16x64_hbd_avx2); out[BLOCK_64X16 as usize] = Some(rav1e_satd_64x16_hbd_avx2); out }; cpu_function_lookup_table!( SATD_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [AVX2] ); #[cfg(test)] mod test { use super::*; use crate::frame::{AsRegion, Plane}; use rand::random; use std::str::FromStr; macro_rules! test_dist_fns { ($(($W:expr, $H:expr)),*, $DIST_TY:ident, $BD:expr, $OPT:ident, $OPTLIT:tt) => { $( paste::item! { #[test] fn []() { if CpuFeatureLevel::default() < CpuFeatureLevel::from_str($OPTLIT).unwrap() { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; } if $BD > 8 { // dynamic allocation: test let mut src = Plane::from_slice(&[0u16; $W * $H], $W); // dynamic allocation: test let mut dst = Plane::from_slice(&[0u16; $W * $H], $W); for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { *s = random::() as u16 * $BD / 8; *d = random::() as u16 * $BD / 8; } let result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap()); let rust_result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST); assert_eq!(rust_result, result); } else { // dynamic allocation: test let mut src = Plane::from_slice(&[0u8; $W * $H], $W); // dynamic allocation: test let mut dst = Plane::from_slice(&[0u8; $W * $H], $W); for (s, d) in src.data.iter_mut().zip(dst.data.iter_mut()) { *s = random::(); *d = random::(); } let result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::from_str($OPTLIT).unwrap()); let rust_result = [](&src.as_region(), &dst.as_region(), $W, $H, $BD, CpuFeatureLevel::RUST); assert_eq!(rust_result, result); } } } )* } } test_dist_fns!( (4, 4), (16, 16), (8, 8), (4, 8), (8, 4), (8, 16), (16, 8), (4, 16), (16, 4), (8, 32), (32, 8), (32, 32), (64, 64), (128, 128), (16, 32), (32, 16), (32, 64), (64, 32), (64, 128), (128, 64), (16, 64), (64, 16), sad, 10, ssse3, "ssse3" ); test_dist_fns!( (4, 4), (16, 16), (8, 8), (4, 8), (8, 4), (8, 16), (16, 8), (4, 16), (16, 4), (8, 32), (32, 8), (32, 32), (64, 64), (128, 128), (16, 32), (32, 16), (32, 64), (64, 32), (64, 128), (128, 64), (16, 64), (64, 16), sad, 10, avx2, "avx2" ); test_dist_fns!( (4, 4), (4, 8), (4, 16), (8, 4), (8, 8), (8, 16), (8, 32), (16, 16), (32, 32), (64, 64), (128, 128), sad, 8, sse2, "sse2" ); test_dist_fns!( (16, 4), (16, 8), (16, 16), (16, 32), (16, 64), (32, 8), (32, 16), (32, 32), (32, 64), (64, 16), (64, 32), (64, 64), (64, 128), (128, 64), (128, 128), sad, 8, avx2, "avx2" ); test_dist_fns!((8, 8), satd, 8, ssse3, "ssse3"); test_dist_fns!((4, 4), satd, 8, sse4, "sse4.1"); test_dist_fns!( (4, 4), (8, 8), (16, 16), (32, 32), (64, 64), (128, 128), (4, 8), (8, 4), (8, 16), (16, 8), (16, 32), (32, 16), (32, 64), (64, 32), (64, 128), (128, 64), (4, 16), (16, 4), (8, 32), (32, 8), (16, 64), (64, 16), satd, 8, avx2, "avx2" ); test_dist_fns!( (4, 4), (8, 8), (16, 16), (32, 32), (64, 64), (128, 128), (4, 8), (8, 4), (8, 16), (16, 8), (16, 32), (32, 16), (32, 64), (64, 32), (64, 128), (128, 64), (4, 16), (16, 4), (8, 32), (32, 8), (16, 64), (64, 16), satd, 10, avx2, "avx2" ); test_dist_fns!( (4, 4), (8, 8), (16, 16), (32, 32), (64, 64), (128, 128), (4, 8), (8, 4), (8, 16), (16, 8), (16, 32), (32, 16), (32, 64), (64, 32), (64, 128), (128, 64), (4, 16), (16, 4), (8, 32), (32, 8), (16, 64), (64, 16), satd, 12, avx2, "avx2" ); } rav1e-0.7.1/src/asm/x86/dist/sse.rs000064400000000000000000000155561046102023000147620ustar 00000000000000// Copyright (c) 2020-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::dist::*; use crate::encoder::IMPORTANCE_BLOCK_SIZE; use crate::partition::BlockSize; use crate::rdo::DistortionScale; use crate::tiling::PlaneRegion; use crate::util::*; type WeightedSseFn = unsafe extern fn( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, scale: *const u32, scale_stride: isize, ) -> u64; type WeightedSseHBDFn = unsafe extern fn( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, scale: *const u32, scale_stride: isize, ) -> u64; macro_rules! declare_asm_sse_fn { ($($name: ident),+) => ( $( extern { fn $name ( src: *const u8, src_stride: isize, dst: *const u8, dst_stride: isize, scale: *const u32, scale_stride: isize ) -> u64; } )+ ) } macro_rules! declare_asm_hbd_sse_fn { ($($name: ident),+) => ( $( extern { fn $name ( src: *const u16, src_stride: isize, dst: *const u16, dst_stride: isize, scale: *const u32, scale_stride: isize ) -> u64; } )+ ) } declare_asm_sse_fn![ // SSSE3 rav1e_weighted_sse_4x4_ssse3, rav1e_weighted_sse_4x8_ssse3, rav1e_weighted_sse_4x16_ssse3, rav1e_weighted_sse_8x4_ssse3, rav1e_weighted_sse_8x8_ssse3, rav1e_weighted_sse_8x16_ssse3, rav1e_weighted_sse_8x32_ssse3, // AVX2 rav1e_weighted_sse_16x4_avx2, rav1e_weighted_sse_16x8_avx2, rav1e_weighted_sse_16x16_avx2, rav1e_weighted_sse_16x32_avx2, rav1e_weighted_sse_16x64_avx2, rav1e_weighted_sse_32x8_avx2, rav1e_weighted_sse_32x16_avx2, rav1e_weighted_sse_32x32_avx2, rav1e_weighted_sse_32x64_avx2, rav1e_weighted_sse_64x16_avx2, rav1e_weighted_sse_64x32_avx2, rav1e_weighted_sse_64x64_avx2, rav1e_weighted_sse_64x128_avx2, rav1e_weighted_sse_128x64_avx2, rav1e_weighted_sse_128x128_avx2 ]; declare_asm_hbd_sse_fn![ // SSE2 rav1e_weighted_sse_4x4_hbd_sse2 ]; /// # Panics /// /// - If in `check_asm` mode, panics on mismatch between native and ASM results. #[inline(always)] #[allow(clippy::let_and_return)] pub fn get_weighted_sse( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, scale: &[u32], scale_stride: usize, w: usize, h: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) -> u64 { // Assembly breaks if imp block size changes. assert_eq!(IMPORTANCE_BLOCK_SIZE >> 1, 4); let bsize_opt = BlockSize::from_width_and_height_opt(w, h); let call_rust = || -> u64 { rust::get_weighted_sse(dst, src, scale, scale_stride, w, h, bit_depth, cpu) }; #[cfg(feature = "check_asm")] let ref_dist = call_rust(); #[inline] const fn size_of_element(_: &[T]) -> usize { std::mem::size_of::() } let den = DistortionScale::new(1, 1 << rust::GET_WEIGHTED_SSE_SHIFT).0 as u64; let dist = match (bsize_opt, T::type_enum()) { (Err(_), _) => call_rust(), (Ok(bsize), PixelType::U8) => { match SSE_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { ((func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), scale.as_ptr(), (scale_stride * size_of_element(scale)) as isize, ) + (den >> 1)) / den }, None => call_rust(), } } (Ok(bsize), PixelType::U16) => { match SSE_HBD_FNS[cpu.as_index()][to_index(bsize)] { // SAFETY: Calls Assembly code. Some(func) => unsafe { ((func)( src.data_ptr() as *const _, T::to_asm_stride(src.plane_cfg.stride), dst.data_ptr() as *const _, T::to_asm_stride(dst.plane_cfg.stride), scale.as_ptr(), (scale_stride * size_of_element(scale)) as isize, ) + (den >> 1)) / den }, None => call_rust(), } } }; #[cfg(feature = "check_asm")] assert_eq!( dist, ref_dist, "Weighted SSE {:?}: Assembly doesn't match reference code.", bsize_opt ); dist } static SSE_FNS_SSSE3: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_weighted_sse_4x4_ssse3); out[BLOCK_4X8 as usize] = Some(rav1e_weighted_sse_4x8_ssse3); out[BLOCK_4X16 as usize] = Some(rav1e_weighted_sse_4x16_ssse3); out[BLOCK_8X4 as usize] = Some(rav1e_weighted_sse_8x4_ssse3); out[BLOCK_8X8 as usize] = Some(rav1e_weighted_sse_8x8_ssse3); out[BLOCK_8X16 as usize] = Some(rav1e_weighted_sse_8x16_ssse3); out[BLOCK_8X32 as usize] = Some(rav1e_weighted_sse_8x32_ssse3); out }; static SSE_FNS_AVX2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = SSE_FNS_SSSE3; use BlockSize::*; out[BLOCK_16X4 as usize] = Some(rav1e_weighted_sse_16x4_avx2); out[BLOCK_16X8 as usize] = Some(rav1e_weighted_sse_16x8_avx2); out[BLOCK_16X16 as usize] = Some(rav1e_weighted_sse_16x16_avx2); out[BLOCK_16X32 as usize] = Some(rav1e_weighted_sse_16x32_avx2); out[BLOCK_16X64 as usize] = Some(rav1e_weighted_sse_16x64_avx2); out[BLOCK_32X8 as usize] = Some(rav1e_weighted_sse_32x8_avx2); out[BLOCK_32X16 as usize] = Some(rav1e_weighted_sse_32x16_avx2); out[BLOCK_32X32 as usize] = Some(rav1e_weighted_sse_32x32_avx2); out[BLOCK_32X64 as usize] = Some(rav1e_weighted_sse_32x64_avx2); out[BLOCK_64X16 as usize] = Some(rav1e_weighted_sse_64x16_avx2); out[BLOCK_64X32 as usize] = Some(rav1e_weighted_sse_64x32_avx2); out[BLOCK_64X64 as usize] = Some(rav1e_weighted_sse_64x64_avx2); out[BLOCK_64X128 as usize] = Some(rav1e_weighted_sse_64x128_avx2); out[BLOCK_128X64 as usize] = Some(rav1e_weighted_sse_128x64_avx2); out[BLOCK_128X128 as usize] = Some(rav1e_weighted_sse_128x128_avx2); out }; static SSE_HBD_FNS_SSE2: [Option; DIST_FNS_LENGTH] = { let mut out: [Option; DIST_FNS_LENGTH] = [None; DIST_FNS_LENGTH]; use BlockSize::*; out[BLOCK_4X4 as usize] = Some(rav1e_weighted_sse_4x4_hbd_sse2); out }; cpu_function_lookup_table!( SSE_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [SSSE3, AVX2] ); cpu_function_lookup_table!( SSE_HBD_FNS: [[Option; DIST_FNS_LENGTH]], default: [None; DIST_FNS_LENGTH], [SSE2] ); rav1e-0.7.1/src/asm/x86/ec.rs000064400000000000000000000076311046102023000136070ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::ec::rust; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; #[inline(always)] pub fn update_cdf(cdf: &mut [u16; N], val: u32) { if cdf.len() == 4 { // SAFETY: Calls Assembly code, which is only valid when the length of // `cdf` is 4. return unsafe { update_cdf_4_sse2(cdf, val); }; } rust::update_cdf(cdf, val); } #[target_feature(enable = "sse2")] #[inline] unsafe fn update_cdf_4_sse2(cdf: &mut [u16], val: u32) { let nsymbs = 4; let rate = 5 + (cdf[nsymbs - 1] >> 4) as usize; let count = cdf[nsymbs - 1] + (cdf[nsymbs - 1] < 32) as u16; // A bit of explanation of what is happening down here. First of all, let's look at the simple // implementation: // // ``` // if i as u32 >= val { // *v -= *v >> rate; // } else { // *v += (32768 - *v) >> rate; // } // ``` // // We want to perform the same arithmetic operation in the two branches, therefore we can // transform in something like: // // ``` // if i as u32 >= val { // *v -= *v >> rate; // } else { // *v -= -((32768 - *v) >> rate); // } // ``` // // It is possible to bring the "minus" for the second branch logically before the right shift // using the following rule: // -(x >> y) = (-1 - x) >> y + 1 // So we obtain // // ``` // if i as u32 >= val { // *v -= *v >> rate; // } else { // *v -= (-1 - (32768 - *v)) >> rate + 1; // } // ``` // // Good. A range `0..4` can be compared against `val` in order to have a starting point to work // in different ways on the two branches. `cmplt` returns `-1` if `lhs < rhs`, 0 otherwise. // It is possible to use the `avg` SIMD operator, which performs `(lhs + rhs + 1) >> 1`. This is // useful because `avg` treats numbers as unsigned, `-1 = 0xFFFF`, therefore `(0xFFFF + 0 + 1) >> // 1 = 0x8000 = 32768`. Obviously `(0 + 0 + 1) >> 1 = 0`. // // Now the result of `cmplt` can be used along with the result from `avg` and the data in `cdf` // in order to obtain the right hand side of the subtraction from `cdf`. let val_splat = _mm_set1_epi16(val as i16); let indices = _mm_set_epi16(0, 0, 0, 0, 3, 2, 1, 0); let index_lt_val = _mm_cmplt_epi16(indices, val_splat); let k = _mm_avg_epu16(index_lt_val, _mm_setzero_si128()); let cdf_simd = _mm_loadl_epi64(cdf.as_mut_ptr() as *const __m128i); let k_minus_v = _mm_sub_epi16(k, cdf_simd); let negated_if_lt_val = _mm_sub_epi16(index_lt_val, k_minus_v); let shifted = _mm_sra_epi16(negated_if_lt_val, _mm_set_epi32(0, 0, 0, rate as i32)); let fixed_if_lt_val = _mm_sub_epi16(shifted, index_lt_val); let result = _mm_sub_epi16(cdf_simd, fixed_if_lt_val); _mm_storel_epi64(cdf.as_mut_ptr() as *mut __m128i, result); cdf[nsymbs - 1] = count; } #[cfg(test)] mod test { use crate::ec::rust; #[test] fn update_cdf_4_sse2() { let mut cdf = [7296, 3819, 1616, 0]; let mut cdf2 = [7296, 3819, 1616, 0]; for i in 0..4 { rust::update_cdf(&mut cdf, i); // SAFETY: We are only testing on cdfs of size 4 unsafe { super::update_cdf_4_sse2(&mut cdf2, i); } assert_eq!(cdf, cdf2); } let mut cdf = [7297, 3820, 1617, 0]; let mut cdf2 = cdf; for i in 0..4 { rust::update_cdf(&mut cdf, i); // SAFETY: We are only testing on cdfs of size 4 unsafe { super::update_cdf_4_sse2(&mut cdf2, i); } assert_eq!(cdf, cdf2); } } } rav1e-0.7.1/src/asm/x86/lrf.rs000064400000000000000000000426631046102023000140070ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::frame::PlaneSlice; use crate::lrf::*; use crate::util::Pixel; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; use std::mem; // computes an intermediate (ab) row for stripe_w + 2 columns at row y #[inline] pub fn sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, cpu: CpuFeatureLevel, ) { // only use 8-bit AVX2 assembly when bitdepth equals 8 if cpu >= CpuFeatureLevel::AVX2 && BD == 8 { // SAFETY: Calls Assembly code. return unsafe { sgrproj_box_ab_r1_avx2::( af, bf, iimg, iimg_sq, iimg_stride, y, stripe_w, s, ); }; } rust::sgrproj_box_ab_r1::( af, bf, iimg, iimg_sq, iimg_stride, y, stripe_w, s, cpu, ); } // computes an intermediate (ab) row for stripe_w + 2 columns at row y #[inline] pub fn sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, cpu: CpuFeatureLevel, ) { // only use 8-bit AVX2 assembly when bitdepth equals 8 if cpu >= CpuFeatureLevel::AVX2 && BD == 8 { // SAFETY: Calls Assembly code. return unsafe { sgrproj_box_ab_r2_avx2::( af, bf, iimg, iimg_sq, iimg_stride, y, stripe_w, s, ); }; } rust::sgrproj_box_ab_r2::( af, bf, iimg, iimg_sq, iimg_stride, y, stripe_w, s, cpu, ); } #[inline] pub fn sgrproj_box_f_r0( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, cpu: CpuFeatureLevel, ) { if cpu >= CpuFeatureLevel::AVX2 { // SAFETY: Calls Assembly code. return unsafe { sgrproj_box_f_r0_avx2(f, y, w, cdeffed); }; } rust::sgrproj_box_f_r0(f, y, w, cdeffed, cpu); } #[inline] pub fn sgrproj_box_f_r1( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, cpu: CpuFeatureLevel, ) { if cpu >= CpuFeatureLevel::AVX2 { // SAFETY: Calls Assembly code. return unsafe { sgrproj_box_f_r1_avx2(af, bf, f, y, w, cdeffed); }; } rust::sgrproj_box_f_r1(af, bf, f, y, w, cdeffed, cpu); } #[inline] pub fn sgrproj_box_f_r2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, cpu: CpuFeatureLevel, ) { if cpu >= CpuFeatureLevel::AVX2 { // SAFETY: Calls Assembly code. return unsafe { sgrproj_box_f_r2_avx2(af, bf, f0, f1, y, w, cdeffed); }; } rust::sgrproj_box_f_r2(af, bf, f0, f1, y, w, cdeffed, cpu); } static X_BY_XPLUS1: [u32; 256] = [ // Special case: Map 0 -> 1 (corresponding to a value of 1/256) // instead of 0. See comments in selfguided_restoration_internal() for why 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 256, ]; #[inline] #[target_feature(enable = "avx2")] unsafe fn sgrproj_box_ab_8_avx2( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, x: usize, y: usize, s: u32, ) { let bdm8 = BD - 8; let d: usize = r * 2 + 1; let n: i32 = (d * d) as i32; let one_over_n = if r == 1 { 455 } else { 164 }; // Using an integral image, compute the sum of a square region #[inline] #[target_feature(enable = "avx2")] unsafe fn get_integral_square_avx2( iimg: &[u32], stride: usize, x: usize, y: usize, size: usize, ) -> __m256i { let iimg = iimg.as_ptr().add(y * stride + x); // Cancel out overflow in iimg by using wrapping arithmetic _mm256_sub_epi32( _mm256_add_epi32( _mm256_loadu_si256(iimg as *const _), _mm256_loadu_si256(iimg.add(size * stride + size) as *const _), ), _mm256_add_epi32( _mm256_loadu_si256(iimg.add(size * stride) as *const _), _mm256_loadu_si256(iimg.add(size) as *const _), ), ) } let sum = get_integral_square_avx2(iimg, iimg_stride, x, y, d); let ssq = get_integral_square_avx2(iimg_sq, iimg_stride, x, y, d); let scaled_sum = _mm256_srlv_epi32( _mm256_add_epi32(sum, _mm256_set1_epi32(1 << bdm8 as i32 >> 1)), _mm256_set1_epi32(bdm8 as i32), ); let scaled_ssq = _mm256_srlv_epi32( _mm256_add_epi32(ssq, _mm256_set1_epi32(1 << (2 * bdm8) as i32 >> 1)), _mm256_set1_epi32(2 * bdm8 as i32), ); let p = _mm256_max_epi32( _mm256_setzero_si256(), _mm256_sub_epi32( _mm256_mullo_epi32(scaled_ssq, _mm256_set1_epi32(n)), _mm256_madd_epi16(scaled_sum, scaled_sum), ), ); let z = _mm256_srli_epi32( _mm256_add_epi32( _mm256_mullo_epi32(p, _mm256_set1_epi32(s as i32)), _mm256_set1_epi32(1 << SGRPROJ_MTABLE_BITS as i32 >> 1), ), SGRPROJ_MTABLE_BITS as i32, ); let a = _mm256_i32gather_epi32( X_BY_XPLUS1.as_ptr() as *const _, _mm256_min_epi32(z, _mm256_set1_epi32(255)), 4, ); let b = _mm256_mullo_epi32( _mm256_madd_epi16( _mm256_sub_epi32(_mm256_set1_epi32(1 << SGRPROJ_SGR_BITS as i32), a), sum, ), _mm256_set1_epi32(one_over_n), ); let b = _mm256_srlv_epi32( _mm256_add_epi32( b, _mm256_set1_epi32(1 << SGRPROJ_RECIP_BITS as i32 >> 1), ), _mm256_set1_epi32(SGRPROJ_RECIP_BITS as i32), ); _mm256_storeu_si256(af.as_mut_ptr().add(x) as *mut _, a); _mm256_storeu_si256(bf.as_mut_ptr().add(x) as *mut _, b); } #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, ) { for x in (0..stripe_w + 2).step_by(8) { if x + 8 <= stripe_w + 2 { sgrproj_box_ab_8_avx2::( 1, af, bf, iimg, iimg_sq, iimg_stride, x, y, s, ); } else { // finish using scalar rust::sgrproj_box_ab_internal::( 1, af, bf, iimg, iimg_sq, iimg_stride, x, y, stripe_w, s, ); } } #[cfg(feature = "check_asm")] { let mut af_ref: Vec = vec![0; stripe_w + 2]; let mut bf_ref: Vec = vec![0; stripe_w + 2]; rust::sgrproj_box_ab_internal::( 1, &mut af_ref, &mut bf_ref, iimg, iimg_sq, iimg_stride, 0, y, stripe_w, s, ); assert_eq!(&af[..stripe_w + 2], &af_ref[..]); assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]); } } #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, ) { for x in (0..stripe_w + 2).step_by(8) { if x + 8 <= stripe_w + 2 { sgrproj_box_ab_8_avx2::( 2, af, bf, iimg, iimg_sq, iimg_stride, x, y, s, ); } else { // finish using scalar rust::sgrproj_box_ab_internal::( 2, af, bf, iimg, iimg_sq, iimg_stride, x, y, stripe_w, s, ); } } #[cfg(feature = "check_asm")] { let mut af_ref: Vec = vec![0; stripe_w + 2]; let mut bf_ref: Vec = vec![0; stripe_w + 2]; rust::sgrproj_box_ab_internal::( 2, &mut af_ref, &mut bf_ref, iimg, iimg_sq, iimg_stride, 0, y, stripe_w, s, ); assert_eq!(&af[..stripe_w + 2], &af_ref[..]); assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]); } } #[inline] #[target_feature(enable = "avx2")] unsafe fn sgrproj_box_f_r0_8_avx2( f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice, ) { _mm256_storeu_si256( f.as_mut_ptr().add(x) as *mut _, _mm256_slli_epi32( if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( cdeffed.subslice(x, y).as_ptr() as *const _ )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( cdeffed.subslice(x, y).as_ptr() as *const _ )) }, SGRPROJ_RST_BITS as i32, ), ); } #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_f_r0_avx2( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, ) { for x in (0..w).step_by(8) { if x + 8 <= w { sgrproj_box_f_r0_8_avx2(f, x, y, cdeffed); } else { // finish using scalar rust::sgrproj_box_f_r0_internal(f, x, y, w, cdeffed); } } #[cfg(feature = "check_asm")] { let mut f_ref: Vec = vec![0; w]; rust::sgrproj_box_f_r0_internal(&mut f_ref, 0, y, w, cdeffed); assert_eq!(&f[..w], &f_ref[..]); } } #[inline] #[target_feature(enable = "avx2")] unsafe fn sgrproj_box_f_r1_8_avx2( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice, ) { let three = _mm256_set1_epi32(3); let four = _mm256_set1_epi32(4); let a0 = af[0].as_ptr(); let a1 = af[1].as_ptr(); let a2 = af[2].as_ptr(); let b0 = bf[0].as_ptr(); let b1 = bf[1].as_ptr(); let b2 = bf[2].as_ptr(); let a = _mm256_add_epi32( _mm256_madd_epi16( _mm256_add_epi32( _mm256_add_epi32( _mm256_loadu_si256(a0.add(x) as *const _), _mm256_loadu_si256(a0.add(x + 2) as *const _), ), _mm256_add_epi32( _mm256_loadu_si256(a2.add(x) as *const _), _mm256_loadu_si256(a2.add(x + 2) as *const _), ), ), three, ), _mm256_madd_epi16( _mm256_add_epi32( _mm256_add_epi32( _mm256_loadu_si256(a1.add(x) as *const _), _mm256_loadu_si256(a0.add(x + 1) as *const _), ), _mm256_add_epi32( _mm256_add_epi32( _mm256_loadu_si256(a1.add(x + 1) as *const _), _mm256_loadu_si256(a2.add(x + 1) as *const _), ), _mm256_loadu_si256(a1.add(x + 2) as *const _), ), ), four, ), ); let b = _mm256_add_epi32( _mm256_mullo_epi32( _mm256_add_epi32( _mm256_add_epi32( _mm256_loadu_si256(b0.add(x) as *const _), _mm256_loadu_si256(b0.add(x + 2) as *const _), ), _mm256_add_epi32( _mm256_loadu_si256(b2.add(x) as *const _), _mm256_loadu_si256(b2.add(x + 2) as *const _), ), ), three, ), _mm256_mullo_epi32( _mm256_add_epi32( _mm256_add_epi32( _mm256_loadu_si256(b1.add(x) as *const _), _mm256_loadu_si256(b0.add(x + 1) as *const _), ), _mm256_add_epi32( _mm256_add_epi32( _mm256_loadu_si256(b1.add(x + 1) as *const _), _mm256_loadu_si256(b2.add(x + 1) as *const _), ), _mm256_loadu_si256(b1.add(x + 2) as *const _), ), ), four, ), ); let v = _mm256_add_epi32( _mm256_madd_epi16( a, if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( cdeffed.subslice(x, y).as_ptr() as *const _ )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( cdeffed.subslice(x, y).as_ptr() as *const _ )) }, ), b, ); const SHIFT: i32 = (5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32; _mm256_storeu_si256( f.as_mut_ptr().add(x) as *mut _, _mm256_srli_epi32( _mm256_add_epi32(v, _mm256_set1_epi32(1 << SHIFT >> 1)), SHIFT, ), ); } #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_f_r1_avx2( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, ) { for x in (0..w).step_by(8) { if x + 8 <= w { sgrproj_box_f_r1_8_avx2(af, bf, f, x, y, cdeffed); } else { // finish using scalar rust::sgrproj_box_f_r1_internal(af, bf, f, x, y, w, cdeffed); } } #[cfg(feature = "check_asm")] { let mut f_ref: Vec = vec![0; w]; rust::sgrproj_box_f_r1_internal(af, bf, &mut f_ref, 0, y, w, cdeffed); assert_eq!(&f[..w], &f_ref[..]); } } #[inline] #[target_feature(enable = "avx2")] unsafe fn sgrproj_box_f_r2_8_avx2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], x: usize, y: usize, cdeffed: &PlaneSlice, ) { let five = _mm256_set1_epi32(5); let six = _mm256_set1_epi32(6); let a0 = af[0].as_ptr(); let a2 = af[1].as_ptr(); let b0 = bf[0].as_ptr(); let b2 = bf[1].as_ptr(); let a = _mm256_add_epi32( _mm256_madd_epi16( _mm256_add_epi32( _mm256_loadu_si256(a0.add(x) as *const _), _mm256_loadu_si256(a0.add(x + 2) as *const _), ), five, ), _mm256_madd_epi16(_mm256_loadu_si256(a0.add(x + 1) as *const _), six), ); let b = _mm256_add_epi32( _mm256_mullo_epi32( _mm256_add_epi32( _mm256_loadu_si256(b0.add(x) as *const _), _mm256_loadu_si256(b0.add(x + 2) as *const _), ), five, ), _mm256_mullo_epi32(_mm256_loadu_si256(b0.add(x + 1) as *const _), six), ); let ao = _mm256_add_epi32( _mm256_madd_epi16( _mm256_add_epi32( _mm256_loadu_si256(a2.add(x) as *const _), _mm256_loadu_si256(a2.add(x + 2) as *const _), ), five, ), _mm256_madd_epi16(_mm256_loadu_si256(a2.add(x + 1) as *const _), six), ); let bo = _mm256_add_epi32( _mm256_mullo_epi32( _mm256_add_epi32( _mm256_loadu_si256(b2.add(x) as *const _), _mm256_loadu_si256(b2.add(x + 2) as *const _), ), five, ), _mm256_mullo_epi32(_mm256_loadu_si256(b2.add(x + 1) as *const _), six), ); let v = _mm256_add_epi32( _mm256_madd_epi16( _mm256_add_epi32(a, ao), if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( cdeffed.subslice(x, y).as_ptr() as *const _ )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( cdeffed.subslice(x, y).as_ptr() as *const _ )) }, ), _mm256_add_epi32(b, bo), ); let vo = _mm256_add_epi32( _mm256_madd_epi16( ao, if mem::size_of::() == 1 { _mm256_cvtepu8_epi32(_mm_loadl_epi64( cdeffed.subslice(x, y + 1).as_ptr() as *const _, )) } else { _mm256_cvtepu16_epi32(_mm_loadu_si128( cdeffed.subslice(x, y + 1).as_ptr() as *const _, )) }, ), bo, ); const SHIFT: i32 = (5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32; _mm256_storeu_si256( f0.as_mut_ptr().add(x) as *mut _, _mm256_srli_epi32( _mm256_add_epi32(v, _mm256_set1_epi32(1 << SHIFT >> 1)), SHIFT, ), ); const SHIFTO: i32 = (4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS) as i32; _mm256_storeu_si256( f1.as_mut_ptr().add(x) as *mut _, _mm256_srli_epi32( _mm256_add_epi32(vo, _mm256_set1_epi32(1 << SHIFTO >> 1)), SHIFTO, ), ); } #[target_feature(enable = "avx2")] pub(crate) unsafe fn sgrproj_box_f_r2_avx2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, ) { for x in (0..w).step_by(8) { if x + 8 <= w { sgrproj_box_f_r2_8_avx2(af, bf, f0, f1, x, y, cdeffed); } else { // finish using scalar rust::sgrproj_box_f_r2_internal(af, bf, f0, f1, x, y, w, cdeffed); } } #[cfg(feature = "check_asm")] { let mut f0_ref: Vec = vec![0; w]; let mut f1_ref: Vec = vec![0; w]; rust::sgrproj_box_f_r2_internal( af, bf, &mut f0_ref, &mut f1_ref, 0, y, w, cdeffed, ); assert_eq!(&f0[..w], &f0_ref[..]); assert_eq!(&f1[..w], &f1_ref[..]); } } rav1e-0.7.1/src/asm/x86/mc.rs000064400000000000000000000633471046102023000136250ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::mc::FilterMode::*; use crate::mc::*; use crate::tiling::*; use crate::util::*; type PutFn = unsafe extern fn( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, ); type PutHBDFn = unsafe extern fn( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, bitdepth_max: i32, ); type PrepFn = unsafe extern fn( tmp: *mut i16, src: *const u8, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, ); type PrepHBDFn = unsafe extern fn( tmp: *mut i16, src: *const u16, src_stride: isize, width: i32, height: i32, col_frac: i32, row_frac: i32, bitdepth_max: i32, ); type AvgFn = unsafe extern fn( dst: *mut u8, dst_stride: isize, tmp1: *const i16, tmp2: *const i16, width: i32, height: i32, ); type AvgHBDFn = unsafe extern fn( dst: *mut u16, dst_stride: isize, tmp1: *const i16, tmp2: *const i16, width: i32, height: i32, bitdepth_max: i32, ); // gets an index that can be mapped to a function for a pair of filter modes #[inline] const fn get_2d_mode_idx(mode_x: FilterMode, mode_y: FilterMode) -> usize { (mode_x as usize + 4 * (mode_y as usize)) & 15 } /// # Panics /// /// - If `width` is not a power of 2 /// - If `width` is not between 2 and 128 /// - If `height` is odd /// - If `width * height` is greater than the length of `tmp1` or `tmp2` /// - If `width` and `height` do not fit within the bounds of `src` #[inline(always)] pub fn put_8tap( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, bit_depth: usize, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { rust::put_8tap( dst, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth, cpu, ); }; #[cfg(feature = "check_asm")] let ref_dst = { let mut copy = dst.scratch_copy(); call_rust(&mut copy.as_region_mut()); copy }; // SAFETY: The assembly only supports even heights and valid uncropped // widths unsafe { assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check bounds of dst assert!(dst.rect().width >= width && dst.rect().height >= height); // SAFETY: Check bounds of src assert!(src.accessible(width + 4, height + 4)); assert!(src.accessible_neg(3, 3)); match T::type_enum() { PixelType::U8 => { match PUT_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, ), None => call_rust(dst), } } PixelType::U16 => { match PUT_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, (1 << bit_depth) - 1, ), None => call_rust(dst), } } } } #[cfg(feature = "check_asm")] { for (dst_row, ref_row) in dst.rows_iter().zip(ref_dst.as_region().rows_iter()) { for (dst, reference) in dst_row.iter().zip(ref_row) { assert_eq!(*dst, *reference); } } } } /// # Panics /// /// - If `width` is not a power of 2 /// - If `width` is not between 2 and 128 /// - If `height` is odd /// - If `width * height` is greater than the length of `tmp1` or `tmp2` /// - If `width` and `height` do not fit within the bounds of `src` #[inline(always)] pub fn prep_8tap( tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, bit_depth: usize, cpu: CpuFeatureLevel, ) { let call_rust = |tmp: &mut [i16]| { rust::prep_8tap( tmp, src, width, height, col_frac, row_frac, mode_x, mode_y, bit_depth, cpu, ); }; #[cfg(feature = "check_asm")] let ref_tmp = { let mut copy = vec![0; width * height]; copy[..].copy_from_slice(&tmp[..width * height]); call_rust(&mut copy); copy }; // SAFETY: The assembly only supports even heights and valid uncropped // widths unsafe { assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check length of tmp assert!(tmp.len() >= width * height); // SAFETY: Check bounds of src assert!(src.accessible(width + 4, height + 4)); assert!(src.accessible_neg(3, 3)); match T::type_enum() { PixelType::U8 => { match PREP_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( tmp.as_mut_ptr(), src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, ), None => call_rust(tmp), } } PixelType::U16 if bit_depth > 8 => { match PREP_HBD_FNS[cpu.as_index()][get_2d_mode_idx(mode_x, mode_y)] { Some(func) => (func)( tmp.as_mut_ptr() as *mut _, src.as_ptr() as *const _, T::to_asm_stride(src.plane.cfg.stride), width as i32, height as i32, col_frac, row_frac, (1 << bit_depth) - 1, ), None => call_rust(tmp), } } _ => call_rust(tmp), } } #[cfg(feature = "check_asm")] { assert_eq!(&tmp[..width * height], &ref_tmp[..]); } } /// # Panics /// /// - If `width` is not a power of 2 /// - If `width` is not between 2 and 128 /// - If `width * height` is greater than the length of `tmp1` or `tmp2` /// - If `width` and `height` do not fit within the bounds of `dst` pub fn mc_avg( dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize, height: usize, bit_depth: usize, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { rust::mc_avg(dst, tmp1, tmp2, width, height, bit_depth, cpu); }; #[cfg(feature = "check_asm")] let ref_dst = { let mut copy = dst.scratch_copy(); call_rust(&mut copy.as_region_mut()); copy }; // SAFETY: The assembly only supports even heights and valid uncropped // widths unsafe { assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); // SAFETY: Check bounds of dst assert!(dst.rect().width >= width && dst.rect().height >= height); // SAFETY: Check length of tmp1 & tmp2 assert!(tmp1.len() >= width * height); assert!(tmp2.len() >= width * height); match T::type_enum() { PixelType::U8 => match AVG_FNS[cpu.as_index()] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), tmp1.as_ptr(), tmp2.as_ptr(), width as i32, height as i32, ), None => call_rust(dst), }, PixelType::U16 if bit_depth > 8 => match AVG_HBD_FNS[cpu.as_index()] { Some(func) => (func)( dst.data_ptr_mut() as *mut _, T::to_asm_stride(dst.plane_cfg.stride), tmp1.as_ptr(), tmp2.as_ptr(), width as i32, height as i32, (1 << bit_depth) - 1, ), None => call_rust(dst), }, _ => call_rust(dst), } } #[cfg(feature = "check_asm")] { for (dst_row, ref_row) in dst.rows_iter().zip(ref_dst.as_region().rows_iter()) { for (dst, reference) in dst_row.iter().zip(ref_row) { assert_eq!(*dst, *reference); } } } } macro_rules! decl_mc_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { paste::item! { extern { $( fn [<$func_name _ssse3>]( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32 ); fn [<$func_name _avx2>]( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32 ); fn [<$func_name _avx512icl>]( dst: *mut u8, dst_stride: isize, src: *const u8, src_stride: isize, w: i32, h: i32, mx: i32, my: i32 ); )* } static PUT_FNS_SSSE3: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _ssse3>]); )* out }; static PUT_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx2>]); )* out }; static PUT_FNS_AVX512ICL: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx512icl>]); )* out }; } } } decl_mc_fns!( (REGULAR, REGULAR, rav1e_put_8tap_regular_8bpc), (REGULAR, SMOOTH, rav1e_put_8tap_regular_smooth_8bpc), (REGULAR, SHARP, rav1e_put_8tap_regular_sharp_8bpc), (SMOOTH, REGULAR, rav1e_put_8tap_smooth_regular_8bpc), (SMOOTH, SMOOTH, rav1e_put_8tap_smooth_8bpc), (SMOOTH, SHARP, rav1e_put_8tap_smooth_sharp_8bpc), (SHARP, REGULAR, rav1e_put_8tap_sharp_regular_8bpc), (SHARP, SMOOTH, rav1e_put_8tap_sharp_smooth_8bpc), (SHARP, SHARP, rav1e_put_8tap_sharp_8bpc), (BILINEAR, BILINEAR, rav1e_put_bilin_8bpc) ); cpu_function_lookup_table!( PUT_FNS: [[Option; 16]], default: [None; 16], [SSSE3, AVX2, AVX512ICL] ); macro_rules! decl_mc_hbd_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { paste::item! { extern { $( fn [<$func_name _ssse3>]( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); fn [<$func_name _avx2>]( dst: *mut u16, dst_stride: isize, src: *const u16, src_stride: isize, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); )* } static PUT_HBD_FNS_SSSE3: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _ssse3>]); )* out }; static PUT_HBD_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx2>]); )* out }; } } } decl_mc_hbd_fns!( (REGULAR, REGULAR, rav1e_put_8tap_regular_16bpc), (REGULAR, SMOOTH, rav1e_put_8tap_regular_smooth_16bpc), (REGULAR, SHARP, rav1e_put_8tap_regular_sharp_16bpc), (SMOOTH, REGULAR, rav1e_put_8tap_smooth_regular_16bpc), (SMOOTH, SMOOTH, rav1e_put_8tap_smooth_16bpc), (SMOOTH, SHARP, rav1e_put_8tap_smooth_sharp_16bpc), (SHARP, REGULAR, rav1e_put_8tap_sharp_regular_16bpc), (SHARP, SMOOTH, rav1e_put_8tap_sharp_smooth_16bpc), (SHARP, SHARP, rav1e_put_8tap_sharp_16bpc), (BILINEAR, BILINEAR, rav1e_put_bilin_16bpc) ); cpu_function_lookup_table!( PUT_HBD_FNS: [[Option; 16]], default: [None; 16], [SSSE3, AVX2] ); macro_rules! decl_mct_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { paste::item! { extern { $( fn [<$func_name _sse2>]( tmp: *mut i16, src: *const u8, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32 ); fn [<$func_name _ssse3>]( tmp: *mut i16, src: *const u8, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32 ); fn [<$func_name _avx2>]( tmp: *mut i16, src: *const u8, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32 ); fn [<$func_name _avx512icl>]( tmp: *mut i16, src: *const u8, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32 ); )* } static PREP_FNS_SSE2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _sse2>]); )* out }; static PREP_FNS_SSSE3: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _ssse3>]); )* out }; static PREP_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx2>]); )* out }; static PREP_FNS_AVX512ICL: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx512icl>]); )* out }; } } } decl_mct_fns!( (REGULAR, REGULAR, rav1e_prep_8tap_regular_8bpc), (REGULAR, SMOOTH, rav1e_prep_8tap_regular_smooth_8bpc), (REGULAR, SHARP, rav1e_prep_8tap_regular_sharp_8bpc), (SMOOTH, REGULAR, rav1e_prep_8tap_smooth_regular_8bpc), (SMOOTH, SMOOTH, rav1e_prep_8tap_smooth_8bpc), (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp_8bpc), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular_8bpc), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth_8bpc), (SHARP, SHARP, rav1e_prep_8tap_sharp_8bpc), (BILINEAR, BILINEAR, rav1e_prep_bilin_8bpc) ); cpu_function_lookup_table!( PREP_FNS: [[Option; 16]], default: [None; 16], [SSE2, SSSE3, AVX2, AVX512ICL] ); macro_rules! decl_mct_hbd_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),+) => { paste::item! { extern { $( fn [<$func_name _ssse3>]( tmp: *mut i16, src: *const u16, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); fn [<$func_name _avx2>]( tmp: *mut i16, src: *const u16, src_stride: libc::ptrdiff_t, w: i32, h: i32, mx: i32, my: i32, bitdepth_max: i32, ); )* } static PREP_HBD_FNS_SSSE3: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _ssse3>]); )* out }; static PREP_HBD_FNS_AVX2: [Option; 16] = { let mut out: [Option; 16] = [None; 16]; $( out[get_2d_mode_idx($mode_x, $mode_y)] = Some([<$func_name _avx2>]); )* out }; } } } decl_mct_hbd_fns!( (REGULAR, REGULAR, rav1e_prep_8tap_regular_16bpc), (REGULAR, SMOOTH, rav1e_prep_8tap_regular_smooth_16bpc), (REGULAR, SHARP, rav1e_prep_8tap_regular_sharp_16bpc), (SMOOTH, REGULAR, rav1e_prep_8tap_smooth_regular_16bpc), (SMOOTH, SMOOTH, rav1e_prep_8tap_smooth_16bpc), (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp_16bpc), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular_16bpc), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth_16bpc), (SHARP, SHARP, rav1e_prep_8tap_sharp_16bpc), (BILINEAR, BILINEAR, rav1e_prep_bilin_16bpc) ); cpu_function_lookup_table!( PREP_HBD_FNS: [[Option; 16]], default: [None; 16], [SSSE3, AVX2] ); extern { fn rav1e_avg_8bpc_ssse3( dst: *mut u8, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, ); fn rav1e_avg_8bpc_avx2( dst: *mut u8, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, ); fn rav1e_avg_8bpc_avx512icl( dst: *mut u8, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, ); fn rav1e_avg_16bpc_ssse3( dst: *mut u16, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, bitdepth_max: i32, ); fn rav1e_avg_16bpc_avx2( dst: *mut u16, dst_stride: libc::ptrdiff_t, tmp1: *const i16, tmp2: *const i16, w: i32, h: i32, bitdepth_max: i32, ); } cpu_function_lookup_table!( AVG_FNS: [Option], default: None, [ (SSSE3, Some(rav1e_avg_8bpc_ssse3)), (AVX2, Some(rav1e_avg_8bpc_avx2)), (AVX512ICL, Some(rav1e_avg_8bpc_avx512icl)) ] ); cpu_function_lookup_table!( AVG_HBD_FNS: [Option], default: None, [(SSSE3, Some(rav1e_avg_16bpc_ssse3)), (AVX2, Some(rav1e_avg_16bpc_avx2))] ); #[cfg(test)] mod test { use super::*; use rand::random; use std::str::FromStr; macro_rules! test_put_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => { $( paste::item! { #[test] fn []() { if CpuFeatureLevel::default() < CpuFeatureLevel::from_str($OPTLIT).unwrap() { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; } let test_mvs = [MotionVector { row: 0, col: 0 }, MotionVector { row: 4, col: 0 }, MotionVector { row: 0, col: 4 }, MotionVector { row: 4, col: 4 }]; if $BD > 8 { // dynamic allocation: test let mut src = Plane::from_slice(&vec![0u16; 64 * 64], 64); for s in src.data.iter_mut() { *s = random::() as u16 * $BD / 8; } // dynamic allocation: test let mut dst1 = Plane::from_slice(&vec![0u16; 64 * 64], 64); // dynamic allocation: test let mut dst2 = Plane::from_slice(&vec![0u16; 64 * 64], 64); for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); assert_eq!(&*dst1.data, &*dst2.data); } } else { // dynamic allocation: test let mut src = Plane::from_slice(&vec![0u8; 64 * 64], 64); for s in src.data.iter_mut() { *s = random::(); } // dynamic allocation: test let mut dst1 = Plane::from_slice(&vec![0u8; 64 * 64], 64); // dynamic allocation: test let mut dst2 = Plane::from_slice(&vec![0u8; 64 * 64], 64); for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); super::put_8tap(&mut dst1.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); super::put_8tap(&mut dst2.as_region_mut(), src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); assert_eq!(&*dst1.data, &*dst2.data); } }; } } )* } } test_put_fns!( (REGULAR, REGULAR, rav1e_put_8tap_regular), (REGULAR, SMOOTH, rav1e_put_8tap_regular_smooth), (REGULAR, SHARP, rav1e_put_8tap_regular_sharp), (SMOOTH, REGULAR, rav1e_put_8tap_smooth_regular), (SMOOTH, SMOOTH, rav1e_put_8tap_smooth), (SMOOTH, SHARP, rav1e_put_8tap_smooth_sharp), (SHARP, REGULAR, rav1e_put_8tap_sharp_regular), (SHARP, SMOOTH, rav1e_put_8tap_sharp_smooth), (SHARP, SHARP, rav1e_put_8tap_sharp), (BILINEAR, BILINEAR, rav1e_put_bilin), ssse3, "ssse3", 8 ); test_put_fns!( (REGULAR, REGULAR, rav1e_put_8tap_regular), (REGULAR, SMOOTH, rav1e_put_8tap_regular_smooth), (REGULAR, SHARP, rav1e_put_8tap_regular_sharp), (SMOOTH, REGULAR, rav1e_put_8tap_smooth_regular), (SMOOTH, SMOOTH, rav1e_put_8tap_smooth), (SMOOTH, SHARP, rav1e_put_8tap_smooth_sharp), (SHARP, REGULAR, rav1e_put_8tap_sharp_regular), (SHARP, SMOOTH, rav1e_put_8tap_sharp_smooth), (SHARP, SHARP, rav1e_put_8tap_sharp), (BILINEAR, BILINEAR, rav1e_put_bilin), avx2, "avx2", 8 ); macro_rules! test_prep_fns { ($(($mode_x:expr, $mode_y:expr, $func_name:ident)),*, $OPT:ident, $OPTLIT:tt, $BD:expr) => { $( paste::item! { #[test] fn []() { if CpuFeatureLevel::default() < CpuFeatureLevel::from_str($OPTLIT).unwrap() { eprintln!("Ignoring {} test, not supported on this machine!", $OPTLIT); return; } // dynamic allocation: test let mut dst1 = Aligned::new([0i16; 128 * 128]); // dynamic allocation: test let mut dst2 = Aligned::new([0i16; 128 * 128]); let test_mvs = [MotionVector { row: 0, col: 0 }, MotionVector { row: 4, col: 0 }, MotionVector { row: 0, col: 4 }, MotionVector { row: 4, col: 4 }]; if $BD > 8 { // dynamic allocation: test let mut src = Plane::from_slice(&vec![0u16; 64 * 64], 64); for s in src.data.iter_mut() { *s = random::() as u16 * $BD / 8; } for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); } } else { // dynamic allocation: test let mut src = Plane::from_slice(&vec![0u8; 64 * 64], 64); for s in src.data.iter_mut() { *s = random::(); } for mv in &test_mvs { let (row_frac, col_frac, src) = get_params(&src, PlaneOffset { x: 0, y: 0 }, *mv); super::prep_8tap(&mut dst1.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::from_str($OPTLIT).unwrap()); super::prep_8tap(&mut dst2.data, src, 8, 8, col_frac, row_frac, $mode_x, $mode_y, 8, CpuFeatureLevel::RUST); } }; assert_eq!(&dst1.data.to_vec(), &dst2.data.to_vec()); } } )* } } test_prep_fns!( (REGULAR, REGULAR, rav1e_prep_8tap_regular), (REGULAR, SMOOTH, rav1e_prep_8tap_regular_smooth), (REGULAR, SHARP, rav1e_prep_8tap_regular_sharp), (SMOOTH, REGULAR, rav1e_prep_8tap_smooth_regular), (SMOOTH, SMOOTH, rav1e_prep_8tap_smooth), (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth), (SHARP, SHARP, rav1e_prep_8tap_sharp), (BILINEAR, BILINEAR, rav1e_prep_bilin), ssse3, "ssse3", 8 ); test_prep_fns!( (REGULAR, REGULAR, rav1e_prep_8tap_regular), (REGULAR, SMOOTH, rav1e_prep_8tap_regular_smooth), (REGULAR, SHARP, rav1e_prep_8tap_regular_sharp), (SMOOTH, REGULAR, rav1e_prep_8tap_smooth_regular), (SMOOTH, SMOOTH, rav1e_prep_8tap_smooth), (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth), (SHARP, SHARP, rav1e_prep_8tap_sharp), (BILINEAR, BILINEAR, rav1e_prep_bilin), avx2, "avx2", 8 ); test_prep_fns!( (REGULAR, REGULAR, rav1e_prep_8tap_regular), (REGULAR, SMOOTH, rav1e_prep_8tap_regular_smooth), (REGULAR, SHARP, rav1e_prep_8tap_regular_sharp), (SMOOTH, REGULAR, rav1e_prep_8tap_smooth_regular), (SMOOTH, SMOOTH, rav1e_prep_8tap_smooth), (SMOOTH, SHARP, rav1e_prep_8tap_smooth_sharp), (SHARP, REGULAR, rav1e_prep_8tap_sharp_regular), (SHARP, SMOOTH, rav1e_prep_8tap_sharp_smooth), (SHARP, SHARP, rav1e_prep_8tap_sharp), (BILINEAR, BILINEAR, rav1e_prep_bilin), avx512icl, "avx512vpclmulqdq", 8 ); fn get_params( rec_plane: &Plane, po: PlaneOffset, mv: MotionVector, ) -> (i32, i32, PlaneSlice) { let rec_cfg = &rec_plane.cfg; let shift_row = 3 + rec_cfg.ydec; let shift_col = 3 + rec_cfg.xdec; let row_offset = mv.row as i32 >> shift_row; let col_offset = mv.col as i32 >> shift_col; let row_frac = (mv.row as i32 - (row_offset << shift_row)) << (4 - shift_row); let col_frac = (mv.col as i32 - (col_offset << shift_col)) << (4 - shift_col); let qo = PlaneOffset { x: po.x + col_offset as isize - 3, y: po.y + row_offset as isize - 3, }; (row_frac, col_frac, rec_plane.slice(qo).clamp().subslice(3, 3)) } } rav1e-0.7.1/src/asm/x86/mod.rs000064400000000000000000000012311046102023000137650ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub mod cdef; pub mod dist; pub mod ec; pub mod lrf; pub mod mc; pub mod predict; pub mod quantize; pub mod sad_plane; pub mod transform; rav1e-0.7.1/src/asm/x86/predict.rs000064400000000000000000001037501046102023000146510ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::partition::{BlockSize, IntraEdge}; use crate::predict::{ rust, IntraEdgeFilterParameters, PredictionMode, PredictionVariant, }; use crate::tiling::{PlaneRegion, PlaneRegionMut}; use crate::transform::TxSize; use crate::Pixel; use std::mem::MaybeUninit; use v_frame::pixel::PixelType; macro_rules! decl_angular_ipred_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, angle: libc::c_int, ); )* } }; } decl_angular_ipred_fn! { rav1e_ipred_h_8bpc_ssse3, rav1e_ipred_h_8bpc_avx2, rav1e_ipred_h_8bpc_avx512icl, rav1e_ipred_v_8bpc_ssse3, rav1e_ipred_v_8bpc_avx2, rav1e_ipred_v_8bpc_avx512icl, rav1e_ipred_dc_8bpc_ssse3, rav1e_ipred_dc_8bpc_avx2, rav1e_ipred_dc_8bpc_avx512icl, rav1e_ipred_dc_left_8bpc_ssse3, rav1e_ipred_dc_left_8bpc_avx2, rav1e_ipred_dc_left_8bpc_avx512icl, rav1e_ipred_dc_128_8bpc_ssse3, rav1e_ipred_dc_128_8bpc_avx2, rav1e_ipred_dc_128_8bpc_avx512icl, rav1e_ipred_dc_top_8bpc_ssse3, rav1e_ipred_dc_top_8bpc_avx2, rav1e_ipred_dc_top_8bpc_avx512icl, rav1e_ipred_smooth_v_8bpc_ssse3, rav1e_ipred_smooth_v_8bpc_avx2, rav1e_ipred_smooth_v_8bpc_avx512icl, rav1e_ipred_smooth_h_8bpc_ssse3, rav1e_ipred_smooth_h_8bpc_avx2, rav1e_ipred_smooth_h_8bpc_avx512icl, rav1e_ipred_smooth_8bpc_ssse3, rav1e_ipred_smooth_8bpc_avx2, rav1e_ipred_smooth_8bpc_avx512icl, rav1e_ipred_z1_8bpc_ssse3, rav1e_ipred_z1_8bpc_avx2, rav1e_ipred_z3_8bpc_ssse3, rav1e_ipred_z3_8bpc_avx2, rav1e_ipred_paeth_8bpc_ssse3, rav1e_ipred_paeth_8bpc_avx2, rav1e_ipred_paeth_8bpc_avx512icl } macro_rules! decl_angular_ipred_hbd_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, width: libc::c_int, height: libc::c_int, angle: libc::c_int, max_width: libc::c_int, max_height: libc::c_int, bit_depth_max: libc::c_int, ); )* } }; } decl_angular_ipred_hbd_fn! { rav1e_ipred_h_16bpc_ssse3, rav1e_ipred_h_16bpc_avx2, rav1e_ipred_v_16bpc_ssse3, rav1e_ipred_v_16bpc_avx2, rav1e_ipred_dc_16bpc_ssse3, rav1e_ipred_dc_16bpc_avx2, rav1e_ipred_dc_left_16bpc_ssse3, rav1e_ipred_dc_left_16bpc_avx2, rav1e_ipred_dc_128_16bpc_ssse3, rav1e_ipred_dc_128_16bpc_avx2, rav1e_ipred_dc_top_16bpc_ssse3, rav1e_ipred_dc_top_16bpc_avx2, rav1e_ipred_smooth_v_16bpc_ssse3, rav1e_ipred_smooth_v_16bpc_avx2, rav1e_ipred_smooth_v_16bpc_avx512icl, rav1e_ipred_smooth_h_16bpc_ssse3, rav1e_ipred_smooth_h_16bpc_avx2, rav1e_ipred_smooth_h_16bpc_avx512icl, rav1e_ipred_smooth_16bpc_ssse3, rav1e_ipred_smooth_16bpc_avx2, rav1e_ipred_smooth_16bpc_avx512icl, rav1e_ipred_z1_16bpc_ssse3, rav1e_ipred_z1_16bpc_avx2, rav1e_ipred_z2_16bpc_ssse3, rav1e_ipred_z3_16bpc_ssse3, rav1e_ipred_z3_16bpc_avx2, rav1e_ipred_paeth_16bpc_ssse3, rav1e_ipred_paeth_16bpc_avx2, rav1e_ipred_paeth_16bpc_avx512icl } // For z2 prediction, we need to provide extra parameters, dx and dy, which indicate // the distance between the predicted block's top-left pixel and the frame's edge. // It is required for the intra edge filtering process. extern { fn rav1e_ipred_z2_8bpc_ssse3( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, angle: libc::c_int, dx: libc::c_int, dy: libc::c_int, ); fn rav1e_ipred_z2_8bpc_avx2( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, angle: libc::c_int, dx: libc::c_int, dy: libc::c_int, ); fn rav1e_ipred_z2_16bpc_avx2( dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, width: libc::c_int, height: libc::c_int, angle: libc::c_int, dx: libc::c_int, dy: libc::c_int, bit_depth_max: libc::c_int, ); } macro_rules! decl_cfl_ac_fn { ($($f:ident),+) => { extern { $( fn $f( ac: *mut MaybeUninit, src: *const u8, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); )* } }; } decl_cfl_ac_fn! { rav1e_ipred_cfl_ac_420_8bpc_avx2, rav1e_ipred_cfl_ac_420_8bpc_ssse3, rav1e_ipred_cfl_ac_422_8bpc_avx2, rav1e_ipred_cfl_ac_422_8bpc_ssse3, rav1e_ipred_cfl_ac_444_8bpc_avx2, rav1e_ipred_cfl_ac_444_8bpc_ssse3 } macro_rules! decl_cfl_ac_hbd_fn { ($($f:ident),+) => { extern { $( fn $f( ac: *mut MaybeUninit, src: *const u16, stride: libc::ptrdiff_t, w_pad: libc::c_int, h_pad: libc::c_int, width: libc::c_int, height: libc::c_int, ); )* } }; } decl_cfl_ac_hbd_fn! { rav1e_ipred_cfl_ac_420_16bpc_ssse3, rav1e_ipred_cfl_ac_420_16bpc_avx2, rav1e_ipred_cfl_ac_422_16bpc_ssse3, rav1e_ipred_cfl_ac_422_16bpc_avx2, rav1e_ipred_cfl_ac_444_16bpc_ssse3, rav1e_ipred_cfl_ac_444_16bpc_avx2 } macro_rules! decl_cfl_pred_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u8, stride: libc::ptrdiff_t, topleft: *const u8, width: libc::c_int, height: libc::c_int, ac: *const i16, alpha: libc::c_int, ); )* } }; } decl_cfl_pred_fn! { rav1e_ipred_cfl_8bpc_ssse3, rav1e_ipred_cfl_8bpc_avx2, rav1e_ipred_cfl_left_8bpc_ssse3, rav1e_ipred_cfl_left_8bpc_avx2, rav1e_ipred_cfl_top_8bpc_ssse3, rav1e_ipred_cfl_top_8bpc_avx2, rav1e_ipred_cfl_128_8bpc_ssse3, rav1e_ipred_cfl_128_8bpc_avx2 } macro_rules! decl_cfl_pred_hbd_fn { ($($f:ident),+) => { extern { $( fn $f( dst: *mut u16, stride: libc::ptrdiff_t, topleft: *const u16, width: libc::c_int, height: libc::c_int, ac: *const i16, alpha: libc::c_int, bit_depth_max: libc::c_int, ); )* } }; } decl_cfl_pred_hbd_fn! { rav1e_ipred_cfl_16bpc_ssse3, rav1e_ipred_cfl_16bpc_avx2, rav1e_ipred_cfl_128_16bpc_ssse3, rav1e_ipred_cfl_128_16bpc_avx2, rav1e_ipred_cfl_left_16bpc_ssse3, rav1e_ipred_cfl_left_16bpc_avx2, rav1e_ipred_cfl_top_16bpc_ssse3, rav1e_ipred_cfl_top_16bpc_avx2 } #[inline(always)] pub fn dispatch_predict_intra( mode: PredictionMode, variant: PredictionVariant, dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, ac: &[i16], angle: isize, ief_params: Option, edge_buf: &IntraEdge, cpu: CpuFeatureLevel, ) { let call_rust = |dst: &mut PlaneRegionMut<'_, T>| { rust::dispatch_predict_intra( mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf, cpu, ); }; // SAFETY: Calls Assembly code. unsafe { let stride = T::to_asm_stride(dst.plane_cfg.stride) as libc::ptrdiff_t; let w = tx_size.width() as libc::c_int; let h = tx_size.height() as libc::c_int; let angle = angle as libc::c_int; match T::type_enum() { PixelType::U8 => { let dst_ptr = dst.data_ptr_mut() as *mut _; let edge_ptr = edge_buf.top_left_ptr() as *const _; if cpu >= CpuFeatureLevel::AVX512ICL { match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_8bpc_avx512icl, PredictionVariant::LEFT => rav1e_ipred_dc_left_8bpc_avx512icl, PredictionVariant::TOP => rav1e_ipred_dc_top_8bpc_avx512icl, PredictionVariant::BOTH => rav1e_ipred_dc_8bpc_avx512icl, })(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_8bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_8bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => { let (enable_ief, ief_smooth_filter) = if let Some(params) = ief_params { ( true as libc::c_int, params.use_smooth_filter() as libc::c_int, ) } else { (false as libc::c_int, false as libc::c_int) }; // dav1d assembly uses the unused integer bits to hold IEF parameters let angle_arg = angle | (enable_ief << 10) | (ief_smooth_filter << 9); // From dav1d, bw and bh are the frame width and height rounded to 8px units let (bw, bh) = ( ((dst.plane_cfg.width + 7) >> 3) << 3, ((dst.plane_cfg.height + 7) >> 3) << 3, ); // From dav1d, dx and dy are the distance from the predicted block to the frame edge let (dx, dy) = ( (bw as isize - dst.rect().x) as libc::c_int, (bh as isize - dst.rect().y) as libc::c_int, ); if angle <= 90 { rav1e_ipred_z1_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } else if angle < 180 { rav1e_ipred_z2_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, ); } else { rav1e_ipred_z3_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_8bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_8bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_8bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_8bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_8bpc_avx2, PredictionVariant::LEFT => rav1e_ipred_cfl_left_8bpc_avx2, PredictionVariant::TOP => rav1e_ipred_cfl_top_8bpc_avx2, PredictionVariant::BOTH => rav1e_ipred_cfl_8bpc_avx2, })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); } _ => call_rust(dst), } } else if cpu >= CpuFeatureLevel::AVX2 { match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_8bpc_avx2, PredictionVariant::LEFT => rav1e_ipred_dc_left_8bpc_avx2, PredictionVariant::TOP => rav1e_ipred_dc_top_8bpc_avx2, PredictionVariant::BOTH => rav1e_ipred_dc_8bpc_avx2, })(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_8bpc_avx2(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_8bpc_avx2(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => { let (enable_ief, ief_smooth_filter) = if let Some(params) = ief_params { ( true as libc::c_int, params.use_smooth_filter() as libc::c_int, ) } else { (false as libc::c_int, false as libc::c_int) }; // dav1d assembly uses the unused integer bits to hold IEF parameters let angle_arg = angle | (enable_ief << 10) | (ief_smooth_filter << 9); // From dav1d, bw and bh are the frame width and height rounded to 8px units let (bw, bh) = ( ((dst.plane_cfg.width + 7) >> 3) << 3, ((dst.plane_cfg.height + 7) >> 3) << 3, ); // From dav1d, dx and dy are the distance from the predicted block to the frame edge let (dx, dy) = ( (bw as isize - dst.rect().x) as libc::c_int, (bh as isize - dst.rect().y) as libc::c_int, ); if angle <= 90 { rav1e_ipred_z1_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } else if angle < 180 { rav1e_ipred_z2_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, ); } else { rav1e_ipred_z3_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_8bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_8bpc_avx2, PredictionVariant::LEFT => rav1e_ipred_cfl_left_8bpc_avx2, PredictionVariant::TOP => rav1e_ipred_cfl_top_8bpc_avx2, PredictionVariant::BOTH => rav1e_ipred_cfl_8bpc_avx2, })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); } _ => call_rust(dst), } } else if cpu >= CpuFeatureLevel::SSSE3 { match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_8bpc_ssse3, PredictionVariant::LEFT => rav1e_ipred_dc_left_8bpc_ssse3, PredictionVariant::TOP => rav1e_ipred_dc_top_8bpc_ssse3, PredictionVariant::BOTH => rav1e_ipred_dc_8bpc_ssse3, })(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_8bpc_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_8bpc_ssse3(dst_ptr, stride, edge_ptr, w, h, angle); } PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => { let (enable_ief, ief_smooth_filter) = if let Some(params) = ief_params { ( true as libc::c_int, params.use_smooth_filter() as libc::c_int, ) } else { (false as libc::c_int, false as libc::c_int) }; // dav1d assembly uses the unused integer bits to hold IEF parameters let angle_arg = angle | (enable_ief << 10) | (ief_smooth_filter << 9); // From dav1d, bw and bh are the frame width and height rounded to 8px units let (bw, bh) = ( ((dst.plane_cfg.width + 7) >> 3) << 3, ((dst.plane_cfg.height + 7) >> 3) << 3, ); // From dav1d, dx and dy are the distance from the predicted block to the frame edge let (dx, dy) = ( (bw as isize - dst.rect().x) as libc::c_int, (bh as isize - dst.rect().y) as libc::c_int, ); if angle <= 90 { rav1e_ipred_z1_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } else if angle < 180 { rav1e_ipred_z2_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, ); } else { rav1e_ipred_z3_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle_arg, ); } } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_8bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_8bpc_ssse3, PredictionVariant::LEFT => rav1e_ipred_cfl_left_8bpc_ssse3, PredictionVariant::TOP => rav1e_ipred_cfl_top_8bpc_ssse3, PredictionVariant::BOTH => rav1e_ipred_cfl_8bpc_ssse3, })(dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle); } _ => call_rust(dst), } } else { call_rust(dst) } } PixelType::U16 => { let dst_ptr = dst.data_ptr_mut() as *mut _; let edge_ptr = edge_buf.top_left_ptr() as *const _; let bd_max = (1 << bit_depth) - 1; if cpu >= CpuFeatureLevel::AVX512ICL { match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_16bpc_avx2, PredictionVariant::LEFT => rav1e_ipred_dc_left_16bpc_avx2, PredictionVariant::TOP => rav1e_ipred_dc_top_16bpc_avx2, PredictionVariant::BOTH => rav1e_ipred_dc_16bpc_avx2, })( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max ); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => { let (enable_ief, ief_smooth_filter) = if let Some(params) = ief_params { ( true as libc::c_int, params.use_smooth_filter() as libc::c_int, ) } else { (false as libc::c_int, false as libc::c_int) }; // dav1d assembly uses the unused integer bits to hold IEF parameters let angle_arg = angle | (enable_ief << 10) | (ief_smooth_filter << 9); // From dav1d, bw and bh are the frame width and height rounded to 8px units let (bw, bh) = ( ((dst.plane_cfg.width + 7) >> 3) << 3, ((dst.plane_cfg.height + 7) >> 3) << 3, ); // From dav1d, dx and dy are the distance from the predicted block to the frame edge let (dx, dy) = ( (bw as isize - dst.rect().x) as libc::c_int, (bh as isize - dst.rect().y) as libc::c_int, ); if angle <= 90 { rav1e_ipred_z1_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, ); } else if angle < 180 { rav1e_ipred_z2_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, bd_max, ); } else { rav1e_ipred_z3_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, ); } } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_16bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_16bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_16bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_16bpc_avx512icl( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_16bpc_avx2, PredictionVariant::LEFT => rav1e_ipred_cfl_left_16bpc_avx2, PredictionVariant::TOP => rav1e_ipred_cfl_top_16bpc_avx2, PredictionVariant::BOTH => rav1e_ipred_cfl_16bpc_avx2, })( dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle, bd_max ); } _ => call_rust(dst), } } else if cpu >= CpuFeatureLevel::AVX2 { match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_16bpc_avx2, PredictionVariant::LEFT => rav1e_ipred_dc_left_16bpc_avx2, PredictionVariant::TOP => rav1e_ipred_dc_top_16bpc_avx2, PredictionVariant::BOTH => rav1e_ipred_dc_16bpc_avx2, })( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max ); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => { let (enable_ief, ief_smooth_filter) = if let Some(params) = ief_params { ( true as libc::c_int, params.use_smooth_filter() as libc::c_int, ) } else { (false as libc::c_int, false as libc::c_int) }; // dav1d assembly uses the unused integer bits to hold IEF parameters let angle_arg = angle | (enable_ief << 10) | (ief_smooth_filter << 9); // From dav1d, bw and bh are the frame width and height rounded to 8px units let (bw, bh) = ( ((dst.plane_cfg.width + 7) >> 3) << 3, ((dst.plane_cfg.height + 7) >> 3) << 3, ); // From dav1d, dx and dy are the distance from the predicted block to the frame edge let (dx, dy) = ( (bw as isize - dst.rect().x) as libc::c_int, (bh as isize - dst.rect().y) as libc::c_int, ); if angle <= 90 { rav1e_ipred_z1_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, ); } else if angle < 180 { rav1e_ipred_z2_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, bd_max, ); } else { rav1e_ipred_z3_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, ); } } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_16bpc_avx2( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_16bpc_avx2, PredictionVariant::LEFT => rav1e_ipred_cfl_left_16bpc_avx2, PredictionVariant::TOP => rav1e_ipred_cfl_top_16bpc_avx2, PredictionVariant::BOTH => rav1e_ipred_cfl_16bpc_avx2, })( dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle, bd_max ); } _ => call_rust(dst), } } else if cpu >= CpuFeatureLevel::SSSE3 { match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => rav1e_ipred_dc_128_16bpc_ssse3, PredictionVariant::LEFT => rav1e_ipred_dc_left_16bpc_ssse3, PredictionVariant::TOP => rav1e_ipred_dc_top_16bpc_ssse3, PredictionVariant::BOTH => rav1e_ipred_dc_16bpc_ssse3, })( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max ); } PredictionMode::V_PRED if angle == 90 => { rav1e_ipred_v_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::H_PRED if angle == 180 => { rav1e_ipred_h_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => { let (enable_ief, ief_smooth_filter) = if let Some(params) = ief_params { ( true as libc::c_int, params.use_smooth_filter() as libc::c_int, ) } else { (false as libc::c_int, false as libc::c_int) }; // dav1d assembly uses the unused integer bits to hold IEF parameters let angle_arg = angle | (enable_ief << 10) | (ief_smooth_filter << 9); // From dav1d, bw and bh are the frame width and height rounded to 8px units let (bw, bh) = ( ((dst.plane_cfg.width + 7) >> 3) << 3, ((dst.plane_cfg.height + 7) >> 3) << 3, ); // From dav1d, dx and dy are the distance from the predicted block to the frame edge let (dx, dy) = ( (bw as isize - dst.rect().x) as libc::c_int, (bh as isize - dst.rect().y) as libc::c_int, ); if angle <= 90 { rav1e_ipred_z1_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, ); } else if angle < 180 { rav1e_ipred_z2_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle_arg, dx, dy, bd_max, ); } else { rav1e_ipred_z3_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle_arg, 0, 0, bd_max, ); } } PredictionMode::SMOOTH_PRED => { rav1e_ipred_smooth_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_V_PRED => { rav1e_ipred_smooth_v_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::SMOOTH_H_PRED => { rav1e_ipred_smooth_h_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::PAETH_PRED => { rav1e_ipred_paeth_16bpc_ssse3( dst_ptr, stride, edge_ptr, w, h, angle, 0, 0, bd_max, ); } PredictionMode::UV_CFL_PRED => { let ac_ptr = ac.as_ptr() as *const _; (match variant { PredictionVariant::NONE => rav1e_ipred_cfl_128_16bpc_ssse3, PredictionVariant::LEFT => rav1e_ipred_cfl_left_16bpc_ssse3, PredictionVariant::TOP => rav1e_ipred_cfl_top_16bpc_ssse3, PredictionVariant::BOTH => rav1e_ipred_cfl_16bpc_ssse3, })( dst_ptr, stride, edge_ptr, w, h, ac_ptr, angle, bd_max ); } _ => call_rust(dst), } } else { call_rust(dst) } } } } } // The implementation MUST inititialize all `ac` elements #[inline(always)] pub(crate) fn pred_cfl_ac( ac: &mut [MaybeUninit], luma: &PlaneRegion<'_, T>, bsize: BlockSize, w_pad: usize, h_pad: usize, cpu: CpuFeatureLevel, ) { debug_assert_eq!(ac.len(), bsize.area()); let call_rust = |ac: &mut [MaybeUninit]| { rust::pred_cfl_ac::(ac, luma, bsize, w_pad, h_pad, cpu); }; let stride = T::to_asm_stride(luma.plane_cfg.stride) as libc::ptrdiff_t; let w = bsize.width() as libc::c_int; let h = bsize.height() as libc::c_int; let w_pad = w_pad as libc::c_int; let h_pad = h_pad as libc::c_int; // SAFETY: Calls Assembly code. unsafe { let ac_ptr = ac.as_mut_ptr(); match T::type_enum() { PixelType::U8 if cpu >= CpuFeatureLevel::SSSE3 => { let luma_ptr = luma.data_ptr() as *const u8; (if cpu >= CpuFeatureLevel::AVX2 { match (XDEC, YDEC) { (0, 0) => rav1e_ipred_cfl_ac_444_8bpc_avx2, (1, 0) => rav1e_ipred_cfl_ac_422_8bpc_avx2, _ => rav1e_ipred_cfl_ac_420_8bpc_avx2, } } else { match (XDEC, YDEC) { (0, 0) => rav1e_ipred_cfl_ac_444_8bpc_ssse3, (1, 0) => rav1e_ipred_cfl_ac_422_8bpc_ssse3, _ => rav1e_ipred_cfl_ac_420_8bpc_ssse3, } })(ac_ptr, luma_ptr, stride, w_pad, h_pad, w, h) } PixelType::U16 if cpu >= CpuFeatureLevel::SSSE3 => { let luma_ptr = luma.data_ptr() as *const u16; (if cpu >= CpuFeatureLevel::AVX2 { match (XDEC, YDEC) { (0, 0) => rav1e_ipred_cfl_ac_444_16bpc_avx2, (1, 0) => rav1e_ipred_cfl_ac_422_16bpc_avx2, _ => rav1e_ipred_cfl_ac_420_16bpc_avx2, } } else { match (XDEC, YDEC) { (0, 0) => rav1e_ipred_cfl_ac_444_16bpc_ssse3, (1, 0) => rav1e_ipred_cfl_ac_422_16bpc_ssse3, _ => rav1e_ipred_cfl_ac_420_16bpc_ssse3, } })(ac_ptr, luma_ptr, stride, w_pad, h_pad, w, h) } _ => call_rust(ac), } } } rav1e-0.7.1/src/asm/x86/quantize.rs000064400000000000000000000150071046102023000150540ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; use crate::context::av1_get_coded_tx_size; use crate::cpu_features::CpuFeatureLevel; use crate::quantize::*; use crate::transform::TxSize; use crate::util::*; use std::mem::MaybeUninit; type DequantizeFn = unsafe fn( qindex: u8, coeffs_ptr: *const i16, _eob: u16, rcoeffs_ptr: *mut i16, tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, ); cpu_function_lookup_table!( DEQUANTIZE_FNS: [Option], default: None, [(AVX2, Some(dequantize_avx2))] ); #[inline(always)] pub fn dequantize( qindex: u8, coeffs: &[T], eob: u16, rcoeffs: &mut [MaybeUninit], tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel, ) { let call_rust = |rcoeffs: &mut [MaybeUninit]| { crate::quantize::rust::dequantize( qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q, ac_delta_q, cpu, ); }; #[cfg(any(feature = "check_asm", test))] let mut ref_rcoeffs = { let area = av1_get_coded_tx_size(tx_size).area(); let mut copy = vec![MaybeUninit::new(T::cast_from(0)); area]; call_rust(&mut copy); copy }; match T::Pixel::type_enum() { PixelType::U8 => { if let Some(func) = DEQUANTIZE_FNS[cpu.as_index()] { // SAFETY: Calls Assembly code. unsafe { (func)( qindex, coeffs.as_ptr() as *const _, eob, rcoeffs.as_mut_ptr() as *mut _, tx_size, bit_depth, dc_delta_q, ac_delta_q, ) } } else { call_rust(rcoeffs) } } PixelType::U16 => call_rust(rcoeffs), } #[cfg(any(feature = "check_asm", test))] { let area = av1_get_coded_tx_size(tx_size).area(); let rcoeffs = unsafe { slice_assume_init_mut(&mut rcoeffs[..area]) }; let ref_rcoeffs = unsafe { slice_assume_init_mut(&mut ref_rcoeffs[..]) }; assert_eq!(rcoeffs, ref_rcoeffs); } } #[target_feature(enable = "avx2")] unsafe fn dequantize_avx2( qindex: u8, coeffs_ptr: *const i16, _eob: u16, rcoeffs_ptr: *mut i16, tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, ) { let log_tx_scale = _mm256_set1_epi32(get_log_tx_scale(tx_size) as i32); let quants_ac = _mm256_set1_epi32(ac_q(qindex, ac_delta_q, bit_depth).get() as i32); // Use the dc quantize as first vector element for the first iteration let mut quants = _mm256_insert_epi32( quants_ac, dc_q(qindex, dc_delta_q, bit_depth).get() as i32, 0, ); let area: usize = av1_get_coded_tx_size(tx_size).area(); // Step by 16 (256/16) coefficients for each iteration let step: usize = 16; assert!(area >= step); // Increase the pointers as we iterate let mut coeffs_ptr: *const i16 = coeffs_ptr; let mut rcoeffs_ptr: *mut i16 = rcoeffs_ptr; for _i in (0..area).step_by(step) { let coeffs = _mm256_load_si256(coeffs_ptr as *const _); let coeffs_abs = _mm256_abs_epi16(coeffs); // TODO: Since log_tx_scale is at most 2 and we gain an extra bit by taking // the abs value (unless the range is [-(2^15), 2^15 + 1]), it might be // possible to perform a 16-bit multiply and get the highest bit by // comparing coeffs to (1<<16) / quant. The would cost 1 compare, 1 blend, // and 1 add, but would get rid of 1 pack, 2 unpacks, 1 shift, and 1 // multiply. let rcoeffs = _mm256_sign_epi16( _mm256_packs_epi32( // (abs_coeff * quant) >> log_tx_scale _mm256_srlv_epi32( _mm256_madd_epi16( quants, // Convert the first half of each lane to 32-bits _mm256_unpacklo_epi16(coeffs_abs, _mm256_setzero_si256()), ), log_tx_scale, ), // Second half _mm256_srlv_epi32( _mm256_madd_epi16( quants_ac, _mm256_unpackhi_epi16(coeffs_abs, _mm256_setzero_si256()), ), log_tx_scale, ), ), coeffs, ); _mm256_store_si256(rcoeffs_ptr as *mut _, rcoeffs); // Only use a dc quantizer for the first iteration quants = quants_ac; coeffs_ptr = coeffs_ptr.add(step); rcoeffs_ptr = rcoeffs_ptr.add(step); } } #[cfg(test)] mod test { use super::*; use rand::distributions::{Distribution, Uniform}; use rand::{thread_rng, Rng}; use std::mem::MaybeUninit; #[test] fn dequantize_test() { let mut rng = thread_rng(); use TxSize::*; let tx_sizes = [ TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, ]; let bd: usize = 8; for &tx_size in &tx_sizes { let qindex: u8 = rng.gen_range((MINQ as u8)..(MAXQ as u8)); let dc_quant = dc_q(qindex, 0, bd).get() as i16; let ac_quant = ac_q(qindex, 0, bd).get() as i16; // Test the min, max, and random eobs let eobs = { let mut out = [0u16; 16]; let area: usize = av1_get_coded_tx_size(tx_size).area(); out[0] = 0; out[1] = area as u16; for eob in out.iter_mut().skip(2) { *eob = rng.gen_range(0..area as u16); } out }; for &eob in &eobs { let mut qcoeffs = Aligned::new([0i16; 32 * 32]); let mut rcoeffs = Aligned::new([MaybeUninit::new(0i16); 32 * 32]); // Generate quantized coefficients up to the eob let between = Uniform::from(-i16::MAX..=i16::MAX); for (i, qcoeff) in qcoeffs.data.iter_mut().enumerate().take(eob as usize) { *qcoeff = between.sample(&mut rng) / if i == 0 { dc_quant } else { ac_quant }; } // Rely on quantize's internal tests dequantize( qindex, &qcoeffs.data, eob, &mut rcoeffs.data, tx_size, bd, 0, 0, CpuFeatureLevel::default(), ); } } } } rav1e-0.7.1/src/asm/x86/sad_plane.rs000064400000000000000000000044251046102023000151440ustar 00000000000000// Copyright (c) 2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::sad_plane::*; use crate::util::{Pixel, PixelType}; use v_frame::plane::Plane; use std::mem; macro_rules! decl_sad_plane_fn { ($($f:ident),+) => { extern { $( fn $f( src: *const u8, dst: *const u8, stride: libc::size_t, width: libc::size_t, rows: libc::size_t ) -> u64; )* } }; } decl_sad_plane_fn!(rav1e_sad_plane_8bpc_sse2, rav1e_sad_plane_8bpc_avx2); pub(crate) fn sad_plane_internal( src: &Plane, dst: &Plane, cpu: CpuFeatureLevel, ) -> u64 { debug_assert!(src.cfg.width == dst.cfg.width); debug_assert!(src.cfg.stride == dst.cfg.stride); debug_assert!(src.cfg.height == dst.cfg.height); debug_assert!(src.cfg.width <= src.cfg.stride); match T::type_enum() { PixelType::U8 => { // helper macro to reduce boilerplate macro_rules! call_asm { ($func:ident, $src:expr, $dst:expr, $cpu:expr) => { // SAFETY: Calls Assembly code. unsafe { let result = $func( mem::transmute(src.data_origin().as_ptr()), mem::transmute(dst.data_origin().as_ptr()), src.cfg.stride, src.cfg.width, src.cfg.height, ); #[cfg(feature = "check_asm")] assert_eq!(result, rust::sad_plane_internal($src, $dst, $cpu)); result } }; } if cpu >= CpuFeatureLevel::AVX2 { call_asm!(rav1e_sad_plane_8bpc_avx2, src, dst, cpu) } else if cpu >= CpuFeatureLevel::SSE2 { call_asm!(rav1e_sad_plane_8bpc_sse2, src, dst, cpu) } else { rust::sad_plane_internal(src, dst, cpu) } } PixelType::U16 => rust::sad_plane_internal(src, dst, cpu), } } rav1e-0.7.1/src/asm/x86/transform/forward.rs000064400000000000000000000355071046102023000167020ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::asm::shared::transform::forward::*; use crate::cpu_features::CpuFeatureLevel; use crate::transform::forward::rust; use crate::transform::forward_shared::*; use crate::transform::*; use crate::util::*; use std::mem::MaybeUninit; use debug_unreachable::debug_unreachable; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[derive(Copy, Clone)] struct I32X8 { data: __m256i, } impl I32X8 { #[target_feature(enable = "avx2")] #[inline] const unsafe fn vec(self) -> __m256i { self.data } #[target_feature(enable = "avx2")] #[inline] const unsafe fn new(a: __m256i) -> I32X8 { I32X8 { data: a } } } type TxfmFunc = unsafe fn(&mut [I32X8]); impl_1d_tx!(target_feature(enable = "avx2"), unsafe); impl TxOperations for I32X8 { #[target_feature(enable = "avx2")] #[inline] unsafe fn zero() -> Self { I32X8::new(_mm256_setzero_si256()) } #[target_feature(enable = "avx2")] #[inline] unsafe fn tx_mul(self, mul: i32) -> Self { I32X8::new(_mm256_srav_epi32( _mm256_add_epi32( _mm256_mullo_epi32(self.vec(), _mm256_set1_epi32(mul)), _mm256_set1_epi32(1 << SHIFT >> 1), ), _mm256_set1_epi32(SHIFT), )) } #[target_feature(enable = "avx2")] #[inline] unsafe fn rshift1(self) -> Self { I32X8::new(_mm256_srai_epi32( _mm256_sub_epi32( self.vec(), _mm256_cmpgt_epi32(_mm256_setzero_si256(), self.vec()), ), 1, )) } #[target_feature(enable = "avx2")] #[inline] unsafe fn add(self, b: Self) -> Self { I32X8::new(_mm256_add_epi32(self.vec(), b.vec())) } #[target_feature(enable = "avx2")] #[inline] unsafe fn sub(self, b: Self) -> Self { I32X8::new(_mm256_sub_epi32(self.vec(), b.vec())) } #[target_feature(enable = "avx2")] #[inline] unsafe fn add_avg(self, b: Self) -> Self { I32X8::new(_mm256_srai_epi32(_mm256_add_epi32(self.vec(), b.vec()), 1)) } #[target_feature(enable = "avx2")] #[inline] unsafe fn sub_avg(self, b: Self) -> Self { I32X8::new(_mm256_srai_epi32(_mm256_sub_epi32(self.vec(), b.vec()), 1)) } } #[target_feature(enable = "avx2")] unsafe fn transpose_8x8_avx2(input: &[I32X8; 8], into: &mut [I32X8; 8]) { let stage1 = ( _mm256_unpacklo_epi32(input[0].vec(), input[1].vec()), _mm256_unpackhi_epi32(input[0].vec(), input[1].vec()), _mm256_unpacklo_epi32(input[2].vec(), input[3].vec()), _mm256_unpackhi_epi32(input[2].vec(), input[3].vec()), _mm256_unpacklo_epi32(input[4].vec(), input[5].vec()), _mm256_unpackhi_epi32(input[4].vec(), input[5].vec()), _mm256_unpacklo_epi32(input[6].vec(), input[7].vec()), _mm256_unpackhi_epi32(input[6].vec(), input[7].vec()), ); let stage2 = ( _mm256_unpacklo_epi64(stage1.0, stage1.2), _mm256_unpackhi_epi64(stage1.0, stage1.2), _mm256_unpacklo_epi64(stage1.1, stage1.3), _mm256_unpackhi_epi64(stage1.1, stage1.3), _mm256_unpacklo_epi64(stage1.4, stage1.6), _mm256_unpackhi_epi64(stage1.4, stage1.6), _mm256_unpacklo_epi64(stage1.5, stage1.7), _mm256_unpackhi_epi64(stage1.5, stage1.7), ); #[allow(clippy::identity_op)] const LO: i32 = (2 << 4) | 0; const HI: i32 = (3 << 4) | 1; into[0] = I32X8::new(_mm256_permute2x128_si256(stage2.0, stage2.4, LO)); into[1] = I32X8::new(_mm256_permute2x128_si256(stage2.1, stage2.5, LO)); into[2] = I32X8::new(_mm256_permute2x128_si256(stage2.2, stage2.6, LO)); into[3] = I32X8::new(_mm256_permute2x128_si256(stage2.3, stage2.7, LO)); into[4] = I32X8::new(_mm256_permute2x128_si256(stage2.0, stage2.4, HI)); into[5] = I32X8::new(_mm256_permute2x128_si256(stage2.1, stage2.5, HI)); into[6] = I32X8::new(_mm256_permute2x128_si256(stage2.2, stage2.6, HI)); into[7] = I32X8::new(_mm256_permute2x128_si256(stage2.3, stage2.7, HI)); } #[target_feature(enable = "avx2")] unsafe fn transpose_8x4_avx2(input: &[I32X8; 8], into: &mut [I32X8; 4]) { // Last 8 are empty let stage1 = ( //0101 _mm256_unpacklo_epi32(input[0].vec(), input[1].vec()), _mm256_unpackhi_epi32(input[0].vec(), input[1].vec()), _mm256_unpacklo_epi32(input[2].vec(), input[3].vec()), _mm256_unpackhi_epi32(input[2].vec(), input[3].vec()), _mm256_unpacklo_epi32(input[4].vec(), input[5].vec()), _mm256_unpackhi_epi32(input[4].vec(), input[5].vec()), _mm256_unpacklo_epi32(input[6].vec(), input[7].vec()), _mm256_unpackhi_epi32(input[6].vec(), input[7].vec()), ); let stage2 = ( _mm256_unpacklo_epi64(stage1.0, stage1.2), _mm256_unpackhi_epi64(stage1.0, stage1.2), _mm256_unpacklo_epi64(stage1.1, stage1.3), _mm256_unpackhi_epi64(stage1.1, stage1.3), _mm256_unpacklo_epi64(stage1.4, stage1.6), _mm256_unpackhi_epi64(stage1.4, stage1.6), _mm256_unpacklo_epi64(stage1.5, stage1.7), _mm256_unpackhi_epi64(stage1.5, stage1.7), ); #[allow(clippy::identity_op)] const LO: i32 = (2 << 4) | 0; into[0] = I32X8::new(_mm256_permute2x128_si256(stage2.0, stage2.4, LO)); into[1] = I32X8::new(_mm256_permute2x128_si256(stage2.1, stage2.5, LO)); into[2] = I32X8::new(_mm256_permute2x128_si256(stage2.2, stage2.6, LO)); into[3] = I32X8::new(_mm256_permute2x128_si256(stage2.3, stage2.7, LO)); } #[target_feature(enable = "avx2")] unsafe fn transpose_4x8_avx2(input: &[I32X8; 4], into: &mut [I32X8; 8]) { let stage1 = ( // 0101 _mm256_unpacklo_epi32(input[0].vec(), input[1].vec()), _mm256_unpackhi_epi32(input[0].vec(), input[1].vec()), // 2323 _mm256_unpacklo_epi32(input[2].vec(), input[3].vec()), _mm256_unpackhi_epi32(input[2].vec(), input[3].vec()), ); let stage2 = ( // 01234567 _mm256_unpacklo_epi64(stage1.0, stage1.2), _mm256_unpackhi_epi64(stage1.0, stage1.2), _mm256_unpacklo_epi64(stage1.1, stage1.3), _mm256_unpackhi_epi64(stage1.1, stage1.3), ); into[0] = I32X8::new(stage2.0); into[1] = I32X8::new(stage2.1); into[2] = I32X8::new(stage2.2); into[3] = I32X8::new(stage2.3); into[4] = I32X8::new(_mm256_castsi128_si256(_mm256_extractf128_si256(stage2.0, 1))); into[5] = I32X8::new(_mm256_castsi128_si256(_mm256_extractf128_si256(stage2.1, 1))); into[6] = I32X8::new(_mm256_castsi128_si256(_mm256_extractf128_si256(stage2.2, 1))); into[7] = I32X8::new(_mm256_castsi128_si256(_mm256_extractf128_si256(stage2.3, 1))); } #[target_feature(enable = "avx2")] unsafe fn transpose_4x4_avx2(input: &[I32X8; 4], into: &mut [I32X8; 4]) { let stage1 = ( _mm256_unpacklo_epi32(input[0].vec(), input[1].vec()), _mm256_unpackhi_epi32(input[0].vec(), input[1].vec()), _mm256_unpacklo_epi32(input[2].vec(), input[3].vec()), _mm256_unpackhi_epi32(input[2].vec(), input[3].vec()), ); into[0] = I32X8::new(_mm256_unpacklo_epi64(stage1.0, stage1.2)); into[1] = I32X8::new(_mm256_unpackhi_epi64(stage1.0, stage1.2)); into[2] = I32X8::new(_mm256_unpacklo_epi64(stage1.1, stage1.3)); into[3] = I32X8::new(_mm256_unpackhi_epi64(stage1.1, stage1.3)); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left(a: I32X8, shift: u8) -> I32X8 { I32X8::new(_mm256_sllv_epi32(a.vec(), _mm256_set1_epi32(shift as i32))) } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_right(a: I32X8, shift: u8) -> I32X8 { I32X8::new(_mm256_srav_epi32( _mm256_add_epi32(a.vec(), _mm256_set1_epi32(1 << (shift as i32) >> 1)), _mm256_set1_epi32(shift as i32), )) } #[target_feature(enable = "avx2")] #[inline] unsafe fn round_shift_array_avx2(arr: &mut [I32X8], bit: i8) { if arr.len() % 4 != 0 { debug_unreachable!(); } if bit == 0 { return; } if bit > 0 { let shift = bit as u8; for s in arr.chunks_exact_mut(4) { for chunk in s { *chunk = shift_right(*chunk, shift); } } } else { let shift = (-bit) as u8; for s in arr.chunks_exact_mut(4) { for chunk in s { *chunk = shift_left(*chunk, shift); } } } } #[allow(clippy::identity_op, clippy::erasing_op)] #[target_feature(enable = "avx2")] unsafe fn forward_transform_avx2( input: &[i16], output: &mut [MaybeUninit], stride: usize, tx_size: TxSize, tx_type: TxType, bd: usize, ) { // Note when assigning txfm_size_col, we use the txfm_size from the // row configuration and vice versa. This is intentionally done to // accurately perform rectangular transforms. When the transform is // rectangular, the number of columns will be the same as the // txfm_size stored in the row cfg struct. It will make no difference // for square transforms. let txfm_size_col = tx_size.width(); let txfm_size_row = tx_size.height(); let col_class = SizeClass1D::from_length(txfm_size_col); let row_class = SizeClass1D::from_length(txfm_size_row); let mut tmp: Aligned<[I32X8; 64 * 64 / 8]> = Aligned::uninitialized(); let buf = &mut tmp.data[..txfm_size_col * (txfm_size_row / 8).max(1)]; let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); let txfm_func_col = get_func(cfg.txfm_type_col); let txfm_func_row = get_func(cfg.txfm_type_row); // Columns for cg in (0..txfm_size_col).step_by(8) { let shift = cfg.shift[0] as u8; #[target_feature(enable = "avx2")] #[inline] unsafe fn load_columns(input_ptr: *const i16, shift: u8) -> I32X8 { // TODO: load 64-bits for x4 wide columns shift_left( I32X8::new(_mm256_cvtepi16_epi32(_mm_loadu_si128( input_ptr as *const _, ))), shift, ) } // Avoid zero initialization let tx_in = &mut [MaybeUninit::::uninit(); 64][..txfm_size_row]; if cfg.ud_flip { // flip upside down for (in_slice, out_reg) in input[cg..].chunks(stride).zip(tx_in.iter_mut().rev()) { *out_reg = MaybeUninit::new(load_columns(in_slice.as_ptr(), shift)); } } else { for (in_slice, out_reg) in input[cg..].chunks(stride).zip(tx_in.iter_mut()) { *out_reg = MaybeUninit::new(load_columns(in_slice.as_ptr(), shift)); } } let col_coeffs = slice_assume_init_mut(tx_in); txfm_func_col(col_coeffs); round_shift_array_avx2(col_coeffs, -cfg.shift[1]); // Transpose the array. Select the appropriate method to do so. match (row_class, col_class) { (SizeClass1D::X8UP, SizeClass1D::X8UP) => { for rg in (0..txfm_size_row).step_by(8) { let buf = &mut buf[(rg / 8 * txfm_size_col) + cg..]; let buf = cast_mut::<8, _>(buf); let input = &col_coeffs[rg..]; let input = cast::<8, _>(input); transpose_8x8_avx2(input, buf); } } (SizeClass1D::X8UP, SizeClass1D::X4) => { for rg in (0..txfm_size_row).step_by(8) { let buf = &mut buf[(rg / 8 * txfm_size_col) + cg..]; let buf = cast_mut::<4, _>(buf); let input = &col_coeffs[rg..]; let input = cast::<8, _>(input); transpose_8x4_avx2(input, buf); } } (SizeClass1D::X4, SizeClass1D::X8UP) => { // Don't need to loop over rows let buf = &mut buf[cg..]; let buf = cast_mut::<8, _>(buf); let input = cast::<4, _>(col_coeffs); transpose_4x8_avx2(input, buf); } (SizeClass1D::X4, SizeClass1D::X4) => { // Don't need to loop over rows let buf = cast_mut::<4, _>(buf); let input = cast::<4, _>(col_coeffs); transpose_4x4_avx2(input, buf); } } } // Rows for rg in (0..txfm_size_row).step_by(8) { let row_coeffs = &mut buf[rg / 8 * txfm_size_col..][..txfm_size_col]; if cfg.lr_flip { row_coeffs.reverse(); } txfm_func_row(row_coeffs); round_shift_array_avx2(row_coeffs, -cfg.shift[2]); // Write out the coefficients using the correct method for transforms of // this size. match row_class { SizeClass1D::X8UP => { // Store output in at most 32x32 chunks. See rust code for details. // Output is grouped into 32x32 chunks so a stride of at most 32 is // used for each chunk let output_stride = txfm_size_row.min(32); // Split the first 32 rows from the last 32 rows and offset by rg % 32 let output = &mut output[(rg & 31) + (rg >= 32) as usize * output_stride * txfm_size_col.min(32)..]; for cg in (0..txfm_size_col).step_by(32) { // Offset by zero or half of output let output = &mut output[txfm_size_row * cg..]; for c in 0..txfm_size_col.min(32) { match T::Pixel::type_enum() { PixelType::U8 => { let vec = row_coeffs[c + cg].vec(); let lo = _mm256_castsi256_si128(vec); let hi = _mm256_extracti128_si256(vec, 1); _mm_storeu_si128( output[c * output_stride..].as_mut_ptr() as *mut _, _mm_packs_epi32(lo, hi), ); } PixelType::U16 => { _mm256_storeu_si256( output[c * output_stride..].as_mut_ptr() as *mut _, row_coeffs[c + cg].vec(), ); } } } } } SizeClass1D::X4 => { // Write out coefficients in normal order - it isn't possible to have // more than 32 rows. for c in 0..txfm_size_col { match T::Pixel::type_enum() { PixelType::U8 => { let lo = _mm256_castsi256_si128(row_coeffs[c].vec()); _mm_storel_epi64( output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _, _mm_packs_epi32(lo, lo), ); } PixelType::U16 => { _mm256_storeu_si256( output[c * txfm_size_row + rg..].as_mut_ptr() as *mut _, row_coeffs[c].vec(), ); } } } } } } } /// # Panics /// /// - If called with an invalid combination of `tx_size` and `tx_type` #[inline] pub fn forward_transform( input: &[i16], output: &mut [MaybeUninit], stride: usize, tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, ) { assert!(valid_av1_transform(tx_size, tx_type)); if cpu >= CpuFeatureLevel::AVX2 { // SAFETY: Calls Assembly code. unsafe { forward_transform_avx2(input, output, stride, tx_size, tx_type, bd); } } else { rust::forward_transform(input, output, stride, tx_size, tx_type, bd, cpu); } } rav1e-0.7.1/src/asm/x86/transform/inverse.rs000064400000000000000000000402601046102023000167010ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::tiling::PlaneRegionMut; use crate::transform::inverse::*; use crate::transform::*; use crate::{Pixel, PixelType}; use crate::asm::shared::transform::inverse::*; pub fn inverse_transform_add( input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16, tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, ) { match T::type_enum() { PixelType::U8 => { if let Some(func) = INV_TXFM_FNS[cpu.as_index()][tx_size][tx_type] { return call_inverse_func( func, input, output, eob, tx_size.width(), tx_size.height(), bd, ); } } PixelType::U16 if bd == 10 => { if let Some(func) = INV_TXFM_HBD_FNS_10[cpu.as_index()][tx_size][tx_type] { return call_inverse_hbd_func( func, input, output, eob, tx_size.width(), tx_size.height(), bd, ); } } PixelType::U16 => { if let Some(func) = INV_TXFM_HBD_FNS_12[cpu.as_index()][tx_size][tx_type] { return call_inverse_hbd_func( func, input, output, eob, tx_size.width(), tx_size.height(), bd, ); } } }; rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu); } macro_rules! decl_itx_fns { // Takes a 2d list of tx types for W and H ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr, $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { // For each tx type, declare an function for the current WxH $( $( extern { // Note: type1 and type2 are flipped fn []( dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32 ); } )* )* // Create a lookup table for the tx types declared above const []: [Option; TX_TYPES_PLUS_LL] = { let mut out: [Option; TX_TYPES_PLUS_LL] = [None; TX_TYPES_PLUS_LL]; $( $( out[$ENUM as usize] = Some([]); )* )* out }; } }; } macro_rules! decl_itx_hbd_fns { // Takes a 2d list of tx types for W and H ([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr, $BPC:expr, $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { // For each tx type, declare an function for the current WxH $( $( extern { // Note: type1 and type2 are flipped fn []( dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32, bitdepth_max: i32, ); } )* )* // Create a lookup table for the tx types declared above const []: [Option; TX_TYPES_PLUS_LL] = { #[allow(unused_mut)] let mut out: [Option; TX_TYPES_PLUS_LL] = [None; TX_TYPES_PLUS_LL]; $( $( out[$ENUM as usize] = Some([]); )* )* out }; } }; } macro_rules! create_wxh_tables { // Create a lookup table for each cpu feature ([$([$(($W:expr, $H:expr)),*]),*], $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { const []: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = { let mut out: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]; // For each dimension, add an entry to the table $( $( out[TxSize::[] as usize] = []; )* )* out }; } }; // Loop through cpu features ($DIMS:tt, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( create_wxh_tables!($DIMS, $OPT_LOWER, $OPT_UPPER); )* }; } macro_rules! create_wxh_hbd_tables { // Create a lookup table for each cpu feature ([$([$(($W:expr, $H:expr)),*]),*], $EXT:ident, $BPC:expr, $OPT_LOWER:ident, $OPT_UPPER:ident) => { paste::item! { const []: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = { let mut out: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]; // For each dimension, add an entry to the table $( $( out[TxSize::[] as usize] = []; )* )* out }; } }; // Loop through cpu features ($DIMS:tt, $EXT:ident, [$(($BPC:expr, $OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( create_wxh_hbd_tables!($DIMS, $EXT, $BPC, $OPT_LOWER, $OPT_UPPER); )* }; } macro_rules! impl_itx_fns { ($TYPES:tt, $W:expr, $H:expr, [$(($OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( decl_itx_fns!($TYPES, $W, $H, $OPT_LOWER, $OPT_UPPER); )* }; // Loop over a list of dimensions ($TYPES_VALID:tt, [$(($W:expr, $H:expr)),*], $OPT:tt) => { $( impl_itx_fns!($TYPES_VALID, $W, $H, $OPT); )* }; ($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt, $TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt, $OPT:tt) => { // Make 2d list of tx types for each set of dimensions. Each set of // dimensions uses a superset of the previous set of tx types. impl_itx_fns!([$TYPES64], $DIMS64, $OPT); impl_itx_fns!([$TYPES64, $TYPES32], $DIMS32, $OPT); impl_itx_fns!([$TYPES64, $TYPES32, $TYPES16], $DIMS16, $OPT); impl_itx_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84, $OPT ); impl_itx_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4, $OPT ); // Pool all of the dimensions together to create a table for each cpu // feature level. create_wxh_tables!( [$DIMS64, $DIMS32, $DIMS16, $DIMS84, $DIMS4], $OPT ); }; } impl_itx_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16 and 4x4) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8)], // 4x4 [(TxType::WHT_WHT, wht, wht)], [(4, 4)], [(avx2, AVX2)] ); impl_itx_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)], // 4x4 [], [], [(avx512icl, AVX512ICL), (ssse3, SSSE3)] ); impl_itx_fns!( // 64x [], [], // 32x [], [], // 16x16 [], [], // 8x, 4x and 16x (minus 16x16 and 4x4) [], [], // 4x4 [(TxType::WHT_WHT, wht, wht)], [(4, 4)], [(sse2, SSE2)] ); cpu_function_lookup_table!( INV_TXFM_FNS: [[[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]], default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL], [SSE2, SSSE3, AVX2, AVX512ICL] ); macro_rules! impl_itx_hbd_fns { ($TYPES:tt, $W:expr, $H:expr, [$(($BPC:expr, $OPT_LOWER:ident, $OPT_UPPER:ident)),+]) => { $( decl_itx_hbd_fns!($TYPES, $W, $H, $BPC, $OPT_LOWER, $OPT_UPPER); )* }; // Loop over a list of dimensions ($TYPES_VALID:tt, [$(($W:expr, $H:expr)),*], $OPT:tt) => { $( impl_itx_hbd_fns!($TYPES_VALID, $W, $H, $OPT); )* }; ($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt, $TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt, $EXT:ident, $OPT:tt) => { // Make 2d list of tx types for each set of dimensions. Each set of // dimensions uses a superset of the previous set of tx types. impl_itx_hbd_fns!([$TYPES64], $DIMS64, $OPT); impl_itx_hbd_fns!([$TYPES64, $TYPES32], $DIMS32, $OPT); impl_itx_hbd_fns!([$TYPES64, $TYPES32, $TYPES16], $DIMS16, $OPT); impl_itx_hbd_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84, $OPT ); impl_itx_hbd_fns!( [$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4, $OPT ); // Pool all of the dimensions together to create a table for each cpu // feature level. create_wxh_hbd_tables!( [$DIMS64, $DIMS32, $DIMS16, $DIMS84, $DIMS4], $EXT, $OPT ); }; } impl_itx_hbd_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (64, 16), (16, 64)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (8, 8)], // 4x4 [], [], _10, [(10, avx512icl, AVX512ICL)] ); impl_itx_hbd_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)], // 4x4 [], [], _10_, [(10, avx2, AVX2)] ); impl_itx_hbd_fns!( // 64x [(TxType::DCT_DCT, dct, dct)], [(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)], // 32x [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)], // 4x4 [], [], _10, [(16, sse4, SSE4_1)] ); impl_itx_hbd_fns!( // 64x [], [], // 32x [], [], // 16x16 [], [], // 8x, 4x and 16x (minus 16x16 and 4x4) [], [], // 4x4 [(TxType::WHT_WHT, wht, wht)], [(4, 4)], _16, [(16, sse2, SSE2), (16, avx2, AVX2)] ); const INV_TXFM_HBD_FNS_10_SSE2: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = INV_TXFM_HBD_FNS_16_SSE2; const INV_TXFM_HBD_FNS_12_SSE2: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = INV_TXFM_HBD_FNS_16_SSE2; const fn merge_hbd_fns( a: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL], b: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL], ) -> [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] { let mut out = b; let mut tx_size = 0; loop { let mut tx_type = 0; loop { if a[tx_size][tx_type].is_some() { out[tx_size][tx_type] = a[tx_size][tx_type]; } tx_type += 1; if tx_type == TX_TYPES_PLUS_LL { break; } } tx_size += 1; if tx_size == TxSize::TX_SIZES_ALL { break; } } out } const INV_TXFM_HBD_FNS_10_AVX2: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = merge_hbd_fns(INV_TXFM_HBD_FNS_10__AVX2, INV_TXFM_HBD_FNS_16_AVX2); cpu_function_lookup_table!( INV_TXFM_HBD_FNS_10: [[[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]], default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL], [SSE2, SSE4_1, AVX2, AVX512ICL] ); impl_itx_hbd_fns!( // 32x (DCT and IDTX swapped due to incomplete DCT implementation) [(TxType::IDTX, identity, identity)], [(32, 32), (32, 16), (16, 32)], [(TxType::DCT_DCT, dct, dct)], [(32, 8), (8, 32)], // 16x16 [ (TxType::DCT_ADST, dct, adst), (TxType::ADST_DCT, adst, dct), (TxType::DCT_FLIPADST, dct, flipadst), (TxType::FLIPADST_DCT, flipadst, dct), (TxType::V_DCT, dct, identity), (TxType::H_DCT, identity, dct), (TxType::ADST_ADST, adst, adst), (TxType::ADST_FLIPADST, adst, flipadst), (TxType::FLIPADST_ADST, flipadst, adst), (TxType::FLIPADST_FLIPADST, flipadst, flipadst) ], [(16, 16)], // 8x, 4x and 16x (minus 16x16) [ (TxType::V_ADST, adst, identity), (TxType::H_ADST, identity, adst), (TxType::V_FLIPADST, flipadst, identity), (TxType::H_FLIPADST, identity, flipadst) ], [(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)], // 4x4 [], [], _12_, [(12, avx2, AVX2)] ); const INV_TXFM_HBD_FNS_12_AVX2: [[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] = merge_hbd_fns(INV_TXFM_HBD_FNS_12__AVX2, INV_TXFM_HBD_FNS_16_AVX2); cpu_function_lookup_table!( INV_TXFM_HBD_FNS_12: [[[Option; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]], default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL], [SSE2, AVX2] ); rav1e-0.7.1/src/asm/x86/transform/mod.rs000064400000000000000000000010541046102023000160030ustar 00000000000000// Copyright (c) 2019, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub mod forward; pub mod inverse; rav1e-0.7.1/src/bin/common.rs000064400000000000000000000553601046102023000140550ustar 00000000000000// Copyright (c) 2017-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::error::*; use crate::muxer::{create_muxer, Muxer}; use crate::stats::MetricsEnabled; use crate::{ColorPrimaries, MatrixCoefficients, TransferCharacteristics}; use clap::{CommandFactory, Parser as Clap, Subcommand}; use clap_complete::{generate, Shell}; use once_cell::sync::Lazy; use rav1e::prelude::*; use scan_fmt::scan_fmt; use rav1e::config::CpuFeatureLevel; use std::fs::File; use std::io; use std::io::prelude::*; use std::path::PathBuf; pub mod built_info { // The file has been placed there by the build script. include!(concat!(env!("OUT_DIR"), "/built.rs")); } #[derive(Clap)] #[clap( name = "rav1e", version = get_version(), long_version = get_long_version(), about = "AV1 video encoder", long_about = None )] pub struct CliOptions { /// Uncompressed YUV4MPEG2 video input #[clap(value_parser, help_heading = "INPUT/OUTPUT")] pub input: PathBuf, /// Compressed AV1 in IVF video output #[clap(long, short, value_parser, help_heading = "INPUT/OUTPUT")] pub output: PathBuf, /// Overwrite output file. #[clap(short = 'y', help_heading = "INPUT/OUTPUT")] pub overwrite: bool, /// Set the threadpool size. If 0, will use the number of logical CPUs. /// rav1e will use up to this many threads. Additional tiles may be needed to /// increase thread utilization. #[clap(long, value_parser, default_value_t = 0, help_heading = "THREADING")] pub threads: usize, /// Number of tile rows. Must be a power of 2. rav1e may override this based on video resolution. #[clap(long, value_parser, default_value_t = 0, help_heading = "THREADING")] pub tile_rows: usize, /// Number of tile columns. Must be a power of 2. rav1e may override this based on video resolution. #[clap(long, value_parser, default_value_t = 0, help_heading = "THREADING")] pub tile_cols: usize, /// Number of tiles. Tile-cols and tile-rows are overridden /// so that the video has at least this many tiles. #[clap( long, value_parser, conflicts_with = "tile_rows", conflicts_with = "tile_cols", help_heading = "THREADING" )] pub tiles: Option, /// Maximum number of GOPs that can be encoded in parallel #[cfg(feature = "unstable")] #[clap(long, value_parser, default_value_t = 0, help_heading = "THREADING")] pub slots: usize, /// Perform the first pass of a two-pass encode, /// saving the pass data to the specified file for future passes #[clap( long, value_parser, value_name = "STATS_FILE", help_heading = "ENCODE SETTINGS" )] pub first_pass: Option, /// Perform the second pass of a two-pass encode, /// reading the pass data saved from a previous pass from the specified file #[clap( long, value_parser, value_name = "STATS_FILE", help_heading = "ENCODE SETTINGS" )] pub second_pass: Option, /// Maximum number of frames to encode #[clap( long, short, value_parser, default_value_t = 0, help_heading = "ENCODE SETTINGS" )] pub limit: usize, /// Skip n number of frames and encode #[clap( long, value_parser, default_value_t = 0, help_heading = "ENCODE SETTINGS" )] pub skip: usize, /// Quantizer (0-255), smaller values are higher quality [default: 100] #[clap(long, value_parser, help_heading = "ENCODE SETTINGS")] pub quantizer: Option, /// Minimum quantizer (0-255) to use in bitrate mode [default: 0] #[clap(long, value_parser, help_heading = "ENCODE SETTINGS")] pub min_quantizer: Option, /// Bitrate (kbps) #[clap(long, short, value_parser, help_heading = "ENCODE SETTINGS")] pub bitrate: Option, /// Speed level (0 is best quality, 10 is fastest). /// Speeds 10 and 0 are extremes and are generally not recommended. #[clap(long, short, value_parser = clap::value_parser!(u8).range(0..=10), default_value_t = 6, help_heading = "ENCODE SETTINGS", long_help = build_speed_long_help())] pub speed: u8, /// Speed level for scene-change detection, 0: best quality, 1: fastest mode. /// [default: 0 for s0-s9, 1 for s10] #[clap(long, value_parser = clap::value_parser!(u8).range(0..=1), help_heading = "ENCODE SETTINGS")] pub scd_speed: Option, /// Minimum interval between keyframes #[clap( long, short = 'i', value_parser, default_value_t = 12, help_heading = "ENCODE SETTINGS" )] pub min_keyint: u64, /// Maximum interval between keyframes. When set to 0, disables fixed-interval keyframes. #[clap( long, short = 'I', value_parser, default_value_t = 240, help_heading = "ENCODE SETTINGS" )] pub keyint: u64, /// Maximum interval between switch frames. When set to 0, disables switch frames. #[clap( long, short = 'S', value_parser, default_value_t = 0, help_heading = "ENCODE SETTINGS" )] pub switch_frame_interval: u64, /// "Number of frames over which rate control should distribute the reservoir /// [default: min(240, 1.5x keyint)] /// A minimum value of 12 is enforced. #[clap(long, value_parser = clap::value_parser!(i32).range(12..), help_heading = "ENCODE SETTINGS")] pub reservoir_frame_delay: Option, /// Low latency mode; disables frame reordering. /// Has a significant speed-to-quality trade-off #[clap(long, help_heading = "ENCODE SETTINGS")] pub low_latency: bool, /// Disables scene detection entirely. /// Has a significant speed-to-quality trade-off in full encodes. /// Useful for chunked encoding. #[clap(long, help_heading = "ENCODE SETTINGS")] pub no_scene_detection: bool, /// Number of frames encoder should lookahead for RDO purposes\n\ /// [default value for speed levels: 10,9 - 10; 8,7,6 - 20; 5,4,3 - 30; 2,1,0 - 40] #[clap(long, value_parser, help_heading = "ENCODE SETTINGS")] pub rdo_lookahead_frames: Option, /// Quality tuning #[clap(long, value_parser, default_value_t = Tune::Psychovisual, help_heading = "ENCODE SETTINGS")] pub tune: Tune, /// Still picture mode #[clap(long, help_heading = "ENCODE SETTINGS")] pub still_picture: bool, /// Uses grain synthesis to add photon noise to the resulting encode. /// Takes a strength value 0-64. #[clap( long, conflicts_with = "film_grain_table", value_parser = clap::value_parser!(u8).range(0..=64), default_value_t = 0, help_heading = "ENCODE SETTINGS" )] pub photon_noise: u8, /// Uses a film grain table file to apply grain synthesis to the encode. /// Uses the same table file format as aomenc and svt-av1. #[clap( long, alias = "photon-noise-table", value_parser, help_heading = "ENCODE SETTINGS" )] pub film_grain_table: Option, /// Force the high bitdepth codepath even for 8bit content. /// Mainly for debugging purposes. #[clap(long, help_heading = "ENCODE SETTINGS")] pub high_bitdepth: bool, /// Pixel range #[clap(long, value_parser, help_heading = "VIDEO METADATA")] pub range: Option, /// Color primaries used to describe color parameters #[clap(long, value_parser, help_heading = "VIDEO METADATA")] pub primaries: Option, /// Transfer characteristics used to describe color parameters #[clap(long, value_parser, help_heading = "VIDEO METADATA")] pub transfer: Option, /// Matrix coefficients used to describe color parameters #[clap(long, value_parser, help_heading = "VIDEO METADATA")] pub matrix: Option, /// Mastering display primaries in the form of G(x,y)B(x,y)R(x,y)WP(x,y)L(max,min) #[clap(long, help_heading = "VIDEO METADATA")] pub mastering_display: Option, /// Content light level used to describe content luminosity (cll,fall) #[clap(long, help_heading = "VIDEO METADATA")] pub content_light: Option, /// AV1 level to target in the form ., e.g. 3.1. /// Specify "unconstrained" for no level constraints or "auto" to let /// the encoder choose (default) #[clap(long, help_heading = "LEVEL")] pub level: Option, /// Constant frame rate to set at the output (inferred from input when omitted) #[clap(long, value_parser, help_heading = "VIDEO METADATA")] pub frame_rate: Option, /// The time scale associated with the frame rate if provided (ignored otherwise) #[clap( long, value_parser, default_value_t = 0, help_heading = "VIDEO METADATA" )] pub time_scale: u64, /// Provide a benchmark report at the end of the encoding #[clap(long, help_heading = "DEBUGGING")] pub benchmark: bool, /// Verbose logging; outputs info for every frame #[clap(long, short, help_heading = "DEBUGGING")] pub verbose: bool, /// Do not output any status message #[clap(long, short, conflicts_with = "verbose", help_heading = "DEBUGGING")] pub quiet: bool, /// Calculate and display PSNR metrics #[clap(long, help_heading = "DEBUGGING")] pub psnr: bool, /// Calculate and display several metrics including PSNR, SSIM, CIEDE2000 etc #[clap(long, conflicts_with = "psnr", help_heading = "DEBUGGING")] pub metrics: bool, /// Outputs a Y4M file containing the output from the decoder #[clap(long, short, value_parser, help_heading = "DEBUGGING")] pub reconstruction: Option, #[clap(subcommand)] pub command: Option, } fn get_version() -> &'static str { static VERSION_STR: Lazy = Lazy::new(|| { format!( "{} ({})", rav1e::version::full(), // We cannot use `built_info::DEBUG` because that tells us if there are debug symbols, // not if there are optimizations. if cfg!(debug_assertions) { "debug" } else { "release" } ) }); &VERSION_STR } fn get_long_version() -> &'static str { static LONG_VERSION_STR: Lazy = Lazy::new(|| { let mut rustflags = env!("CARGO_ENCODED_RUSTFLAGS"); if rustflags.trim().is_empty() { rustflags = "(None)"; } format!( "{}\n{} {}\nCompiled CPU Features: {}\nRuntime Assembly Support: {}{}\nThreading: {}\nUnstable Features: {}\nCompiler Flags: {}", get_version(), built_info::RUSTC_VERSION, built_info::TARGET, option_env!("CARGO_CFG_TARGET_FEATURE").unwrap_or("(None)"), if cfg!(feature = "asm") { "Enabled" } else { "Disabled" }, if cfg!(feature = "asm") { format!("\nRuntime Assembly Level: {}", CpuFeatureLevel::default()) } else { String::new() }, if cfg!(feature = "threading") { "Enabled" } else { "Disabled" }, if cfg!(feature = "unstable") { "Enabled" } else { "Disabled" }, rustflags ) }); &LONG_VERSION_STR } #[derive(Subcommand)] pub enum Commands { /// Advanced features Advanced { /// Output to stdout the completion definition for the shell #[clap(long, short, value_parser)] completion: Option, /// Save the current configuration in a toml file #[clap(long, short, value_parser)] save_config: Option, /// Load the encoder configuration from a toml file #[clap(long, short, value_parser, conflicts_with = "save-config")] load_config: Option, }, } pub struct EncoderIO { pub input: Box, pub output: Box, pub rec: Option>, } #[derive(Clone, Copy, PartialEq, Eq)] pub enum Verboseness { Quiet, Normal, Verbose, } pub struct ParsedCliOptions { pub io: EncoderIO, pub enc: EncoderConfig, pub limit: usize, pub color_range_specified: bool, pub override_time_base: bool, pub skip: usize, pub verbose: Verboseness, pub benchmark: bool, pub threads: usize, pub metrics_enabled: MetricsEnabled, pub pass1file_name: Option, pub pass2file_name: Option, pub save_config: Option, pub photon_noise: u8, #[cfg(feature = "unstable")] pub slots: usize, pub force_highbitdepth: bool, } #[cfg(feature = "serialize")] static HELP_TEXT: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); #[cfg(feature = "serialize")] fn build_speed_long_help() -> Option<&'static str> { let help = HELP_TEXT.get_or_init(|| { let levels = (0..=10) .map(|speed| { let s = SpeedSettings::from_preset(speed); let o = crate::kv::to_string(&s).unwrap().replace(", ", "\n "); format!("{:2} :\n {}", speed, o) }) .collect::>() .join("\n"); format!( "Speed level (0 is best quality, 10 is fastest)\n\ Speeds 10 and 0 are extremes and are generally not recommended\n\ {}", levels ) }); Some(&help) } #[cfg(not(feature = "serialize"))] #[allow(clippy::missing_const_for_fn)] fn build_speed_long_help() -> Option<&'static str> { Some( "Speed level (0 is best quality, 10 is fastest)\n\ Speeds 10 and 0 are extremes and are generally not recommended", ) } #[allow(unused_mut)] /// Only call this once at the start of the app, /// otherwise bad things will happen. pub fn parse_cli() -> Result { let matches = CliOptions::parse(); let mut save_config_path = None; let mut enc = None; if let Some(command) = matches.command.as_ref() { match command { Commands::Advanced { completion, save_config, load_config } => { if let Some(shell) = completion { let mut app = CliOptions::command(); let app_name = app.get_name().to_string(); generate(*shell, &mut app, app_name, &mut std::io::stdout()); std::process::exit(0); } #[cfg(feature = "serialize")] { save_config_path = save_config.clone(); if let Some(load_config) = load_config { let mut config = String::new(); File::open(load_config) .and_then(|mut f| f.read_to_string(&mut config)) .map_err(|e| e.context("Cannot open the configuration file"))?; enc = Some(toml::from_str(&config).unwrap()); } } #[cfg(not(feature = "serialize"))] { if save_config.is_some() || load_config.is_some() { let e: io::Error = io::ErrorKind::InvalidInput.into(); return Err(e.context( "The load/save config advanced option requires the `serialize` feature, rebuild adding it.", )); } } } } } let rec = match matches.reconstruction.as_ref() { Some(f) => Some(Box::new( File::create(f) .map_err(|e| e.context("Cannot create reconstruction file"))?, ) as Box), None => None, }; let os_input = &matches.input; let io = EncoderIO { input: match os_input.to_str() { Some("-") => Box::new(io::stdin()) as Box, _ => Box::new( File::open(os_input) .map_err(|e| e.context("Cannot open input file"))?, ) as Box, }, output: create_muxer(&matches.output, matches.overwrite)?, rec, }; let enc = enc.map_or_else(|| parse_config(&matches), Ok)?; let verbose = if matches.quiet { Verboseness::Quiet } else if matches.verbose { Verboseness::Verbose } else { Verboseness::Normal }; let metrics_enabled = if matches.metrics { MetricsEnabled::All } else if matches.psnr { MetricsEnabled::Psnr } else { MetricsEnabled::None }; let limit = matches.limit; if enc.still_picture && limit > 1 { panic!("A limit cannot be set above 1 in still picture mode"); } #[cfg(feature = "unstable")] let slots = matches.slots; Ok(ParsedCliOptions { io, enc, limit, color_range_specified: matches.range.is_some(), override_time_base: matches.frame_rate.is_some(), metrics_enabled, skip: matches.skip, benchmark: matches.benchmark, verbose, threads: matches.threads, pass1file_name: matches.first_pass.clone(), pass2file_name: matches.second_pass.clone(), save_config: save_config_path, photon_noise: matches.photon_noise, force_highbitdepth: matches.high_bitdepth, #[cfg(feature = "unstable")] slots, }) } fn parse_config(matches: &CliOptions) -> Result { let maybe_quantizer = matches.quantizer; let maybe_bitrate = matches.bitrate; let quantizer = maybe_quantizer.unwrap_or_else(|| { if maybe_bitrate.is_some() { // If a bitrate is specified, the quantizer is the maximum allowed (e.g., // the minimum quality allowed), which by default should be // unconstrained. 255 } else { 100 } }) as usize; let bitrate: i32 = maybe_bitrate.unwrap_or(0); if bitrate <= 0 && (matches.first_pass.is_some() || matches.second_pass.is_some()) { panic!("A target bitrate must be specified when using passes"); } if quantizer == 0 { unimplemented!("Lossless encoding not yet implemented"); } else if quantizer > 255 { panic!("Quantizer must be between 0-255"); } let speed = matches.speed; let scene_detection_speed = matches.scd_speed; let max_interval = matches.keyint; let min_interval = matches.min_keyint.min(max_interval); if speed > 10 { panic!("Speed must be between 0-10"); } else if min_interval > max_interval { panic!("Maximum keyframe interval must be greater than or equal to minimum keyframe interval"); } let color_primaries = matches.primaries.unwrap_or_default(); let transfer_characteristics = matches.transfer.unwrap_or_default(); let matrix_coefficients = matches.matrix.unwrap_or_default(); let mut cfg = EncoderConfig::with_speed_preset(speed); if let Some(level_str) = &matches.level { cfg.level_idx = match level_str.as_str() { "auto" => None, "unconstrained" => Some(31), _ => { let (major, minor) = scan_fmt!(level_str, "{}.{}", u8, u8) .expect("Could not parse AV1 level"); if major > 7 || minor > 3 { panic!("Invalid AV1 level") } Some(((major - 2) << 2) + minor) } }; }; if let Some(scd_speed) = scene_detection_speed { cfg.speed_settings.scene_detection_mode = if scd_speed == 0 { SceneDetectionSpeed::Standard } else { SceneDetectionSpeed::Fast }; } cfg.set_key_frame_interval(min_interval, max_interval); cfg.switch_frame_interval = matches.switch_frame_interval; cfg.pixel_range = matches.range.unwrap_or_default(); cfg.color_description = if color_primaries == ColorPrimaries::Unspecified && transfer_characteristics == TransferCharacteristics::Unspecified && matrix_coefficients == MatrixCoefficients::Unspecified { // No need to set a color description with all parameters unspecified. None } else { Some(ColorDescription { color_primaries, transfer_characteristics, matrix_coefficients, }) }; cfg.mastering_display = matches.mastering_display.as_ref().map(|mastering_display| { let (g_x, g_y, b_x, b_y, r_x, r_y, wp_x, wp_y, max_lum, min_lum) = scan_fmt!( mastering_display, "G({},{})B({},{})R({},{})WP({},{})L({},{})", f64, f64, f64, f64, f64, f64, f64, f64, f64, f64 ) .expect("Cannot parse the mastering display option"); /* AV1 spec sec. 6.7.4 "Metadata high dynamic range mastering display color volume semantics" * specifies chromaticity coords as 0.16 fixed-point numbers, which have a max float value * of 0.9999847412109375 (rounding to 1). */ let chromaticity_range = 0.0..=1.0; if !chromaticity_range.contains(&g_x) || !chromaticity_range.contains(&g_y) || !chromaticity_range.contains(&b_x) || !chromaticity_range.contains(&b_y) || !chromaticity_range.contains(&r_x) || !chromaticity_range.contains(&r_y) || !chromaticity_range.contains(&wp_x) || !chromaticity_range.contains(&wp_y) { warn!( "Chromaticity coordinates will be trimmed to the range 0.0 to 1.0 (see AV1 spec sec. 6.7.4)." ); } MasteringDisplay { primaries: [ ChromaticityPoint { x: (r_x * ((1 << 16) as f64)).round() as u16, y: (r_y * ((1 << 16) as f64)).round() as u16, }, ChromaticityPoint { x: (g_x * ((1 << 16) as f64)).round() as u16, y: (g_y * ((1 << 16) as f64)).round() as u16, }, ChromaticityPoint { x: (b_x * ((1 << 16) as f64)).round() as u16, y: (b_y * ((1 << 16) as f64)).round() as u16, }, ], white_point: ChromaticityPoint { x: (wp_x * ((1 << 16) as f64)).round() as u16, y: (wp_y * ((1 << 16) as f64)).round() as u16, }, max_luminance: (max_lum * ((1 << 8) as f64)).round() as u32, min_luminance: (min_lum * ((1 << 14) as f64)).round() as u32, } }); cfg.content_light = matches.content_light.as_ref().and_then(|content_light| { let (cll, fall) = scan_fmt!(content_light, "{},{}", u16, u16) .expect("Cannot parse the content light option"); if cll == 0 && fall == 0 { None } else { Some(ContentLight { max_content_light_level: cll, max_frame_average_light_level: fall, }) } }); cfg.still_picture = matches.still_picture; cfg.quantizer = quantizer; cfg.min_quantizer = matches.min_quantizer.unwrap_or(0); cfg.bitrate = bitrate.checked_mul(1000).expect("Bitrate too high"); cfg.reservoir_frame_delay = matches.reservoir_frame_delay; if let Some(rdo_frames) = matches.rdo_lookahead_frames { cfg.speed_settings.rdo_lookahead_frames = rdo_frames; } cfg.tune = matches.tune; if cfg.tune == Tune::Psychovisual { cfg.speed_settings.transform.tx_domain_distortion = false; } cfg.tile_cols = matches.tile_cols; cfg.tile_rows = matches.tile_rows; cfg.tiles = matches.tiles.unwrap_or(0); if cfg.tile_cols > 64 || cfg.tile_rows > 64 { panic!("Tile columns and rows may not be greater than 64"); } if let Some(table_file) = matches.film_grain_table.as_ref() { let contents = std::fs::read_to_string(table_file) .expect("Failed to read film grain table file"); let table = av1_grain::parse_grain_table(&contents) .expect("Failed to parse film grain table"); if !table.is_empty() { cfg.film_grain_params = Some(table); } } if let Some(frame_rate) = matches.frame_rate { cfg.time_base = Rational::new(matches.time_scale, frame_rate); } cfg.low_latency = matches.low_latency; // Disables scene_detection if matches.no_scene_detection { cfg.speed_settings.scene_detection_mode = SceneDetectionSpeed::None; } Ok(cfg) } rav1e-0.7.1/src/bin/decoder/mod.rs000064400000000000000000000033171046102023000147440ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use rav1e::prelude::*; use std::io; pub mod y4m; pub trait FrameBuilder { fn new_frame(&self) -> Frame; } pub trait Decoder: Send { fn get_video_details(&self) -> VideoDetails; fn read_frame>( &mut self, ctx: &F, cfg: &VideoDetails, ) -> Result, DecodeError>; } #[derive(Debug)] #[allow(clippy::upper_case_acronyms)] pub enum DecodeError { EOF, BadInput, UnknownColorspace, ParseError, IoError(io::Error), MemoryLimitExceeded, } #[derive(Debug, Clone, Copy)] pub struct VideoDetails { pub width: usize, pub height: usize, pub sample_aspect_ratio: Rational, pub bit_depth: usize, pub chroma_sampling: ChromaSampling, pub chroma_sample_position: ChromaSamplePosition, pub time_base: Rational, } impl Default for VideoDetails { fn default() -> Self { VideoDetails { width: 640, height: 480, sample_aspect_ratio: Rational { num: 1, den: 1 }, bit_depth: 8, chroma_sampling: ChromaSampling::Cs420, chroma_sample_position: ChromaSamplePosition::Unknown, time_base: Rational { num: 30, den: 1 }, } } } rav1e-0.7.1/src/bin/decoder/y4m.rs000064400000000000000000000071651046102023000147030ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use std::io::Read; use crate::color::ChromaSampling::Cs400; use crate::decoder::{DecodeError, Decoder, FrameBuilder, VideoDetails}; use crate::Frame; use rav1e::prelude::*; impl Decoder for y4m::Decoder> { fn get_video_details(&self) -> VideoDetails { let width = self.get_width(); let height = self.get_height(); let aspect_ratio = self.get_pixel_aspect(); let color_space = self.get_colorspace(); let bit_depth = color_space.get_bit_depth(); let (chroma_sampling, chroma_sample_position) = map_y4m_color_space(color_space); let framerate = self.get_framerate(); let time_base = Rational::new(framerate.den as u64, framerate.num as u64); VideoDetails { width, height, sample_aspect_ratio: if aspect_ratio.num == 0 && aspect_ratio.den == 0 { Rational::new(1, 1) } else { Rational::new(aspect_ratio.num as u64, aspect_ratio.den as u64) }, bit_depth, chroma_sampling, chroma_sample_position, time_base, } } fn read_frame>( &mut self, ctx: &F, cfg: &VideoDetails, ) -> Result, DecodeError> { let bytes = self.get_bytes_per_sample(); self .read_frame() .map(|frame| { let mut f = ctx.new_frame(); let (chroma_width, _) = cfg.chroma_sampling.get_chroma_dimensions(cfg.width, cfg.height); f.planes[0].copy_from_raw_u8( frame.get_y_plane(), cfg.width * bytes, bytes, ); if cfg.chroma_sampling != Cs400 { f.planes[1].copy_from_raw_u8( frame.get_u_plane(), chroma_width * bytes, bytes, ); f.planes[2].copy_from_raw_u8( frame.get_v_plane(), chroma_width * bytes, bytes, ); } f }) .map_err(Into::into) } } impl From for DecodeError { fn from(e: y4m::Error) -> DecodeError { match e { y4m::Error::EOF => DecodeError::EOF, y4m::Error::BadInput => DecodeError::BadInput, y4m::Error::UnknownColorspace => DecodeError::UnknownColorspace, y4m::Error::ParseError(_) => DecodeError::ParseError, y4m::Error::IoError(e) => DecodeError::IoError(e), // Note that this error code has nothing to do with the system running out of memory, // it means the y4m decoder has exceeded its memory allocation limit. y4m::Error::OutOfMemory => DecodeError::MemoryLimitExceeded, } } } pub const fn map_y4m_color_space( color_space: y4m::Colorspace, ) -> (ChromaSampling, ChromaSamplePosition) { use crate::ChromaSamplePosition::*; use crate::ChromaSampling::*; use y4m::Colorspace::*; match color_space { Cmono | Cmono12 => (Cs400, Unknown), C420jpeg | C420paldv => (Cs420, Unknown), C420mpeg2 => (Cs420, Vertical), C420 | C420p10 | C420p12 => (Cs420, Colocated), C422 | C422p10 | C422p12 => (Cs422, Colocated), C444 | C444p10 | C444p12 => (Cs444, Colocated), _ => unimplemented!(), } } rav1e-0.7.1/src/bin/error.rs000064400000000000000000000031761046102023000137140ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2020, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #[derive(Debug, thiserror::Error)] pub enum CliError { #[error("Cannot parse option `{opt}`: {err}")] ParseInt { opt: String, err: std::num::ParseIntError }, #[error("{msg}")] Message { msg: String }, #[error("{msg}: {e}")] Generic { msg: String, e: String }, } impl CliError { pub fn new(msg: &str) -> CliError { CliError::Message { msg: msg.to_owned() } } } pub trait ToError: std::error::Error + Sized { fn context(self, msg: &str) -> CliError { CliError::Generic { msg: msg.to_owned(), e: self.to_string() } } } impl ToError for std::num::ParseIntError { fn context(self, opt: &str) -> CliError { CliError::ParseInt { opt: opt.to_lowercase(), err: self } } } impl ToError for std::io::Error {} impl ToError for rav1e::InvalidConfig {} impl ToError for rav1e::EncoderStatus {} impl ToError for rav1e::config::RateControlError {} pub fn print_error(e: &dyn std::error::Error) { error!("{}", e); let mut cause = e.source(); while let Some(e) = cause { error!("Caused by: {}", e); cause = e.source(); } } rav1e-0.7.1/src/bin/kv.rs000064400000000000000000000125741046102023000132050ustar 00000000000000#![allow(unused_variables)] use serde::{ser, Serialize, Serializer}; use thiserror::*; use std::fmt; struct KVString { output: String, } #[derive(Copy, Clone, PartialEq, Eq, Debug, Error)] enum Error { #[error("unsupported")] Unsupported, } impl ser::Error for Error { #[cold] fn custom(msg: T) -> Error { Error::Unsupported } } /// Serialize a configuration as a Key-Value string. pub fn to_string(value: &T) -> Result where T: Serialize, { let mut serializer = KVString { output: String::new() }; value.serialize(&mut serializer).map_err(|_| ())?; Ok(serializer.output) } impl Serializer for &mut KVString { type Ok = (); type Error = Error; type SerializeSeq = ser::Impossible<(), Self::Error>; type SerializeTuple = ser::Impossible<(), Self::Error>; type SerializeTupleStruct = ser::Impossible<(), Self::Error>; type SerializeTupleVariant = ser::Impossible<(), Self::Error>; type SerializeMap = ser::Impossible<(), Self::Error>; type SerializeStruct = Self; type SerializeStructVariant = ser::Impossible<(), Self::Error>; fn serialize_bool(self, v: bool) -> Result { self.output += if v { "true" } else { "false" }; Ok(()) } fn serialize_i8(self, v: i8) -> Result<(), Self::Error> { self.serialize_i64(i64::from(v)) } fn serialize_i16(self, v: i16) -> Result<(), Self::Error> { self.serialize_i64(i64::from(v)) } fn serialize_i32(self, v: i32) -> Result<(), Self::Error> { self.serialize_i64(i64::from(v)) } fn serialize_i64(self, v: i64) -> Result<(), Self::Error> { self.output += &v.to_string(); Ok(()) } fn serialize_u8(self, v: u8) -> Result<(), Self::Error> { self.serialize_u64(u64::from(v)) } fn serialize_u16(self, v: u16) -> Result<(), Self::Error> { self.serialize_u64(u64::from(v)) } fn serialize_u32(self, v: u32) -> Result<(), Self::Error> { self.serialize_u64(u64::from(v)) } fn serialize_u64(self, v: u64) -> Result<(), Self::Error> { self.output += &v.to_string(); Ok(()) } fn serialize_f32(self, v: f32) -> Result<(), Self::Error> { self.serialize_f64(f64::from(v)) } fn serialize_f64(self, v: f64) -> Result<(), Self::Error> { self.output += &v.to_string(); Ok(()) } fn serialize_char(self, v: char) -> Result { unimplemented!() } fn serialize_str(self, v: &str) -> Result { self.output += v; Ok(()) } fn serialize_bytes(self, v: &[u8]) -> Result { unimplemented!() } fn serialize_none(self) -> Result { self.output += "None"; Ok(()) } fn serialize_some( self, value: &T, ) -> Result where T: Serialize, { value.serialize(self) } fn serialize_unit(self) -> Result { self.output += ""; Ok(()) } fn serialize_unit_struct( self, name: &'static str, ) -> Result { self.serialize_unit() } fn serialize_unit_variant( self, name: &'static str, variant_index: u32, variant: &'static str, ) -> Result { self.serialize_str(variant) } fn serialize_newtype_struct( self, name: &'static str, value: &T, ) -> Result where T: Serialize, { unimplemented!() } fn serialize_newtype_variant( self, name: &'static str, variant_index: u32, variant: &'static str, value: &T, ) -> Result where T: Serialize, { unimplemented!() } fn serialize_seq( self, len: Option, ) -> Result { unimplemented!() } fn serialize_tuple( self, len: usize, ) -> Result { unimplemented!() } fn serialize_tuple_struct( self, name: &'static str, len: usize, ) -> Result { unimplemented!() } fn serialize_tuple_variant( self, name: &'static str, variant_index: u32, variant: &'static str, len: usize, ) -> Result { unimplemented!() } fn serialize_map( self, len: Option, ) -> Result { unimplemented!() } fn serialize_struct( self, name: &'static str, len: usize, ) -> Result { Ok(self) } fn serialize_struct_variant( self, name: &'static str, variant_index: u32, variant: &'static str, len: usize, ) -> Result { unimplemented!() } } impl<'a> ser::SerializeStruct for &mut KVString { type Ok = (); type Error = Error; // Assume a single flat struct for now fn serialize_field( &mut self, key: &'static str, value: &T, ) -> Result<(), Self::Error> where T: ?Sized + Serialize, { if !self.output.is_empty() { self.output += ", " } self.output += key; self.output += " = "; value.serialize(&mut **self) } fn end(self) -> Result<(), Self::Error> { Ok(()) } } #[cfg(test)] mod test { use rav1e::prelude::SpeedSettings; #[test] fn serialize_speed_settings() { for preset in 0..=10 { let s = SpeedSettings::from_preset(preset); let out = super::to_string(&s).unwrap(); println!("preset {}: {}", preset, out); } } } rav1e-0.7.1/src/bin/muxer/ivf.rs000064400000000000000000000051531046102023000145040ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::Muxer; use crate::error::*; use ivf::*; use rav1e::prelude::*; use std::fs; use std::fs::File; use std::io; use std::io::Write; use std::path::Path; pub struct IvfMuxer { output: Box, } impl Muxer for IvfMuxer { fn write_header( &mut self, width: usize, height: usize, framerate_num: usize, framerate_den: usize, ) { write_ivf_header( &mut self.output, width, height, framerate_num, framerate_den, ); } #[profiling::function] fn write_frame(&mut self, pts: u64, data: &[u8], _frame_type: FrameType) { write_ivf_frame(&mut self.output, pts, data); } fn flush(&mut self) -> io::Result<()> { self.output.flush() } } cfg_if::cfg_if! { if #[cfg(unix)] { use std::os::unix::fs::*; fn is_file>(path: P) -> bool { fs::metadata(path).map(|meta| { !meta.file_type().is_char_device() && !meta.file_type().is_socket() }).unwrap_or(false) } } else { fn is_file>(path: P) -> bool { fs::metadata(path).is_ok() } } } impl IvfMuxer { pub fn check_file>(path: P) -> Result<(), CliError> { if is_file(path.as_ref()) { eprint!( "File '{}' already exists. Overwrite ? [y/N] ", path.as_ref().display() ); io::stdout().flush().unwrap(); let mut option_input = String::new(); io::stdin() .read_line(&mut option_input) .expect("Failed to read option."); match option_input.as_str().trim() { "y" | "Y" => return Ok(()), _ => return Err(CliError::new("Not overwriting, exiting.")), }; } Ok(()) } pub fn open>( path: P, ) -> Result, CliError> { let ivf = IvfMuxer { output: match path.as_ref().to_str() { Some("-") => Box::new(std::io::stdout()), _ => Box::new( File::create(path) .map_err(|e| e.context("Cannot open output file"))?, ), }, }; Ok(Box::new(ivf)) } } rav1e-0.7.1/src/bin/muxer/mod.rs000064400000000000000000000030271046102023000144750ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. mod ivf; use self::ivf::IvfMuxer; mod y4m; pub use self::y4m::write_y4m_frame; use rav1e::prelude::*; use std::ffi::OsStr; use std::io; use std::path::Path; use crate::error::*; pub trait Muxer: Send { fn write_header( &mut self, width: usize, height: usize, framerate_num: usize, framerate_den: usize, ); fn write_frame(&mut self, pts: u64, data: &[u8], frame_type: FrameType); fn flush(&mut self) -> io::Result<()>; } pub fn create_muxer>( path: P, overwrite: bool, ) -> Result, CliError> { if !overwrite { IvfMuxer::check_file(path.as_ref())?; } if let Some(path) = path.as_ref().to_str() { if path == "-" { return IvfMuxer::open(path); } } let ext = path .as_ref() .extension() .and_then(OsStr::to_str) .map(str::to_lowercase) .unwrap_or_else(|| "ivf".into()); match &ext[..] { "ivf" => IvfMuxer::open(path), _e => { panic!("{ext} is not a supported extension, please change to .ivf"); } } } rav1e-0.7.1/src/bin/muxer/y4m.rs000064400000000000000000000064141046102023000144320ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::decoder::VideoDetails; use rav1e::prelude::*; use std::io::Write; use std::slice; #[profiling::function] pub fn write_y4m_frame( y4m_enc: &mut y4m::Encoder>, rec: &Frame, y4m_details: VideoDetails, ) { let planes = if y4m_details.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; let bytes_per_sample = if y4m_details.bit_depth > 8 { 2 } else { 1 }; let (chroma_width, chroma_height) = y4m_details .chroma_sampling .get_chroma_dimensions(y4m_details.width, y4m_details.height); let pitch_y = y4m_details.width * bytes_per_sample; let pitch_uv = chroma_width * bytes_per_sample; let (mut rec_y, mut rec_u, mut rec_v) = ( vec![128u8; pitch_y * y4m_details.height], vec![128u8; pitch_uv * chroma_height], vec![128u8; pitch_uv * chroma_height], ); let (stride_y, stride_u, stride_v) = ( rec.planes[0].cfg.stride, rec.planes[1].cfg.stride, rec.planes[2].cfg.stride, ); for (line, line_out) in rec.planes[0].data_origin().chunks(stride_y).zip(rec_y.chunks_mut(pitch_y)) { if y4m_details.bit_depth > 8 { // SAFETY: This is essentially doing a transmute to u16, but safer. unsafe { line_out.copy_from_slice(slice::from_raw_parts::( line.as_ptr() as *const u8, pitch_y, )); } } else { line_out.copy_from_slice( &line.iter().map(|&v| u8::cast_from(v)).collect::>() [..pitch_y], ); } } if planes > 1 { for (line, line_out) in rec.planes[1] .data_origin() .chunks(stride_u) .zip(rec_u.chunks_mut(pitch_uv)) { if y4m_details.bit_depth > 8 { // SAFETY: This is essentially doing a transmute to u16, but safer. unsafe { line_out.copy_from_slice(slice::from_raw_parts::( line.as_ptr() as *const u8, pitch_uv, )); } } else { line_out.copy_from_slice( &line.iter().map(|&v| u8::cast_from(v)).collect::>() [..pitch_uv], ); } } for (line, line_out) in rec.planes[2] .data_origin() .chunks(stride_v) .zip(rec_v.chunks_mut(pitch_uv)) { if y4m_details.bit_depth > 8 { // SAFETY: This is essentially doing a transmute to u16, but safer. unsafe { line_out.copy_from_slice(slice::from_raw_parts::( line.as_ptr() as *const u8, pitch_uv, )); } } else { line_out.copy_from_slice( &line.iter().map(|&v| u8::cast_from(v)).collect::>() [..pitch_uv], ); } } } let rec_frame = y4m::Frame::new([&rec_y, &rec_u, &rec_v], None); y4m_enc.write_frame(&rec_frame).unwrap(); } rav1e-0.7.1/src/bin/rav1e-ch.rs000064400000000000000000000504221046102023000141650ustar 00000000000000// Copyright (c) 2017-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. // Safety lints #![deny(bare_trait_objects)] #![deny(clippy::as_ptr_cast_mut)] #![deny(clippy::large_stack_arrays)] // Performance lints #![warn(clippy::inefficient_to_string)] #![warn(clippy::invalid_upcast_comparisons)] #![warn(clippy::iter_with_drain)] #![warn(clippy::linkedlist)] #![warn(clippy::mutex_integer)] #![warn(clippy::naive_bytecount)] #![warn(clippy::needless_bitwise_bool)] #![warn(clippy::needless_collect)] #![warn(clippy::or_fun_call)] #![warn(clippy::stable_sort_primitive)] #![warn(clippy::suboptimal_flops)] #![warn(clippy::trivial_regex)] #![warn(clippy::trivially_copy_pass_by_ref)] #![warn(clippy::unnecessary_join)] #![warn(clippy::unused_async)] #![warn(clippy::zero_sized_map_values)] // Correctness lints #![deny(clippy::case_sensitive_file_extension_comparisons)] #![deny(clippy::copy_iterator)] #![deny(clippy::expl_impl_clone_on_copy)] #![deny(clippy::float_cmp)] #![warn(clippy::imprecise_flops)] #![deny(clippy::manual_instant_elapsed)] #![deny(clippy::mem_forget)] #![deny(clippy::path_buf_push_overwrite)] #![deny(clippy::same_functions_in_if_condition)] #![deny(clippy::unchecked_duration_subtraction)] #![deny(clippy::unicode_not_nfc)] // Clarity/formatting lints #![warn(clippy::checked_conversions)] #![allow(clippy::comparison_chain)] #![warn(clippy::derive_partial_eq_without_eq)] #![allow(clippy::enum_variant_names)] #![warn(clippy::explicit_deref_methods)] #![warn(clippy::filter_map_next)] #![warn(clippy::flat_map_option)] #![warn(clippy::fn_params_excessive_bools)] #![warn(clippy::implicit_clone)] #![warn(clippy::iter_not_returning_iterator)] #![warn(clippy::iter_on_empty_collections)] #![warn(clippy::macro_use_imports)] #![warn(clippy::manual_clamp)] #![warn(clippy::manual_let_else)] #![warn(clippy::manual_ok_or)] #![warn(clippy::manual_string_new)] #![warn(clippy::map_flatten)] #![warn(clippy::match_bool)] #![warn(clippy::mut_mut)] #![warn(clippy::needless_borrow)] #![warn(clippy::needless_continue)] #![allow(clippy::needless_range_loop)] #![allow(clippy::too_many_arguments)] #![warn(clippy::range_minus_one)] #![warn(clippy::range_plus_one)] #![warn(clippy::ref_binding_to_reference)] #![warn(clippy::ref_option_ref)] #![warn(clippy::trait_duplication_in_bounds)] #![warn(clippy::unused_peekable)] #![warn(clippy::unused_rounding)] #![warn(clippy::unused_self)] #![allow(clippy::upper_case_acronyms)] #![warn(clippy::verbose_bit_mask)] #![warn(clippy::verbose_file_reads)] // Documentation lints #![warn(clippy::doc_link_with_quotes)] #![warn(clippy::doc_markdown)] #![warn(clippy::missing_errors_doc)] #![warn(clippy::missing_panics_doc)] #[macro_use] extern crate log; mod common; mod decoder; mod error; #[cfg(feature = "serialize")] mod kv; mod muxer; mod stats; use crate::common::*; use crate::error::*; use crate::stats::*; use rav1e::config::CpuFeatureLevel; use rav1e::prelude::*; use crate::decoder::{Decoder, FrameBuilder, VideoDetails}; use crate::muxer::*; use std::fs::File; use std::io::{Read, Seek, Write}; use std::sync::Arc; impl FrameBuilder for FrameSender { fn new_frame(&self) -> Frame { FrameSender::new_frame(self) } } struct Source { limit: usize, count: usize, input: D, #[cfg(all(unix, feature = "signal-hook"))] exit_requested: Arc, } impl Source { cfg_if::cfg_if! { if #[cfg(all(unix, feature = "signal-hook"))] { fn new(limit: usize, input: D) -> Self { use signal_hook::{flag, consts}; // Make sure double CTRL+C and similar kills let exit_requested = Arc::new(std::sync::atomic::AtomicBool::new(false)); for sig in consts::TERM_SIGNALS { // When terminated by a second term signal, exit with exit code 1. // This will do nothing the first time (because term_now is false). flag::register_conditional_shutdown(*sig, 1, Arc::clone(&exit_requested)).unwrap(); // But this will "arm" the above for the second time, by setting it to true. // The order of registering these is important, if you put this one first, it will // first arm and then terminate ‒ all in the first round. flag::register(*sig, Arc::clone(&exit_requested)).unwrap(); } Self { limit, input, count: 0, exit_requested, } } } else { fn new(limit: usize, input: D) -> Self { Self { limit, input, count: 0, } } } } fn read_frame( &mut self, send_frame: &mut FrameSender, video_info: VideoDetails, ) -> bool { if self.limit != 0 && self.count == self.limit { return false; } #[cfg(all(unix, feature = "signal-hook"))] { if self.exit_requested.load(std::sync::atomic::Ordering::SeqCst) { return false; } } match self.input.read_frame(send_frame, &video_info) { Ok(frame) => { self.count += 1; let _ = send_frame.send(frame); true } _ => false, } } } fn do_encode( cfg: Config, verbose: Verboseness, mut progress: ProgressInfo, output: &mut dyn Muxer, mut source: Source, pass1file: Option, pass2file: Option, mut y4m_enc: Option>>, metrics_enabled: MetricsEnabled, ) -> Result<(), CliError> { let ((mut send_frame, receive_packet), (send_rc, receive_rc)) = match (pass1file.is_some(), pass2file.is_some()) { (true, true) => { let (channel, (send_rc, receive_rc)) = cfg .new_multipass_channel::() .map_err(|e| e.context("Invalid setup"))?; (channel, (Some(send_rc), Some(receive_rc))) } (true, false) => { let (channel, receive_rc) = cfg .new_firstpass_channel() .map_err(|e| e.context("Invalid setup"))?; (channel, (None, Some(receive_rc))) } (false, true) => { let (channel, send_rc) = cfg .new_secondpass_channel() .map_err(|e| e.context("Invalid setup"))?; (channel, (Some(send_rc), None)) } (false, false) => { let channel = cfg.new_channel().map_err(|e| e.context("Invalid setup"))?; (channel, (None, None)) } }; let y4m_details = source.input.get_video_details(); crossbeam::thread::scope(move |s| -> Result<(), CliError> { // Receive pass data let receive_pass_data = s.spawn(move |_| -> Result<(), CliError> { if let (Some(mut passfile), Some(receive_rc)) = (pass1file, receive_rc) { let len = receive_rc.summary_size(); let buf = vec![0u8; len]; passfile .write_all(&(len as u64).to_be_bytes()) .map_err(|e| e.context("Unable to write to two-pass data file."))?; passfile .write_all(&buf) .map_err(|e| e.context("Unable to write to two-pass data file."))?; for data in receive_rc.iter() { match data { RcData::Frame(outbuf) => { let len = outbuf.len() as u64; passfile.write_all(&len.to_be_bytes()).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; passfile.write_all(&outbuf).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; } RcData::Summary(outbuf) => { // Write an end marker passfile.write_all(&0u64.to_be_bytes()).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; // The last packet of rate control data we get is the summary data. // Let's put it at the start of the file. passfile.seek(std::io::SeekFrom::Start(0)).map_err(|e| { e.context("Unable to seek in the two-pass data file.") })?; let len = outbuf.len() as u64; passfile.write_all(&len.to_be_bytes()).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; passfile.write_all(&outbuf).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; } } } } Ok(()) }); // Send frames let send_frames = s.spawn(move |_| -> Result<(), CliError> { while source.read_frame(&mut send_frame, y4m_details) {} // send_frame.result() Ok(()) }); // Send pass data let send_pass_data = s.spawn(move |_| -> Result<(), CliError> { if let (Some(mut passfile), Some(mut send_rc)) = (pass2file, send_rc) { let mut buflen = [0u8; 8]; passfile .read_exact(&mut buflen) .map_err(|e| e.context("Unable to read the two-pass data file."))?; let mut data = vec![0u8; u64::from_be_bytes(buflen) as usize]; passfile .read_exact(&mut data) .map_err(|e| e.context("Unable to read the two-pass data file."))?; while send_rc.send(RcData::Frame(data.into_boxed_slice())).is_ok() { passfile.read_exact(&mut buflen).map_err(|e| { e.context("Unable to read the two-pass data file.") })?; if u64::from_be_bytes(buflen) == 0 { break; } data = vec![0u8; u64::from_be_bytes(buflen) as usize]; passfile.read_exact(&mut data).map_err(|e| { e.context("Unable to read the two-pass data file.") })?; } } Ok(()) }); // Receive Packets let receive_packets = s.spawn(move |_| -> Result<(), CliError> { for pkt in receive_packet.iter() { output.write_frame( pkt.input_frameno as u64, pkt.data.as_ref(), pkt.frame_type, ); output.flush().unwrap(); if let (Some(ref mut y4m_enc_uw), Some(ref rec)) = (y4m_enc.as_mut(), &pkt.rec) { write_y4m_frame(y4m_enc_uw, rec, y4m_details); } let summary = build_frame_summary( pkt, y4m_details.bit_depth, y4m_details.chroma_sampling, metrics_enabled, ); if verbose != Verboseness::Quiet { progress.add_frame(summary.clone()); if verbose == Verboseness::Verbose { info!("{} - {}", summary, progress); } else { // Print a one-line progress indicator that overrides itself with every update eprint!("\r{} ", progress); }; } } if verbose != Verboseness::Quiet { if verbose == Verboseness::Verbose { // Clear out the temporary progress indicator eprint!("\r"); } progress.print_summary(verbose == Verboseness::Verbose); } // receive_packet.result() Ok(()) }); send_pass_data.join().expect("The send pass data thread panicked ")?; receive_pass_data .join() .expect("The receive pass data thread panicked")?; send_frames.join().expect("The send frames thread panicked")?; receive_packets.join().expect("The receive packets thread panicked")?; Ok(()) }) .unwrap() } fn main() -> Result<(), Box> { init_logger(); #[cfg(feature = "tracing")] let (chrome_layer, _guard) = tracing_chrome::ChromeLayerBuilder::new().build(); #[cfg(feature = "tracing")] { use tracing_subscriber::layer::subscriberext; tracing::subscriber::set_global_default( tracing_subscriber::registry().with(chrome_layer), ) .unwrap(); } run().map_err(|e| { error::print_error(&e); Box::new(e) as Box }) } fn init_logger() { use std::str::FromStr; fn level_colored(l: log::Level) -> console::StyledObject<&'static str> { use console::style; use log::Level; match l { Level::Trace => style("??").dim(), Level::Debug => style("? ").dim(), Level::Info => style("> ").green(), Level::Warn => style("! ").yellow(), Level::Error => style("!!").red(), } } let level = std::env::var("RAV1E_LOG") .ok() .and_then(|l| log::LevelFilter::from_str(&l).ok()) .unwrap_or(log::LevelFilter::Info); fern::Dispatch::new() .format(move |out, message, record| { out.finish(format_args!( "{level} {message}", level = level_colored(record.level()), message = message, )); }) // set the default log level. to filter out verbose log messages from dependencies, set // this to Warn and overwrite the log level for your crate. .level(log::LevelFilter::Warn) // change log levels for individual modules. Note: This looks for the record's target // field which defaults to the module path but can be overwritten with the `target` // parameter: // `info!(target="special_target", "This log message is about special_target");` .level_for("rav1e", level) .level_for("rav1e_ch", level) // output to stdout .chain(std::io::stderr()) .apply() .unwrap(); } cfg_if::cfg_if! { if #[cfg(any(target_os = "windows", target_arch = "wasm32"))] { fn print_rusage() { eprintln!("Resource usage reporting is not currently supported on this platform"); } } else { fn print_rusage() { // SAFETY: This uses an FFI, it is safe because we call it correctly. let (utime, stime, maxrss) = unsafe { let mut usage = std::mem::zeroed(); let _ = libc::getrusage(libc::RUSAGE_SELF, &mut usage); (usage.ru_utime, usage.ru_stime, usage.ru_maxrss) }; eprintln!( "user time: {} s", utime.tv_sec as f64 + utime.tv_usec as f64 / 1_000_000f64 ); eprintln!( "system time: {} s", stime.tv_sec as f64 + stime.tv_usec as f64 / 1_000_000f64 ); eprintln!("maximum rss: {} KB", maxrss); } } } fn run() -> Result<(), error::CliError> { let mut cli = parse_cli()?; // Maximum frame size by specification + maximum y4m header let limit = y4m::Limits { // Use saturating operations to gracefully handle 32-bit architectures bytes: 64usize .saturating_mul(64) .saturating_mul(4096) .saturating_mul(2304) .saturating_add(1024), }; let mut y4m_dec = match y4m::Decoder::new_with_limits(cli.io.input, limit) { Err(e) => { return Err(CliError::new(match e { y4m::Error::ParseError(_) => { "Could not parse input video. Is it a y4m file?" } y4m::Error::IoError(_) => { "Could not read input file. Check that the path is correct and you have read permissions." } y4m::Error::UnknownColorspace => { "Unknown colorspace or unsupported bit depth." } y4m::Error::OutOfMemory => "The video's frame size exceeds the limit.", y4m::Error::EOF => "Unexpected end of input.", y4m::Error::BadInput => "Bad y4m input parameters provided.", })) } Ok(d) => d, }; let video_info = y4m_dec.get_video_details(); let y4m_enc = cli.io.rec.map(|rec| { y4m::encode( video_info.width, video_info.height, y4m::Ratio::new( video_info.time_base.den as usize, video_info.time_base.num as usize, ), ) .with_colorspace(y4m_dec.get_colorspace()) .with_pixel_aspect(y4m::Ratio { num: video_info.sample_aspect_ratio.num as usize, den: video_info.sample_aspect_ratio.den as usize, }) .write_header(rec) .unwrap() }); match video_info.bit_depth { 8 | 10 | 12 => {} _ => return Err(CliError::new("Unsupported bit depth")), } cli.enc.width = video_info.width; cli.enc.height = video_info.height; cli.enc.bit_depth = video_info.bit_depth; cli.enc.sample_aspect_ratio = video_info.sample_aspect_ratio; cli.enc.chroma_sampling = video_info.chroma_sampling; cli.enc.chroma_sample_position = video_info.chroma_sample_position; // If no pixel range is specified via CLI, assume limited, // as it is the default for the Y4M format. if !cli.color_range_specified { cli.enc.pixel_range = PixelRange::Limited; } if !cli.override_time_base { cli.enc.time_base = video_info.time_base; } if cli.photon_noise > 0 && cli.enc.film_grain_params.is_none() { cli.enc.film_grain_params = Some(vec![generate_photon_noise_params( 0, u64::MAX, NoiseGenArgs { iso_setting: cli.photon_noise as u32 * 100, width: video_info.width as u32, height: video_info.height as u32, transfer_function: if cli.enc.is_hdr() { TransferFunction::SMPTE2084 } else { TransferFunction::BT1886 }, chroma_grain: false, random_seed: None, }, )]); } let mut rc = RateControlConfig::new(); let pass2file = match cli.pass2file_name { Some(f) => { let mut f = File::open(f).map_err(|e| { e.context("Unable to open file for reading two-pass data") })?; let mut buflen = [0u8; 8]; f.read_exact(&mut buflen) .map_err(|e| e.context("Summary data too short"))?; let len = i64::from_be_bytes(buflen); let mut buf = vec![0u8; len as usize]; f.read_exact(&mut buf) .map_err(|e| e.context("Summary data too short"))?; rc = RateControlConfig::from_summary_slice(&buf) .map_err(|e| e.context("Invalid summary"))?; Some(f) } None => None, }; let pass1file = match cli.pass1file_name { Some(f) => { let f = File::create(f).map_err(|e| { e.context("Unable to open file for writing two-pass data") })?; rc = rc.with_emit_data(true); Some(f) } None => None, }; let cfg = Config::new() .with_encoder_config(cli.enc.clone()) .with_threads(cli.threads) .with_rate_control(rc) .with_parallel_gops(cli.slots); #[cfg(feature = "serialize")] { if let Some(save_config) = cli.save_config { let mut out = File::create(save_config) .map_err(|e| e.context("Cannot create configuration file"))?; let s = toml::to_string(&cli.enc).unwrap(); out .write_all(s.as_bytes()) .map_err(|e| e.context("Cannot write the configuration file"))? } } cli.io.output.write_header( video_info.width, video_info.height, cli.enc.time_base.den as usize, cli.enc.time_base.num as usize, ); let tiling = cfg.tiling_info().map_err(|e| e.context("Invalid configuration"))?; if cli.verbose != Verboseness::Quiet { info!("CPU Feature Level: {}", CpuFeatureLevel::default()); info!( "Using y4m decoder: {}x{}p @ {}/{} fps, {}, {}-bit", video_info.width, video_info.height, video_info.time_base.den, video_info.time_base.num, video_info.chroma_sampling, video_info.bit_depth ); info!("Encoding settings: {}", cli.enc); if tiling.tile_count() == 1 { info!("Using 1 tile"); } else { info!( "Using {} tiles ({}x{})", tiling.tile_count(), tiling.cols, tiling.rows ); } } let progress = ProgressInfo::new( Rational { num: video_info.time_base.den, den: video_info.time_base.num }, if cli.limit == 0 { None } else { Some(cli.limit) }, cli.metrics_enabled, ); for _ in 0..cli.skip { match y4m_dec.read_frame() { Ok(f) => f, Err(_) => { return Err(CliError::new("Skipped more frames than in the input")) } }; } let source = Source::new(cli.limit, y4m_dec); if video_info.bit_depth == 8 && !cli.force_highbitdepth { do_encode::>>( cfg, cli.verbose, progress, &mut *cli.io.output, source, pass1file, pass2file, y4m_enc, cli.metrics_enabled, )? } else { do_encode::>>( cfg, cli.verbose, progress, &mut *cli.io.output, source, pass1file, pass2file, y4m_enc, cli.metrics_enabled, )? } if cli.benchmark { print_rusage(); } Ok(()) } rav1e-0.7.1/src/bin/rav1e.rs000064400000000000000000000465111046102023000136010ustar 00000000000000// Copyright (c) 2017-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. // Safety lints #![deny(bare_trait_objects)] #![deny(clippy::as_ptr_cast_mut)] #![deny(clippy::large_stack_arrays)] // Performance lints #![warn(clippy::inefficient_to_string)] #![warn(clippy::invalid_upcast_comparisons)] #![warn(clippy::iter_with_drain)] #![warn(clippy::linkedlist)] #![warn(clippy::mutex_integer)] #![warn(clippy::naive_bytecount)] #![warn(clippy::needless_bitwise_bool)] #![warn(clippy::needless_collect)] #![warn(clippy::or_fun_call)] #![warn(clippy::stable_sort_primitive)] #![warn(clippy::suboptimal_flops)] #![warn(clippy::trivial_regex)] #![warn(clippy::trivially_copy_pass_by_ref)] #![warn(clippy::unnecessary_join)] #![warn(clippy::unused_async)] #![warn(clippy::zero_sized_map_values)] // Correctness lints #![deny(clippy::case_sensitive_file_extension_comparisons)] #![deny(clippy::copy_iterator)] #![deny(clippy::expl_impl_clone_on_copy)] #![deny(clippy::float_cmp)] #![warn(clippy::imprecise_flops)] #![deny(clippy::manual_instant_elapsed)] #![deny(clippy::mem_forget)] #![deny(clippy::path_buf_push_overwrite)] #![deny(clippy::same_functions_in_if_condition)] #![deny(clippy::unchecked_duration_subtraction)] #![deny(clippy::unicode_not_nfc)] // Clarity/formatting lints #![warn(clippy::checked_conversions)] #![allow(clippy::comparison_chain)] #![warn(clippy::derive_partial_eq_without_eq)] #![allow(clippy::enum_variant_names)] #![warn(clippy::explicit_deref_methods)] #![warn(clippy::filter_map_next)] #![warn(clippy::flat_map_option)] #![warn(clippy::fn_params_excessive_bools)] #![warn(clippy::implicit_clone)] #![warn(clippy::iter_not_returning_iterator)] #![warn(clippy::iter_on_empty_collections)] #![warn(clippy::macro_use_imports)] #![warn(clippy::manual_clamp)] #![warn(clippy::manual_let_else)] #![warn(clippy::manual_ok_or)] #![warn(clippy::manual_string_new)] #![warn(clippy::map_flatten)] #![warn(clippy::match_bool)] #![warn(clippy::mut_mut)] #![warn(clippy::needless_borrow)] #![warn(clippy::needless_continue)] #![allow(clippy::needless_range_loop)] #![allow(clippy::too_many_arguments)] #![warn(clippy::range_minus_one)] #![warn(clippy::range_plus_one)] #![warn(clippy::ref_binding_to_reference)] #![warn(clippy::ref_option_ref)] #![warn(clippy::trait_duplication_in_bounds)] #![warn(clippy::unused_peekable)] #![warn(clippy::unused_rounding)] #![warn(clippy::unused_self)] #![allow(clippy::upper_case_acronyms)] #![warn(clippy::verbose_bit_mask)] #![warn(clippy::verbose_file_reads)] // Documentation lints #![warn(clippy::doc_link_with_quotes)] #![warn(clippy::doc_markdown)] #![warn(clippy::missing_errors_doc)] #![warn(clippy::missing_panics_doc)] #[macro_use] extern crate log; mod common; mod decoder; mod error; #[cfg(feature = "serialize")] mod kv; mod muxer; mod stats; use crate::common::*; use crate::error::*; use crate::stats::*; use rav1e::config::CpuFeatureLevel; use rav1e::prelude::*; use crate::decoder::{Decoder, FrameBuilder, VideoDetails}; use crate::muxer::*; use std::fs::File; use std::io::{Read, Seek, Write}; use std::process::exit; use std::sync::Arc; impl FrameBuilder for Context { fn new_frame(&self) -> Frame { Context::new_frame(self) } } struct Source { limit: usize, count: usize, input: D, #[cfg(all(unix, feature = "signal-hook"))] exit_requested: Arc, } impl Source { cfg_if::cfg_if! { if #[cfg(all(unix, feature = "signal-hook"))] { fn new(limit: usize, input: D) -> Self { use signal_hook::{flag, consts}; // Make sure double CTRL+C and similar kills let exit_requested = Arc::new(std::sync::atomic::AtomicBool::new(false)); for sig in consts::TERM_SIGNALS { // When terminated by a second term signal, exit with exit code 1. // This will do nothing the first time (because term_now is false). flag::register_conditional_shutdown(*sig, 1, Arc::clone(&exit_requested)).unwrap(); // But this will "arm" the above for the second time, by setting it to true. // The order of registering these is important, if you put this one first, it will // first arm and then terminate ‒ all in the first round. flag::register(*sig, Arc::clone(&exit_requested)).unwrap(); } Self { limit, input, count: 0, exit_requested, } } } else { #[allow(clippy::missing_const_for_fn)] fn new(limit: usize, input: D) -> Self { Self { limit, input, count: 0, } } } } #[profiling::function] fn read_frame( &mut self, ctx: &mut Context, video_info: VideoDetails, ) -> Result<(), CliError> { if self.limit != 0 && self.count == self.limit { ctx.flush(); return Ok(()); } #[cfg(all(unix, feature = "signal-hook"))] { if self.exit_requested.load(std::sync::atomic::Ordering::SeqCst) { ctx.flush(); return Ok(()); } } match self.input.read_frame(ctx, &video_info) { Ok(frame) => { match video_info.bit_depth { 8 | 10 | 12 => {} _ => return Err(CliError::new("Unsupported bit depth")), } self.count += 1; let _ = ctx.send_frame(Some(Arc::new(frame))); } _ => { ctx.flush(); } }; Ok(()) } } // Encode and write a frame. // Returns frame information in a `Result`. #[profiling::function] fn process_frame( ctx: &mut Context, output_file: &mut dyn Muxer, source: &mut Source, pass1file: Option<&mut File>, pass2file: Option<&mut File>, mut y4m_enc: Option<&mut y4m::Encoder>>, metrics_cli: MetricsEnabled, ) -> Result>, CliError> { let y4m_details = source.input.get_video_details(); let mut frame_summaries = Vec::new(); let mut pass1file = pass1file; let mut pass2file = pass2file; // Submit first pass data to pass 2. if let Some(passfile) = pass2file.as_mut() { while ctx.rc_second_pass_data_required() > 0 { let mut buflen = [0u8; 8]; passfile .read_exact(&mut buflen) .map_err(|e| e.context("Unable to read the two-pass data file."))?; let mut data = vec![0u8; u64::from_be_bytes(buflen) as usize]; passfile .read_exact(&mut data) .map_err(|e| e.context("Unable to read the two-pass data file."))?; ctx .rc_send_pass_data(&data) .map_err(|e| e.context("Corrupted first pass data"))?; } } let pkt_wrapped = ctx.receive_packet(); let (ret, emit_pass_data) = match pkt_wrapped { Ok(pkt) => { output_file.write_frame( pkt.input_frameno, pkt.data.as_ref(), pkt.frame_type, ); if let (Some(ref mut y4m_enc_uw), Some(ref rec)) = (y4m_enc.as_mut(), &pkt.rec) { write_y4m_frame(y4m_enc_uw, rec, y4m_details); } frame_summaries.push(build_frame_summary( pkt, y4m_details.bit_depth, y4m_details.chroma_sampling, metrics_cli, )); Ok((Some(frame_summaries), true)) } Err(EncoderStatus::NeedMoreData) => { source.read_frame(ctx, y4m_details)?; Ok((Some(frame_summaries), false)) } Err(EncoderStatus::EnoughData) => { unreachable!() } Err(EncoderStatus::LimitReached) => Ok((None, true)), Err(e @ EncoderStatus::Failure) => { Err(e.context("Failed to encode video")) } Err(e @ EncoderStatus::NotReady) => { Err(e.context("Mismanaged handling of two-pass stats data")) } Err(EncoderStatus::Encoded) => Ok((Some(frame_summaries), true)), }?; // Save first pass data from pass 1. if let Some(passfile) = pass1file.as_mut() { if emit_pass_data { match ctx.rc_receive_pass_data() { Some(RcData::Frame(outbuf)) => { let len = outbuf.len() as u64; passfile.write_all(&len.to_be_bytes()).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; passfile.write_all(&outbuf).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; } Some(RcData::Summary(outbuf)) => { // The last packet of rate control data we get is the summary data. // Let's put it at the start of the file. passfile.rewind().map_err(|e| { e.context("Unable to seek in the two-pass data file.") })?; let len = outbuf.len() as u64; passfile.write_all(&len.to_be_bytes()).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; passfile.write_all(&outbuf).map_err(|e| { e.context("Unable to write to two-pass data file.") })?; } None => {} } } } Ok(ret) } fn do_encode( cfg: Config, verbose: Verboseness, mut progress: ProgressInfo, output: &mut dyn Muxer, mut source: Source, mut pass1file: Option, mut pass2file: Option, mut y4m_enc: Option>>, metrics_enabled: MetricsEnabled, ) -> Result<(), CliError> { let mut ctx: Context = cfg.new_context().map_err(|e| e.context("Invalid encoder settings"))?; // Let's write down a placeholder. if let Some(passfile) = pass1file.as_mut() { let len = ctx.rc_summary_size(); let buf = vec![0u8; len]; passfile .write_all(&(len as u64).to_be_bytes()) .map_err(|e| e.context("Unable to write to two-pass data file."))?; passfile .write_all(&buf) .map_err(|e| e.context("Unable to write to two-pass data file."))?; } while let Some(frame_info) = process_frame( &mut ctx, &mut *output, &mut source, pass1file.as_mut(), pass2file.as_mut(), y4m_enc.as_mut(), metrics_enabled, )? { if verbose != Verboseness::Quiet { for frame in frame_info { progress.add_frame(frame.clone()); if verbose == Verboseness::Verbose { info!("{} - {}", frame, progress); } else { // Print a one-line progress indicator that overrides itself with every update eprint!("\r{progress} "); }; } output.flush().unwrap(); } } if verbose != Verboseness::Quiet { if verbose == Verboseness::Verbose { // Clear out the temporary progress indicator eprint!("\r"); } progress.print_summary(verbose == Verboseness::Verbose); } Ok(()) } fn main() { init_logger(); #[cfg(feature = "tracing")] let (chrome_layer, _guard) = tracing_chrome::ChromeLayerBuilder::new().build(); #[cfg(feature = "tracing")] { use tracing_subscriber::layer::SubscriberExt; tracing::subscriber::set_global_default( tracing_subscriber::registry().with(chrome_layer), ) .unwrap(); } run().unwrap_or_else(|e| { error::print_error(&e); exit(1); }); } fn init_logger() { use std::str::FromStr; fn level_colored(l: log::Level) -> console::StyledObject<&'static str> { use console::style; use log::Level; match l { Level::Trace => style("??").dim(), Level::Debug => style("? ").dim(), Level::Info => style("> ").green(), Level::Warn => style("! ").yellow(), Level::Error => style("!!").red(), } } let level = std::env::var("RAV1E_LOG") .ok() .and_then(|l| log::LevelFilter::from_str(&l).ok()) .unwrap_or(log::LevelFilter::Info); fern::Dispatch::new() .format(move |out, message, record| { out.finish(format_args!( "{level} {message}", level = level_colored(record.level()), message = message, )); }) // set the default log level. to filter out verbose log messages from dependencies, set // this to Warn and overwrite the log level for your crate. .level(log::LevelFilter::Warn) // change log levels for individual modules. Note: This looks for the record's target // field which defaults to the module path but can be overwritten with the `target` // parameter: // `info!(target="special_target", "This log message is about special_target");` .level_for("rav1e", level) // output to stdout .chain(std::io::stderr()) .apply() .unwrap(); } cfg_if::cfg_if! { if #[cfg(any(target_os = "windows", target_arch = "wasm32"))] { fn print_rusage() { eprintln!("Resource usage reporting is not currently supported on this platform"); } } else { fn print_rusage() { // SAFETY: This uses an FFI, it is safe because we call it correctly. let (utime, stime, maxrss) = unsafe { let mut usage = std::mem::zeroed(); let _ = libc::getrusage(libc::RUSAGE_SELF, &mut usage); (usage.ru_utime, usage.ru_stime, usage.ru_maxrss) }; eprintln!( "user time: {} s", utime.tv_sec as f64 + utime.tv_usec as f64 / 1_000_000f64 ); eprintln!( "system time: {} s", stime.tv_sec as f64 + stime.tv_usec as f64 / 1_000_000f64 ); eprintln!("maximum rss: {maxrss} KB"); } } } fn run() -> Result<(), error::CliError> { let mut cli = parse_cli()?; // Maximum frame size by specification + maximum y4m header let limit = y4m::Limits { // Use saturating operations to gracefully handle 32-bit architectures bytes: 64usize .saturating_mul(64) .saturating_mul(4096) .saturating_mul(2304) .saturating_add(1024), }; let mut y4m_dec = match y4m::Decoder::new_with_limits(cli.io.input, limit) { Err(e) => { return Err(CliError::new(match e { y4m::Error::ParseError(_) => { "Could not parse input video. Is it a y4m file?" } y4m::Error::IoError(_) => { "Could not read input file. Check that the path is correct and you have read permissions." } y4m::Error::UnknownColorspace => { "Unknown colorspace or unsupported bit depth." } y4m::Error::OutOfMemory => "The video's frame size exceeds the limit.", y4m::Error::EOF => "Unexpected end of input.", y4m::Error::BadInput => "Bad y4m input parameters provided.", })) } Ok(d) => d, }; let video_info = y4m_dec.get_video_details(); let y4m_enc = cli.io.rec.map(|rec| { y4m::encode( video_info.width, video_info.height, y4m::Ratio::new( video_info.time_base.den as usize, video_info.time_base.num as usize, ), ) .with_colorspace(y4m_dec.get_colorspace()) .with_pixel_aspect(y4m::Ratio { num: video_info.sample_aspect_ratio.num as usize, den: video_info.sample_aspect_ratio.den as usize, }) .write_header(rec) .unwrap() }); cli.enc.width = video_info.width; cli.enc.height = video_info.height; cli.enc.sample_aspect_ratio = video_info.sample_aspect_ratio; cli.enc.bit_depth = video_info.bit_depth; cli.enc.chroma_sampling = video_info.chroma_sampling; cli.enc.chroma_sample_position = video_info.chroma_sample_position; // If no pixel range is specified via CLI, assume limited, // as it is the default for the Y4M format. if !cli.color_range_specified { cli.enc.pixel_range = PixelRange::Limited; } if !cli.override_time_base { cli.enc.time_base = video_info.time_base; } if cli.photon_noise > 0 && cli.enc.film_grain_params.is_none() { cli.enc.film_grain_params = Some(vec![generate_photon_noise_params( 0, u64::MAX, NoiseGenArgs { iso_setting: cli.photon_noise as u32 * 100, width: video_info.width as u32, height: video_info.height as u32, transfer_function: if cli.enc.is_hdr() { TransferFunction::SMPTE2084 } else { TransferFunction::BT1886 }, chroma_grain: false, random_seed: None, }, )]); } let mut rc = RateControlConfig::new(); let pass2file = match cli.pass2file_name { Some(f) => { let mut f = File::open(f).map_err(|e| { e.context("Unable to open file for reading two-pass data") })?; let mut buflen = [0u8; 8]; f.read_exact(&mut buflen) .map_err(|e| e.context("Summary data too short"))?; let len = i64::from_be_bytes(buflen); let mut buf = vec![0u8; len as usize]; f.read_exact(&mut buf) .map_err(|e| e.context("Summary data too short"))?; rc = RateControlConfig::from_summary_slice(&buf) .map_err(|e| e.context("Invalid summary"))?; Some(f) } None => None, }; let pass1file = match cli.pass1file_name { Some(f) => { let f = File::create(f).map_err(|e| { e.context("Unable to open file for writing two-pass data") })?; rc = rc.with_emit_data(true); Some(f) } None => None, }; let cfg = Config::new() .with_encoder_config(cli.enc.clone()) .with_threads(cli.threads) .with_rate_control(rc); #[cfg(feature = "serialize")] { if let Some(save_config) = cli.save_config { let mut out = File::create(save_config) .map_err(|e| e.context("Cannot create configuration file"))?; let s = toml::to_string(&cli.enc).unwrap(); out .write_all(s.as_bytes()) .map_err(|e| e.context("Cannot write the configuration file"))? } } cli.io.output.write_header( video_info.width, video_info.height, cli.enc.time_base.den as usize, cli.enc.time_base.num as usize, ); let tiling = cfg.tiling_info().map_err(|e| e.context("Invalid configuration"))?; if cli.verbose != Verboseness::Quiet { info!("CPU Feature Level: {}", CpuFeatureLevel::default()); info!( "Using y4m decoder: {}x{}p @ {}/{} fps, {}, {}-bit", video_info.width, video_info.height, video_info.time_base.den, video_info.time_base.num, video_info.chroma_sampling, video_info.bit_depth ); info!("Encoding settings: {}", cli.enc); if tiling.tile_count() == 1 { info!("Using 1 tile"); } else { info!( "Using {} tiles ({}x{})", tiling.tile_count(), tiling.cols, tiling.rows ); } } let progress = ProgressInfo::new( Rational { num: video_info.time_base.den, den: video_info.time_base.num }, if cli.limit == 0 { None } else { Some(cli.limit) }, cli.metrics_enabled, ); for _ in 0..cli.skip { match y4m_dec.read_frame() { Ok(f) => f, Err(_) => { return Err(CliError::new("Skipped more frames than in the input")) } }; } let source = Source::new(cli.limit, y4m_dec); if video_info.bit_depth == 8 && !cli.force_highbitdepth { do_encode::>>( cfg, cli.verbose, progress, &mut *cli.io.output, source, pass1file, pass2file, y4m_enc, cli.metrics_enabled, )? } else { do_encode::>>( cfg, cli.verbose, progress, &mut *cli.io.output, source, pass1file, pass2file, y4m_enc, cli.metrics_enabled, )? } if cli.benchmark { print_rusage(); } Ok(()) } rav1e-0.7.1/src/bin/stats.rs000064400000000000000000000643621046102023000137250ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use av_metrics::video::*; use rav1e::data::EncoderStats; use rav1e::prelude::Rational; use rav1e::prelude::*; use rav1e::{Packet, Pixel}; use std::fmt; use std::time::Instant; #[derive(Debug, Clone)] pub struct FrameSummary { /// Frame size in bytes pub size: usize, pub input_frameno: u64, pub frame_type: FrameType, /// Contains metrics such as PSNR, SSIM, etc. pub metrics: QualityMetrics, /// QP selected for the frame. pub qp: u8, /// Block-level encoding stats for the frame pub enc_stats: EncoderStats, } #[profiling::function] pub fn build_frame_summary( packets: Packet, bit_depth: usize, chroma_sampling: ChromaSampling, metrics_cli: MetricsEnabled, ) -> FrameSummary { let metrics_input_frame: &Frame = packets.source.as_ref().unwrap(); let metrics_output_frame: &Frame = packets.rec.as_ref().unwrap(); let encode_metrics: QualityMetrics = calculate_frame_metrics( metrics_input_frame, metrics_output_frame, bit_depth, chroma_sampling, metrics_cli, ); FrameSummary { size: packets.data.len(), input_frameno: packets.input_frameno, frame_type: packets.frame_type, metrics: encode_metrics, qp: packets.qp, enc_stats: packets.enc_stats, } } impl fmt::Display for FrameSummary { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "Input Frame {} - {} - {} bytes{}", self.input_frameno, self.frame_type, self.size, if let Some(psnr) = self.metrics.psnr { format!( " - PSNR: Y: {:.4} Cb: {:.4} Cr: {:.4}", psnr.y, psnr.u, psnr.v ) } else { String::new() } ) } } #[derive(Debug, Clone)] pub struct ProgressInfo { // Frame rate of the video frame_rate: Rational, // The length of the whole video, in frames, if known total_frames: Option, // The time the encode was started time_started: Instant, // List of frames encoded so far frame_info: Vec, // Video size so far in bytes. // // This value will be updated in the CLI very frequently, so we cache the previous value // to reduce the overall complexity. encoded_size: usize, // Which Metrics to display during and at end of encode metrics_enabled: MetricsEnabled, } impl ProgressInfo { pub fn new( frame_rate: Rational, total_frames: Option, metrics_enabled: MetricsEnabled, ) -> Self { Self { frame_rate, total_frames, time_started: Instant::now(), frame_info: Vec::with_capacity(total_frames.unwrap_or_default()), encoded_size: 0, metrics_enabled, } } pub fn add_frame(&mut self, frame: FrameSummary) { self.encoded_size += frame.size; self.frame_info.push(frame); } pub fn frames_encoded(&self) -> usize { self.frame_info.len() } pub fn encoding_fps(&self) -> f64 { let duration = Instant::now().duration_since(self.time_started); self.frame_info.len() as f64 / (duration.as_secs() as f64 + duration.subsec_millis() as f64 / 1000f64) } pub fn video_fps(&self) -> f64 { self.frame_rate.num as f64 / self.frame_rate.den as f64 } // Returns the bitrate of the frames so far, in bits/second pub fn bitrate(&self) -> usize { let bits = self.encoded_size * 8; let seconds = self.frame_info.len() as f64 / self.video_fps(); (bits as f64 / seconds) as usize } // Estimates the final filesize in bytes, if the number of frames is known pub fn estimated_size(&self) -> usize { self .total_frames .map(|frames| self.encoded_size * frames / self.frames_encoded()) .unwrap_or_default() } // Estimates the remaining encoding time in seconds, if the number of frames is known pub fn estimated_time(&self) -> u64 { self .total_frames .map(|frames| { (frames - self.frames_encoded()) as f64 / self.encoding_fps() }) .unwrap_or_default() as u64 } // Elapsed time in seconds pub fn elapsed_time(&self) -> u64 { Instant::now().duration_since(self.time_started).as_secs() } // Number of frames of given type which appear in the video fn get_frame_type_count(&self, frame_type: FrameType) -> usize { self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .count() } fn get_frame_type_avg_size(&self, frame_type: FrameType) -> usize { let count = self.get_frame_type_count(frame_type); if count == 0 { return 0; } self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.size) .sum::() / count } fn get_frame_type_avg_qp(&self, frame_type: FrameType) -> f32 { let count = self.get_frame_type_count(frame_type); if count == 0 { return 0.; } self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.qp as f32) .sum::() / count as f32 } fn get_block_count_by_frame_type(&self, frame_type: FrameType) -> usize { self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.block_size_counts.iter().sum::()) .sum() } fn get_tx_count_by_frame_type(&self, frame_type: FrameType) -> usize { self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.tx_type_counts.iter().sum::()) .sum() } fn get_bsize_pct_by_frame_type( &self, bsize: BlockSize, frame_type: FrameType, ) -> f32 { let count = self.get_block_count_by_frame_type(frame_type); if count == 0 { return 0.; } self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.block_size_counts[bsize as usize]) .sum::() as f32 / count as f32 * 100. } fn get_skip_pct_by_frame_type(&self, frame_type: FrameType) -> f32 { let count = self.get_block_count_by_frame_type(frame_type); if count == 0 { return 0.; } self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.skip_block_count) .sum::() as f32 / count as f32 * 100. } fn get_txtype_pct_by_frame_type( &self, txtype: TxType, frame_type: FrameType, ) -> f32 { let count = self.get_tx_count_by_frame_type(frame_type); if count == 0 { return 0.; } self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.tx_type_counts[txtype as usize]) .sum::() as f32 / count as f32 * 100. } fn get_luma_pred_count_by_frame_type(&self, frame_type: FrameType) -> usize { self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.luma_pred_mode_counts.iter().sum::()) .sum() } fn get_chroma_pred_count_by_frame_type( &self, frame_type: FrameType, ) -> usize { self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| { frame.enc_stats.chroma_pred_mode_counts.iter().sum::() }) .sum() } fn get_luma_pred_mode_pct_by_frame_type( &self, pred_mode: PredictionMode, frame_type: FrameType, ) -> f32 { let count = self.get_luma_pred_count_by_frame_type(frame_type); if count == 0 { return 0.; } self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.luma_pred_mode_counts[pred_mode as usize]) .sum::() as f32 / count as f32 * 100. } fn get_chroma_pred_mode_pct_by_frame_type( &self, pred_mode: PredictionMode, frame_type: FrameType, ) -> f32 { let count = self.get_chroma_pred_count_by_frame_type(frame_type); if count == 0 { return 0.; } self .frame_info .iter() .filter(|frame| frame.frame_type == frame_type) .map(|frame| frame.enc_stats.chroma_pred_mode_counts[pred_mode as usize]) .sum::() as f32 / count as f32 * 100. } pub fn print_summary(&self, verbose: bool) { eprint!("\r"); info!("{}", self); info!("----------"); self.print_frame_type_summary(FrameType::KEY); self.print_frame_type_summary(FrameType::INTER); self.print_frame_type_summary(FrameType::INTRA_ONLY); self.print_frame_type_summary(FrameType::SWITCH); if verbose { self.print_block_type_summary(); self.print_transform_type_summary(); self.print_prediction_modes_summary(); } match self.metrics_enabled { MetricsEnabled::None => info!("----"), MetricsEnabled::Psnr => self.print_video_psnr(), MetricsEnabled::All => { self.print_video_psnr(); self.print_video_all(); } } } fn print_frame_type_summary(&self, frame_type: FrameType) { let count = self.get_frame_type_count(frame_type); let size = self.get_frame_type_avg_size(frame_type); let avg_qp = self.get_frame_type_avg_qp(frame_type); info!( "{:17} {:>6} | avg QP: {:6.2} | avg size: {:>7} B", format!("{frame_type}:"), count, avg_qp, size ); } fn print_video_psnr(&self) { info!("----------"); let psnr_y = sum_metric(&self.frame_info, |fi| fi.metrics.psnr.unwrap().y); let psnr_u = sum_metric(&self.frame_info, |fi| fi.metrics.psnr.unwrap().u); let psnr_v = sum_metric(&self.frame_info, |fi| fi.metrics.psnr.unwrap().v); let psnr_avg = sum_metric(&self.frame_info, |fi| fi.metrics.psnr.unwrap().avg); info!( "Mean PSNR: Avg: {:.4} Y: {:.4} Cb: {:.4} Cr: {:.4}", psnr_avg, psnr_y, psnr_u, psnr_v ); } fn print_video_all(&self) { info!("----------"); let psnr_hvs = sum_metric(&self.frame_info, |fi| fi.metrics.psnr_hvs.unwrap().avg); let ssim = sum_metric(&self.frame_info, |fi| fi.metrics.ssim.unwrap().avg); let ms_ssim = sum_metric(&self.frame_info, |fi| fi.metrics.ms_ssim.unwrap().avg); let ciede = sum_metric(&self.frame_info, |fi| fi.metrics.ciede.unwrap()); info!("PSNR HVS: {:.4}", psnr_hvs); info!("SSIM: {:.4} MS SSIM: {:.4}", ssim, ms_ssim); info!("CIEDE2000: {:.4}", ciede); info!("----------"); } fn print_block_type_summary(&self) { self.print_block_type_summary_for_frame_type(FrameType::KEY, 'I'); self.print_block_type_summary_for_frame_type(FrameType::INTER, 'P'); } fn print_block_type_summary_for_frame_type( &self, frame_type: FrameType, type_label: char, ) { info!("----------"); info!( "bsize {}: {:>6} {:>6} {:>6} {:>6} {:>6} {:>6}", type_label, "x128", "x64", "x32", "x16", "x8", "x4" ); info!( " 128x: {:>5.1}% {:>5.1}% {}", self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_128X128, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_128X64, frame_type), if frame_type == FrameType::INTER { format!("skip: {:>5.1}%", self.get_skip_pct_by_frame_type(frame_type)) } else { String::new() } ); info!( " 64x: {:>5.1}% {:>5.1}% {:>5.1}% {:>5.1}%", self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_64X128, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_64X64, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_64X32, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_64X16, frame_type), ); info!( " 32x: {:>5.1}% {:>5.1}% {:>5.1}% {:>5.1}%", self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_32X64, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_32X32, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_32X16, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_32X8, frame_type), ); info!( " 16x: {:>5.1}% {:>5.1}% {:>5.1}% {:>5.1}% {:>5.1}%", self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_16X64, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_16X32, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_16X16, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_16X8, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_16X4, frame_type), ); info!( " 8x: {:>5.1}% {:>5.1}% {:>5.1}% {:>5.1}%", self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_8X32, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_8X16, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_8X8, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_8X4, frame_type), ); info!( " 4x: {:>5.1}% {:>5.1}% {:>5.1}%", self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_4X16, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_4X8, frame_type), self.get_bsize_pct_by_frame_type(BlockSize::BLOCK_4X4, frame_type), ); } fn print_transform_type_summary(&self) { info!("----------"); self.print_transform_type_summary_by_frame_type(FrameType::KEY, 'I'); self.print_transform_type_summary_by_frame_type(FrameType::INTER, 'P'); } fn print_transform_type_summary_by_frame_type( &self, frame_type: FrameType, type_label: char, ) { info!( "txtypes {}: DCT_DCT: {:.1}% | ADST_DCT: {:.1}% | DCT_ADST: {:.1}% | ADST_ADST: {:.1}%", type_label, self.get_txtype_pct_by_frame_type(TxType::DCT_DCT, frame_type), self.get_txtype_pct_by_frame_type(TxType::ADST_DCT, frame_type), self.get_txtype_pct_by_frame_type(TxType::DCT_ADST, frame_type), self.get_txtype_pct_by_frame_type(TxType::ADST_ADST, frame_type) ); info!( " IDTX: {:.1}% | V_DCT: {:.1}% | H_DCT: {:.1}%", self.get_txtype_pct_by_frame_type(TxType::IDTX, frame_type), self.get_txtype_pct_by_frame_type(TxType::V_DCT, frame_type), self.get_txtype_pct_by_frame_type(TxType::H_DCT, frame_type), ) } fn print_prediction_modes_summary(&self) { info!("----------"); self.print_luma_prediction_mode_summary_by_frame_type(FrameType::KEY, 'I'); self .print_chroma_prediction_mode_summary_by_frame_type(FrameType::KEY, 'I'); info!("----------"); self .print_luma_prediction_mode_summary_by_frame_type(FrameType::INTER, 'P'); self.print_chroma_prediction_mode_summary_by_frame_type( FrameType::INTER, 'P', ); } fn print_luma_prediction_mode_summary_by_frame_type( &self, frame_type: FrameType, type_label: char, ) { if frame_type == FrameType::KEY { info!( "y modes {}: DC: {:.1}% | V: {:.1}% | H: {:.1}% | Paeth: {:.1}%", type_label, self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::DC_PRED, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::V_PRED, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::H_PRED, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::PAETH_PRED, frame_type ), ); info!( " Smooth: {:.1}% | Smooth V: {:.1}% | Smooth H: {:.1}%", self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::SMOOTH_PRED, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::SMOOTH_V_PRED, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::SMOOTH_H_PRED, frame_type ), ); // Keep angular order for presentation here, rather than enum order. info!( " D: 45: {:.1}% | 67: {:.1}% | 113: {:.1}% | 135: {:.1}% | 157: {:.1}% | 203: {:.1}%", self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::D45_PRED, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::D67_PRED, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::D113_PRED, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::D135_PRED, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::D157_PRED, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::D203_PRED, frame_type), ); } else if frame_type == FrameType::INTER { info!( "y modes {}: Nearest: {:.1}% | Near0: {:.1}% | Near1: {:.1}% | NearNear0: {:.1}% | NearNear1: {:.1}% | NearNear2: {:.1}%", type_label, self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::NEARESTMV, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::NEAR0MV, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::NEAR1MV, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::NEAR_NEAR0MV, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::NEAR_NEAR1MV, frame_type), self.get_luma_pred_mode_pct_by_frame_type(PredictionMode::NEAR_NEAR2MV, frame_type), ); info!( "y modes {}: NearNew0: {:.1}% | NearNew1: {:.1}% | NearNew2: {:.1}%", type_label, self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::NEAR_NEW0MV, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::NEAR_NEW1MV, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::NEAR_NEW2MV, frame_type ), ); info!( "y modes {}: NewNear0: {:.1}% | NewNear1: {:.1}% | NewNear2: {:.1}%", type_label, self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::NEW_NEAR0MV, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::NEW_NEAR1MV, frame_type ), self.get_luma_pred_mode_pct_by_frame_type( PredictionMode::NEW_NEAR2MV, frame_type ), ); } } fn print_chroma_prediction_mode_summary_by_frame_type( &self, frame_type: FrameType, type_label: char, ) { if frame_type == FrameType::KEY { info!( "uv modes {}: DC: {:.1}% | V: {:.1}% | H: {:.1}% | Paeth: {:.1}%", type_label, self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::DC_PRED, frame_type ), self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::V_PRED, frame_type ), self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::H_PRED, frame_type ), self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::PAETH_PRED, frame_type ), ); info!( " Smooth: {:.1}% | Smooth V: {:.1}% | Smooth H: {:.1}% | UV CFL: {:.1}%", self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::SMOOTH_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::SMOOTH_V_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::SMOOTH_H_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::UV_CFL_PRED, frame_type), ); // Keep angular order for presentation here, rather than enum order. info!( " D: 45: {:.1}% | 67: {:.1}% | 113: {:.1}% | 135: {:.1}% | 157: {:.1}% | 203: {:.1}%", self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::D45_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::D67_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::D113_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::D135_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::D157_PRED, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::D203_PRED, frame_type), ); } else if frame_type == FrameType::INTER { info!( "uv modes {}: Nearest: {:.1}% | Near0: {:.1}% | Near1: {:.1}% | NearNear0: {:.1}% | NearNear1: {:.1}% | NearNear2: {:.1}%", type_label, self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEARESTMV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEAR0MV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEAR1MV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEAR_NEAR0MV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEAR_NEAR1MV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEAR_NEAR2MV, frame_type), ); info!( "uv modes {}: NearNew0: {:.1}% | NearNew1: {:.1}% | NearNew2: {:.1}%", type_label, self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::NEAR_NEW0MV, frame_type ), self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::NEAR_NEW1MV, frame_type ), self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::NEAR_NEW2MV, frame_type ), ); info!( "uv modes {}: NewNear0: {:.1}% | NewNear1: {:.1}% | NewNear2: {:.1}%", type_label, self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::NEW_NEAR0MV, frame_type ), self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::NEW_NEAR1MV, frame_type ), self.get_chroma_pred_mode_pct_by_frame_type( PredictionMode::NEW_NEAR2MV, frame_type ), ); info!(" New: {:.1}% | NewNew: {:.1}% | NearestNearest: {:.1}% | GlobalGlobal: {:.1}%", self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEWMV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEW_NEWMV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::NEAREST_NEARESTMV, frame_type), self.get_chroma_pred_mode_pct_by_frame_type(PredictionMode::GLOBAL_GLOBALMV, frame_type),); } } } fn sum_metric f64>( frame_info: &[FrameSummary], map_fn: F, ) -> f64 { frame_info.iter().map(map_fn).sum::() / frame_info.len() as f64 } impl fmt::Display for ProgressInfo { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(total_frames) = self.total_frames { write!( f, "encoded {}/{} frames, {:.3} fps, {:.2} Kb/s, est: {}, {:.2} MB, elapsed: {}", self.frames_encoded(), total_frames, self.encoding_fps(), self.bitrate() as f64 / 1000f64, secs_to_human_time(self.estimated_time()), self.estimated_size() as f64 / (1024 * 1024) as f64, secs_to_human_time(self.elapsed_time()) ) } else { write!( f, "encoded {} frames, {:.3} fps, {:.2} Kb/s, elapsed: {}", self.frames_encoded(), self.encoding_fps(), self.bitrate() as f64 / 1000f64, secs_to_human_time(self.elapsed_time()) ) } } } fn secs_to_human_time(mut secs: u64) -> String { let mut mins = secs / 60; secs %= 60; let hours = mins / 60; mins %= 60; if hours > 0 { format!("{hours}h {mins}m {secs}s") } else if mins > 0 { format!("{mins}m {secs}s") } else { format!("{secs}s") } } #[derive(Debug, Clone, Copy, Default, PartialEq)] pub struct QualityMetrics { /// Peak Signal-to-Noise Ratio for Y, U, and V planes pub psnr: Option, /// Peak Signal-to-Noise Ratio as perceived by the Human Visual System-- /// taking into account Contrast Sensitivity Function (CSF) pub psnr_hvs: Option, /// Structural Similarity pub ssim: Option, /// Multi-Scale Structural Similarity pub ms_ssim: Option, /// CIEDE 2000 color difference algorithm: https://en.wikipedia.org/wiki/Color_difference#CIEDE2000 pub ciede: Option, /// Aligned Peak Signal-to-Noise Ratio for Y, U, and V planes pub apsnr: Option, /// Netflix's Video Multimethod Assessment Fusion pub vmaf: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum MetricsEnabled { /// Don't calculate any metrics. None, /// Calculate the PSNR of each plane, but no other metrics. Psnr, /// Calculate all implemented metrics. Currently implemented metrics match what is available via AWCY. All, } pub fn calculate_frame_metrics( frame1: &Frame, frame2: &Frame, bit_depth: usize, cs: ChromaSampling, metrics: MetricsEnabled, ) -> QualityMetrics { match metrics { MetricsEnabled::None => QualityMetrics::default(), MetricsEnabled::Psnr => QualityMetrics { psnr: Some( psnr::calculate_frame_psnr(frame1, frame2, bit_depth, cs).unwrap(), ), ..Default::default() }, MetricsEnabled::All => { let mut metrics = QualityMetrics { psnr: Some( psnr::calculate_frame_psnr(frame1, frame2, bit_depth, cs).unwrap(), ), psnr_hvs: Some( psnr_hvs::calculate_frame_psnr_hvs(frame1, frame2, bit_depth, cs) .unwrap(), ), ..Default::default() }; let ssim = ssim::calculate_frame_ssim(frame1, frame2, bit_depth, cs); metrics.ssim = Some(ssim.unwrap()); let ms_ssim = ssim::calculate_frame_msssim(frame1, frame2, bit_depth, cs); metrics.ms_ssim = Some(ms_ssim.unwrap()); let ciede = ciede::calculate_frame_ciede(frame1, frame2, bit_depth, cs); metrics.ciede = Some(ciede.unwrap()); // TODO APSNR // TODO VMAF metrics } } } rav1e-0.7.1/src/capi.rs000064400000000000000000001302001046102023000127140ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. //! # C API for rav1e //! //! [rav1e](https://github.com/xiph/rav1e/) is an [AV1](https://aomediacodec.github.io/av1-spec/) //! encoder written in [Rust](https://rust-lang.org) //! //! This is the C-compatible API #![deny(missing_docs)] // Basically everything will be unsafe since this is a FFI #![allow(clippy::undocumented_unsafe_blocks)] // const extern fns are unstable #![allow(clippy::missing_const_for_fn)] use std::slice; use std::sync::Arc; use std::ffi::CStr; use std::ffi::CString; use std::mem; use std::os::raw::c_char; use std::os::raw::c_int; use std::os::raw::c_void; use libc::ptrdiff_t; use libc::size_t; use num_derive::*; use num_traits::cast::FromPrimitive; use scan_fmt::scan_fmt; use crate::prelude as rav1e; type PixelRange = rav1e::PixelRange; type ChromaSamplePosition = rav1e::ChromaSamplePosition; type ChromaSampling = rav1e::ChromaSampling; type MatrixCoefficients = rav1e::MatrixCoefficients; type ColorPrimaries = rav1e::ColorPrimaries; type TransferCharacteristics = rav1e::TransferCharacteristics; type Rational = rav1e::Rational; type FrameTypeOverride = rav1e::FrameTypeOverride; type FrameOpaqueCb = Option; type T35 = rav1e::T35; #[derive(Clone)] enum FrameInternal { U8(Arc>), U16(Arc>), } impl From> for FrameInternal { fn from(f: rav1e::Frame) -> FrameInternal { FrameInternal::U8(Arc::new(f)) } } impl From> for FrameInternal { fn from(f: rav1e::Frame) -> FrameInternal { FrameInternal::U16(Arc::new(f)) } } impl From>> for FrameInternal { fn from(f: Arc>) -> FrameInternal { FrameInternal::U8(f) } } impl From>> for FrameInternal { fn from(f: Arc>) -> FrameInternal { FrameInternal::U16(f) } } struct FrameOpaque { opaque: *mut c_void, cb: FrameOpaqueCb, } unsafe impl Send for FrameOpaque {} unsafe impl Sync for FrameOpaque {} impl Default for FrameOpaque { fn default() -> Self { FrameOpaque { opaque: std::ptr::null_mut(), cb: None } } } impl Drop for FrameOpaque { fn drop(&mut self) { let FrameOpaque { opaque, cb } = self; if let Some(cb) = cb { cb(*opaque); } } } /// Raw video Frame /// /// It can be allocated through `rav1e_frame_new()`, /// populated using `rav1e_frame_fill_plane()`, /// and freed using `rav1e_frame_unref()`. pub struct Frame { fi: FrameInternal, frame_type: FrameTypeOverride, opaque: Option, t35_metadata: Vec, } /// Status that can be returned by encoder functions. #[repr(C)] #[derive(Copy, Clone, Debug, FromPrimitive, PartialEq)] pub enum EncoderStatus { /// Normal operation. Success = 0, /// The encoder needs more data to produce an output packet. /// /// May be emitted by `rav1e_receive_packet` when frame reordering is /// enabled. NeedMoreData, /// There are enough frames in the queue. /// /// May be emitted by `rav1e_send_frame` when trying to send a frame after /// the encoder has been flushed or the internal queue is full. EnoughData, /// The encoder has already produced the number of frames requested. /// /// May be emitted by `rav1e_receive_packet` after a flush request had been /// processed or the frame limit had been reached. LimitReached, /// A Frame had been encoded but not emitted yet. Encoded, /// Generic fatal error. Failure = -1, /// A frame was encoded in the first pass of a 2-pass encode, but its stats /// data was not retrieved with `rav1e_twopass_out`, or not enough stats data /// was provided in the second pass of a 2-pass encode to encode the next /// frame. NotReady = -2, } impl EncoderStatus { const fn to_c(&self) -> *const u8 { use self::EncoderStatus::*; match self { Success => "Normal operation\0".as_ptr(), NeedMoreData => "The encoder needs more data to produce an output packet\0".as_ptr(), EnoughData => "There are enough frames in the queue\0".as_ptr(), LimitReached => "The encoder has already produced the number of frames requested\0".as_ptr(), Encoded => "A Frame had been encoded but not emitted yet\0".as_ptr(), Failure => "Generic fatal error\0".as_ptr(), NotReady => "First-pass stats data not retrieved or not enough second-pass data provided\0".as_ptr(), } } } impl From> for EncoderStatus { fn from(status: Option) -> Self { match status { None => EncoderStatus::Success, Some(s) => match s { rav1e::EncoderStatus::NeedMoreData => EncoderStatus::NeedMoreData, rav1e::EncoderStatus::EnoughData => EncoderStatus::EnoughData, rav1e::EncoderStatus::LimitReached => EncoderStatus::LimitReached, rav1e::EncoderStatus::Encoded => EncoderStatus::Encoded, rav1e::EncoderStatus::Failure => EncoderStatus::Failure, rav1e::EncoderStatus::NotReady => EncoderStatus::NotReady, }, } } } /// Encoder configuration /// /// Instantiate it using `rav1e_config_default()` and fine-tune it using /// `rav1e_config_parse()`. /// /// Use `rav1e_config_unref()` to free its memory. pub struct Config { cfg: rav1e::Config, } enum EncContext { U8(rav1e::Context), U16(rav1e::Context), } impl EncContext { fn new_frame(&self) -> FrameInternal { match self { EncContext::U8(ctx) => ctx.new_frame().into(), EncContext::U16(ctx) => ctx.new_frame().into(), } } fn send_frame( &mut self, frame: Option, frame_type: FrameTypeOverride, opaque: Option, t35_metadata: Box<[T35]>, ) -> Result<(), rav1e::EncoderStatus> { let info = rav1e::FrameParameters { frame_type_override: frame_type, opaque, t35_metadata, }; if let Some(frame) = frame { match (self, frame) { (EncContext::U8(ctx), FrameInternal::U8(ref f)) => { ctx.send_frame((f.clone(), info)) } (EncContext::U16(ctx), FrameInternal::U16(ref f)) => { ctx.send_frame((f.clone(), info)) } _ => Err(rav1e::EncoderStatus::Failure), } } else { match self { EncContext::U8(ctx) => ctx.send_frame(None), EncContext::U16(ctx) => ctx.send_frame(None), } } } fn receive_packet(&mut self) -> Result { fn receive_packet( ctx: &mut rav1e::Context, ) -> Result where FrameInternal: From>>, { ctx.receive_packet().map(|p| { let mut p = std::mem::ManuallyDrop::new(p); let opaque = p.opaque.take().map_or_else(std::ptr::null_mut, |o| { let mut opaque = o.downcast::().unwrap(); opaque.cb = None; opaque.opaque }); let p = std::mem::ManuallyDrop::into_inner(p); let rav1e::Packet { data, rec, source, input_frameno, frame_type, .. } = p; let len = data.len(); let data = Box::into_raw(data.into_boxed_slice()) as *const u8; let rec = if let Some(rec) = rec { let rec = FrameInternal::from(rec); Box::into_raw(Box::new(Frame { fi: rec, frame_type: FrameTypeOverride::No, opaque: None, t35_metadata: Vec::new(), })) } else { std::ptr::null_mut() }; let source = if let Some(source) = source { let source = FrameInternal::from(source); Box::into_raw(Box::new(Frame { fi: source, frame_type: FrameTypeOverride::No, opaque: None, t35_metadata: Vec::new(), })) } else { std::ptr::null_mut() }; Packet { data, rec, source, len, input_frameno, frame_type, opaque } }) } match self { EncContext::U8(ctx) => receive_packet(ctx), EncContext::U16(ctx) => receive_packet(ctx), } } fn container_sequence_header(&self) -> Vec { match self { EncContext::U8(ctx) => ctx.container_sequence_header(), EncContext::U16(ctx) => ctx.container_sequence_header(), } } fn twopass_bytes_needed(&mut self) -> usize { match self { EncContext::U8(ctx) => ctx.twopass_bytes_needed(), EncContext::U16(ctx) => ctx.twopass_bytes_needed(), } } fn twopass_in(&mut self, buf: &[u8]) -> Result { match self { EncContext::U8(ctx) => ctx.twopass_in(buf), EncContext::U16(ctx) => ctx.twopass_in(buf), } } fn twopass_out(&mut self) -> Option<&[u8]> { match self { EncContext::U8(ctx) => ctx.twopass_out(), EncContext::U16(ctx) => ctx.twopass_out(), } } fn rc_summary_size(&self) -> usize { match self { EncContext::U8(ctx) => ctx.rc_summary_size(), EncContext::U16(ctx) => ctx.rc_summary_size(), } } fn rc_receive_pass_data(&mut self) -> Option { match self { EncContext::U8(ctx) => ctx.rc_receive_pass_data(), EncContext::U16(ctx) => ctx.rc_receive_pass_data(), } } fn rc_second_pass_data_required(&self) -> usize { match self { EncContext::U8(ctx) => ctx.rc_second_pass_data_required(), EncContext::U16(ctx) => ctx.rc_second_pass_data_required(), } } fn rc_send_pass_data( &mut self, data: &[u8], ) -> Result<(), rav1e::EncoderStatus> { match self { EncContext::U8(ctx) => ctx.rc_send_pass_data(data), EncContext::U16(ctx) => ctx.rc_send_pass_data(data), } } fn config(&self) -> rav1e::EncoderConfig { // Ideally this would return a reference instead of cloning, // but that would require a breaking change in the CAPI. match self { EncContext::U8(ctx) => ctx.config.clone(), EncContext::U16(ctx) => ctx.config.clone(), } } } /// Encoder context /// /// Contains the encoding state, it is created by `rav1e_context_new()` using an /// Encoder configuration. /// /// Use `rav1e_context_unref()` to free its memory. pub struct Context { ctx: EncContext, last_err: Option, } type FrameType = rav1e::FrameType; /// Encoded Packet /// /// The encoded packets are retrieved using `rav1e_receive_packet()`. /// /// Use `rav1e_packet_unref()` to free its memory. #[repr(C)] pub struct Packet { /// Encoded data buffer pub data: *const u8, /// Encoded data buffer size pub len: size_t, /// Frame sequence number pub input_frameno: u64, /// Frame type pub frame_type: FrameType, /// User provided opaque data pub opaque: *mut c_void, /// The reconstruction of the shown frame. /// This is freed automatically by `rav1e_packet_unref()`. pub rec: *mut Frame, /// The Reference Frame /// This is freed automatically by `rav1e_packet_unref()`. pub source: *mut Frame, } /// Version information as presented in `[package]` `version`. /// /// e.g. `0.1.0` /// /// Can be parsed by [semver](https://crates.io/crates/semver). /// This returns the version of the loaded library, regardless /// of which version the library user was built against. #[no_mangle] pub unsafe extern fn rav1e_version_short() -> *const c_char { concat!(env!("CARGO_PKG_VERSION"), "\0").as_ptr() as *const c_char } static FULL_VERSION_C: once_cell::sync::OnceCell = once_cell::sync::OnceCell::new(); /// Version information with the information /// provided by `git describe --tags`. /// /// e.g. `0.1.0 (v0.1.0-1-g743d464)` /// /// This returns the version of the loaded library, regardless /// of which version the library user was built against. #[no_mangle] pub unsafe extern fn rav1e_version_full() -> *const c_char { FULL_VERSION_C .get_or_init(|| { CString::new(crate::version::full()).expect("Bogus version data") }) .as_ptr() as *const c_char } /// Simple Data /// /// Use `rav1e_data_unref()` to free its memory. #[repr(C)] pub struct Data { /// Pointer to the data buffer pub data: *const u8, /// Data buffer size pub len: size_t, } /// Free a `RaData` buffer #[no_mangle] pub unsafe extern fn rav1e_data_unref(data: *mut Data) { if !data.is_null() { let data = Box::from_raw(data); let _ = Vec::from_raw_parts( data.data as *mut u8, data.len as usize, data.len as usize, ); } } /// Create a `RaConfig` filled with default parameters. #[no_mangle] pub unsafe extern fn rav1e_config_default() -> *mut Config { let cfg = rav1e::Config::default(); let c = Box::new(Config { cfg }); Box::into_raw(c) } unsafe fn decode_slice<'a>( data: *mut *const u8, len: *mut size_t, ) -> (c_int, Option<&'a [u8]>) { if *len < 8 { return (8, None); } let buf = slice::from_raw_parts(*data, *len as usize); let (len_bytes, rest) = buf.split_at(std::mem::size_of::()); let buf_len = u64::from_be_bytes(len_bytes.try_into().unwrap()) as usize; let full_len = buf_len + 8; if buf_len > rest.len() { return (full_len as c_int, None); } *len -= full_len; *data = (*data).offset(full_len.try_into().unwrap()); (0, Some(&rest[..buf_len])) } /// Setup a second pass rate control using the provided summary /// /// Passing `NULL` data resets the rate control settings. /// /// If additional data is required, pointer and len stay unchanged, otherwise /// they are updated. /// /// Return: /// `0` on success /// `> 0` if the buffer has to be larger /// `< 0` on failure #[no_mangle] // Panic can never occur here #[allow(clippy::missing_panics_doc)] pub unsafe extern fn rav1e_config_set_rc_summary( cfg: *mut Config, data: *mut *const u8, len: *mut size_t, ) -> c_int { if data.is_null() { (*cfg).cfg.rate_control.summary = None; return 0; } let (needed, maybe_buf) = decode_slice(data, len); if maybe_buf.is_none() { return needed; } let summary = rav1e::RateControlSummary::from_slice(maybe_buf.unwrap()).ok(); if summary.is_none() { -1 } else { (*cfg).cfg.rate_control.summary = summary; 0 } } /// Request to emit pass data /// /// Set emit to 0 to not emit pass data, non-zero to emit pass data. /// #[no_mangle] pub unsafe extern fn rav1e_config_set_emit_data( cfg: *mut Config, emit: c_int, ) { (*cfg).cfg.rate_control.emit_pass_data = emit != 0; } /// Set the display aspect ratio of the stream /// /// Needed for anamorphic video. #[no_mangle] pub unsafe extern fn rav1e_config_set_sample_aspect_ratio( cfg: *mut Config, sample_aspect_ratio: Rational, ) { (*cfg).cfg.enc.sample_aspect_ratio = sample_aspect_ratio } /// Set the time base of the stream /// /// Needed for rate control. #[no_mangle] pub unsafe extern fn rav1e_config_set_time_base( cfg: *mut Config, time_base: Rational, ) { (*cfg).cfg.enc.time_base = time_base } /// Set pixel format of the stream. /// /// Supported values for `subsampling` and `chroma_pos` are defined by the /// enum types `RaChromaSampling` and `RaChromaSamplePosition` respectively. /// Valid values for `pixel_range` are 0 and 1. /// /// Returns a negative value on error or 0. #[no_mangle] pub unsafe extern fn rav1e_config_set_pixel_format( cfg: *mut Config, bit_depth: u8, subsampling: ChromaSampling, chroma_pos: ChromaSamplePosition, pixel_range: PixelRange, ) -> c_int { if bit_depth != 8 && bit_depth != 10 && bit_depth != 12 { return -1; } (*cfg).cfg.enc.bit_depth = bit_depth as usize; let subsampling_val = std::mem::transmute::(subsampling); if ChromaSampling::from_i32(subsampling_val).is_none() { return -1; } (*cfg).cfg.enc.chroma_sampling = subsampling; let chroma_pos_val = std::mem::transmute::(chroma_pos); if ChromaSamplePosition::from_i32(chroma_pos_val).is_none() { return -1; } (*cfg).cfg.enc.chroma_sample_position = chroma_pos; let pixel_range_val = std::mem::transmute::(pixel_range); if PixelRange::from_i32(pixel_range_val).is_none() { return -1; } (*cfg).cfg.enc.pixel_range = pixel_range; 0 } /// Set color properties of the stream. /// /// Supported values are defined by the enum types /// `RaMatrixCoefficients`, `RaColorPrimaries`, and `RaTransferCharacteristics` /// respectively. /// /// Return a negative value on error or 0. #[no_mangle] pub unsafe extern fn rav1e_config_set_color_description( cfg: *mut Config, matrix: MatrixCoefficients, primaries: ColorPrimaries, transfer: TransferCharacteristics, ) -> c_int { (*cfg).cfg.enc.color_description = Some(rav1e::ColorDescription { matrix_coefficients: matrix, color_primaries: primaries, transfer_characteristics: transfer, }); if (*cfg).cfg.enc.color_description.is_some() { 0 } else { -1 } } /// Set the content light level information for HDR10 streams. /// /// Return a negative value on error or 0. #[no_mangle] pub unsafe extern fn rav1e_config_set_content_light( cfg: *mut Config, max_content_light_level: u16, max_frame_average_light_level: u16, ) -> c_int { (*cfg).cfg.enc.content_light = Some(rav1e::ContentLight { max_content_light_level, max_frame_average_light_level, }); if (*cfg).cfg.enc.content_light.is_some() { 0 } else { -1 } } /// Set the mastering display information for HDR10 streams. /// /// `primaries` and `white_point` arguments are `RaChromaticityPoint`, /// containing 0.16 fixed point values. /// `max_luminance` is a 24.8 fixed point value. /// `min_luminance` is a 18.14 fixed point value. /// /// Returns a negative value on error or 0. /// cbindgen:ptrs-as-arrays=[[primaries;3]] #[no_mangle] pub unsafe extern fn rav1e_config_set_mastering_display( cfg: *mut Config, primaries: *const rav1e::ChromaticityPoint, white_point: rav1e::ChromaticityPoint, max_luminance: u32, min_luminance: u32, ) -> c_int { let primaries = *(primaries as *const [rav1e::ChromaticityPoint; 3]); (*cfg).cfg.enc.mastering_display = Some(rav1e::MasteringDisplay { primaries, white_point, max_luminance, min_luminance, }); if (*cfg).cfg.enc.mastering_display.is_some() { 0 } else { -1 } } /// Free the `RaConfig`. #[no_mangle] pub unsafe extern fn rav1e_config_unref(cfg: *mut Config) { if !cfg.is_null() { let _ = Box::from_raw(cfg); } } unsafe fn option_match( cfg: *mut Config, key: *const c_char, value: *const c_char, ) -> Result<(), ()> { let key = CStr::from_ptr(key).to_str().map_err(|_| ())?; let value = CStr::from_ptr(value).to_str().map_err(|_| ())?; let enc = &mut (*cfg).cfg.enc; match key { "width" => enc.width = value.parse().map_err(|_| ())?, "height" => enc.height = value.parse().map_err(|_| ())?, "speed" => { enc.speed_settings = rav1e::SpeedSettings::from_preset(value.parse().map_err(|_| ())?) } "threads" => (*cfg).cfg.threads = value.parse().map_err(|_| ())?, "tiles" => enc.tiles = value.parse().map_err(|_| ())?, "tile_rows" => enc.tile_rows = value.parse().map_err(|_| ())?, "tile_cols" => enc.tile_cols = value.parse().map_err(|_| ())?, "tune" => enc.tune = value.parse().map_err(|_| ())?, "quantizer" => enc.quantizer = value.parse().map_err(|_| ())?, "min_quantizer" => enc.min_quantizer = value.parse().map_err(|_| ())?, "bitrate" => enc.bitrate = value.parse().map_err(|_| ())?, "key_frame_interval" => { enc.set_key_frame_interval( enc.min_key_frame_interval, value.parse().map_err(|_| ())?, ); } "min_key_frame_interval" => { enc.set_key_frame_interval( value.parse().map_err(|_| ())?, enc.max_key_frame_interval, ); } "switch_frame_interval" => { enc.switch_frame_interval = value.parse().map_err(|_| ())? } "reservoir_frame_delay" => { enc.reservoir_frame_delay = Some(value.parse().map_err(|_| ())?) } "rdo_lookahead_frames" => { enc.speed_settings.rdo_lookahead_frames = value.parse().map_err(|_| ())? } "low_latency" => enc.low_latency = value.parse().map_err(|_| ())?, "enable_timing_info" => { enc.enable_timing_info = value.parse().map_err(|_| ())? } "still_picture" => enc.still_picture = value.parse().map_err(|_| ())?, "level" => { enc.level_idx = match value { "auto" => None, "unconstrained" => Some(31), _ => { let (major, minor) = scan_fmt!(value, "{}.{}", u8, u8).map_err(|_| ())?; if major > 7 || minor > 3 { return Err(()); } Some(((major - 2) << 2) + minor) } }; } _ => return Err(()), } Ok(()) } /// Set a configuration parameter using its key and value as string. /// /// Available keys and values /// - `"width"`: width of the frame, default `640` /// - `"height"`: height of the frame, default `480` /// - `"speed"`: 0-10, default `6` /// - `"threads"`: maximum number of threads to be used, default auto /// - `"tune"`: `"psnr"` or `"psychovisual"`, default `"psychovisual"` /// - `"quantizer"`: 0-255, default `100` /// - `"tiles"`: total number of tiles desired (0 denotes auto), default `0` /// - `"tile_rows"`: number of tiles horizontally (must be a power of two, overridden by tiles if present), default `0` /// - `"tile_cols"`: number of tiles vertically (must be a power of two, overridden by tiles if present), default `0` /// - `"min_quantizer"`: minimum allowed base quantizer to use in bitrate mode, default `0` /// - `"bitrate"`: target bitrate for the bitrate mode (required for two pass mode), default `0` /// - `"key_frame_interval"`: maximum interval between two keyframes, default `240` /// - `"min_key_frame_interval"`: minimum interval between two keyframes, default `12` /// - `"switch_frame_interval"`: interval between switch frames, default `0` /// - `"reservoir_frame_delay"`: number of temporal units over which to distribute the reservoir usage, default `None` /// - `"rdo_lookahead_frames"`: number of frames to read ahead for the RDO lookahead computation, default `40` /// - `"low_latency"`: flag to enable low latency mode, default `false` /// - `"enable_timing_info"`: flag to enable signaling timing info in the bitstream, default `false` /// - `"still_picture"`: flag for still picture mode, default `false` /// /// Return a negative value on error or 0. #[no_mangle] pub unsafe extern fn rav1e_config_parse( cfg: *mut Config, key: *const c_char, value: *const c_char, ) -> c_int { if option_match(cfg, key, value) == Ok(()) { 0 } else { -1 } } /// Set a configuration parameter using its key and value as integer. /// /// Available keys and values are the same as `rav1e_config_parse()` /// /// Return a negative value on error or 0. #[no_mangle] // Panic can never occur here #[allow(clippy::missing_panics_doc)] pub unsafe extern fn rav1e_config_parse_int( cfg: *mut Config, key: *const c_char, value: c_int, ) -> c_int { let val = CString::new(value.to_string()).unwrap(); if option_match(cfg, key, val.as_ptr()) == Ok(()) { 0 } else { config_parse_bool(cfg, key, value) } } unsafe fn config_parse_bool( cfg: *mut Config, key: *const c_char, value: c_int, ) -> c_int { let val = CString::new(if value != 0 { "true" } else { "false" }).unwrap(); if option_match(cfg, key, val.as_ptr()) == Ok(()) { 0 } else { -1 } } /// Generate a new encoding context from a populated encoder configuration /// /// Multiple contexts can be generated through it. /// Returns `Null` if context creation failed, e.g. by passing /// an invalid `Config`. #[no_mangle] pub unsafe extern fn rav1e_context_new(cfg: *const Config) -> *mut Context { let cfg = &(*cfg).cfg; let enc = &cfg.enc; let ctx = match enc.bit_depth { 8 => cfg.new_context().map(EncContext::U8), _ => cfg.new_context().map(EncContext::U16), }; if let Ok(ctx) = ctx { Box::into_raw(Box::new(Context { ctx, last_err: None })) } else { std::ptr::null_mut() } } /// Free the `RaContext`. #[no_mangle] pub unsafe extern fn rav1e_context_unref(ctx: *mut Context) { if !ctx.is_null() { let _ = Box::from_raw(ctx); } } /// Produce a new frame from the encoding context /// /// It must be populated using `rav1e_frame_fill_plane()`. /// /// The frame is reference counted and must be released passing it to `rav1e_frame_unref()`, /// see `rav1e_send_frame()`. #[no_mangle] pub unsafe extern fn rav1e_frame_new(ctx: *const Context) -> *mut Frame { let fi = (*ctx).ctx.new_frame(); let frame_type = rav1e::FrameTypeOverride::No; let f = Frame { fi, frame_type, opaque: None, t35_metadata: Vec::new() }; let frame = Box::new(f); Box::into_raw(frame) } /// Free the `RaFrame`. #[no_mangle] pub unsafe extern fn rav1e_frame_unref(frame: *mut Frame) { if !frame.is_null() { let _ = Box::from_raw(frame); } } /// Overrides the encoders frame type decision for a frame /// /// Must be called before `rav1e_send_frame()` if used. #[no_mangle] pub unsafe extern fn rav1e_frame_set_type( frame: *mut Frame, frame_type: FrameTypeOverride, ) -> c_int { let frame_type_val = std::mem::transmute::(frame_type); if FrameTypeOverride::from_i32(frame_type_val).is_none() { return -1; } (*frame).frame_type = frame_type; 0 } /// Register an opaque data and a destructor to the frame /// /// It takes the ownership of its memory: /// - it will relinquish the ownership to the context if /// `rav1e_send_frame` is called. /// - it will call the destructor if `rav1e_frame_unref` is called /// otherwise. #[no_mangle] pub unsafe extern fn rav1e_frame_set_opaque( frame: *mut Frame, opaque: *mut c_void, cb: FrameOpaqueCb, ) { if opaque.is_null() { (*frame).opaque = None; } else { (*frame).opaque = Some(FrameOpaque { opaque, cb }); } } /// Add generic T35 metadata to a frame /// /// The buffer will be copied into the frame and can be freed /// immediately after this call. /// /// Can be called multiple times to add multiple T35 metadata /// blocks. #[no_mangle] pub unsafe extern fn rav1e_frame_add_t35_metadata( frame: *mut Frame, country_code: u8, country_code_extension_byte: u8, data: *const u8, data_len: size_t, ) { (*frame).t35_metadata.push(T35 { country_code, country_code_extension_byte, data: slice::from_raw_parts(data, data_len).into(), }); } /// Retrieve the first-pass data of a two-pass encode for the frame that was /// just encoded. This should be called BEFORE every call to `rav1e_receive_packet()` /// (including the very first one), even if no packet was produced by the /// last call to `rav1e_receive_packet`, if any (i.e., `RA_ENCODER_STATUS_ENCODED` /// was returned). It needs to be called once more after /// `RA_ENCODER_STATUS_LIMIT_REACHED` is returned, to retrieve the header that /// should be written to the front of the stats file (overwriting the /// placeholder header that was emitted at the start of encoding). /// /// It is still safe to call this function when `rav1e_receive_packet()` returns any /// other error. It will return `NULL` instead of returning a duplicate copy /// of the previous frame's data. /// /// Must be freed with `rav1e_data_unref()`. #[no_mangle] // Panic can never occur here #[allow(clippy::missing_panics_doc)] pub unsafe extern fn rav1e_twopass_out(ctx: *mut Context) -> *mut Data { let buf = (*ctx).ctx.twopass_out(); if buf.is_none() { return std::ptr::null_mut(); } let v = buf.unwrap().to_vec(); Box::into_raw(Box::new(Data { len: v.len(), data: Box::into_raw(v.into_boxed_slice()) as *mut u8, })) } /// Rate Control Data #[derive(Debug, PartialEq)] #[repr(C)] pub enum RcDataKind { /// A Rate Control Summary Packet /// /// It is emitted once, after the encoder is flushed. /// /// It contains a summary of the rate control information for the /// encoding process that just terminated. Summary, /// A Rate Control Frame-specific Packet /// /// It is emitted every time a frame is processed. /// /// The information contained is required to encode its matching /// frame in a second pass encoding. Frame, /// There is no pass data available for now /// /// This is emitted if `rav1e_rc_receive_pass_data` is called more /// often than it should. Empty, } /// Return the Rate Control Summary Packet size /// /// It is useful mainly to preserve space when saving /// both Rate Control Summary and Frame Packets in a single file #[no_mangle] pub unsafe extern fn rav1e_rc_summary_size(ctx: *const Context) -> size_t { (*ctx).ctx.rc_summary_size() as size_t + 8 } /// Return the first pass data /// /// Call it after `rav1e_receive_packet()` returns a normal condition status: /// - `EncoderStatus::Encoded`, /// - `EncoderStatus::Success`, /// - `EncoderStatus::LimitReached`. /// /// use `rav1e_data_unref()` to free the data. /// /// It will return a `RcDataKind::Summary` once the encoder is flushed. #[no_mangle] pub unsafe extern fn rav1e_rc_receive_pass_data( ctx: *mut Context, data: *mut *mut Data, ) -> RcDataKind { use crate::api::RcData::*; let (buf, kind) = match (*ctx).ctx.rc_receive_pass_data() { Some(Summary(data)) => (data, RcDataKind::Summary), Some(Frame(data)) => (data, RcDataKind::Frame), None => return RcDataKind::Empty, }; let mut full_buf = Vec::with_capacity(buf.len() + 8); full_buf.extend_from_slice(&(buf.len() as u64).to_be_bytes()); full_buf.extend_from_slice(&buf); let full_buf = full_buf.into_boxed_slice(); *data = Box::into_raw(Box::new(Data { len: full_buf.len(), data: Box::into_raw(full_buf) as *mut u8, })); kind } /// Number of pass data packets required to progress the encoding process. /// /// At least that number of packets must be passed before the encoder can /// progress. /// /// Stop feeding-in pass data packets once the function returns 0. /// /// ``` c /// while (rav1e_rc_second_pass_data_required(ctx) > 0) { /// int more = rav1e_rc_send_pass_data(ctx, &data, &len); /// if (more > 0) { /// refill(&data, &len); /// } else if (more < 0) { /// goto fail; /// } /// } /// ``` /// #[no_mangle] pub unsafe extern fn rav1e_rc_second_pass_data_required( ctx: *const Context, ) -> i32 { (*ctx).ctx.rc_second_pass_data_required() as i32 } /// Feed the first pass Rate Control data to the encoder, /// Frame-specific Packets only. /// /// Call it before `receive_packet()` /// /// If additional data is required, pointer and len stay unchanged, otherwise /// they are updated. /// /// Returns: /// - `0` on success, /// - `> 0` the amount of bytes needed /// - `< 0` on unrecoverable failure #[no_mangle] // Panic can never occur here #[allow(clippy::missing_panics_doc)] pub unsafe extern fn rav1e_rc_send_pass_data( ctx: *mut Context, data: *mut *const u8, len: *mut size_t, ) -> c_int { let (need, maybe_buf) = decode_slice(data, len); if maybe_buf.is_none() { return need; } let ret = (*ctx) .ctx .rc_send_pass_data(maybe_buf.unwrap()) .map(|_v| None) .unwrap_or_else(Some); (*ctx).last_err = ret; if ret.is_some() { -1 } else { 0 } } /// Ask how many bytes of the stats file are needed before the next frame /// of the second pass in a two-pass encode can be encoded. This is a lower /// bound (more might be required), but if 0 is returned, then encoding can /// proceed. This is just a hint to the application, and does not need to /// be called for encoding the second pass to work, so long as the /// application continues to provide more data to `rav1e_twopass_in()` in a loop /// until `rav1e_twopass_in()` returns 0. #[no_mangle] pub unsafe extern fn rav1e_twopass_bytes_needed(ctx: *mut Context) -> size_t { (*ctx).ctx.twopass_bytes_needed() as size_t } /// Provide stats data produced in the first pass of a two-pass encode to the /// second pass. On success this returns the number of bytes of that data /// which were consumed. When encoding the second pass of a two-pass encode, /// this should be called repeatedly in a loop before every call to /// `rav1e_receive_packet()` (including the very first one) until no bytes are /// consumed, or until `twopass_bytes_needed()` returns 0. Returns -1 on failure. #[no_mangle] pub unsafe extern fn rav1e_twopass_in( ctx: *mut Context, buf: *mut u8, buf_size: size_t, ) -> c_int { let buf_slice = slice::from_raw_parts(buf, buf_size as usize); let r = (*ctx).ctx.twopass_in(buf_slice); match r { Ok(v) => v as c_int, Err(v) => { (*ctx).last_err = Some(v); -1 } } } /// Send the frame for encoding /// /// The function increases the frame internal reference count and it can be passed multiple /// times to different `rav1e_send_frame()` with a caveat: /// /// The opaque data, if present, will be moved from the `Frame` to the `Context` /// and returned by `rav1e_receive_packet` in the `Packet` `opaque` field or /// the destructor will be called on `rav1e_context_unref` if the frame is /// still pending in the encoder. /// /// Returns: /// - `0` on success, /// - `> 0` if the input queue is full /// - `< 0` on unrecoverable failure #[no_mangle] pub unsafe extern fn rav1e_send_frame( ctx: *mut Context, frame: *mut Frame, ) -> EncoderStatus { if !frame.is_null() { let rav1e::EncoderConfig { width, height, chroma_sampling, .. } = (*ctx).ctx.config(); let planes = if chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; match (*frame).fi { FrameInternal::U8(ref mut f) => { rav1e_frame_pad_internal(f, planes, width, height) } FrameInternal::U16(ref mut f) => { rav1e_frame_pad_internal(f, planes, width, height) } } } let frame_internal = if frame.is_null() { None } else { Some((*frame).fi.clone()) }; let frame_type = if frame.is_null() { rav1e::FrameTypeOverride::No } else { (*frame).frame_type }; let maybe_opaque = if frame.is_null() { None } else { (*frame).opaque.take().map(rav1e::Opaque::new) }; let t35_metadata = if frame.is_null() { Box::new([]) } else { mem::take(&mut (*frame).t35_metadata).into_boxed_slice() }; let ret = (*ctx) .ctx .send_frame(frame_internal, frame_type, maybe_opaque, t35_metadata) .map(|_v| None) .unwrap_or_else(Some); (*ctx).last_err = ret; ret.into() } /// Return the last encoder status #[no_mangle] pub unsafe extern fn rav1e_last_status(ctx: *const Context) -> EncoderStatus { (*ctx).last_err.into() } /// Return a static string matching the `EncoderStatus` variant. /// #[no_mangle] pub unsafe extern fn rav1e_status_to_str( status: EncoderStatus, ) -> *const c_char { if EncoderStatus::from_i32(std::mem::transmute(status)).is_none() { return std::ptr::null(); } status.to_c() as *const c_char } /// Receive encoded data /// /// Returns: /// - `0` on success /// - `> 0` if additional frame data is required /// - `< 0` on unrecoverable failure #[no_mangle] pub unsafe extern fn rav1e_receive_packet( ctx: *mut Context, pkt: *mut *mut Packet, ) -> EncoderStatus { let ret = (*ctx) .ctx .receive_packet() .map(|packet| { *pkt = Box::into_raw(Box::new(packet)); None }) .unwrap_or_else(Some); (*ctx).last_err = ret; ret.into() } /// Free the `RaPacket`. #[no_mangle] pub unsafe extern fn rav1e_packet_unref(pkt: *mut Packet) { if !pkt.is_null() { let pkt = Box::from_raw(pkt); let _ = Vec::from_raw_parts( pkt.data as *mut u8, pkt.len as usize, pkt.len as usize, ); rav1e_frame_unref(pkt.rec); rav1e_frame_unref(pkt.source); } } /// Produce a sequence header matching the current encoding context /// /// Its format is compatible with the AV1 Matroska and ISOBMFF specification. /// /// Use `rav1e_data_unref()` to free it. #[no_mangle] pub unsafe extern fn rav1e_container_sequence_header( ctx: *const Context, ) -> *mut Data { let buf = (*ctx).ctx.container_sequence_header(); Box::into_raw(Box::new(Data { len: buf.len(), data: Box::into_raw(buf.into_boxed_slice()) as *mut u8, })) } fn rav1e_frame_fill_plane_internal( f: &mut Arc>, plane: c_int, data_slice: &[u8], stride: ptrdiff_t, bytewidth: c_int, ) { let input = Arc::get_mut(f).unwrap(); input.planes[plane as usize].copy_from_raw_u8( data_slice, stride as usize, bytewidth as usize, ); } fn rav1e_frame_pad_internal( f: &mut Arc>, planes: usize, width: usize, height: usize, ) { if let Some(ref mut input) = Arc::get_mut(f) { for plane in input.planes[..planes].iter_mut() { plane.pad(width, height); } } } fn rav1e_frame_extract_plane_internal( f: &Arc>, plane: c_int, data_slice: &mut [u8], stride: ptrdiff_t, bytewidth: c_int, ) { f.planes[plane as usize].copy_to_raw_u8( data_slice, stride as usize, bytewidth as usize, ); } /// Fill a frame plane /// /// Currently the frame contains 3 planes, the first is luminance followed by /// chrominance. /// /// The data is copied and this function has to be called for each plane. /// /// `frame`: A frame provided by `rav1e_frame_new()` /// `plane`: The index of the plane starting from 0 /// `data`: The data to be copied /// `data_len`: Length of the buffer /// `stride`: Plane line in bytes, including padding /// `bytewidth`: Number of bytes per component, either 1 or 2 #[no_mangle] pub unsafe extern fn rav1e_frame_fill_plane( frame: *mut Frame, plane: c_int, data: *const u8, data_len: size_t, stride: ptrdiff_t, bytewidth: c_int, ) { let data_slice = slice::from_raw_parts(data, data_len as usize); match (*frame).fi { FrameInternal::U8(ref mut f) => { rav1e_frame_fill_plane_internal(f, plane, data_slice, stride, bytewidth) } FrameInternal::U16(ref mut f) => { rav1e_frame_fill_plane_internal(f, plane, data_slice, stride, bytewidth) } } } /// Extract a frame plane /// /// This is the reverse of `rav1e_frame_fill_plane()`, primarily used for /// extracting the source and reconstruction data from a `RaPacket`. /// /// Currently the frame contains 3 planes, the first is luminance followed by /// chrominance. /// /// The data is copied out of the frame for a single plane. /// /// `frame`: A frame provided inside a packet returned by `rav1e_receive_packet()` /// `plane`: The index of the plane starting from 0 /// `data`: The destination for the data /// `data_len`: Length of the buffer /// `stride`: Plane line in bytes, including padding /// `bytewidth`: Number of bytes per component, either 1 or 2 #[no_mangle] pub unsafe extern fn rav1e_frame_extract_plane( frame: *const Frame, plane: c_int, data: *mut u8, data_len: size_t, stride: ptrdiff_t, bytewidth: c_int, ) { let data_slice = slice::from_raw_parts_mut(data, data_len as usize); match (*frame).fi { FrameInternal::U8(ref f) => rav1e_frame_extract_plane_internal( f, plane, data_slice, stride, bytewidth, ), FrameInternal::U16(ref f) => rav1e_frame_extract_plane_internal( f, plane, data_slice, stride, bytewidth, ), } } #[cfg(test)] mod test { use super::*; use std::ffi::CString; #[test] fn forward_opaque() { unsafe { let rac = rav1e_config_default(); let w = CString::new("width").unwrap(); rav1e_config_parse_int(rac, w.as_ptr(), 64); let h = CString::new("height").unwrap(); rav1e_config_parse_int(rac, h.as_ptr(), 64); let s = CString::new("speed").unwrap(); rav1e_config_parse_int(rac, s.as_ptr(), 10); let l = CString::new("level").unwrap(); let lo = CString::new("6.2").unwrap(); rav1e_config_parse(rac, l.as_ptr(), lo.as_ptr()); let rax = rav1e_context_new(rac); let f = rav1e_frame_new(rax); let pixels = [42; 64 * 64]; rav1e_frame_fill_plane(f, 0, pixels.as_ptr(), pixels.len(), 64, 1); for i in 0..30 { let v = Box::new(i as u8); extern fn cb(o: *mut c_void) { let v = unsafe { Box::from_raw(o as *mut u8) }; eprintln!("Would free {}", v); } rav1e_frame_set_opaque(f, Box::into_raw(v) as *mut c_void, Some(cb)); rav1e_send_frame(rax, f); } rav1e_send_frame(rax, std::ptr::null_mut()); for _ in 0..15 { let mut p: *mut Packet = std::ptr::null_mut(); let ret = rav1e_receive_packet(rax, &mut p); if ret == EncoderStatus::Success { let mut source = vec![1; 64 * 64]; rav1e_frame_extract_plane( (*p).source, 0, source.as_mut_ptr(), 64 * 64, 64, 1, ); assert_eq!(source, vec![42; 64 * 64]); let v = Box::from_raw((*p).opaque as *mut u8); eprintln!("Opaque {}", v); } if ret == EncoderStatus::LimitReached { break; } } let v = Box::new(42u64); extern fn cb(o: *mut c_void) { let v = unsafe { Box::from_raw(o as *mut u64) }; eprintln!("Would free {}", v); } rav1e_frame_set_opaque(f, Box::into_raw(v) as *mut c_void, Some(cb)); // 42 would be freed after this rav1e_frame_unref(f); // 15 - reorder delay .. 29 would be freed after this rav1e_context_unref(rax); rav1e_config_unref(rac); } } #[test] fn two_pass_encoding() { unsafe { let rac = rav1e_config_default(); let w = CString::new("width").unwrap(); rav1e_config_parse_int(rac, w.as_ptr(), 64); let h = CString::new("height").unwrap(); rav1e_config_parse_int(rac, h.as_ptr(), 64); let s = CString::new("speed").unwrap(); rav1e_config_parse_int(rac, s.as_ptr(), 10); let s = CString::new("bitrate").unwrap(); rav1e_config_parse_int(rac, s.as_ptr(), 1000); rav1e_config_set_emit_data(rac, 1); let rax = rav1e_context_new(rac); let f = rav1e_frame_new(rax); let pixels = [42; 64 * 64]; rav1e_frame_fill_plane(f, 0, pixels.as_ptr(), pixels.len(), 64, 1); for _ in 0..10 { rav1e_send_frame(rax, f); } rav1e_send_frame(rax, std::ptr::null_mut()); let mut frame_data = std::collections::VecDeque::new(); let mut summary: *mut Data = std::ptr::null_mut(); loop { let mut p: *mut Packet = std::ptr::null_mut(); let ret = rav1e_receive_packet(rax, &mut p); rav1e_packet_unref(p); if ret == EncoderStatus::LimitReached { let kind = rav1e_rc_receive_pass_data(rax, &mut summary); assert_eq!(kind, RcDataKind::Summary); eprintln!("Got rc summary {} bytes", (*summary).len); break; } else if ret == EncoderStatus::Encoded || ret == EncoderStatus::Success { let mut p: *mut Data = std::ptr::null_mut(); let kind = rav1e_rc_receive_pass_data(rax, &mut p); assert_eq!(kind, RcDataKind::Frame); eprintln!("Got rc frame data {} bytes", (*p).len); frame_data.push_back(p); } } rav1e_config_set_emit_data(rac, 0); let mut data = (*summary).data; let mut len = (*summary).len; let ret = rav1e_config_set_rc_summary(rac, &mut data, &mut len); assert_eq!(ret, 0); rav1e_data_unref(summary); for _ in 0..10 { rav1e_send_frame(rax, f); } rav1e_send_frame(rax, std::ptr::null_mut()); loop { let mut p: *mut Packet = std::ptr::null_mut(); while rav1e_rc_second_pass_data_required(rax) > 0 { let d = frame_data.pop_front().unwrap(); let mut data = (*d).data; let mut len = (*d).len; rav1e_rc_send_pass_data(rax, &mut data, &mut len); rav1e_data_unref(d); } let ret = rav1e_receive_packet(rax, &mut p); rav1e_packet_unref(p); if ret == EncoderStatus::LimitReached { break; } } rav1e_frame_unref(f); rav1e_context_unref(rax); rav1e_config_unref(rac); } } #[test] fn invalid_level() { unsafe { let rac = rav1e_config_default(); let l = CString::new("level").unwrap(); let lo = CString::new("8.3").unwrap(); assert_eq!(rav1e_config_parse(rac, l.as_ptr(), lo.as_ptr()), -1); rav1e_config_unref(rac); } } } rav1e-0.7.1/src/cdef.rs000064400000000000000000000540331046102023000127120ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::color::ChromaSampling::Cs400; use crate::context::*; use crate::encoder::FrameInvariants; use crate::frame::*; use crate::tiling::*; use crate::util::{clamp, msb, CastFromPrimitive, Pixel}; use crate::cpu_features::CpuFeatureLevel; use std::cmp; cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub(crate) use crate::asm::x86::cdef::*; } else if #[cfg(asm_neon)] { pub(crate) use crate::asm::aarch64::cdef::*; } else { pub(crate) use self::rust::*; } } pub const CDEF_VERY_LARGE: u16 = 0x8000; // These values match dav1d; flags indicating where padding exists pub const CDEF_HAVE_LEFT: u8 = 1 << 0; pub const CDEF_HAVE_RIGHT: u8 = 1 << 1; pub const CDEF_HAVE_TOP: u8 = 1 << 2; pub const CDEF_HAVE_BOTTOM: u8 = 1 << 3; pub const CDEF_HAVE_ALL: u8 = CDEF_HAVE_LEFT | CDEF_HAVE_RIGHT | CDEF_HAVE_TOP | CDEF_HAVE_BOTTOM; pub(crate) const CDEF_SEC_STRENGTHS: u8 = 4; pub struct CdefDirections { dir: [[u8; 8]; 8], var: [[i32; 8]; 8], } pub(crate) mod rust { use super::*; use simd_helpers::cold_for_target_arch; // Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n. // The output is then 840 times larger, but we don't care for finding // the max. const CDEF_DIV_TABLE: [i32; 9] = [0, 840, 420, 280, 210, 168, 140, 120, 105]; /// Returns the position and value of the first instance of the max element in /// a slice as a tuple. /// /// # Arguments /// /// * `elems` - A non-empty slice of integers /// /// # Panics /// /// Panics if `elems` is empty #[inline] fn first_max_element(elems: &[i32]) -> (usize, i32) { // In case of a tie, the first element must be selected. let (max_idx, max_value) = elems .iter() .enumerate() .max_by_key(|&(i, v)| (v, -(i as isize))) .unwrap(); (max_idx, *max_value) } // Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. // The search minimizes the weighted variance along all the lines in a // particular direction, i.e. the squared error between the input and a // "predicted" block where each pixel is replaced by the average along a line // in a particular direction. Since each direction have the same sum(x^2) term, // that term is never computed. See Section 2, step 2, of: // http://jmvalin.ca/notes/intra_paint.pdf pub fn cdef_find_dir( img: &PlaneSlice<'_, T>, var: &mut u32, coeff_shift: usize, _cpu: CpuFeatureLevel, ) -> i32 { let mut cost: [i32; 8] = [0; 8]; let mut partial: [[i32; 15]; 8] = [[0; 15]; 8]; for i in 0..8 { for j in 0..8 { let p: i32 = i32::cast_from(img[i][j]); // We subtract 128 here to reduce the maximum range of the squared // partial sums. debug_assert!(p >> coeff_shift <= 255); let x = (p >> coeff_shift) - 128; partial[0][i + j] += x; partial[1][i + j / 2] += x; partial[2][i] += x; partial[3][3 + i - j / 2] += x; partial[4][7 + i - j] += x; partial[5][3 - i / 2 + j] += x; partial[6][j] += x; partial[7][i / 2 + j] += x; } } for i in 0..8 { cost[2] += partial[2][i] * partial[2][i]; cost[6] += partial[6][i] * partial[6][i]; } cost[2] *= CDEF_DIV_TABLE[8]; cost[6] *= CDEF_DIV_TABLE[8]; for i in 0..7 { cost[0] += (partial[0][i] * partial[0][i] + partial[0][14 - i] * partial[0][14 - i]) * CDEF_DIV_TABLE[i + 1]; cost[4] += (partial[4][i] * partial[4][i] + partial[4][14 - i] * partial[4][14 - i]) * CDEF_DIV_TABLE[i + 1]; } cost[0] += partial[0][7] * partial[0][7] * CDEF_DIV_TABLE[8]; cost[4] += partial[4][7] * partial[4][7] * CDEF_DIV_TABLE[8]; for i in (1..8).step_by(2) { for j in 0..5 { cost[i] += partial[i][3 + j] * partial[i][3 + j]; } cost[i] *= CDEF_DIV_TABLE[8]; for j in 0..3 { cost[i] += (partial[i][j] * partial[i][j] + partial[i][10 - j] * partial[i][10 - j]) * CDEF_DIV_TABLE[2 * j + 2]; } } let (best_dir, best_cost) = first_max_element(&cost); // Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out. // We'd normally divide by 840, but dividing by 1024 is close enough // for what we're going to do with this. */ *var = ((best_cost - cost[(best_dir + 4) & 7]) >> 10) as u32; best_dir as i32 } #[inline(always)] fn constrain(diff: i32, threshold: i32, damping: i32) -> i32 { if threshold != 0 { let shift = cmp::max(0, damping - msb(threshold)); let magnitude = (threshold - (diff.abs() >> shift)).clamp(0, diff.abs()); if diff < 0 { -magnitude } else { magnitude } } else { 0 } } pub unsafe fn pad_into_tmp16( dst: *mut u16, dst_stride: isize, src: *const T, src_stride: isize, block_width: usize, block_height: usize, edges: u8, ) { let mut w = block_width; let mut h = block_height; let (dst_col, src_col) = if (edges & CDEF_HAVE_LEFT) != 0 { w += 2; (dst, src.offset(-2)) } else { (dst.offset(2), src) }; if (edges & CDEF_HAVE_RIGHT) != 0 { w += 2; }; let (mut dst_ptr, mut src_ptr) = if (edges & CDEF_HAVE_TOP) != 0 { h += 2; (dst_col, src_col.offset(-2 * src_stride)) } else { (dst_col.offset(2 * dst_stride), src_col) }; if (edges & CDEF_HAVE_BOTTOM) != 0 { h += 2; }; for _y in 0..h { for x in 0..w { *dst_ptr.add(x) = u16::cast_from(*src_ptr.add(x)); } src_ptr = src_ptr.offset(src_stride); dst_ptr = dst_ptr.offset(dst_stride); } } #[cold_for_target_arch("x86_64")] #[allow(clippy::erasing_op, clippy::identity_op, clippy::neg_multiply)] pub(crate) unsafe fn cdef_filter_block( dst: &mut PlaneRegionMut<'_, T>, input: *const U, istride: isize, pri_strength: i32, sec_strength: i32, dir: usize, damping: i32, bit_depth: usize, xdec: usize, ydec: usize, edges: u8, _cpu: CpuFeatureLevel, ) { if edges != CDEF_HAVE_ALL { // slowpath for unpadded border[s] let tmpstride = 2 + (8 >> xdec) + 2; let mut tmp = [CDEF_VERY_LARGE; (2 + 8 + 2) * (2 + 8 + 2)]; // copy in what pixels we have/are allowed to use pad_into_tmp16( tmp.as_mut_ptr(), // points to *padding* upper left tmpstride, input, // points to *block* upper left istride, 8 >> xdec, 8 >> ydec, edges, ); cdef_filter_block( dst, tmp.as_ptr().offset(2 * tmpstride + 2), tmpstride, pri_strength, sec_strength, dir, damping, bit_depth, xdec, ydec, CDEF_HAVE_ALL, _cpu, ); } else { let xsize = (8 >> xdec) as isize; let ysize = (8 >> ydec) as isize; let coeff_shift = bit_depth - 8; let cdef_pri_taps = [[4, 2], [3, 3]]; let cdef_sec_taps = [[2, 1], [2, 1]]; let pri_taps = cdef_pri_taps[((pri_strength >> coeff_shift) & 1) as usize]; let sec_taps = cdef_sec_taps[((pri_strength >> coeff_shift) & 1) as usize]; let cdef_directions = [ [-1 * istride + 1, -2 * istride + 2], [0 * istride + 1, -1 * istride + 2], [0 * istride + 1, 0 * istride + 2], [0 * istride + 1, 1 * istride + 2], [1 * istride + 1, 2 * istride + 2], [1 * istride + 0, 2 * istride + 1], [1 * istride + 0, 2 * istride + 0], [1 * istride + 0, 2 * istride - 1], ]; for i in 0..ysize { for j in 0..xsize { let ptr_in = input.offset(i * istride + j); let x = i32::cast_from(*ptr_in); let mut sum: i32 = 0; let mut max = x; let mut min = x; for k in 0..2usize { let cdef_dirs = [ cdef_directions[dir][k], cdef_directions[(dir + 2) & 7][k], cdef_directions[(dir + 6) & 7][k], ]; let pri_tap = pri_taps[k]; let p = [ i32::cast_from(*ptr_in.offset(cdef_dirs[0])), i32::cast_from(*ptr_in.offset(-cdef_dirs[0])), ]; for p_elem in p.iter() { sum += pri_tap * constrain(*p_elem - x, pri_strength, damping); if *p_elem != CDEF_VERY_LARGE as i32 { max = cmp::max(*p_elem, max); } min = cmp::min(*p_elem, min); } let s = [ i32::cast_from(*ptr_in.offset(cdef_dirs[1])), i32::cast_from(*ptr_in.offset(-cdef_dirs[1])), i32::cast_from(*ptr_in.offset(cdef_dirs[2])), i32::cast_from(*ptr_in.offset(-cdef_dirs[2])), ]; let sec_tap = sec_taps[k]; for s_elem in s.iter() { if *s_elem != CDEF_VERY_LARGE as i32 { max = cmp::max(*s_elem, max); } min = cmp::min(*s_elem, min); sum += sec_tap * constrain(*s_elem - x, sec_strength, damping); } } let v = x + ((8 + sum - (sum < 0) as i32) >> 4); dst[i as usize][j as usize] = T::cast_from(clamp(v, min, max)); } } } } #[cfg(test)] mod test { use super::*; #[test] fn check_max_element() { assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 6, 6]), (6, 6)); assert_eq!(first_max_element(&[-1, -1, 1, 2, 3, 4, 7, 6]), (6, 7)); assert_eq!(first_max_element(&[0, 0]), (0, 0)); } } } // We use the variance of an 8x8 block to adjust the effective filter strength. #[inline] fn adjust_strength(strength: i32, var: i32) -> i32 { let i = if (var >> 6) != 0 { cmp::min(msb(var >> 6), 12) } else { 0 }; if var != 0 { (strength * (4 + i) + 8) >> 4 } else { 0 } } #[profiling::function] pub fn cdef_analyze_superblock_range( fi: &FrameInvariants, in_frame: &Frame, blocks: &TileBlocks<'_>, sb_w: usize, sb_h: usize, ) -> Vec { let mut ret = Vec::::with_capacity(sb_h * sb_w); for sby in 0..sb_h { for sbx in 0..sb_w { let sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }); ret.push(cdef_analyze_superblock(fi, in_frame, blocks, sbo)); } } ret } #[profiling::function] pub fn cdef_analyze_superblock( fi: &FrameInvariants, in_frame: &Frame, blocks: &TileBlocks<'_>, sbo: TileSuperBlockOffset, ) -> CdefDirections { let coeff_shift = fi.sequence.bit_depth - 8; let mut dir: CdefDirections = CdefDirections { dir: [[0; 8]; 8], var: [[0; 8]; 8] }; // Each direction block is 8x8 in y, and direction computation only looks at y for by in 0..8 { for bx in 0..8 { let block_offset = sbo.block_offset(bx << 1, by << 1); if block_offset.0.x < blocks.cols() && block_offset.0.y < blocks.rows() { let skip = blocks[block_offset].skip & blocks[sbo.block_offset(2 * bx + 1, 2 * by)].skip & blocks[sbo.block_offset(2 * bx, 2 * by + 1)].skip & blocks[sbo.block_offset(2 * bx + 1, 2 * by + 1)].skip; if !skip { let mut var: u32 = 0; let in_plane = &in_frame.planes[0]; let in_po = sbo.plane_offset(&in_plane.cfg); let in_slice = in_plane.slice(in_po); dir.dir[bx][by] = cdef_find_dir::( &in_slice.reslice(8 * bx as isize, 8 * by as isize), &mut var, coeff_shift, fi.cpu_feature_level, ) as u8; dir.var[bx][by] = var as i32; } } } } dir } // input: A Frame of reconstructed/deblocked pixels prepared to // undergo CDEF. Note that the input is a Frame and not a Tile due to // Tiles not allowing [supervised] out-of-rect access for padding // pixels. This will be corrected at some point in the future. // tile_sbo: specifies an offset into the output Tile, not an // absolute offset in the visible frame. The Tile's own offset is // added to this in order to address into the input Frame. // tb: the TileBlocks associated with the filtered region; the // provided blocks co-locate with the output region. The TileBlocks // provide by-[super]qblock CDEF parameters. // output: TileMut destination for filtered pixels. The output's // rect specifies the region of the input to be processed (x and y // are relative to the input Frame's origin). Note that an // additional area of 2 pixels of padding is used for CDEF. When // these pixels are unavailable (beyond the visible frame or at a // tile boundary), the filtering process ignores input pixels that // don't exist. /// # Panics /// /// - If called with invalid parameters #[profiling::function] pub fn cdef_filter_superblock( fi: &FrameInvariants, input: &Frame, output: &mut TileMut<'_, T>, blocks: &TileBlocks<'_>, tile_sbo: TileSuperBlockOffset, cdef_index: u8, cdef_dirs: &CdefDirections, ) { let bit_depth = fi.sequence.bit_depth; let coeff_shift = fi.sequence.bit_depth as i32 - 8; let cdef_damping = fi.cdef_damping as i32; let cdef_y_strength = fi.cdef_y_strengths[cdef_index as usize]; let cdef_uv_strength = fi.cdef_uv_strengths[cdef_index as usize]; let cdef_pri_y_strength = (cdef_y_strength / CDEF_SEC_STRENGTHS) as i32; let mut cdef_sec_y_strength = (cdef_y_strength % CDEF_SEC_STRENGTHS) as i32; let cdef_pri_uv_strength = (cdef_uv_strength / CDEF_SEC_STRENGTHS) as i32; let planes = if fi.sequence.chroma_sampling == Cs400 { 1 } else { 3 }; let mut cdef_sec_uv_strength = (cdef_uv_strength % CDEF_SEC_STRENGTHS) as i32; if cdef_sec_y_strength == 3 { cdef_sec_y_strength += 1; } if cdef_sec_uv_strength == 3 { cdef_sec_uv_strength += 1; } let tile_rect = *output.planes[0].rect(); let input_xoffset = tile_rect.x + tile_sbo.plane_offset(&input.planes[0].cfg).x; let input_yoffset = tile_rect.y + tile_sbo.plane_offset(&input.planes[0].cfg).y; let input_xavail = input.planes[0].cfg.width as isize - input_xoffset; let input_yavail = input.planes[0].cfg.height as isize - input_yoffset; /* determine what edge padding we have, and what padding we don't. * We don't pad here, but rather tell the filter_block call what it * needs to do, then let it handle the specifics (following dav1d's * lead). We make one assumption that's not obvious: Because the * cdef clipping area is rounded up to an even 8x8 luma block, we * don't need to guard against having only one (as opposed to two) * pixels of padding past the current block boundary. The padding * is all-or-nothing. */ // Slightly harder than in dav1d; we're not always doing full-frame. let have_top_p = if tile_sbo.0.y as isize + tile_rect.y > 0 { CDEF_HAVE_TOP } else { 0 }; let have_left_p = if tile_sbo.0.x as isize + tile_rect.x > 0 { CDEF_HAVE_LEFT } else { 0 }; let mut edges = have_top_p | CDEF_HAVE_BOTTOM; // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma for by in 0..8usize { if by + 1 >= (input_yavail as usize >> 3) { edges &= !CDEF_HAVE_BOTTOM }; edges &= !CDEF_HAVE_LEFT; edges |= have_left_p; edges |= CDEF_HAVE_RIGHT; for bx in 0..8usize { if bx + 1 >= (input_xavail as usize >> 3) { edges &= !CDEF_HAVE_RIGHT }; let block_offset = tile_sbo.block_offset(bx << 1, by << 1); if block_offset.0.x < blocks.cols() && block_offset.0.y < blocks.rows() { let skip = blocks[block_offset].skip & blocks[tile_sbo.block_offset(2 * bx + 1, 2 * by)].skip & blocks[tile_sbo.block_offset(2 * bx, 2 * by + 1)].skip & blocks[tile_sbo.block_offset(2 * bx + 1, 2 * by + 1)].skip; let dir = cdef_dirs.dir[bx][by]; let var = cdef_dirs.var[bx][by]; for p in 0..planes { let out_plane = &mut output.planes[p]; let in_plane = &input.planes[p]; let xdec = in_plane.cfg.xdec; let ydec = in_plane.cfg.ydec; let xsize = 8 >> xdec; let ysize = 8 >> ydec; let in_po = PlaneOffset { x: (input_xoffset >> xdec) + (bx * xsize) as isize, y: (input_yoffset >> ydec) + (by * ysize) as isize, }; let in_stride = in_plane.cfg.stride; let in_slice = &in_plane.slice(in_po); let out_block = &mut out_plane.subregion_mut(Area::BlockRect { bo: tile_sbo.block_offset(2 * bx, 2 * by).0, width: xsize, height: ysize, }); if !skip { let local_pri_strength; let local_sec_strength; let mut local_damping: i32 = cdef_damping + coeff_shift; // See `Cdef_Uv_Dir` constant lookup table in Section 7.15.1 // let local_dir = if p == 0 { local_pri_strength = adjust_strength(cdef_pri_y_strength << coeff_shift, var); local_sec_strength = cdef_sec_y_strength << coeff_shift; if cdef_pri_y_strength != 0 { dir as usize } else { 0 } } else { local_pri_strength = cdef_pri_uv_strength << coeff_shift; local_sec_strength = cdef_sec_uv_strength << coeff_shift; local_damping -= 1; if cdef_pri_uv_strength != 0 { if xdec != ydec { [7, 0, 2, 4, 5, 6, 6, 6][dir as usize] } else { dir as usize } } else { 0 } }; // SAFETY: `cdef_filter_block` may call Assembly code. // The asserts here verify that we are not calling it // with invalid parameters. unsafe { assert!( input.planes[p].cfg.width as isize >= in_po.x + xsize as isize + if edges & CDEF_HAVE_RIGHT > 0 { 2 } else { 0 } ); assert!( 0 <= in_po.x - if edges & CDEF_HAVE_LEFT > 0 { 2 } else { 0 } ); assert!( input.planes[p].cfg.height as isize >= in_po.y + ysize as isize + if edges & CDEF_HAVE_BOTTOM > 0 { 2 } else { 0 } ); assert!( 0 <= in_po.y - if edges & CDEF_HAVE_TOP > 0 { 2 } else { 0 } ); cdef_filter_block( out_block, in_slice.as_ptr(), in_stride as isize, local_pri_strength, local_sec_strength, local_dir, local_damping, bit_depth, xdec, ydec, edges, fi.cpu_feature_level, ); } } else { // no filtering, but we need to copy input to output for i in 0..ysize { for j in 0..xsize { out_block[i][j] = in_slice[i][j]; } } } } } edges |= CDEF_HAVE_LEFT; } edges |= CDEF_HAVE_TOP; } } // The purpose of CDEF is to perform deringing based on the detected // direction of blocks. CDEF parameters are stored for each 64 by 64 // block of pixels. The CDEF filter is applied on each 8 by 8 block // of pixels. Reference: // http://av1-spec.argondesign.com/av1-spec/av1-spec.html#cdef-process // input: A Frame of reconstructed/deblocked pixels prepared to // undergo CDEF. cdef_filter_tile acts on a subset of these input // pixels, as specified by the PlaneRegion rect of the output. Note // that the input is a Frame and not a Tile due to Tiles not // allowing [supervised] out-of-rect access for padding pixels. // This will be corrected at some point in the future. // tb: the TileBlocks associated with the filtered region; the // provided blocks co-locate with the output region. // output: TileMut destination for filtered pixels. The output's // rect specifies the region of the input to be processed (x and y // are relative to the input Frame's origin). Note that an // additional area of 2 pixels of padding is used for CDEF. When // these pixels are unavailable (beyond the visible frame or at a // tile boundary), the filtering process ignores input pixels that // don't exist. #[profiling::function] pub fn cdef_filter_tile( fi: &FrameInvariants, input: &Frame, tb: &TileBlocks, output: &mut TileMut<'_, T>, ) { // Each filter block is 64x64, except right and/or bottom for non-multiple-of-64 sizes. // FIXME: 128x128 SB support will break this, we need FilterBlockOffset etc. // No need to guard against having fewer actual coded blocks than // the output.rect() area. Inner code already guards this case. let fb_width = (output.planes[0].rect().width + 63) / 64; let fb_height = (output.planes[0].rect().height + 63) / 64; // should parallelize this for fby in 0..fb_height { for fbx in 0..fb_width { // tile_sbo is treated as an offset into the Tiles' plane // regions, not as an absolute offset in the visible frame. The // Tile's own offset is added to this in order to address into // the input Frame. let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: fbx, y: fby }); let cdef_index = tb.get_cdef(tile_sbo); let cdef_dirs = cdef_analyze_superblock(fi, input, tb, tile_sbo); cdef_filter_superblock( fi, input, output, tb, tile_sbo, cdef_index, &cdef_dirs, ); } } } rav1e-0.7.1/src/context/block_unit.rs000064400000000000000000001645461046102023000156410ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use std::mem::MaybeUninit; use super::*; use crate::predict::PredictionMode; use crate::util::cdf; pub const MAX_PLANES: usize = 3; pub const BLOCK_SIZE_GROUPS: usize = 4; pub const MAX_ANGLE_DELTA: usize = 3; pub const DIRECTIONAL_MODES: usize = 8; pub const KF_MODE_CONTEXTS: usize = 5; pub const INTRA_INTER_CONTEXTS: usize = 4; pub const INTER_MODE_CONTEXTS: usize = 8; pub const DRL_MODE_CONTEXTS: usize = 3; pub const COMP_INTER_CONTEXTS: usize = 5; pub const COMP_REF_TYPE_CONTEXTS: usize = 5; pub const UNI_COMP_REF_CONTEXTS: usize = 3; pub const PLANE_TYPES: usize = 2; const REF_TYPES: usize = 2; pub const COMP_INDEX_CONTEXTS: usize = 6; pub const COMP_GROUP_IDX_CONTEXTS: usize = 6; pub const COEFF_CONTEXT_MAX_WIDTH: usize = MAX_TILE_WIDTH / MI_SIZE; /// Absolute offset in blocks, where a block is defined /// to be an `N*N` square where `N == (1 << BLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct BlockOffset { pub x: usize, pub y: usize, } /// Absolute offset in blocks inside a plane, where a block is defined /// to be an `N*N` square where `N == (1 << BLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct PlaneBlockOffset(pub BlockOffset); /// Absolute offset in blocks inside a tile, where a block is defined /// to be an `N*N` square where `N == (1 << BLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct TileBlockOffset(pub BlockOffset); impl BlockOffset { /// Offset of the superblock in which this block is located. #[inline] const fn sb_offset(self) -> SuperBlockOffset { SuperBlockOffset { x: self.x >> SUPERBLOCK_TO_BLOCK_SHIFT, y: self.y >> SUPERBLOCK_TO_BLOCK_SHIFT, } } /// Offset of the top-left pixel of this block. #[inline] const fn plane_offset(self, plane: &PlaneConfig) -> PlaneOffset { PlaneOffset { x: (self.x >> plane.xdec << BLOCK_TO_PLANE_SHIFT) as isize, y: (self.y >> plane.ydec << BLOCK_TO_PLANE_SHIFT) as isize, } } /// Convert to plane offset without decimation. #[inline] const fn to_luma_plane_offset(self) -> PlaneOffset { PlaneOffset { x: (self.x as isize) << BLOCK_TO_PLANE_SHIFT, y: (self.y as isize) << BLOCK_TO_PLANE_SHIFT, } } #[inline] const fn y_in_sb(self) -> usize { self.y % MIB_SIZE } #[inline] fn with_offset(self, col_offset: isize, row_offset: isize) -> BlockOffset { let x = self.x as isize + col_offset; let y = self.y as isize + row_offset; debug_assert!(x >= 0); debug_assert!(y >= 0); BlockOffset { x: x as usize, y: y as usize } } } impl PlaneBlockOffset { /// Offset of the superblock in which this block is located. #[inline] pub const fn sb_offset(self) -> PlaneSuperBlockOffset { PlaneSuperBlockOffset(self.0.sb_offset()) } /// Offset of the top-left pixel of this block. #[inline] pub const fn plane_offset(self, plane: &PlaneConfig) -> PlaneOffset { self.0.plane_offset(plane) } /// Convert to plane offset without decimation. #[inline] pub const fn to_luma_plane_offset(self) -> PlaneOffset { self.0.to_luma_plane_offset() } #[inline] pub const fn y_in_sb(self) -> usize { self.0.y_in_sb() } #[inline] pub fn with_offset( self, col_offset: isize, row_offset: isize, ) -> PlaneBlockOffset { Self(self.0.with_offset(col_offset, row_offset)) } } impl TileBlockOffset { /// Offset of the superblock in which this block is located. #[inline] pub const fn sb_offset(self) -> TileSuperBlockOffset { TileSuperBlockOffset(self.0.sb_offset()) } /// Offset of the top-left pixel of this block. #[inline] pub const fn plane_offset(self, plane: &PlaneConfig) -> PlaneOffset { self.0.plane_offset(plane) } /// Convert to plane offset without decimation. #[inline] pub const fn to_luma_plane_offset(self) -> PlaneOffset { self.0.to_luma_plane_offset() } #[inline] pub const fn y_in_sb(self) -> usize { self.0.y_in_sb() } #[inline] pub fn with_offset( self, col_offset: isize, row_offset: isize, ) -> TileBlockOffset { Self(self.0.with_offset(col_offset, row_offset)) } } #[derive(Copy, Clone)] pub struct Block { pub mode: PredictionMode, pub partition: PartitionType, pub skip: bool, pub ref_frames: [RefType; 2], pub mv: [MotionVector; 2], // note: indexes are reflist index, NOT the same as libaom pub neighbors_ref_counts: [u8; INTER_REFS_PER_FRAME], pub cdef_index: u8, pub bsize: BlockSize, pub n4_w: u8, /* block width in the unit of mode_info */ pub n4_h: u8, /* block height in the unit of mode_info */ pub txsize: TxSize, // The block-level deblock_deltas are left-shifted by // fi.deblock.block_delta_shift and added to the frame-configured // deltas pub deblock_deltas: [i8; FRAME_LF_COUNT], pub segmentation_idx: u8, } impl Block { pub fn is_inter(&self) -> bool { self.mode >= PredictionMode::NEARESTMV } pub fn has_second_ref(&self) -> bool { self.ref_frames[1] != INTRA_FRAME && self.ref_frames[1] != NONE_FRAME } } impl Default for Block { fn default() -> Block { Block { mode: PredictionMode::DC_PRED, partition: PartitionType::PARTITION_NONE, skip: false, ref_frames: [INTRA_FRAME; 2], mv: [MotionVector::default(); 2], neighbors_ref_counts: [0; INTER_REFS_PER_FRAME], cdef_index: 0, bsize: BLOCK_64X64, n4_w: BLOCK_64X64.width_mi() as u8, n4_h: BLOCK_64X64.height_mi() as u8, txsize: TX_64X64, deblock_deltas: [0, 0, 0, 0], segmentation_idx: 0, } } } #[derive(Clone)] pub struct BlockContextCheckpoint { x: usize, chroma_sampling: ChromaSampling, cdef_coded: bool, above_partition_context: [u8; MIB_SIZE >> 1], // left context is also at 8x8 granularity left_partition_context: [u8; MIB_SIZE >> 1], above_tx_context: [u8; MIB_SIZE], left_tx_context: [u8; MIB_SIZE], above_coeff_context: [[u8; MIB_SIZE]; MAX_PLANES], left_coeff_context: [[u8; MIB_SIZE]; MAX_PLANES], } pub struct BlockContext<'a> { pub cdef_coded: bool, pub code_deltas: bool, pub update_seg: bool, pub preskip_segid: bool, pub above_partition_context: [u8; PARTITION_CONTEXT_MAX_WIDTH], pub left_partition_context: [u8; MIB_SIZE >> 1], pub above_tx_context: [u8; COEFF_CONTEXT_MAX_WIDTH], pub left_tx_context: [u8; MIB_SIZE], pub above_coeff_context: [[u8; COEFF_CONTEXT_MAX_WIDTH]; MAX_PLANES], pub left_coeff_context: [[u8; MIB_SIZE]; MAX_PLANES], pub blocks: &'a mut TileBlocksMut<'a>, } impl<'a> BlockContext<'a> { pub fn new(blocks: &'a mut TileBlocksMut<'a>) -> Self { BlockContext { cdef_coded: false, code_deltas: false, update_seg: false, preskip_segid: false, above_partition_context: [0; PARTITION_CONTEXT_MAX_WIDTH], left_partition_context: [0; MIB_SIZE >> 1], above_tx_context: [0; COEFF_CONTEXT_MAX_WIDTH], left_tx_context: [0; MIB_SIZE], above_coeff_context: [ [0; COEFF_CONTEXT_MAX_WIDTH], [0; COEFF_CONTEXT_MAX_WIDTH], [0; COEFF_CONTEXT_MAX_WIDTH], ], left_coeff_context: [[0; MIB_SIZE]; MAX_PLANES], blocks, } } pub fn checkpoint( &self, tile_bo: &TileBlockOffset, chroma_sampling: ChromaSampling, ) -> BlockContextCheckpoint { let x = tile_bo.0.x & (COEFF_CONTEXT_MAX_WIDTH - MIB_SIZE); let mut checkpoint = BlockContextCheckpoint { x, chroma_sampling, cdef_coded: self.cdef_coded, above_partition_context: [0; MIB_SIZE >> 1], left_partition_context: self.left_partition_context, above_tx_context: [0; MIB_SIZE], left_tx_context: self.left_tx_context, above_coeff_context: [[0; MIB_SIZE]; MAX_PLANES], left_coeff_context: self.left_coeff_context, }; checkpoint.above_partition_context.copy_from_slice( &self.above_partition_context[(x >> 1)..][..(MIB_SIZE >> 1)], ); checkpoint .above_tx_context .copy_from_slice(&self.above_tx_context[x..][..MIB_SIZE]); let num_planes = if chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; for (p, (dst, src)) in checkpoint .above_coeff_context .iter_mut() .zip(self.above_coeff_context.iter()) .enumerate() .take(num_planes) { let xdec = (p > 0 && chroma_sampling != ChromaSampling::Cs444) as usize; dst.copy_from_slice(&src[(x >> xdec)..][..MIB_SIZE]); } checkpoint } pub fn rollback(&mut self, checkpoint: &BlockContextCheckpoint) { let x = checkpoint.x & (COEFF_CONTEXT_MAX_WIDTH - MIB_SIZE); self.cdef_coded = checkpoint.cdef_coded; self.above_partition_context[(x >> 1)..][..(MIB_SIZE >> 1)] .copy_from_slice(&checkpoint.above_partition_context); self.left_partition_context = checkpoint.left_partition_context; self.above_tx_context[x..][..MIB_SIZE] .copy_from_slice(&checkpoint.above_tx_context); self.left_tx_context = checkpoint.left_tx_context; let num_planes = if checkpoint.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; for (p, (dst, src)) in self .above_coeff_context .iter_mut() .zip(checkpoint.above_coeff_context.iter()) .enumerate() .take(num_planes) { let xdec = (p > 0 && checkpoint.chroma_sampling != ChromaSampling::Cs444) as usize; dst[(x >> xdec)..][..MIB_SIZE].copy_from_slice(src); } self.left_coeff_context = checkpoint.left_coeff_context; } #[inline] pub fn set_dc_sign(cul_level: &mut u32, dc_val: i32) { if dc_val < 0 { *cul_level |= 1 << COEFF_CONTEXT_BITS; } else if dc_val > 0 { *cul_level += 2 << COEFF_CONTEXT_BITS; } } pub fn set_coeff_context( &mut self, plane: usize, bo: TileBlockOffset, tx_size: TxSize, xdec: usize, ydec: usize, value: u8, ) { for above in &mut self.above_coeff_context[plane][(bo.0.x >> xdec)..] [..tx_size.width_mi()] { *above = value; } let bo_y = bo.y_in_sb(); for left in &mut self.left_coeff_context[plane][(bo_y >> ydec)..] [..tx_size.height_mi()] { *left = value; } } fn reset_left_coeff_context(&mut self, plane: usize) { for c in &mut self.left_coeff_context[plane] { *c = 0; } } fn reset_left_partition_context(&mut self) { for c in &mut self.left_partition_context { *c = 0; } } pub fn update_tx_size_context( &mut self, bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, skip: bool, ) { let n4_w = bsize.width_mi(); let n4_h = bsize.height_mi(); let (tx_w, tx_h) = if skip { ((n4_w * MI_SIZE) as u8, (n4_h * MI_SIZE) as u8) } else { (tx_size.width() as u8, tx_size.height() as u8) }; let above_ctx = &mut self.above_tx_context[bo.0.x..bo.0.x + n4_w]; let left_ctx = &mut self.left_tx_context[bo.y_in_sb()..bo.y_in_sb() + n4_h]; for v in above_ctx[0..n4_w].iter_mut() { *v = tx_w; } for v in left_ctx[0..n4_h].iter_mut() { *v = tx_h; } } fn reset_left_tx_context(&mut self) { for c in &mut self.left_tx_context { *c = 0; } } pub fn reset_left_contexts(&mut self, planes: usize) { for p in 0..planes { BlockContext::reset_left_coeff_context(self, p); } BlockContext::reset_left_partition_context(self); BlockContext::reset_left_tx_context(self); } // The mode info data structure has a one element border above and to the // left of the entries corresponding to real macroblocks. // The prediction flags in these dummy entries are initialized to 0. // 0 - inter/inter, inter/--, --/inter, --/-- // 1 - intra/inter, inter/intra // 2 - intra/--, --/intra // 3 - intra/intra pub fn intra_inter_context(&self, bo: TileBlockOffset) -> usize { let has_above = bo.0.y > 0; let has_left = bo.0.x > 0; match (has_above, has_left) { (true, true) => { let above_intra = !self.blocks.above_of(bo).is_inter(); let left_intra = !self.blocks.left_of(bo).is_inter(); if above_intra && left_intra { 3 } else { (above_intra || left_intra) as usize } } (true, false) => { if self.blocks.above_of(bo).is_inter() { 0 } else { 2 } } (false, true) => { if self.blocks.left_of(bo).is_inter() { 0 } else { 2 } } _ => 0, } } pub fn get_txb_ctx( &self, plane_bsize: BlockSize, tx_size: TxSize, plane: usize, bo: TileBlockOffset, xdec: usize, ydec: usize, frame_clipped_txw: usize, frame_clipped_txh: usize, ) -> TXB_CTX { let mut txb_ctx = TXB_CTX { txb_skip_ctx: 0, dc_sign_ctx: 0 }; const MAX_TX_SIZE_UNIT: usize = 16; const signs: [i8; 3] = [0, -1, 1]; const dc_sign_contexts: [usize; 4 * MAX_TX_SIZE_UNIT + 1] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ]; let mut dc_sign: i16 = 0; let above_ctxs = &self.above_coeff_context[plane][(bo.0.x >> xdec)..] [..frame_clipped_txw >> 2]; let left_ctxs = &self.left_coeff_context[plane][(bo.y_in_sb() >> ydec)..] [..frame_clipped_txh >> 2]; // Decide txb_ctx.dc_sign_ctx for &ctx in above_ctxs { let sign = ctx >> COEFF_CONTEXT_BITS; dc_sign += signs[sign as usize] as i16; } for &ctx in left_ctxs { let sign = ctx >> COEFF_CONTEXT_BITS; dc_sign += signs[sign as usize] as i16; } txb_ctx.dc_sign_ctx = dc_sign_contexts[(dc_sign + 2 * MAX_TX_SIZE_UNIT as i16) as usize]; // Decide txb_ctx.txb_skip_ctx if plane == 0 { if plane_bsize == tx_size.block_size() { txb_ctx.txb_skip_ctx = 0; } else { // This is the algorithm to generate table skip_contexts[min][max]. // if (!max) // txb_skip_ctx = 1; // else if (!min) // txb_skip_ctx = 2 + (max > 3); // else if (max <= 3) // txb_skip_ctx = 4; // else if (min <= 3) // txb_skip_ctx = 5; // else // txb_skip_ctx = 6; const skip_contexts: [[u8; 5]; 5] = [ [1, 2, 2, 2, 3], [1, 4, 4, 4, 5], [1, 4, 4, 4, 5], [1, 4, 4, 4, 5], [1, 4, 4, 4, 6], ]; let top: u8 = above_ctxs.iter().fold(0, |acc, ctx| acc | *ctx) & COEFF_CONTEXT_MASK as u8; let left: u8 = left_ctxs.iter().fold(0, |acc, ctx| acc | *ctx) & COEFF_CONTEXT_MASK as u8; let max = cmp::min(top | left, 4); let min = cmp::min(cmp::min(top, left), 4); txb_ctx.txb_skip_ctx = skip_contexts[min as usize][max as usize] as usize; } } else { let top: u8 = above_ctxs.iter().fold(0, |acc, ctx| acc | *ctx); let left: u8 = left_ctxs.iter().fold(0, |acc, ctx| acc | *ctx); let ctx_base = (top != 0) as usize + (left != 0) as usize; let ctx_offset = if num_pels_log2_lookup[plane_bsize as usize] > num_pels_log2_lookup[tx_size.block_size() as usize] { 10 } else { 7 }; txb_ctx.txb_skip_ctx = ctx_base + ctx_offset; } txb_ctx } } #[derive(Clone, Copy)] #[repr(C)] pub struct NMVComponent { pub sign_cdf: [u16; 2], pub class0_hp_cdf: [u16; 2], pub hp_cdf: [u16; 2], pub class0_cdf: [u16; CLASS0_SIZE], pub bits_cdf: [[u16; 2]; MV_OFFSET_BITS], pub class0_fp_cdf: [[u16; MV_FP_SIZE]; CLASS0_SIZE], pub fp_cdf: [u16; MV_FP_SIZE], pub classes_cdf: [u16; MV_CLASSES], // MV_CLASSES + 5 == 16; pad the last CDF for rollback. padding: [u16; 5], } #[derive(Clone, Copy)] #[repr(C)] pub struct NMVContext { pub joints_cdf: [u16; MV_JOINTS], // MV_JOINTS + 12 == 16; pad the last CDF for rollback. padding: [u16; 12], pub comps: [NMVComponent; 2], } // lv_map pub static default_nmv_context: NMVContext = { NMVContext { joints_cdf: cdf([4096, 11264, 19328]), padding: [0; 12], comps: [ NMVComponent { classes_cdf: cdf([ 28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, 32762, 32767, ]), class0_fp_cdf: cdf_2d([[16384, 24576, 26624], [12288, 21248, 24128]]), fp_cdf: cdf([8192, 17408, 21248]), sign_cdf: cdf([128 * 128]), class0_hp_cdf: cdf([160 * 128]), hp_cdf: cdf([128 * 128]), class0_cdf: cdf([216 * 128]), bits_cdf: cdf_2d([ [128 * 136], [128 * 140], [128 * 148], [128 * 160], [128 * 176], [128 * 192], [128 * 224], [128 * 234], [128 * 234], [128 * 240], ]), padding: [0; 5], }, NMVComponent { classes_cdf: cdf([ 28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757, 32762, 32767, ]), class0_fp_cdf: cdf_2d([[16384, 24576, 26624], [12288, 21248, 24128]]), fp_cdf: cdf([8192, 17408, 21248]), sign_cdf: cdf([128 * 128]), class0_hp_cdf: cdf([160 * 128]), hp_cdf: cdf([128 * 128]), class0_cdf: cdf([216 * 128]), bits_cdf: cdf_2d([ [128 * 136], [128 * 140], [128 * 148], [128 * 160], [128 * 176], [128 * 192], [128 * 224], [128 * 234], [128 * 234], [128 * 240], ]), padding: [0; 5], }, ], } }; #[derive(Clone)] pub struct CandidateMV { pub this_mv: MotionVector, pub comp_mv: MotionVector, pub weight: u32, } #[derive(Clone)] pub struct FrameBlocks { blocks: Box<[Block]>, pub cols: usize, pub rows: usize, } impl FrameBlocks { pub fn new(cols: usize, rows: usize) -> Self { Self { blocks: vec![Block::default(); cols * rows].into_boxed_slice(), cols, rows, } } #[inline(always)] pub fn as_tile_blocks(&self) -> TileBlocks<'_> { TileBlocks::new(self, 0, 0, self.cols, self.rows) } #[inline(always)] pub fn as_tile_blocks_mut(&mut self) -> TileBlocksMut<'_> { TileBlocksMut::new(self, 0, 0, self.cols, self.rows) } } impl Index for FrameBlocks { type Output = [Block]; #[inline] fn index(&self, index: usize) -> &Self::Output { &self.blocks[index * self.cols..(index + 1) * self.cols] } } impl IndexMut for FrameBlocks { #[inline] fn index_mut(&mut self, index: usize) -> &mut Self::Output { &mut self.blocks[index * self.cols..(index + 1) * self.cols] } } // for convenience, also index by BlockOffset impl Index for FrameBlocks { type Output = Block; #[inline] fn index(&self, bo: PlaneBlockOffset) -> &Self::Output { &self[bo.0.y][bo.0.x] } } impl IndexMut for FrameBlocks { #[inline] fn index_mut(&mut self, bo: PlaneBlockOffset) -> &mut Self::Output { &mut self[bo.0.y][bo.0.x] } } impl<'a> ContextWriter<'a> { pub fn get_cdf_intra_mode_kf( &self, bo: TileBlockOffset, ) -> &[u16; INTRA_MODES] { static intra_mode_context: [usize; INTRA_MODES] = [0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0]; let above_mode = if bo.0.y > 0 { self.bc.blocks.above_of(bo).mode } else { PredictionMode::DC_PRED }; let left_mode = if bo.0.x > 0 { self.bc.blocks.left_of(bo).mode } else { PredictionMode::DC_PRED }; let above_ctx = intra_mode_context[above_mode as usize]; let left_ctx = intra_mode_context[left_mode as usize]; &self.fc.kf_y_cdf[above_ctx][left_ctx] } pub fn write_intra_mode_kf( &mut self, w: &mut W, bo: TileBlockOffset, mode: PredictionMode, ) { static intra_mode_context: [usize; INTRA_MODES] = [0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0]; let above_mode = if bo.0.y > 0 { self.bc.blocks.above_of(bo).mode } else { PredictionMode::DC_PRED }; let left_mode = if bo.0.x > 0 { self.bc.blocks.left_of(bo).mode } else { PredictionMode::DC_PRED }; let above_ctx = intra_mode_context[above_mode as usize]; let left_ctx = intra_mode_context[left_mode as usize]; let cdf = &self.fc.kf_y_cdf[above_ctx][left_ctx]; symbol_with_update!(self, w, mode as u32, cdf); } pub fn get_cdf_intra_mode(&self, bsize: BlockSize) -> &[u16; INTRA_MODES] { &self.fc.y_mode_cdf[size_group_lookup[bsize as usize] as usize] } #[inline] pub fn write_intra_mode( &mut self, w: &mut W, bsize: BlockSize, mode: PredictionMode, ) { let cdf = &self.fc.y_mode_cdf[size_group_lookup[bsize as usize] as usize]; symbol_with_update!(self, w, mode as u32, cdf); } #[inline] pub fn write_intra_uv_mode( &mut self, w: &mut W, uv_mode: PredictionMode, y_mode: PredictionMode, bs: BlockSize, ) { if bs.cfl_allowed() { let cdf = &self.fc.uv_mode_cfl_cdf[y_mode as usize]; symbol_with_update!(self, w, uv_mode as u32, cdf); } else { let cdf = &self.fc.uv_mode_cdf[y_mode as usize]; symbol_with_update!(self, w, uv_mode as u32, cdf); } } #[inline] pub fn write_angle_delta( &mut self, w: &mut W, angle: i8, mode: PredictionMode, ) { symbol_with_update!( self, w, (angle + MAX_ANGLE_DELTA as i8) as u32, &self.fc.angle_delta_cdf [mode as usize - PredictionMode::V_PRED as usize] ); } pub fn write_use_filter_intra( &mut self, w: &mut W, enable: bool, block_size: BlockSize, ) { let cdf = &self.fc.filter_intra_cdfs[block_size as usize]; symbol_with_update!(self, w, enable as u32, cdf); } /// # Panics /// /// - If called with `enable: true` (not yet implemented pub fn write_use_palette_mode( &mut self, w: &mut W, enable: bool, bsize: BlockSize, bo: TileBlockOffset, luma_mode: PredictionMode, chroma_mode: PredictionMode, xdec: usize, ydec: usize, cs: ChromaSampling, ) { if enable { unimplemented!(); // TODO } let (ctx_luma, ctx_chroma) = (0, 0); // TODO: increase based on surrounding block info if luma_mode == PredictionMode::DC_PRED { let bsize_ctx = bsize.width_mi_log2() + bsize.height_mi_log2() - 2; let cdf = &self.fc.palette_y_mode_cdfs[bsize_ctx][ctx_luma]; symbol_with_update!(self, w, enable as u32, cdf); } if has_chroma(bo, bsize, xdec, ydec, cs) && chroma_mode == PredictionMode::DC_PRED { let cdf = &self.fc.palette_uv_mode_cdfs[ctx_chroma]; symbol_with_update!(self, w, enable as u32, cdf); } } fn find_valid_row_offs( row_offset: isize, mi_row: usize, mi_rows: usize, ) -> isize { cmp::min( cmp::max(row_offset, -(mi_row as isize)), (mi_rows - mi_row - 1) as isize, ) } fn find_valid_col_offs( col_offset: isize, mi_col: usize, mi_cols: usize, ) -> isize { cmp::min( cmp::max(col_offset, -(mi_col as isize)), (mi_cols - mi_col - 1) as isize, ) } fn find_matching_mv( mv: MotionVector, mv_stack: &mut ArrayVec, ) -> bool { for mv_cand in mv_stack { if mv.row == mv_cand.this_mv.row && mv.col == mv_cand.this_mv.col { return true; } } false } fn find_matching_mv_and_update_weight( mv: MotionVector, mv_stack: &mut ArrayVec, weight: u32, ) -> bool { for mv_cand in mv_stack { if mv.row == mv_cand.this_mv.row && mv.col == mv_cand.this_mv.col { mv_cand.weight += weight; return true; } } false } fn find_matching_comp_mv_and_update_weight( mvs: [MotionVector; 2], mv_stack: &mut ArrayVec, weight: u32, ) -> bool { for mv_cand in mv_stack { if mvs[0].row == mv_cand.this_mv.row && mvs[0].col == mv_cand.this_mv.col && mvs[1].row == mv_cand.comp_mv.row && mvs[1].col == mv_cand.comp_mv.col { mv_cand.weight += weight; return true; } } false } fn add_ref_mv_candidate( ref_frames: [RefType; 2], blk: &Block, mv_stack: &mut ArrayVec, weight: u32, newmv_count: &mut usize, is_compound: bool, ) -> bool { if !blk.is_inter() { /* For intrabc */ false } else if is_compound { if blk.ref_frames[0] == ref_frames[0] && blk.ref_frames[1] == ref_frames[1] { let found_match = Self::find_matching_comp_mv_and_update_weight( blk.mv, mv_stack, weight, ); if !found_match && mv_stack.len() < MAX_REF_MV_STACK_SIZE { let mv_cand = CandidateMV { this_mv: blk.mv[0], comp_mv: blk.mv[1], weight }; mv_stack.push(mv_cand); } if blk.mode.has_newmv() { *newmv_count += 1; } true } else { false } } else { let mut found = false; for i in 0..2 { if blk.ref_frames[i] == ref_frames[0] { let found_match = Self::find_matching_mv_and_update_weight( blk.mv[i], mv_stack, weight, ); if !found_match && mv_stack.len() < MAX_REF_MV_STACK_SIZE { let mv_cand = CandidateMV { this_mv: blk.mv[i], comp_mv: MotionVector::default(), weight, }; mv_stack.push(mv_cand); } if blk.mode.has_newmv() { *newmv_count += 1; } found = true; } } found } } fn add_extra_mv_candidate( blk: &Block, ref_frames: [RefType; 2], mv_stack: &mut ArrayVec, fi: &FrameInvariants, is_compound: bool, ref_id_count: &mut [usize; 2], ref_id_mvs: &mut [[MotionVector; 2]; 2], ref_diff_count: &mut [usize; 2], ref_diff_mvs: &mut [[MotionVector; 2]; 2], ) { if is_compound { for cand_list in 0..2 { let cand_ref = blk.ref_frames[cand_list]; if cand_ref != INTRA_FRAME && cand_ref != NONE_FRAME { for list in 0..2 { let mut cand_mv = blk.mv[cand_list]; if cand_ref == ref_frames[list] && ref_id_count[list] < 2 { ref_id_mvs[list][ref_id_count[list]] = cand_mv; ref_id_count[list] += 1; } else if ref_diff_count[list] < 2 { if fi.ref_frame_sign_bias[cand_ref.to_index()] != fi.ref_frame_sign_bias[ref_frames[list].to_index()] { cand_mv.row = -cand_mv.row; cand_mv.col = -cand_mv.col; } ref_diff_mvs[list][ref_diff_count[list]] = cand_mv; ref_diff_count[list] += 1; } } } } } else { for cand_list in 0..2 { let cand_ref = blk.ref_frames[cand_list]; if cand_ref != INTRA_FRAME && cand_ref != NONE_FRAME { let mut mv = blk.mv[cand_list]; if fi.ref_frame_sign_bias[cand_ref.to_index()] != fi.ref_frame_sign_bias[ref_frames[0].to_index()] { mv.row = -mv.row; mv.col = -mv.col; } if !Self::find_matching_mv(mv, mv_stack) { let mv_cand = CandidateMV { this_mv: mv, comp_mv: MotionVector::default(), weight: 2, }; mv_stack.push(mv_cand); } } } } } fn scan_row_mbmi( &self, bo: TileBlockOffset, row_offset: isize, max_row_offs: isize, processed_rows: &mut isize, ref_frames: [RefType; 2], mv_stack: &mut ArrayVec, newmv_count: &mut usize, bsize: BlockSize, is_compound: bool, ) -> bool { let bc = &self.bc; let target_n4_w = bsize.width_mi(); let end_mi = cmp::min( cmp::min(target_n4_w, bc.blocks.cols() - bo.0.x), BLOCK_64X64.width_mi(), ); let n4_w_8 = BLOCK_8X8.width_mi(); let n4_w_16 = BLOCK_16X16.width_mi(); let mut col_offset = 0; if row_offset.abs() > 1 { col_offset = 1; if ((bo.0.x & 0x01) != 0) && (target_n4_w < n4_w_8) { col_offset -= 1; } } let use_step_16 = target_n4_w >= 16; let mut found_match = false; let mut i = 0; while i < end_mi { let cand = &bc.blocks[bo.with_offset(col_offset + i as isize, row_offset)]; let n4_w = cand.n4_w as usize; let mut len = cmp::min(target_n4_w, n4_w); if use_step_16 { len = cmp::max(n4_w_16, len); } else if row_offset.abs() > 1 { len = cmp::max(len, n4_w_8); } let mut weight: u32 = 2; if target_n4_w >= n4_w_8 && target_n4_w <= n4_w { let inc = cmp::min(-max_row_offs + row_offset + 1, cand.n4_h as isize); assert!(inc >= 0); weight = cmp::max(weight, inc as u32); *processed_rows = inc - row_offset - 1; } if Self::add_ref_mv_candidate( ref_frames, cand, mv_stack, len as u32 * weight, newmv_count, is_compound, ) { found_match = true; } i += len; } found_match } fn scan_col_mbmi( &self, bo: TileBlockOffset, col_offset: isize, max_col_offs: isize, processed_cols: &mut isize, ref_frames: [RefType; 2], mv_stack: &mut ArrayVec, newmv_count: &mut usize, bsize: BlockSize, is_compound: bool, ) -> bool { let bc = &self.bc; let target_n4_h = bsize.height_mi(); let end_mi = cmp::min( cmp::min(target_n4_h, bc.blocks.rows() - bo.0.y), BLOCK_64X64.height_mi(), ); let n4_h_8 = BLOCK_8X8.height_mi(); let n4_h_16 = BLOCK_16X16.height_mi(); let mut row_offset = 0; if col_offset.abs() > 1 { row_offset = 1; if ((bo.0.y & 0x01) != 0) && (target_n4_h < n4_h_8) { row_offset -= 1; } } let use_step_16 = target_n4_h >= 16; let mut found_match = false; let mut i = 0; while i < end_mi { let cand = &bc.blocks[bo.with_offset(col_offset, row_offset + i as isize)]; let n4_h = cand.n4_h as usize; let mut len = cmp::min(target_n4_h, n4_h); if use_step_16 { len = cmp::max(n4_h_16, len); } else if col_offset.abs() > 1 { len = cmp::max(len, n4_h_8); } let mut weight: u32 = 2; if target_n4_h >= n4_h_8 && target_n4_h <= n4_h { let inc = cmp::min(-max_col_offs + col_offset + 1, cand.n4_w as isize); assert!(inc >= 0); weight = cmp::max(weight, inc as u32); *processed_cols = inc - col_offset - 1; } if Self::add_ref_mv_candidate( ref_frames, cand, mv_stack, len as u32 * weight, newmv_count, is_compound, ) { found_match = true; } i += len; } found_match } fn scan_blk_mbmi( &self, bo: TileBlockOffset, ref_frames: [RefType; 2], mv_stack: &mut ArrayVec, newmv_count: &mut usize, is_compound: bool, ) -> bool { if bo.0.x >= self.bc.blocks.cols() || bo.0.y >= self.bc.blocks.rows() { return false; } let weight = 2 * BLOCK_8X8.width_mi() as u32; /* Always assume its within a tile, probably wrong */ Self::add_ref_mv_candidate( ref_frames, &self.bc.blocks[bo], mv_stack, weight, newmv_count, is_compound, ) } fn add_offset(mv_stack: &mut ArrayVec) { for cand_mv in mv_stack { cand_mv.weight += REF_CAT_LEVEL; } } #[profiling::function] fn setup_mvref_list( &self, bo: TileBlockOffset, ref_frames: [RefType; 2], mv_stack: &mut ArrayVec, bsize: BlockSize, fi: &FrameInvariants, is_compound: bool, ) -> usize { let (_rf, _rf_num) = (INTRA_FRAME, 1); let target_n4_h = bsize.height_mi(); let target_n4_w = bsize.width_mi(); let mut max_row_offs: isize = 0; let row_adj = (target_n4_h < BLOCK_8X8.height_mi()) && (bo.0.y & 0x01) != 0x0; let mut max_col_offs: isize = 0; let col_adj = (target_n4_w < BLOCK_8X8.width_mi()) && (bo.0.x & 0x01) != 0x0; let mut processed_rows: isize = 0; let mut processed_cols: isize = 0; let up_avail = bo.0.y > 0; let left_avail = bo.0.x > 0; if up_avail { max_row_offs = -2 * MVREF_ROW_COLS as isize + row_adj as isize; // limit max offset for small blocks if target_n4_h < BLOCK_8X8.height_mi() { max_row_offs = -2 * 2 + row_adj as isize; } let rows = self.bc.blocks.rows(); max_row_offs = Self::find_valid_row_offs(max_row_offs, bo.0.y, rows); } if left_avail { max_col_offs = -2 * MVREF_ROW_COLS as isize + col_adj as isize; // limit max offset for small blocks if target_n4_w < BLOCK_8X8.width_mi() { max_col_offs = -2 * 2 + col_adj as isize; } let cols = self.bc.blocks.cols(); max_col_offs = Self::find_valid_col_offs(max_col_offs, bo.0.x, cols); } let mut row_match = false; let mut col_match = false; let mut newmv_count: usize = 0; if max_row_offs.abs() >= 1 { let found_match = self.scan_row_mbmi( bo, -1, max_row_offs, &mut processed_rows, ref_frames, mv_stack, &mut newmv_count, bsize, is_compound, ); row_match |= found_match; } if max_col_offs.abs() >= 1 { let found_match = self.scan_col_mbmi( bo, -1, max_col_offs, &mut processed_cols, ref_frames, mv_stack, &mut newmv_count, bsize, is_compound, ); col_match |= found_match; } if has_tr(bo, bsize) && bo.0.y > 0 { let found_match = self.scan_blk_mbmi( bo.with_offset(target_n4_w as isize, -1), ref_frames, mv_stack, &mut newmv_count, is_compound, ); row_match |= found_match; } let nearest_match = usize::from(row_match) + usize::from(col_match); Self::add_offset(mv_stack); /* Scan the second outer area. */ let mut far_newmv_count: usize = 0; // won't be used let found_match = bo.0.x > 0 && bo.0.y > 0 && self.scan_blk_mbmi( bo.with_offset(-1, -1), ref_frames, mv_stack, &mut far_newmv_count, is_compound, ); row_match |= found_match; for idx in 2..=MVREF_ROW_COLS { let row_offset = -2 * idx as isize + 1 + row_adj as isize; let col_offset = -2 * idx as isize + 1 + col_adj as isize; if row_offset.abs() <= max_row_offs.abs() && row_offset.abs() > processed_rows { let found_match = self.scan_row_mbmi( bo, row_offset, max_row_offs, &mut processed_rows, ref_frames, mv_stack, &mut far_newmv_count, bsize, is_compound, ); row_match |= found_match; } if col_offset.abs() <= max_col_offs.abs() && col_offset.abs() > processed_cols { let found_match = self.scan_col_mbmi( bo, col_offset, max_col_offs, &mut processed_cols, ref_frames, mv_stack, &mut far_newmv_count, bsize, is_compound, ); col_match |= found_match; } } let total_match = usize::from(row_match) + usize::from(col_match); assert!(total_match >= nearest_match); // mode_context contains both newmv_context and refmv_context, where newmv_context // lies in the REF_MVOFFSET least significant bits let mode_context = match nearest_match { 0 => cmp::min(total_match, 1) + (total_match << REFMV_OFFSET), 1 => 3 - cmp::min(newmv_count, 1) + ((2 + total_match) << REFMV_OFFSET), _ => 5 - cmp::min(newmv_count, 1) + (5 << REFMV_OFFSET), }; /* TODO: Find nearest match and assign nearest and near mvs */ // 7.10.2.11 Sort MV stack according to weight mv_stack.sort_by(|a, b| b.weight.cmp(&a.weight)); if mv_stack.len() < 2 { // 7.10.2.12 Extra search process let w4 = bsize.width_mi().min(16).min(self.bc.blocks.cols() - bo.0.x); let h4 = bsize.height_mi().min(16).min(self.bc.blocks.rows() - bo.0.y); let num4x4 = w4.min(h4); let passes = i32::from(!up_avail)..=i32::from(left_avail); let mut ref_id_count: [usize; 2] = [0; 2]; let mut ref_diff_count: [usize; 2] = [0; 2]; let mut ref_id_mvs = [[MotionVector::default(); 2]; 2]; let mut ref_diff_mvs = [[MotionVector::default(); 2]; 2]; for pass in passes { let mut idx = 0; while idx < num4x4 && mv_stack.len() < 2 { let rbo = if pass == 0 { bo.with_offset(idx as isize, -1) } else { bo.with_offset(-1, idx as isize) }; let blk = &self.bc.blocks[rbo]; Self::add_extra_mv_candidate( blk, ref_frames, mv_stack, fi, is_compound, &mut ref_id_count, &mut ref_id_mvs, &mut ref_diff_count, &mut ref_diff_mvs, ); idx += if pass == 0 { blk.n4_w } else { blk.n4_h } as usize; } } if is_compound { let mut combined_mvs = [[MotionVector::default(); 2]; 2]; for list in 0..2 { let mut comp_count = 0; for idx in 0..ref_id_count[list] { combined_mvs[comp_count][list] = ref_id_mvs[list][idx]; comp_count += 1; } for idx in 0..ref_diff_count[list] { if comp_count < 2 { combined_mvs[comp_count][list] = ref_diff_mvs[list][idx]; comp_count += 1; } } } if mv_stack.len() == 1 { let mv_cand = if combined_mvs[0][0].row == mv_stack[0].this_mv.row && combined_mvs[0][0].col == mv_stack[0].this_mv.col && combined_mvs[0][1].row == mv_stack[0].comp_mv.row && combined_mvs[0][1].col == mv_stack[0].comp_mv.col { CandidateMV { this_mv: combined_mvs[1][0], comp_mv: combined_mvs[1][1], weight: 2, } } else { CandidateMV { this_mv: combined_mvs[0][0], comp_mv: combined_mvs[0][1], weight: 2, } }; mv_stack.push(mv_cand); } else { for idx in 0..2 { let mv_cand = CandidateMV { this_mv: combined_mvs[idx][0], comp_mv: combined_mvs[idx][1], weight: 2, }; mv_stack.push(mv_cand); } } assert!(mv_stack.len() == 2); } } /* TODO: Handle single reference frame extension */ let frame_bo = PlaneBlockOffset(BlockOffset { x: self.bc.blocks.x() + bo.0.x, y: self.bc.blocks.y() + bo.0.y, }); // clamp mvs for mv in mv_stack { let blk_w = bsize.width(); let blk_h = bsize.height(); let border_w = 128 + blk_w as isize * 8; let border_h = 128 + blk_h as isize * 8; let mvx_min = -(frame_bo.0.x as isize) * (8 * MI_SIZE) as isize - border_w; let mvx_max = ((self.bc.blocks.frame_cols() - frame_bo.0.x) as isize - (blk_w / MI_SIZE) as isize) * (8 * MI_SIZE) as isize + border_w; let mvy_min = -(frame_bo.0.y as isize) * (8 * MI_SIZE) as isize - border_h; let mvy_max = ((self.bc.blocks.frame_rows() - frame_bo.0.y) as isize - (blk_h / MI_SIZE) as isize) * (8 * MI_SIZE) as isize + border_h; mv.this_mv.row = (mv.this_mv.row as isize).clamp(mvy_min, mvy_max) as i16; mv.this_mv.col = (mv.this_mv.col as isize).clamp(mvx_min, mvx_max) as i16; mv.comp_mv.row = (mv.comp_mv.row as isize).clamp(mvy_min, mvy_max) as i16; mv.comp_mv.col = (mv.comp_mv.col as isize).clamp(mvx_min, mvx_max) as i16; } mode_context } /// # Panics /// /// - If the first ref frame is not set (`NONE_FRAME`) pub fn find_mvrefs( &self, bo: TileBlockOffset, ref_frames: [RefType; 2], mv_stack: &mut ArrayVec, bsize: BlockSize, fi: &FrameInvariants, is_compound: bool, ) -> usize { assert!(ref_frames[0] != NONE_FRAME); if ref_frames[0] != NONE_FRAME { // TODO: If ref_frames[0] != INTRA_FRAME, convert global mv to an mv; // otherwise, set the global mv ref to invalid. } if ref_frames[0] != INTRA_FRAME { /* TODO: Set zeromv ref to the converted global motion vector */ } else { /* TODO: Set the zeromv ref to 0 */ return 0; } self.setup_mvref_list(bo, ref_frames, mv_stack, bsize, fi, is_compound) } pub fn fill_neighbours_ref_counts(&mut self, bo: TileBlockOffset) { let mut ref_counts = [0; INTER_REFS_PER_FRAME]; if bo.0.y > 0 { let above_b = self.bc.blocks.above_of(bo); if above_b.is_inter() { ref_counts[above_b.ref_frames[0].to_index()] += 1; if above_b.has_second_ref() { ref_counts[above_b.ref_frames[1].to_index()] += 1; } } } if bo.0.x > 0 { let left_b = self.bc.blocks.left_of(bo); if left_b.is_inter() { ref_counts[left_b.ref_frames[0].to_index()] += 1; if left_b.has_second_ref() { ref_counts[left_b.ref_frames[1].to_index()] += 1; } } } self.bc.blocks[bo].neighbors_ref_counts = ref_counts; } #[inline] pub const fn ref_count_ctx(counts0: u8, counts1: u8) -> usize { if counts0 < counts1 { 0 } else if counts0 == counts1 { 1 } else { 2 } } #[inline] pub fn get_pred_ctx_brfarf2_or_arf(&self, bo: TileBlockOffset) -> usize { let ref_counts = self.bc.blocks[bo].neighbors_ref_counts; let brfarf2_count = ref_counts[BWDREF_FRAME.to_index()] + ref_counts[ALTREF2_FRAME.to_index()]; let arf_count = ref_counts[ALTREF_FRAME.to_index()]; ContextWriter::ref_count_ctx(brfarf2_count, arf_count) } #[inline] pub fn get_pred_ctx_ll2_or_l3gld(&self, bo: TileBlockOffset) -> usize { let ref_counts = self.bc.blocks[bo].neighbors_ref_counts; let l_l2_count = ref_counts[LAST_FRAME.to_index()] + ref_counts[LAST2_FRAME.to_index()]; let l3_gold_count = ref_counts[LAST3_FRAME.to_index()] + ref_counts[GOLDEN_FRAME.to_index()]; ContextWriter::ref_count_ctx(l_l2_count, l3_gold_count) } #[inline] pub fn get_pred_ctx_last_or_last2(&self, bo: TileBlockOffset) -> usize { let ref_counts = self.bc.blocks[bo].neighbors_ref_counts; let l_count = ref_counts[LAST_FRAME.to_index()]; let l2_count = ref_counts[LAST2_FRAME.to_index()]; ContextWriter::ref_count_ctx(l_count, l2_count) } #[inline] pub fn get_pred_ctx_last3_or_gold(&self, bo: TileBlockOffset) -> usize { let ref_counts = self.bc.blocks[bo].neighbors_ref_counts; let l3_count = ref_counts[LAST3_FRAME.to_index()]; let gold_count = ref_counts[GOLDEN_FRAME.to_index()]; ContextWriter::ref_count_ctx(l3_count, gold_count) } #[inline] pub fn get_pred_ctx_brf_or_arf2(&self, bo: TileBlockOffset) -> usize { let ref_counts = self.bc.blocks[bo].neighbors_ref_counts; let brf_count = ref_counts[BWDREF_FRAME.to_index()]; let arf2_count = ref_counts[ALTREF2_FRAME.to_index()]; ContextWriter::ref_count_ctx(brf_count, arf2_count) } pub fn get_comp_mode_ctx(&self, bo: TileBlockOffset) -> usize { let avail_left = bo.0.x > 0; let avail_up = bo.0.y > 0; let (left0, left1) = if avail_left { let bo_left = bo.with_offset(-1, 0); let ref_frames = &self.bc.blocks[bo_left].ref_frames; (ref_frames[0], ref_frames[1]) } else { (INTRA_FRAME, NONE_FRAME) }; let (above0, above1) = if avail_up { let bo_up = bo.with_offset(0, -1); let ref_frames = &self.bc.blocks[bo_up].ref_frames; (ref_frames[0], ref_frames[1]) } else { (INTRA_FRAME, NONE_FRAME) }; let left_single = left1 == NONE_FRAME; let above_single = above1 == NONE_FRAME; let left_intra = left0 == INTRA_FRAME; let above_intra = above0 == INTRA_FRAME; let left_backward = left0.is_bwd_ref(); let above_backward = above0.is_bwd_ref(); if avail_left && avail_up { if above_single && left_single { (above_backward ^ left_backward) as usize } else if above_single { 2 + (above_backward || above_intra) as usize } else if left_single { 2 + (left_backward || left_intra) as usize } else { 4 } } else if avail_up { if above_single { above_backward as usize } else { 3 } } else if avail_left { if left_single { left_backward as usize } else { 3 } } else { 1 } } pub fn get_comp_ref_type_ctx(&self, bo: TileBlockOffset) -> usize { fn is_samedir_ref_pair(ref0: RefType, ref1: RefType) -> bool { (ref0.is_bwd_ref() && ref0 != NONE_FRAME) == (ref1.is_bwd_ref() && ref1 != NONE_FRAME) } let avail_left = bo.0.x > 0; let avail_up = bo.0.y > 0; let (left0, left1) = if avail_left { let bo_left = bo.with_offset(-1, 0); let ref_frames = &self.bc.blocks[bo_left].ref_frames; (ref_frames[0], ref_frames[1]) } else { (INTRA_FRAME, NONE_FRAME) }; let (above0, above1) = if avail_up { let bo_up = bo.with_offset(0, -1); let ref_frames = &self.bc.blocks[bo_up].ref_frames; (ref_frames[0], ref_frames[1]) } else { (INTRA_FRAME, NONE_FRAME) }; let left_single = left1 == NONE_FRAME; let above_single = above1 == NONE_FRAME; let left_intra = left0 == INTRA_FRAME; let above_intra = above0 == INTRA_FRAME; let above_comp_inter = avail_up && !above_intra && !above_single; let left_comp_inter = avail_left && !left_intra && !left_single; let above_uni_comp = above_comp_inter && is_samedir_ref_pair(above0, above1); let left_uni_comp = left_comp_inter && is_samedir_ref_pair(left0, left1); if avail_up && !above_intra && avail_left && !left_intra { let samedir = is_samedir_ref_pair(above0, left0) as usize; if !above_comp_inter && !left_comp_inter { 1 + 2 * samedir } else if !above_comp_inter { if !left_uni_comp { 1 } else { 3 + samedir } } else if !left_comp_inter { if !above_uni_comp { 1 } else { 3 + samedir } } else if !above_uni_comp && !left_uni_comp { 0 } else if !above_uni_comp || !left_uni_comp { 2 } else { 3 + ((above0 == BWDREF_FRAME) == (left0 == BWDREF_FRAME)) as usize } } else if avail_up && avail_left { if above_comp_inter { 1 + 2 * above_uni_comp as usize } else if left_comp_inter { 1 + 2 * left_uni_comp as usize } else { 2 } } else if above_comp_inter { 4 * above_uni_comp as usize } else if left_comp_inter { 4 * left_uni_comp as usize } else { 2 } } /// # Panics /// /// - If `mode` is not an inter mode pub fn write_compound_mode( &mut self, w: &mut W, mode: PredictionMode, ctx: usize, ) { let newmv_ctx = ctx & NEWMV_CTX_MASK; let refmv_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; let ctx = if refmv_ctx < 2 { newmv_ctx.min(1) } else if refmv_ctx < 4 { (newmv_ctx + 1).min(4) } else { (newmv_ctx.max(1) + 3).min(7) }; assert!(mode >= PredictionMode::NEAREST_NEARESTMV); let val = match mode { PredictionMode::NEAREST_NEARESTMV => 0, PredictionMode::NEAR_NEAR0MV | PredictionMode::NEAR_NEAR1MV | PredictionMode::NEAR_NEAR2MV => 1, PredictionMode::NEAREST_NEWMV => 2, PredictionMode::NEW_NEARESTMV => 3, PredictionMode::NEAR_NEW0MV | PredictionMode::NEAR_NEW1MV | PredictionMode::NEAR_NEW2MV => 4, PredictionMode::NEW_NEAR0MV | PredictionMode::NEW_NEAR1MV | PredictionMode::NEW_NEAR2MV => 5, PredictionMode::GLOBAL_GLOBALMV => 6, PredictionMode::NEW_NEWMV => 7, _ => unreachable!(), }; symbol_with_update!(self, w, val, &self.fc.compound_mode_cdf[ctx]); } pub fn write_inter_mode( &mut self, w: &mut W, mode: PredictionMode, ctx: usize, ) { use PredictionMode::{GLOBALMV, NEARESTMV, NEWMV}; let newmv_ctx = ctx & NEWMV_CTX_MASK; let cdf = &self.fc.newmv_cdf[newmv_ctx]; symbol_with_update!(self, w, (mode != NEWMV) as u32, cdf); if mode != NEWMV { let zeromv_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK; let cdf = &self.fc.zeromv_cdf[zeromv_ctx]; symbol_with_update!(self, w, (mode != GLOBALMV) as u32, cdf); if mode != GLOBALMV { let refmv_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK; let cdf = &self.fc.refmv_cdf[refmv_ctx]; symbol_with_update!(self, w, (mode != NEARESTMV) as u32, cdf); } } } #[inline] pub fn write_drl_mode( &mut self, w: &mut W, drl_mode: bool, ctx: usize, ) { let cdf = &self.fc.drl_cdfs[ctx]; symbol_with_update!(self, w, drl_mode as u32, cdf); } /// # Panics /// /// - If the MV is invalid pub fn write_mv( &mut self, w: &mut W, mv: MotionVector, ref_mv: MotionVector, mv_precision: MvSubpelPrecision, ) { // assert!(mv.is_valid()); let diff = MotionVector { row: mv.row - ref_mv.row, col: mv.col - ref_mv.col }; let j: MvJointType = av1_get_mv_joint(diff); let cdf = &self.fc.nmv_context.joints_cdf; symbol_with_update!(self, w, j as u32, cdf); if mv_joint_vertical(j) { self.encode_mv_component(w, diff.row as i32, 0, mv_precision); } if mv_joint_horizontal(j) { self.encode_mv_component(w, diff.col as i32, 1, mv_precision); } } pub fn write_block_deblock_deltas( &mut self, w: &mut W, bo: TileBlockOffset, multi: bool, planes: usize, ) { let block = &self.bc.blocks[bo]; let deltas_count = if multi { FRAME_LF_COUNT + planes - 3 } else { 1 }; let deltas = &block.deblock_deltas[..deltas_count]; for (i, &delta) in deltas.iter().enumerate() { let abs = delta.unsigned_abs() as u32; let cdf = if multi { &self.fc.deblock_delta_multi_cdf[i] } else { &self.fc.deblock_delta_cdf }; symbol_with_update!(self, w, cmp::min(abs, DELTA_LF_SMALL), cdf); if abs >= DELTA_LF_SMALL { let bits = msb(abs as i32 - 1) as u32; w.literal(3, bits - 1); w.literal(bits as u8, abs - (1 << bits) - 1); } if abs > 0 { w.bool(delta < 0, 16384); } } } pub fn write_is_inter( &mut self, w: &mut W, bo: TileBlockOffset, is_inter: bool, ) { let ctx = self.bc.intra_inter_context(bo); let cdf = &self.fc.intra_inter_cdfs[ctx]; symbol_with_update!(self, w, is_inter as u32, cdf); } pub fn write_coeffs_lv_map( &mut self, w: &mut W, plane: usize, bo: TileBlockOffset, coeffs_in: &[T], eob: u16, pred_mode: PredictionMode, tx_size: TxSize, tx_type: TxType, plane_bsize: BlockSize, xdec: usize, ydec: usize, use_reduced_tx_set: bool, frame_clipped_txw: usize, frame_clipped_txh: usize, ) -> bool { debug_assert!(frame_clipped_txw != 0); debug_assert!(frame_clipped_txh != 0); let is_inter = pred_mode >= PredictionMode::NEARESTMV; // Note: Both intra and inter mode uses inter scan order. Surprised? let scan: &[u16] = &av1_scan_orders[tx_size as usize][tx_type as usize] .scan[..usize::from(eob)]; let height = av1_get_coded_tx_size(tx_size).height(); // Create a slice with coeffs in scan order let mut coeffs_storage: Aligned> = Aligned::new(ArrayVec::new()); let coeffs = &mut coeffs_storage.data; coeffs.extend(scan.iter().map(|&scan_idx| coeffs_in[scan_idx as usize])); let cul_level: u32 = coeffs.iter().map(|c| u32::cast_from(c.abs())).sum(); let txs_ctx = Self::get_txsize_entropy_ctx(tx_size); let txb_ctx = self.bc.get_txb_ctx( plane_bsize, tx_size, plane, bo, xdec, ydec, frame_clipped_txw, frame_clipped_txh, ); { let cdf = &self.fc.txb_skip_cdf[txs_ctx][txb_ctx.txb_skip_ctx]; symbol_with_update!(self, w, (eob == 0) as u32, cdf); } if eob == 0 { self.bc.set_coeff_context(plane, bo, tx_size, xdec, ydec, 0); return false; } let mut levels_buf = [0u8; TX_PAD_2D]; let levels: &mut [u8] = &mut levels_buf[TX_PAD_TOP * (height + TX_PAD_HOR)..]; self.txb_init_levels(coeffs_in, height, levels, height + TX_PAD_HOR); let tx_class = tx_type_to_class[tx_type as usize]; let plane_type = usize::from(plane != 0); // Signal tx_type for luma plane only if plane == 0 { self.write_tx_type( w, tx_size, tx_type, pred_mode, is_inter, use_reduced_tx_set, ); } self.encode_eob(eob, tx_size, tx_class, txs_ctx, plane_type, w); self.encode_coeffs( coeffs, levels, scan, eob, tx_size, tx_class, txs_ctx, plane_type, w, ); let cul_level = self.encode_coeff_signs(coeffs, w, plane_type, txb_ctx, cul_level); self.bc.set_coeff_context(plane, bo, tx_size, xdec, ydec, cul_level as u8); true } fn encode_eob( &mut self, eob: u16, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize, plane_type: usize, w: &mut W, ) { let (eob_pt, eob_extra) = Self::get_eob_pos_token(eob); let eob_multi_size: usize = tx_size.area_log2() - 4; let eob_multi_ctx: usize = usize::from(tx_class != TX_CLASS_2D); match eob_multi_size { 0 => { let cdf = &self.fc.eob_flag_cdf16[plane_type][eob_multi_ctx]; symbol_with_update!(self, w, eob_pt - 1, cdf); } 1 => { let cdf = &self.fc.eob_flag_cdf32[plane_type][eob_multi_ctx]; symbol_with_update!(self, w, eob_pt - 1, cdf); } 2 => { let cdf = &self.fc.eob_flag_cdf64[plane_type][eob_multi_ctx]; symbol_with_update!(self, w, eob_pt - 1, cdf); } 3 => { let cdf = &self.fc.eob_flag_cdf128[plane_type][eob_multi_ctx]; symbol_with_update!(self, w, eob_pt - 1, cdf); } 4 => { let cdf = &self.fc.eob_flag_cdf256[plane_type][eob_multi_ctx]; symbol_with_update!(self, w, eob_pt - 1, cdf); } 5 => { let cdf = &self.fc.eob_flag_cdf512[plane_type][eob_multi_ctx]; symbol_with_update!(self, w, eob_pt - 1, cdf); } _ => { let cdf = &self.fc.eob_flag_cdf1024[plane_type][eob_multi_ctx]; symbol_with_update!(self, w, eob_pt - 1, cdf); } } let eob_offset_bits = k_eob_offset_bits[eob_pt as usize]; if eob_offset_bits > 0 { let mut eob_shift = eob_offset_bits - 1; let mut bit: u32 = u32::from((eob_extra & (1 << eob_shift)) != 0); let cdf = &self.fc.eob_extra_cdf[txs_ctx][plane_type][(eob_pt - 3) as usize]; symbol_with_update!(self, w, bit, cdf); for i in 1..eob_offset_bits { eob_shift = eob_offset_bits - 1 - i; bit = u32::from((eob_extra & (1 << eob_shift)) != 0); w.bit(bit as u16); } } } fn encode_coeffs( &mut self, coeffs: &[T], levels: &mut [u8], scan: &[u16], eob: u16, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize, plane_type: usize, w: &mut W, ) { let mut coeff_contexts = Aligned::<[MaybeUninit; MAX_CODED_TX_SQUARE]>::uninit_array(); // get_nz_map_contexts sets coeff_contexts contiguously as a parallel array for scan, not in scan order let coeff_contexts = self.get_nz_map_contexts( levels, scan, eob, tx_size, tx_class, &mut coeff_contexts.data, ); let bhl = Self::get_txb_bhl(tx_size); let scan_with_ctx = scan.iter().copied().zip(coeff_contexts.iter().copied()); for (c, ((pos, coeff_ctx), v)) in scan_with_ctx.zip(coeffs.iter().copied()).enumerate().rev() { let pos = pos as usize; let coeff_ctx = coeff_ctx as usize; let level = v.abs(); if c == usize::from(eob) - 1 { symbol_with_update!( self, w, cmp::min(u32::cast_from(level), 3) - 1, &self.fc.coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx] ); } else { symbol_with_update!( self, w, cmp::min(u32::cast_from(level), 3), &self.fc.coeff_base_cdf[txs_ctx][plane_type][coeff_ctx] ); } if level > T::cast_from(NUM_BASE_LEVELS) { let base_range = level - T::cast_from(1 + NUM_BASE_LEVELS); let br_ctx = Self::get_br_ctx(levels, pos, bhl, tx_class); let mut idx: T = T::cast_from(0); loop { if idx >= T::cast_from(COEFF_BASE_RANGE) { break; } let k = cmp::min(base_range - idx, T::cast_from(BR_CDF_SIZE - 1)); let cdf = &self.fc.coeff_br_cdf [txs_ctx.min(TxSize::TX_32X32 as usize)][plane_type][br_ctx]; symbol_with_update!(self, w, u32::cast_from(k), cdf); if k < T::cast_from(BR_CDF_SIZE - 1) { break; } idx += T::cast_from(BR_CDF_SIZE - 1); } } } } fn encode_coeff_signs( &mut self, coeffs: &[T], w: &mut W, plane_type: usize, txb_ctx: TXB_CTX, orig_cul_level: u32, ) -> u32 { // Loop to code all signs in the transform block, // starting with the sign of DC (if applicable) for (c, &v) in coeffs.iter().enumerate() { if v == T::cast_from(0) { continue; } let level = v.abs(); let sign = u32::from(v < T::cast_from(0)); if c == 0 { let cdf = &self.fc.dc_sign_cdf[plane_type][txb_ctx.dc_sign_ctx]; symbol_with_update!(self, w, sign, cdf); } else { w.bit(sign as u16); } // save extra golomb codes for separate loop if level > T::cast_from(COEFF_BASE_RANGE + NUM_BASE_LEVELS) { w.write_golomb(u32::cast_from( level - T::cast_from(COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1), )); } } let mut new_cul_level = cmp::min(COEFF_CONTEXT_MASK as u32, orig_cul_level); BlockContext::set_dc_sign(&mut new_cul_level, i32::cast_from(coeffs[0])); new_cul_level } } rav1e-0.7.1/src/context/cdf_context.rs000064400000000000000000000714421046102023000160000ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; use std::marker::PhantomData; pub const CDF_LEN_MAX: usize = 16; #[derive(Clone)] pub struct CDFContextCheckpoint { small: usize, large: usize, } #[derive(Clone, Copy)] #[repr(C)] pub struct CDFContext { pub comp_bwd_ref_cdf: [[[u16; 2]; BWD_REFS - 1]; REF_CONTEXTS], pub comp_mode_cdf: [[u16; 2]; COMP_INTER_CONTEXTS], pub comp_ref_cdf: [[[u16; 2]; FWD_REFS - 1]; REF_CONTEXTS], pub comp_ref_type_cdf: [[u16; 2]; COMP_REF_TYPE_CONTEXTS], pub dc_sign_cdf: [[[u16; 2]; DC_SIGN_CONTEXTS]; PLANE_TYPES], pub drl_cdfs: [[u16; 2]; DRL_MODE_CONTEXTS], pub eob_extra_cdf: [[[[u16; 2]; EOB_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], pub filter_intra_cdfs: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL], pub intra_inter_cdfs: [[u16; 2]; INTRA_INTER_CONTEXTS], pub lrf_sgrproj_cdf: [u16; 2], pub lrf_wiener_cdf: [u16; 2], pub newmv_cdf: [[u16; 2]; NEWMV_MODE_CONTEXTS], pub palette_uv_mode_cdfs: [[u16; 2]; PALETTE_UV_MODE_CONTEXTS], pub palette_y_mode_cdfs: [[[u16; 2]; PALETTE_Y_MODE_CONTEXTS]; PALETTE_BSIZE_CTXS], pub refmv_cdf: [[u16; 2]; REFMV_MODE_CONTEXTS], pub single_ref_cdfs: [[[u16; 2]; SINGLE_REFS - 1]; REF_CONTEXTS], pub skip_cdfs: [[u16; 2]; SKIP_CONTEXTS], pub txb_skip_cdf: [[[u16; 2]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES], pub txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS], pub zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS], pub tx_size_8x8_cdf: [[u16; MAX_TX_DEPTH]; TX_SIZE_CONTEXTS], pub inter_tx_3_cdf: [[u16; 2]; TX_SIZE_SQR_CONTEXTS], pub coeff_base_eob_cdf: [[[[u16; 3]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES], pub lrf_switchable_cdf: [u16; 3], pub tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; BIG_TX_CATS], pub coeff_base_cdf: [[[[u16; 4]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], pub coeff_br_cdf: [[[[u16; BR_CDF_SIZE]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES], pub deblock_delta_cdf: [u16; DELTA_LF_PROBS + 1], pub deblock_delta_multi_cdf: [[u16; DELTA_LF_PROBS + 1]; FRAME_LF_COUNT], pub partition_w8_cdf: [[u16; 4]; PARTITION_TYPES], pub eob_flag_cdf16: [[[u16; 5]; 2]; PLANE_TYPES], pub intra_tx_2_cdf: [[[u16; 5]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS], pub eob_flag_cdf32: [[[u16; 6]; 2]; PLANE_TYPES], pub angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1]; DIRECTIONAL_MODES], pub eob_flag_cdf64: [[[u16; 7]; 2]; PLANE_TYPES], pub intra_tx_1_cdf: [[[u16; 7]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS], pub cfl_sign_cdf: [u16; CFL_JOINT_SIGNS], pub compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS], pub eob_flag_cdf128: [[[u16; 8]; 2]; PLANE_TYPES], pub spatial_segmentation_cdfs: [[u16; 8]; 3], pub partition_w128_cdf: [[u16; 8]; PARTITION_TYPES], pub eob_flag_cdf256: [[[u16; 9]; 2]; PLANE_TYPES], pub eob_flag_cdf512: [[[u16; 10]; 2]; PLANE_TYPES], pub partition_cdf: [[u16; EXT_PARTITION_TYPES]; 3 * PARTITION_TYPES], pub eob_flag_cdf1024: [[[u16; 11]; 2]; PLANE_TYPES], pub inter_tx_2_cdf: [[u16; 12]; TX_SIZE_SQR_CONTEXTS], pub kf_y_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS], pub y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS], pub uv_mode_cdf: [[u16; INTRA_MODES]; INTRA_MODES], pub uv_mode_cfl_cdf: [[u16; UV_INTRA_MODES]; INTRA_MODES], pub cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS], pub inter_tx_1_cdf: [[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS], pub nmv_context: NMVContext, } pub struct CDFOffset { offset: usize, phantom: PhantomData<[u16; CDF_LEN]>, } impl CDFContext { pub fn new(quantizer: u8) -> CDFContext { let qctx = match quantizer { 0..=20 => 0, 21..=60 => 1, 61..=120 => 2, _ => 3, }; CDFContext { partition_w8_cdf: default_partition_w8_cdf, partition_w128_cdf: default_partition_w128_cdf, partition_cdf: default_partition_cdf, kf_y_cdf: default_kf_y_mode_cdf, y_mode_cdf: default_if_y_mode_cdf, uv_mode_cdf: default_uv_mode_cdf, uv_mode_cfl_cdf: default_uv_mode_cfl_cdf, cfl_sign_cdf: default_cfl_sign_cdf, cfl_alpha_cdf: default_cfl_alpha_cdf, newmv_cdf: default_newmv_cdf, zeromv_cdf: default_zeromv_cdf, refmv_cdf: default_refmv_cdf, intra_tx_2_cdf: default_intra_tx_2_cdf, intra_tx_1_cdf: default_intra_tx_1_cdf, inter_tx_3_cdf: default_inter_tx_3_cdf, inter_tx_2_cdf: default_inter_tx_2_cdf, inter_tx_1_cdf: default_inter_tx_1_cdf, tx_size_8x8_cdf: default_tx_size_8x8_cdf, tx_size_cdf: default_tx_size_cdf, txfm_partition_cdf: default_txfm_partition_cdf, skip_cdfs: default_skip_cdfs, intra_inter_cdfs: default_intra_inter_cdf, angle_delta_cdf: default_angle_delta_cdf, filter_intra_cdfs: default_filter_intra_cdfs, palette_y_mode_cdfs: default_palette_y_mode_cdfs, palette_uv_mode_cdfs: default_palette_uv_mode_cdfs, comp_mode_cdf: default_comp_mode_cdf, comp_ref_type_cdf: default_comp_ref_type_cdf, comp_ref_cdf: default_comp_ref_cdf, comp_bwd_ref_cdf: default_comp_bwdref_cdf, single_ref_cdfs: default_single_ref_cdf, drl_cdfs: default_drl_cdf, compound_mode_cdf: default_compound_mode_cdf, nmv_context: default_nmv_context, deblock_delta_multi_cdf: default_delta_lf_multi_cdf, deblock_delta_cdf: default_delta_lf_cdf, spatial_segmentation_cdfs: default_spatial_pred_seg_tree_cdf, lrf_switchable_cdf: default_switchable_restore_cdf, lrf_sgrproj_cdf: default_sgrproj_restore_cdf, lrf_wiener_cdf: default_wiener_restore_cdf, // lv_map txb_skip_cdf: av1_default_txb_skip_cdfs[qctx], dc_sign_cdf: av1_default_dc_sign_cdfs[qctx], eob_extra_cdf: av1_default_eob_extra_cdfs[qctx], eob_flag_cdf16: av1_default_eob_multi16_cdfs[qctx], eob_flag_cdf32: av1_default_eob_multi32_cdfs[qctx], eob_flag_cdf64: av1_default_eob_multi64_cdfs[qctx], eob_flag_cdf128: av1_default_eob_multi128_cdfs[qctx], eob_flag_cdf256: av1_default_eob_multi256_cdfs[qctx], eob_flag_cdf512: av1_default_eob_multi512_cdfs[qctx], eob_flag_cdf1024: av1_default_eob_multi1024_cdfs[qctx], coeff_base_eob_cdf: av1_default_coeff_base_eob_multi_cdfs[qctx], coeff_base_cdf: av1_default_coeff_base_multi_cdfs[qctx], coeff_br_cdf: av1_default_coeff_lps_multi_cdfs[qctx], } } pub fn reset_counts(&mut self) { macro_rules! reset_1d { ($field:expr) => { let r = $field.last_mut().unwrap(); *r = 0; }; } macro_rules! reset_2d { ($field:expr) => { for x in $field.iter_mut() { reset_1d!(x); } }; } macro_rules! reset_3d { ($field:expr) => { for x in $field.iter_mut() { reset_2d!(x); } }; } macro_rules! reset_4d { ($field:expr) => { for x in $field.iter_mut() { reset_3d!(x); } }; } reset_2d!(self.partition_w8_cdf); reset_2d!(self.partition_w128_cdf); reset_2d!(self.partition_cdf); reset_3d!(self.kf_y_cdf); reset_2d!(self.y_mode_cdf); reset_2d!(self.uv_mode_cdf); reset_2d!(self.uv_mode_cfl_cdf); reset_1d!(self.cfl_sign_cdf); reset_2d!(self.cfl_alpha_cdf); reset_2d!(self.newmv_cdf); reset_2d!(self.zeromv_cdf); reset_2d!(self.refmv_cdf); reset_3d!(self.intra_tx_2_cdf); reset_3d!(self.intra_tx_1_cdf); reset_2d!(self.inter_tx_3_cdf); reset_2d!(self.inter_tx_2_cdf); reset_2d!(self.inter_tx_1_cdf); reset_2d!(self.tx_size_8x8_cdf); reset_3d!(self.tx_size_cdf); reset_2d!(self.txfm_partition_cdf); reset_2d!(self.skip_cdfs); reset_2d!(self.intra_inter_cdfs); reset_2d!(self.angle_delta_cdf); reset_2d!(self.filter_intra_cdfs); reset_3d!(self.palette_y_mode_cdfs); reset_2d!(self.palette_uv_mode_cdfs); reset_2d!(self.comp_mode_cdf); reset_2d!(self.comp_ref_type_cdf); reset_3d!(self.comp_ref_cdf); reset_3d!(self.comp_bwd_ref_cdf); reset_3d!(self.single_ref_cdfs); reset_2d!(self.drl_cdfs); reset_2d!(self.compound_mode_cdf); reset_2d!(self.deblock_delta_multi_cdf); reset_1d!(self.deblock_delta_cdf); reset_2d!(self.spatial_segmentation_cdfs); reset_1d!(self.lrf_switchable_cdf); reset_1d!(self.lrf_sgrproj_cdf); reset_1d!(self.lrf_wiener_cdf); reset_1d!(self.nmv_context.joints_cdf); for i in 0..2 { reset_1d!(self.nmv_context.comps[i].classes_cdf); reset_2d!(self.nmv_context.comps[i].class0_fp_cdf); reset_1d!(self.nmv_context.comps[i].fp_cdf); reset_1d!(self.nmv_context.comps[i].sign_cdf); reset_1d!(self.nmv_context.comps[i].class0_hp_cdf); reset_1d!(self.nmv_context.comps[i].hp_cdf); reset_1d!(self.nmv_context.comps[i].class0_cdf); reset_2d!(self.nmv_context.comps[i].bits_cdf); } // lv_map reset_3d!(self.txb_skip_cdf); reset_3d!(self.dc_sign_cdf); reset_4d!(self.eob_extra_cdf); reset_3d!(self.eob_flag_cdf16); reset_3d!(self.eob_flag_cdf32); reset_3d!(self.eob_flag_cdf64); reset_3d!(self.eob_flag_cdf128); reset_3d!(self.eob_flag_cdf256); reset_3d!(self.eob_flag_cdf512); reset_3d!(self.eob_flag_cdf1024); reset_4d!(self.coeff_base_eob_cdf); reset_4d!(self.coeff_base_cdf); reset_4d!(self.coeff_br_cdf); } /// # Panics /// /// - If any of the CDF arrays are uninitialized. /// This should never happen and indicates a development error. pub fn build_map(&self) -> Vec<(&'static str, usize, usize)> { use std::mem::size_of_val; let partition_w8_cdf_start = self.partition_w8_cdf.first().unwrap().as_ptr() as usize; let partition_w8_cdf_end = partition_w8_cdf_start + size_of_val(&self.partition_w8_cdf); let partition_w128_cdf_start = self.partition_w128_cdf.first().unwrap().as_ptr() as usize; let partition_w128_cdf_end = partition_w128_cdf_start + size_of_val(&self.partition_w128_cdf); let partition_cdf_start = self.partition_cdf.first().unwrap().as_ptr() as usize; let partition_cdf_end = partition_cdf_start + size_of_val(&self.partition_cdf); let kf_y_cdf_start = self.kf_y_cdf.first().unwrap().as_ptr() as usize; let kf_y_cdf_end = kf_y_cdf_start + size_of_val(&self.kf_y_cdf); let y_mode_cdf_start = self.y_mode_cdf.first().unwrap().as_ptr() as usize; let y_mode_cdf_end = y_mode_cdf_start + size_of_val(&self.y_mode_cdf); let uv_mode_cdf_start = self.uv_mode_cdf.first().unwrap().as_ptr() as usize; let uv_mode_cdf_end = uv_mode_cdf_start + size_of_val(&self.uv_mode_cdf); let uv_mode_cfl_cdf_start = self.uv_mode_cfl_cdf.first().unwrap().as_ptr() as usize; let uv_mode_cfl_cdf_end = uv_mode_cfl_cdf_start + size_of_val(&self.uv_mode_cfl_cdf); let cfl_sign_cdf_start = self.cfl_sign_cdf.as_ptr() as usize; let cfl_sign_cdf_end = cfl_sign_cdf_start + size_of_val(&self.cfl_sign_cdf); let cfl_alpha_cdf_start = self.cfl_alpha_cdf.first().unwrap().as_ptr() as usize; let cfl_alpha_cdf_end = cfl_alpha_cdf_start + size_of_val(&self.cfl_alpha_cdf); let newmv_cdf_start = self.newmv_cdf.first().unwrap().as_ptr() as usize; let newmv_cdf_end = newmv_cdf_start + size_of_val(&self.newmv_cdf); let zeromv_cdf_start = self.zeromv_cdf.first().unwrap().as_ptr() as usize; let zeromv_cdf_end = zeromv_cdf_start + size_of_val(&self.zeromv_cdf); let refmv_cdf_start = self.refmv_cdf.first().unwrap().as_ptr() as usize; let refmv_cdf_end = refmv_cdf_start + size_of_val(&self.refmv_cdf); let intra_tx_2_cdf_start = self.intra_tx_2_cdf.first().unwrap().as_ptr() as usize; let intra_tx_2_cdf_end = intra_tx_2_cdf_start + size_of_val(&self.intra_tx_2_cdf); let intra_tx_1_cdf_start = self.intra_tx_1_cdf.first().unwrap().as_ptr() as usize; let intra_tx_1_cdf_end = intra_tx_1_cdf_start + size_of_val(&self.intra_tx_1_cdf); let inter_tx_3_cdf_start = self.inter_tx_3_cdf.first().unwrap().as_ptr() as usize; let inter_tx_3_cdf_end = inter_tx_3_cdf_start + size_of_val(&self.inter_tx_3_cdf); let inter_tx_2_cdf_start = self.inter_tx_2_cdf.first().unwrap().as_ptr() as usize; let inter_tx_2_cdf_end = inter_tx_2_cdf_start + size_of_val(&self.inter_tx_2_cdf); let inter_tx_1_cdf_start = self.inter_tx_1_cdf.first().unwrap().as_ptr() as usize; let inter_tx_1_cdf_end = inter_tx_1_cdf_start + size_of_val(&self.inter_tx_1_cdf); let tx_size_8x8_cdf_start = self.tx_size_8x8_cdf.first().unwrap().as_ptr() as usize; let tx_size_8x8_cdf_end = tx_size_8x8_cdf_start + size_of_val(&self.tx_size_8x8_cdf); let tx_size_cdf_start = self.tx_size_cdf.first().unwrap().as_ptr() as usize; let tx_size_cdf_end = tx_size_cdf_start + size_of_val(&self.tx_size_cdf); let txfm_partition_cdf_start = self.txfm_partition_cdf.first().unwrap().as_ptr() as usize; let txfm_partition_cdf_end = txfm_partition_cdf_start + size_of_val(&self.txfm_partition_cdf); let skip_cdfs_start = self.skip_cdfs.first().unwrap().as_ptr() as usize; let skip_cdfs_end = skip_cdfs_start + size_of_val(&self.skip_cdfs); let intra_inter_cdfs_start = self.intra_inter_cdfs.first().unwrap().as_ptr() as usize; let intra_inter_cdfs_end = intra_inter_cdfs_start + size_of_val(&self.intra_inter_cdfs); let angle_delta_cdf_start = self.angle_delta_cdf.first().unwrap().as_ptr() as usize; let angle_delta_cdf_end = angle_delta_cdf_start + size_of_val(&self.angle_delta_cdf); let filter_intra_cdfs_start = self.filter_intra_cdfs.first().unwrap().as_ptr() as usize; let filter_intra_cdfs_end = filter_intra_cdfs_start + size_of_val(&self.filter_intra_cdfs); let palette_y_mode_cdfs_start = self.palette_y_mode_cdfs.first().unwrap().as_ptr() as usize; let palette_y_mode_cdfs_end = palette_y_mode_cdfs_start + size_of_val(&self.palette_y_mode_cdfs); let palette_uv_mode_cdfs_start = self.palette_uv_mode_cdfs.first().unwrap().as_ptr() as usize; let palette_uv_mode_cdfs_end = palette_uv_mode_cdfs_start + size_of_val(&self.palette_uv_mode_cdfs); let comp_mode_cdf_start = self.comp_mode_cdf.first().unwrap().as_ptr() as usize; let comp_mode_cdf_end = comp_mode_cdf_start + size_of_val(&self.comp_mode_cdf); let comp_ref_type_cdf_start = self.comp_ref_type_cdf.first().unwrap().as_ptr() as usize; let comp_ref_type_cdf_end = comp_ref_type_cdf_start + size_of_val(&self.comp_ref_type_cdf); let comp_ref_cdf_start = self.comp_ref_cdf.first().unwrap().as_ptr() as usize; let comp_ref_cdf_end = comp_ref_cdf_start + size_of_val(&self.comp_ref_cdf); let comp_bwd_ref_cdf_start = self.comp_bwd_ref_cdf.first().unwrap().as_ptr() as usize; let comp_bwd_ref_cdf_end = comp_bwd_ref_cdf_start + size_of_val(&self.comp_bwd_ref_cdf); let single_ref_cdfs_start = self.single_ref_cdfs.first().unwrap().as_ptr() as usize; let single_ref_cdfs_end = single_ref_cdfs_start + size_of_val(&self.single_ref_cdfs); let drl_cdfs_start = self.drl_cdfs.first().unwrap().as_ptr() as usize; let drl_cdfs_end = drl_cdfs_start + size_of_val(&self.drl_cdfs); let compound_mode_cdf_start = self.compound_mode_cdf.first().unwrap().as_ptr() as usize; let compound_mode_cdf_end = compound_mode_cdf_start + size_of_val(&self.compound_mode_cdf); let nmv_context_start = &self.nmv_context as *const NMVContext as usize; let nmv_context_end = nmv_context_start + size_of_val(&self.nmv_context); let deblock_delta_multi_cdf_start = self.deblock_delta_multi_cdf.first().unwrap().as_ptr() as usize; let deblock_delta_multi_cdf_end = deblock_delta_multi_cdf_start + size_of_val(&self.deblock_delta_multi_cdf); let deblock_delta_cdf_start = self.deblock_delta_cdf.as_ptr() as usize; let deblock_delta_cdf_end = deblock_delta_cdf_start + size_of_val(&self.deblock_delta_cdf); let spatial_segmentation_cdfs_start = self.spatial_segmentation_cdfs.first().unwrap().as_ptr() as usize; let spatial_segmentation_cdfs_end = spatial_segmentation_cdfs_start + size_of_val(&self.spatial_segmentation_cdfs); let lrf_switchable_cdf_start = self.lrf_switchable_cdf.as_ptr() as usize; let lrf_switchable_cdf_end = lrf_switchable_cdf_start + size_of_val(&self.lrf_switchable_cdf); let lrf_sgrproj_cdf_start = self.lrf_sgrproj_cdf.as_ptr() as usize; let lrf_sgrproj_cdf_end = lrf_sgrproj_cdf_start + size_of_val(&self.lrf_sgrproj_cdf); let lrf_wiener_cdf_start = self.lrf_wiener_cdf.as_ptr() as usize; let lrf_wiener_cdf_end = lrf_wiener_cdf_start + size_of_val(&self.lrf_wiener_cdf); let txb_skip_cdf_start = self.txb_skip_cdf.first().unwrap().as_ptr() as usize; let txb_skip_cdf_end = txb_skip_cdf_start + size_of_val(&self.txb_skip_cdf); let dc_sign_cdf_start = self.dc_sign_cdf.first().unwrap().as_ptr() as usize; let dc_sign_cdf_end = dc_sign_cdf_start + size_of_val(&self.dc_sign_cdf); let eob_extra_cdf_start = self.eob_extra_cdf.first().unwrap().as_ptr() as usize; let eob_extra_cdf_end = eob_extra_cdf_start + size_of_val(&self.eob_extra_cdf); let eob_flag_cdf16_start = self.eob_flag_cdf16.first().unwrap().as_ptr() as usize; let eob_flag_cdf16_end = eob_flag_cdf16_start + size_of_val(&self.eob_flag_cdf16); let eob_flag_cdf32_start = self.eob_flag_cdf32.first().unwrap().as_ptr() as usize; let eob_flag_cdf32_end = eob_flag_cdf32_start + size_of_val(&self.eob_flag_cdf32); let eob_flag_cdf64_start = self.eob_flag_cdf64.first().unwrap().as_ptr() as usize; let eob_flag_cdf64_end = eob_flag_cdf64_start + size_of_val(&self.eob_flag_cdf64); let eob_flag_cdf128_start = self.eob_flag_cdf128.first().unwrap().as_ptr() as usize; let eob_flag_cdf128_end = eob_flag_cdf128_start + size_of_val(&self.eob_flag_cdf128); let eob_flag_cdf256_start = self.eob_flag_cdf256.first().unwrap().as_ptr() as usize; let eob_flag_cdf256_end = eob_flag_cdf256_start + size_of_val(&self.eob_flag_cdf256); let eob_flag_cdf512_start = self.eob_flag_cdf512.first().unwrap().as_ptr() as usize; let eob_flag_cdf512_end = eob_flag_cdf512_start + size_of_val(&self.eob_flag_cdf512); let eob_flag_cdf1024_start = self.eob_flag_cdf1024.first().unwrap().as_ptr() as usize; let eob_flag_cdf1024_end = eob_flag_cdf1024_start + size_of_val(&self.eob_flag_cdf1024); let coeff_base_eob_cdf_start = self.coeff_base_eob_cdf.first().unwrap().as_ptr() as usize; let coeff_base_eob_cdf_end = coeff_base_eob_cdf_start + size_of_val(&self.coeff_base_eob_cdf); let coeff_base_cdf_start = self.coeff_base_cdf.first().unwrap().as_ptr() as usize; let coeff_base_cdf_end = coeff_base_cdf_start + size_of_val(&self.coeff_base_cdf); let coeff_br_cdf_start = self.coeff_br_cdf.first().unwrap().as_ptr() as usize; let coeff_br_cdf_end = coeff_br_cdf_start + size_of_val(&self.coeff_br_cdf); vec![ ("partition_w8_cdf", partition_w8_cdf_start, partition_w8_cdf_end), ("partition_w128_cdf", partition_w128_cdf_start, partition_w128_cdf_end), ("partition_cdf", partition_cdf_start, partition_cdf_end), ("kf_y_cdf", kf_y_cdf_start, kf_y_cdf_end), ("y_mode_cdf", y_mode_cdf_start, y_mode_cdf_end), ("uv_mode_cdf", uv_mode_cdf_start, uv_mode_cdf_end), ("uv_mode_cfl_cdf", uv_mode_cfl_cdf_start, uv_mode_cfl_cdf_end), ("cfl_sign_cdf", cfl_sign_cdf_start, cfl_sign_cdf_end), ("cfl_alpha_cdf", cfl_alpha_cdf_start, cfl_alpha_cdf_end), ("newmv_cdf", newmv_cdf_start, newmv_cdf_end), ("zeromv_cdf", zeromv_cdf_start, zeromv_cdf_end), ("refmv_cdf", refmv_cdf_start, refmv_cdf_end), ("intra_tx_2_cdf", intra_tx_2_cdf_start, intra_tx_2_cdf_end), ("intra_tx_1_cdf", intra_tx_1_cdf_start, intra_tx_1_cdf_end), ("inter_tx_3_cdf", inter_tx_3_cdf_start, inter_tx_3_cdf_end), ("inter_tx_2_cdf", inter_tx_2_cdf_start, inter_tx_2_cdf_end), ("inter_tx_1_cdf", inter_tx_1_cdf_start, inter_tx_1_cdf_end), ("tx_size_8x8_cdf", tx_size_8x8_cdf_start, tx_size_8x8_cdf_end), ("tx_size_cdf", tx_size_cdf_start, tx_size_cdf_end), ("txfm_partition_cdf", txfm_partition_cdf_start, txfm_partition_cdf_end), ("skip_cdfs", skip_cdfs_start, skip_cdfs_end), ("intra_inter_cdfs", intra_inter_cdfs_start, intra_inter_cdfs_end), ("angle_delta_cdf", angle_delta_cdf_start, angle_delta_cdf_end), ("filter_intra_cdfs", filter_intra_cdfs_start, filter_intra_cdfs_end), ( "palette_y_mode_cdfs", palette_y_mode_cdfs_start, palette_y_mode_cdfs_end, ), ( "palette_uv_mode_cdfs", palette_uv_mode_cdfs_start, palette_uv_mode_cdfs_end, ), ("comp_mode_cdf", comp_mode_cdf_start, comp_mode_cdf_end), ("comp_ref_type_cdf", comp_ref_type_cdf_start, comp_ref_type_cdf_end), ("comp_ref_cdf", comp_ref_cdf_start, comp_ref_cdf_end), ("comp_bwd_ref_cdf", comp_bwd_ref_cdf_start, comp_bwd_ref_cdf_end), ("single_ref_cdfs", single_ref_cdfs_start, single_ref_cdfs_end), ("drl_cdfs", drl_cdfs_start, drl_cdfs_end), ("compound_mode_cdf", compound_mode_cdf_start, compound_mode_cdf_end), ("nmv_context", nmv_context_start, nmv_context_end), ( "deblock_delta_multi_cdf", deblock_delta_multi_cdf_start, deblock_delta_multi_cdf_end, ), ("deblock_delta_cdf", deblock_delta_cdf_start, deblock_delta_cdf_end), ( "spatial_segmentation_cdfs", spatial_segmentation_cdfs_start, spatial_segmentation_cdfs_end, ), ("lrf_switchable_cdf", lrf_switchable_cdf_start, lrf_switchable_cdf_end), ("lrf_sgrproj_cdf", lrf_sgrproj_cdf_start, lrf_sgrproj_cdf_end), ("lrf_wiener_cdf", lrf_wiener_cdf_start, lrf_wiener_cdf_end), ("txb_skip_cdf", txb_skip_cdf_start, txb_skip_cdf_end), ("dc_sign_cdf", dc_sign_cdf_start, dc_sign_cdf_end), ("eob_extra_cdf", eob_extra_cdf_start, eob_extra_cdf_end), ("eob_flag_cdf16", eob_flag_cdf16_start, eob_flag_cdf16_end), ("eob_flag_cdf32", eob_flag_cdf32_start, eob_flag_cdf32_end), ("eob_flag_cdf64", eob_flag_cdf64_start, eob_flag_cdf64_end), ("eob_flag_cdf128", eob_flag_cdf128_start, eob_flag_cdf128_end), ("eob_flag_cdf256", eob_flag_cdf256_start, eob_flag_cdf256_end), ("eob_flag_cdf512", eob_flag_cdf512_start, eob_flag_cdf512_end), ("eob_flag_cdf1024", eob_flag_cdf1024_start, eob_flag_cdf1024_end), ("coeff_base_eob_cdf", coeff_base_eob_cdf_start, coeff_base_eob_cdf_end), ("coeff_base_cdf", coeff_base_cdf_start, coeff_base_cdf_end), ("coeff_br_cdf", coeff_br_cdf_start, coeff_br_cdf_end), ] } pub fn offset( &self, cdf: *const [u16; CDF_LEN], ) -> CDFOffset { CDFOffset { offset: cdf as usize - self as *const _ as usize, phantom: PhantomData, } } } impl fmt::Debug for CDFContext { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "CDFContext contains too many numbers to print :-(") } } macro_rules! symbol_with_update { ($self:ident, $w:ident, $s:expr, $cdf:expr) => { let cdf = $self.fc.offset($cdf); $w.symbol_with_update($s, cdf, &mut $self.fc_log, &mut $self.fc); symbol_with_update!($self, $cdf); }; ($self:ident, $cdf:expr) => { #[cfg(feature = "desync_finder")] { let cdf: &[_] = $cdf; if let Some(map) = $self.fc_map.as_ref() { map.lookup(cdf.as_ptr() as usize); } } }; } #[derive(Clone)] pub struct ContextWriterCheckpoint { pub fc: CDFContextCheckpoint, pub bc: BlockContextCheckpoint, } struct CDFContextLogPartition { pub data: Vec<[u16; CDF_LEN_MAX_PLUS_1]>, } impl CDFContextLogPartition { fn new(capacity: usize) -> Self { Self { data: Vec::with_capacity(capacity) } } #[inline(always)] fn push( &mut self, fc: &mut CDFContext, cdf: CDFOffset, ) -> &mut [u16; CDF_LEN] { debug_assert!(CDF_LEN < CDF_LEN_MAX_PLUS_1); debug_assert!(cdf.offset <= u16::MAX.into()); // SAFETY: Maintain an invariant of non-zero spare capacity, so that // branching may be deferred until writes are issued. Benchmarks indicate // this is faster than first testing capacity and possibly reallocating. unsafe { let len = self.data.len(); let new_len = len + 1; let capacity = self.data.capacity(); debug_assert!(new_len <= capacity); let dst = self.data.as_mut_ptr().add(len) as *mut u16; let base = fc as *mut _ as *mut u8; let src = base.add(cdf.offset) as *const u16; dst.copy_from_nonoverlapping(src, CDF_LEN_MAX_PLUS_1 - 1); *dst.add(CDF_LEN_MAX_PLUS_1 - 1) = cdf.offset as u16; self.data.set_len(new_len); if CDF_LEN_MAX_PLUS_1 > capacity.wrapping_sub(new_len) { self.data.reserve(CDF_LEN_MAX_PLUS_1); } let cdf = base.add(cdf.offset) as *mut [u16; CDF_LEN]; &mut *cdf } } #[inline(always)] fn rollback(&mut self, fc: &mut CDFContext, checkpoint: usize) { let base = fc as *mut _ as *mut u8; let mut len = self.data.len(); // SAFETY: We use unchecked pointers here for performance. // Since we know the length, we can ensure not to go OOB. unsafe { let mut src = self.data.as_mut_ptr().add(len); while len > checkpoint { len -= 1; src = src.sub(1); let src = src as *mut u16; let offset = *src.add(CDF_LEN_MAX_PLUS_1 - 1) as usize; let dst = base.add(offset) as *mut u16; dst.copy_from_nonoverlapping(src, CDF_LEN_MAX_PLUS_1 - 1); } self.data.set_len(len); } } } const CDF_LEN_SMALL: usize = 4; pub struct CDFContextLog { small: CDFContextLogPartition<{ CDF_LEN_SMALL + 1 }>, large: CDFContextLogPartition<{ CDF_LEN_MAX + 1 }>, } impl Default for CDFContextLog { fn default() -> Self { Self { small: CDFContextLogPartition::new(1 << 16), large: CDFContextLogPartition::new(1 << 9), } } } impl CDFContextLog { fn checkpoint(&self) -> CDFContextCheckpoint { CDFContextCheckpoint { small: self.small.data.len(), large: self.large.data.len(), } } #[inline(always)] pub fn push( &mut self, fc: &mut CDFContext, cdf: CDFOffset, ) -> &mut [u16; CDF_LEN] { if CDF_LEN <= CDF_LEN_SMALL { self.small.push(fc, cdf) } else { self.large.push(fc, cdf) } } #[inline(always)] pub fn rollback( &mut self, fc: &mut CDFContext, checkpoint: &CDFContextCheckpoint, ) { self.small.rollback(fc, checkpoint.small); self.large.rollback(fc, checkpoint.large); } pub fn clear(&mut self) { self.small.data.clear(); self.large.data.clear(); } } pub struct ContextWriter<'a> { pub bc: BlockContext<'a>, pub fc: &'a mut CDFContext, pub fc_log: CDFContextLog, #[cfg(feature = "desync_finder")] pub fc_map: Option, // For debugging purposes } impl<'a> ContextWriter<'a> { #[allow(clippy::let_and_return)] pub fn new(fc: &'a mut CDFContext, bc: BlockContext<'a>) -> Self { let fc_log = CDFContextLog::default(); #[allow(unused_mut)] let mut cw = ContextWriter { bc, fc, fc_log, #[cfg(feature = "desync_finder")] fc_map: Default::default(), }; #[cfg(feature = "desync_finder")] { if std::env::var_os("RAV1E_DEBUG").is_some() { cw.fc_map = Some(FieldMap { map: cw.fc.build_map() }); } } cw } pub const fn cdf_element_prob(cdf: &[u16], element: usize) -> u16 { (if element > 0 { cdf[element - 1] } else { 32768 }) - (if element + 1 < cdf.len() { cdf[element] } else { 0 }) } pub fn checkpoint( &self, tile_bo: &TileBlockOffset, chroma_sampling: ChromaSampling, ) -> ContextWriterCheckpoint { ContextWriterCheckpoint { fc: self.fc_log.checkpoint(), bc: self.bc.checkpoint(tile_bo, chroma_sampling), } } pub fn rollback(&mut self, checkpoint: &ContextWriterCheckpoint) { self.fc_log.rollback(self.fc, &checkpoint.fc); self.bc.rollback(&checkpoint.bc); #[cfg(feature = "desync_finder")] { if self.fc_map.is_some() { self.fc_map = Some(FieldMap { map: self.fc.build_map() }); } } } } rav1e-0.7.1/src/context/frame_header.rs000064400000000000000000000227141046102023000161000ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; impl CDFContext { // rather than test writing and rolling back the cdf, we just count Q8 bits using the current cdf pub fn count_lrf_switchable( &self, w: &W, rs: &TileRestorationState, filter: RestorationFilter, pli: usize, ) -> u32 { match filter { RestorationFilter::None => w.symbol_bits(0, &self.lrf_switchable_cdf), RestorationFilter::Wiener { .. } => { unreachable!() // for now, not permanently } RestorationFilter::Sgrproj { set, xqd } => { // Does *not* use 'RESTORE_SGRPROJ' but rather just '2' let rp = &rs.planes[pli]; let mut bits = w.symbol_bits(2, &self.lrf_switchable_cdf) + ((SGRPROJ_PARAMS_BITS as u32) << OD_BITRES); for i in 0..2 { let s = SGRPROJ_PARAMS_S[set as usize][i]; let min = SGRPROJ_XQD_MIN[i] as i32; let max = SGRPROJ_XQD_MAX[i] as i32; if s > 0 { bits += w.count_signed_subexp_with_ref( xqd[i] as i32, min, max + 1, SGRPROJ_PRJ_SUBEXP_K, rp.sgrproj_ref[i] as i32, ); } } bits } } } } impl<'a> ContextWriter<'a> { fn get_ref_frame_ctx_b0(&self, bo: TileBlockOffset) -> usize { let ref_counts = self.bc.blocks[bo].neighbors_ref_counts; let fwd_cnt = ref_counts[LAST_FRAME.to_index()] + ref_counts[LAST2_FRAME.to_index()] + ref_counts[LAST3_FRAME.to_index()] + ref_counts[GOLDEN_FRAME.to_index()]; let bwd_cnt = ref_counts[BWDREF_FRAME.to_index()] + ref_counts[ALTREF2_FRAME.to_index()] + ref_counts[ALTREF_FRAME.to_index()]; ContextWriter::ref_count_ctx(fwd_cnt, bwd_cnt) } /// # Panics /// /// - If the `comp_mode` setting does not match the reference mode and size. pub fn write_ref_frames( &mut self, w: &mut W, fi: &FrameInvariants, bo: TileBlockOffset, ) { let rf = self.bc.blocks[bo].ref_frames; let sz = self.bc.blocks[bo].n4_w.min(self.bc.blocks[bo].n4_h); /* TODO: Handle multiple references */ let comp_mode = self.bc.blocks[bo].has_second_ref(); if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 { let ctx = self.get_comp_mode_ctx(bo); let cdf = &self.fc.comp_mode_cdf[ctx]; symbol_with_update!(self, w, comp_mode as u32, cdf); } else { assert!(!comp_mode); } if comp_mode { let comp_ref_type: u32 = 1; // bidir let ctx = self.get_comp_ref_type_ctx(bo); let cdf = &self.fc.comp_ref_type_cdf[ctx]; symbol_with_update!(self, w, comp_ref_type, cdf); if comp_ref_type == 0 { unimplemented!(); } else { let compref = rf[0] == GOLDEN_FRAME || rf[0] == LAST3_FRAME; let ctx = self.get_pred_ctx_ll2_or_l3gld(bo); let cdf = &self.fc.comp_ref_cdf[ctx][0]; symbol_with_update!(self, w, compref as u32, cdf); if !compref { let compref_p1 = rf[0] == LAST2_FRAME; let ctx = self.get_pred_ctx_last_or_last2(bo); let cdf = &self.fc.comp_ref_cdf[ctx][1]; symbol_with_update!(self, w, compref_p1 as u32, cdf); } else { let compref_p2 = rf[0] == GOLDEN_FRAME; let ctx = self.get_pred_ctx_last3_or_gold(bo); let cdf = &self.fc.comp_ref_cdf[ctx][2]; symbol_with_update!(self, w, compref_p2 as u32, cdf); } let comp_bwdref = rf[1] == ALTREF_FRAME; let ctx = self.get_pred_ctx_brfarf2_or_arf(bo); let cdf = &self.fc.comp_bwd_ref_cdf[ctx][0]; symbol_with_update!(self, w, comp_bwdref as u32, cdf); if !comp_bwdref { let comp_bwdref_p1 = rf[1] == ALTREF2_FRAME; let ctx = self.get_pred_ctx_brf_or_arf2(bo); let cdf = &self.fc.comp_bwd_ref_cdf[ctx][1]; symbol_with_update!(self, w, comp_bwdref_p1 as u32, cdf); } } } else { let b0_ctx = self.get_ref_frame_ctx_b0(bo); let b0 = rf[0] != NONE_FRAME && rf[0].is_bwd_ref(); let cdf = &self.fc.single_ref_cdfs[b0_ctx][0]; symbol_with_update!(self, w, b0 as u32, cdf); if b0 { let b1_ctx = self.get_pred_ctx_brfarf2_or_arf(bo); let b1 = rf[0] == ALTREF_FRAME; let cdf = &self.fc.single_ref_cdfs[b1_ctx][1]; symbol_with_update!(self, w, b1 as u32, cdf); if !b1 { let b5_ctx = self.get_pred_ctx_brf_or_arf2(bo); let b5 = rf[0] == ALTREF2_FRAME; let cdf = &self.fc.single_ref_cdfs[b5_ctx][5]; symbol_with_update!(self, w, b5 as u32, cdf); } } else { let b2_ctx = self.get_pred_ctx_ll2_or_l3gld(bo); let b2 = rf[0] == LAST3_FRAME || rf[0] == GOLDEN_FRAME; let cdf = &self.fc.single_ref_cdfs[b2_ctx][2]; symbol_with_update!(self, w, b2 as u32, cdf); if !b2 { let b3_ctx = self.get_pred_ctx_last_or_last2(bo); let b3 = rf[0] != LAST_FRAME; let cdf = &self.fc.single_ref_cdfs[b3_ctx][3]; symbol_with_update!(self, w, b3 as u32, cdf); } else { let b4_ctx = self.get_pred_ctx_last3_or_gold(bo); let b4 = rf[0] != LAST3_FRAME; let cdf = &self.fc.single_ref_cdfs[b4_ctx][4]; symbol_with_update!(self, w, b4 as u32, cdf); } } } } pub fn count_lrf_switchable( &self, w: &W, rs: &TileRestorationState, filter: RestorationFilter, pli: usize, ) -> u32 { self.fc.count_lrf_switchable(w, rs, filter, pli) } /// # Panics /// /// - If the LRF has already been written for this superblock pub fn write_lrf( &mut self, w: &mut W, rs: &mut TileRestorationStateMut, sbo: TileSuperBlockOffset, pli: usize, ) { let rp = &mut rs.planes[pli]; if let Some(filter) = rp.restoration_unit(sbo, true).map(|ru| ru.filter) { match filter { RestorationFilter::None => match rp.rp_cfg.lrf_type { RESTORE_WIENER => { let cdf = &self.fc.lrf_wiener_cdf; symbol_with_update!(self, w, 0, cdf); } RESTORE_SGRPROJ => { let cdf = &self.fc.lrf_sgrproj_cdf; symbol_with_update!(self, w, 0, cdf); } RESTORE_SWITCHABLE => { let cdf = &self.fc.lrf_switchable_cdf; symbol_with_update!(self, w, 0, cdf); } RESTORE_NONE => {} _ => unreachable!(), }, RestorationFilter::Sgrproj { set, xqd } => { match rp.rp_cfg.lrf_type { RESTORE_SGRPROJ => { let cdf = &self.fc.lrf_sgrproj_cdf; symbol_with_update!(self, w, 1, cdf); } RESTORE_SWITCHABLE => { // Does *not* write 'RESTORE_SGRPROJ' let cdf = &self.fc.lrf_switchable_cdf; symbol_with_update!(self, w, 2, cdf); } _ => unreachable!(), } w.literal(SGRPROJ_PARAMS_BITS, set as u32); for i in 0..2 { let s = SGRPROJ_PARAMS_S[set as usize][i]; let min = SGRPROJ_XQD_MIN[i] as i32; let max = SGRPROJ_XQD_MAX[i] as i32; if s > 0 { w.write_signed_subexp_with_ref( xqd[i] as i32, min, max + 1, SGRPROJ_PRJ_SUBEXP_K, rp.sgrproj_ref[i] as i32, ); rp.sgrproj_ref[i] = xqd[i]; } else { // Nothing written, just update the reference if i == 0 { assert!(xqd[i] == 0); rp.sgrproj_ref[0] = 0; } else { rp.sgrproj_ref[1] = 95; // LOL at spec. The result is always 95. } } } } RestorationFilter::Wiener { coeffs } => { match rp.rp_cfg.lrf_type { RESTORE_WIENER => { let cdf = &self.fc.lrf_wiener_cdf; symbol_with_update!(self, w, 1, cdf); } RESTORE_SWITCHABLE => { // Does *not* write 'RESTORE_WIENER' let cdf = &self.fc.lrf_switchable_cdf; symbol_with_update!(self, w, 1, cdf); } _ => unreachable!(), } for pass in 0..2 { let first_coeff = if pli == 0 { 0 } else { assert!(coeffs[pass][0] == 0); 1 }; for i in first_coeff..3 { let min = WIENER_TAPS_MIN[i] as i32; let max = WIENER_TAPS_MAX[i] as i32; w.write_signed_subexp_with_ref( coeffs[pass][i] as i32, min, max + 1, (i + 1) as u8, rp.wiener_ref[pass][i] as i32, ); rp.wiener_ref[pass][i] = coeffs[pass][i]; } } } } } } pub fn write_cdef( &mut self, w: &mut W, strength_index: u8, bits: u8, ) { w.literal(bits, strength_index as u32); } } rav1e-0.7.1/src/context/mod.rs000064400000000000000000000162251046102023000142550ustar 00000000000000// Copyright (c) 2017-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_upper_case_globals)] #![allow(dead_code)] #![allow(non_camel_case_types)] use crate::color::ChromaSampling; use crate::ec::{Writer, OD_BITRES}; use crate::encoder::FrameInvariants; use crate::entropymode::*; use crate::frame::*; use crate::header::ReferenceMode; use crate::lrf::*; use crate::mc::MotionVector; use crate::partition::BlockSize::*; use crate::partition::RefType::*; use crate::partition::*; use crate::scan_order::*; use crate::tiling::*; use crate::token_cdfs::*; use crate::transform::TxSize::*; use crate::transform::*; use crate::util::*; use arrayvec::*; use std::default::Default; use std::ops::{Add, Index, IndexMut}; use std::*; const MAX_REF_MV_STACK_SIZE: usize = 8; pub const REF_CAT_LEVEL: u32 = 640; pub const FRAME_LF_COUNT: usize = 4; pub const MAX_LOOP_FILTER: usize = 63; const DELTA_LF_SMALL: u32 = 3; pub const DELTA_LF_PROBS: usize = DELTA_LF_SMALL as usize; const DELTA_Q_SMALL: u32 = 3; pub const DELTA_Q_PROBS: usize = DELTA_Q_SMALL as usize; static size_group_lookup: [u8; BlockSize::BLOCK_SIZES_ALL] = [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2]; static num_pels_log2_lookup: [u8; BlockSize::BLOCK_SIZES_ALL] = [4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10]; #[macro_use] mod cdf_context; pub use cdf_context::*; mod partition_unit; pub use partition_unit::*; mod superblock_unit; pub use superblock_unit::*; mod transform_unit; pub use transform_unit::TxClass::*; pub use transform_unit::*; mod block_unit; pub use block_unit::*; mod frame_header; #[derive(Debug, Default)] pub struct FieldMap { map: Vec<(&'static str, usize, usize)>, } impl FieldMap { /// Print the field the address belong to fn lookup(&self, addr: usize) { for (name, start, end) in &self.map { if addr >= *start && addr < *end { println!(" CDF {name}"); println!(); return; } } println!(" CDF address not found: {addr}"); } } #[inline] pub const fn av1_get_coded_tx_size(tx_size: TxSize) -> TxSize { match tx_size { TX_64X64 | TX_64X32 | TX_32X64 => TX_32X32, TX_16X64 => TX_16X32, TX_64X16 => TX_32X16, _ => tx_size, } } /* Symbols for coding magnitude class of nonzero components */ const MV_CLASSES: usize = 11; // MV Class Types const MV_CLASS_0: usize = 0; /* (0, 2] integer pel */ const MV_CLASS_1: usize = 1; /* (2, 4] integer pel */ const MV_CLASS_2: usize = 2; /* (4, 8] integer pel */ const MV_CLASS_3: usize = 3; /* (8, 16] integer pel */ const MV_CLASS_4: usize = 4; /* (16, 32] integer pel */ const MV_CLASS_5: usize = 5; /* (32, 64] integer pel */ const MV_CLASS_6: usize = 6; /* (64, 128] integer pel */ const MV_CLASS_7: usize = 7; /* (128, 256] integer pel */ const MV_CLASS_8: usize = 8; /* (256, 512] integer pel */ const MV_CLASS_9: usize = 9; /* (512, 1024] integer pel */ const MV_CLASS_10: usize = 10; /* (1024,2048] integer pel */ const CLASS0_BITS: usize = 1; /* bits at integer precision for class 0 */ const CLASS0_SIZE: usize = 1 << CLASS0_BITS; const MV_OFFSET_BITS: usize = MV_CLASSES + CLASS0_BITS - 2; const MV_BITS_CONTEXTS: usize = 6; const MV_FP_SIZE: usize = 4; const MV_MAX_BITS: usize = MV_CLASSES + CLASS0_BITS + 2; const MV_MAX: usize = (1 << MV_MAX_BITS) - 1; const MV_VALS: usize = (MV_MAX << 1) + 1; const MV_IN_USE_BITS: usize = 14; pub const MV_UPP: i32 = 1 << MV_IN_USE_BITS; pub const MV_LOW: i32 = -(1 << MV_IN_USE_BITS); #[inline(always)] pub const fn av1_get_mv_joint(mv: MotionVector) -> MvJointType { match (mv.row, mv.col) { (0, 0) => MvJointType::MV_JOINT_ZERO, (0, _) => MvJointType::MV_JOINT_HNZVZ, (_, 0) => MvJointType::MV_JOINT_HZVNZ, (_, _) => MvJointType::MV_JOINT_HNZVNZ, } } #[inline(always)] pub fn mv_joint_vertical(joint_type: MvJointType) -> bool { joint_type == MvJointType::MV_JOINT_HZVNZ || joint_type == MvJointType::MV_JOINT_HNZVNZ } #[inline(always)] pub fn mv_joint_horizontal(joint_type: MvJointType) -> bool { joint_type == MvJointType::MV_JOINT_HNZVZ || joint_type == MvJointType::MV_JOINT_HNZVNZ } #[inline(always)] pub const fn mv_class_base(mv_class: usize) -> u32 { if mv_class != MV_CLASS_0 { (CLASS0_SIZE << (mv_class + 2)) as u32 } else { 0 } } #[inline(always)] // If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0. pub fn log_in_base_2(n: u32) -> u8 { 31 - cmp::min(31, n.leading_zeros() as u8) } /// Returns `(mv_class, offset)` #[inline(always)] pub fn get_mv_class(z: u32) -> (usize, u32) { let c = if z >= CLASS0_SIZE as u32 * 4096 { MV_CLASS_10 } else { log_in_base_2(z >> 3) as usize }; let offset = z - mv_class_base(c); (c, offset) } impl<'a> ContextWriter<'a> { /// # Panics /// /// - If the `comp` is 0 /// - If the `comp` is outside the bounds of `MV_LOW` and `MV_UPP` pub fn encode_mv_component( &mut self, w: &mut W, comp: i32, axis: usize, precision: MvSubpelPrecision, ) { assert!(comp != 0); assert!((MV_LOW..=MV_UPP).contains(&comp)); let sign: u32 = u32::from(comp < 0); let mag: u32 = if sign == 1 { -comp as u32 } else { comp as u32 }; let (mv_class, offset) = get_mv_class(mag - 1); let d = offset >> 3; // int mv data let fr = (offset >> 1) & 3; // fractional mv data let hp = offset & 1; // high precision mv data // Sign { let mvcomp = &self.fc.nmv_context.comps[axis]; let cdf = &mvcomp.sign_cdf; symbol_with_update!(self, w, sign, cdf); } // Class { let mvcomp = &self.fc.nmv_context.comps[axis]; let cdf = &mvcomp.classes_cdf; symbol_with_update!(self, w, mv_class as u32, cdf); } // Integer bits if mv_class == MV_CLASS_0 { let mvcomp = &self.fc.nmv_context.comps[axis]; let cdf = &mvcomp.class0_cdf; symbol_with_update!(self, w, d, cdf); } else { let n = mv_class + CLASS0_BITS - 1; // number of bits for i in 0..n { let mvcomp = &self.fc.nmv_context.comps[axis]; let cdf = &mvcomp.bits_cdf[i]; symbol_with_update!(self, w, (d >> i) & 1, cdf); } } // Fractional bits if precision > MvSubpelPrecision::MV_SUBPEL_NONE { let mvcomp = &self.fc.nmv_context.comps[axis]; let cdf = if mv_class == MV_CLASS_0 { &mvcomp.class0_fp_cdf[d as usize] } else { &mvcomp.fp_cdf }; symbol_with_update!(self, w, fr, cdf); } // High precision bit if precision > MvSubpelPrecision::MV_SUBPEL_LOW_PRECISION { let mvcomp = &self.fc.nmv_context.comps[axis]; let cdf = if mv_class == MV_CLASS_0 { &mvcomp.class0_hp_cdf } else { &mvcomp.hp_cdf }; symbol_with_update!(self, w, hp, cdf); } } } rav1e-0.7.1/src/context/partition_unit.rs000064400000000000000000000375501046102023000165520ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::cdf_context::ContextWriter; use super::*; // Generates 4 bit field in which each bit set to 1 represents // a blocksize partition 1111 means we split 64x64, 32x32, 16x16 // and 8x8. 1000 means we just split the 64x64 to 32x32 pub static partition_context_lookup: [[u8; 2]; BlockSize::BLOCK_SIZES_ALL] = [ [31, 31], // 4X4 - {0b11111, 0b11111} [31, 30], // 4X8 - {0b11111, 0b11110} [30, 31], // 8X4 - {0b11110, 0b11111} [30, 30], // 8X8 - {0b11110, 0b11110} [30, 28], // 8X16 - {0b11110, 0b11100} [28, 30], // 16X8 - {0b11100, 0b11110} [28, 28], // 16X16 - {0b11100, 0b11100} [28, 24], // 16X32 - {0b11100, 0b11000} [24, 28], // 32X16 - {0b11000, 0b11100} [24, 24], // 32X32 - {0b11000, 0b11000} [24, 16], // 32X64 - {0b11000, 0b10000} [16, 24], // 64X32 - {0b10000, 0b11000} [16, 16], // 64X64 - {0b10000, 0b10000} [16, 0], // 64X128- {0b10000, 0b00000} [0, 16], // 128X64- {0b00000, 0b10000} [0, 0], // 128X128-{0b00000, 0b00000} [31, 28], // 4X16 - {0b11111, 0b11100} [28, 31], // 16X4 - {0b11100, 0b11111} [30, 24], // 8X32 - {0b11110, 0b11000} [24, 30], // 32X8 - {0b11000, 0b11110} [28, 16], // 16X64 - {0b11100, 0b10000} [16, 28], // 64X16 - {0b10000, 0b11100} ]; pub const CFL_JOINT_SIGNS: usize = 8; pub const CFL_ALPHA_CONTEXTS: usize = 6; pub const CFL_ALPHABET_SIZE: usize = 16; pub const PARTITION_PLOFFSET: usize = 4; pub const PARTITION_BLOCK_SIZES: usize = 4 + 1; const PARTITION_CONTEXTS_PRIMARY: usize = PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET; pub const PARTITION_CONTEXTS: usize = PARTITION_CONTEXTS_PRIMARY; pub const PARTITION_TYPES: usize = 4; pub const EXT_PARTITION_TYPES: usize = 10; pub const SKIP_CONTEXTS: usize = 3; pub const SKIP_MODE_CONTEXTS: usize = 3; // partition contexts are at 8x8 granularity, as it is not possible to // split 4x4 blocks any further than that pub const PARTITION_CONTEXT_GRANULARITY: usize = 8; pub const PARTITION_CONTEXT_MAX_WIDTH: usize = MAX_TILE_WIDTH / PARTITION_CONTEXT_GRANULARITY; #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum CFLSign { CFL_SIGN_ZERO = 0, CFL_SIGN_NEG = 1, CFL_SIGN_POS = 2, } impl CFLSign { pub const fn from_alpha(a: i16) -> CFLSign { [CFL_SIGN_NEG, CFL_SIGN_ZERO, CFL_SIGN_POS][(a.signum() + 1) as usize] } } use crate::context::CFLSign::*; const CFL_SIGNS: usize = 3; static cfl_sign_value: [i16; CFL_SIGNS] = [0, -1, 1]; #[derive(Copy, Clone, Debug)] pub struct CFLParams { pub sign: [CFLSign; 2], pub scale: [u8; 2], } impl Default for CFLParams { #[inline] fn default() -> Self { Self { sign: [CFL_SIGN_NEG, CFL_SIGN_ZERO], scale: [1, 0] } } } impl CFLParams { /// # Panics /// /// - If either current sign is zero #[inline] pub fn joint_sign(self) -> u32 { assert!(self.sign[0] != CFL_SIGN_ZERO || self.sign[1] != CFL_SIGN_ZERO); (self.sign[0] as u32) * (CFL_SIGNS as u32) + (self.sign[1] as u32) - 1 } /// # Panics /// /// - If the sign at index `uv` is zero #[inline] pub fn context(self, uv: usize) -> usize { assert!(self.sign[uv] != CFL_SIGN_ZERO); (self.sign[uv] as usize - 1) * CFL_SIGNS + (self.sign[1 - uv] as usize) } /// # Panics /// /// - If the sign at index `uv` is zero #[inline] pub fn index(self, uv: usize) -> u32 { assert!(self.sign[uv] != CFL_SIGN_ZERO && self.scale[uv] != 0); (self.scale[uv] - 1) as u32 } #[inline] pub fn alpha(self, uv: usize) -> i16 { cfl_sign_value[self.sign[uv] as usize] * (self.scale[uv] as i16) } #[inline] pub const fn from_alpha(u: i16, v: i16) -> CFLParams { CFLParams { sign: [CFLSign::from_alpha(u), CFLSign::from_alpha(v)], scale: [u.unsigned_abs() as u8, v.unsigned_abs() as u8], } } } #[cfg(test)] mod test { #[test] fn cdf_map() { use super::*; let cdf = CDFContext::new(8); let cdf_map = FieldMap { map: cdf.build_map() }; let f = &cdf.partition_cdf[2]; cdf_map.lookup(f.as_ptr() as usize); } use super::CFLSign; use super::CFLSign::*; static cfl_alpha_signs: [[CFLSign; 2]; 8] = [ [CFL_SIGN_ZERO, CFL_SIGN_NEG], [CFL_SIGN_ZERO, CFL_SIGN_POS], [CFL_SIGN_NEG, CFL_SIGN_ZERO], [CFL_SIGN_NEG, CFL_SIGN_NEG], [CFL_SIGN_NEG, CFL_SIGN_POS], [CFL_SIGN_POS, CFL_SIGN_ZERO], [CFL_SIGN_POS, CFL_SIGN_NEG], [CFL_SIGN_POS, CFL_SIGN_POS], ]; static cfl_context: [[usize; 8]; 2] = [[0, 0, 0, 1, 2, 3, 4, 5], [0, 3, 0, 1, 4, 0, 2, 5]]; #[test] fn cfl_joint_sign() { use super::*; let mut cfl = CFLParams::default(); for (joint_sign, &signs) in cfl_alpha_signs.iter().enumerate() { cfl.sign = signs; assert!(cfl.joint_sign() as usize == joint_sign); for uv in 0..2 { if signs[uv] != CFL_SIGN_ZERO { assert!(cfl.context(uv) == cfl_context[uv][joint_sign]); } } } } } impl<'a> ContextWriter<'a> { fn partition_gather_horz_alike( out: &mut [u16; 2], cdf_in: &[u16], _bsize: BlockSize, ) { out[0] = 32768; out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_HORZ as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_SPLIT as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_HORZ_A as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_HORZ_B as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_VERT_A as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_HORZ_4 as usize, ); out[0] = 32768 - out[0]; out[1] = 0; } fn partition_gather_vert_alike( out: &mut [u16; 2], cdf_in: &[u16], _bsize: BlockSize, ) { out[0] = 32768; out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_VERT as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_SPLIT as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_HORZ_A as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_VERT_A as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_VERT_B as usize, ); out[0] -= ContextWriter::cdf_element_prob( cdf_in, PartitionType::PARTITION_VERT_4 as usize, ); out[0] = 32768 - out[0]; out[1] = 0; } #[inline] pub fn write_skip( &mut self, w: &mut W, bo: TileBlockOffset, skip: bool, ) { let ctx = self.bc.skip_context(bo); let cdf = &self.fc.skip_cdfs[ctx]; symbol_with_update!(self, w, skip as u32, cdf); } pub fn get_segment_pred( &self, bo: TileBlockOffset, last_active_segid: u8, ) -> (u8, u8) { let mut prev_ul = -1; let mut prev_u = -1; let mut prev_l = -1; if bo.0.x > 0 && bo.0.y > 0 { prev_ul = self.bc.blocks.above_left_of(bo).segmentation_idx as i8; } if bo.0.y > 0 { prev_u = self.bc.blocks.above_of(bo).segmentation_idx as i8; } if bo.0.x > 0 { prev_l = self.bc.blocks.left_of(bo).segmentation_idx as i8; } /* Pick CDF index based on number of matching/out-of-bounds segment IDs. */ let cdf_index: u8; if prev_ul < 0 || prev_u < 0 || prev_l < 0 { /* Edge case */ cdf_index = 0; } else if (prev_ul == prev_u) && (prev_ul == prev_l) { cdf_index = 2; } else if (prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l) { cdf_index = 1; } else { cdf_index = 0; } /* If 2 or more are identical returns that as predictor, otherwise prev_l. */ let r: i8; if prev_u == -1 { /* edge case */ r = if prev_l == -1 { 0 } else { prev_l }; } else if prev_l == -1 { /* edge case */ r = prev_u; } else { r = if prev_ul == prev_u { prev_u } else { prev_l }; } ((r as u8).min(last_active_segid), cdf_index) } pub fn write_cfl_alphas(&mut self, w: &mut W, cfl: CFLParams) { symbol_with_update!(self, w, cfl.joint_sign(), &self.fc.cfl_sign_cdf); for uv in 0..2 { if cfl.sign[uv] != CFL_SIGN_ZERO { symbol_with_update!( self, w, cfl.index(uv), &self.fc.cfl_alpha_cdf[cfl.context(uv)] ); } } } /// # Panics /// /// - If called with an 8x8 or larger `bsize` /// - If called with a `PartitionType` incompatible with the current block. pub fn write_partition( &mut self, w: &mut impl Writer, bo: TileBlockOffset, p: PartitionType, bsize: BlockSize, ) { debug_assert!(bsize.is_sqr()); assert!(bsize >= BlockSize::BLOCK_8X8); let hbs = bsize.width_mi() / 2; let has_cols = (bo.0.x + hbs) < self.bc.blocks.cols(); let has_rows = (bo.0.y + hbs) < self.bc.blocks.rows(); let ctx = self.bc.partition_plane_context(bo, bsize); assert!(ctx < PARTITION_CONTEXTS); if !has_rows && !has_cols { return; } if has_rows && has_cols { if ctx < PARTITION_TYPES { let cdf = &self.fc.partition_w8_cdf[ctx]; symbol_with_update!(self, w, p as u32, cdf); } else if ctx < 4 * PARTITION_TYPES { let cdf = &self.fc.partition_cdf[ctx - PARTITION_TYPES]; symbol_with_update!(self, w, p as u32, cdf); } else { let cdf = &self.fc.partition_w128_cdf[ctx - 4 * PARTITION_TYPES]; symbol_with_update!(self, w, p as u32, cdf); } } else if !has_rows && has_cols { assert!( p == PartitionType::PARTITION_SPLIT || p == PartitionType::PARTITION_HORZ ); assert!(bsize > BlockSize::BLOCK_8X8); let mut cdf = [0u16; 2]; if ctx < PARTITION_TYPES { let partition_cdf = &self.fc.partition_w8_cdf[ctx]; ContextWriter::partition_gather_vert_alike( &mut cdf, partition_cdf, bsize, ); } else if ctx < 4 * PARTITION_TYPES { let partition_cdf = &self.fc.partition_cdf[ctx - PARTITION_TYPES]; ContextWriter::partition_gather_vert_alike( &mut cdf, partition_cdf, bsize, ); } else { let partition_cdf = &self.fc.partition_w128_cdf[ctx - 4 * PARTITION_TYPES]; ContextWriter::partition_gather_vert_alike( &mut cdf, partition_cdf, bsize, ); } w.symbol((p == PartitionType::PARTITION_SPLIT) as u32, &cdf); } else { assert!( p == PartitionType::PARTITION_SPLIT || p == PartitionType::PARTITION_VERT ); assert!(bsize > BlockSize::BLOCK_8X8); let mut cdf = [0u16; 2]; if ctx < PARTITION_TYPES { let partition_cdf = &self.fc.partition_w8_cdf[ctx]; ContextWriter::partition_gather_horz_alike( &mut cdf, partition_cdf, bsize, ); } else if ctx < 4 * PARTITION_TYPES { let partition_cdf = &self.fc.partition_cdf[ctx - PARTITION_TYPES]; ContextWriter::partition_gather_horz_alike( &mut cdf, partition_cdf, bsize, ); } else { let partition_cdf = &self.fc.partition_w128_cdf[ctx - 4 * PARTITION_TYPES]; ContextWriter::partition_gather_horz_alike( &mut cdf, partition_cdf, bsize, ); } w.symbol((p == PartitionType::PARTITION_SPLIT) as u32, &cdf); } } fn neg_interleave(x: i32, r: i32, max: i32) -> i32 { assert!(x < max); if r == 0 { return x; } else if r >= (max - 1) { return -x + max - 1; } let diff = x - r; if 2 * r < max { if diff.abs() <= r { if diff > 0 { return (diff << 1) - 1; } else { return (-diff) << 1; } } x } else { if diff.abs() < (max - r) { if diff > 0 { return (diff << 1) - 1; } else { return (-diff) << 1; } } (max - x) - 1 } } pub fn write_segmentation( &mut self, w: &mut W, bo: TileBlockOffset, bsize: BlockSize, skip: bool, last_active_segid: u8, ) { let (pred, cdf_index) = self.get_segment_pred(bo, last_active_segid); if skip { self.bc.blocks.set_segmentation_idx(bo, bsize, pred); return; } let seg_idx = self.bc.blocks[bo].segmentation_idx; let coded_id = Self::neg_interleave( seg_idx as i32, pred as i32, (last_active_segid + 1) as i32, ); symbol_with_update!( self, w, coded_id as u32, &self.fc.spatial_segmentation_cdfs[cdf_index as usize] ); } } impl<'a> BlockContext<'a> { /// # Panics /// /// - If called with a non-square `bsize` pub fn partition_plane_context( &self, bo: TileBlockOffset, bsize: BlockSize, ) -> usize { // TODO: this should be way simpler without sub8x8 let above_ctx = self.above_partition_context[bo.0.x >> 1]; let left_ctx = self.left_partition_context[bo.y_in_sb() >> 1]; let bsl = bsize.width_log2() - BLOCK_8X8.width_log2(); let above = (above_ctx >> bsl) & 1; let left = (left_ctx >> bsl) & 1; assert!(bsize.is_sqr()); (left * 2 + above) as usize + bsl * PARTITION_PLOFFSET } /// # Panics /// /// - If the block size is invalid for subsampling pub fn reset_skip_context( &mut self, bo: TileBlockOffset, bsize: BlockSize, xdec: usize, ydec: usize, cs: ChromaSampling, ) { let num_planes = if cs == ChromaSampling::Cs400 { 1 } else { 3 }; let nplanes = if bsize >= BLOCK_8X8 { num_planes } else { 1 + (num_planes - 1) * has_chroma(bo, bsize, xdec, ydec, cs) as usize }; for plane in 0..nplanes { let xdec2 = if plane == 0 { 0 } else { xdec }; let ydec2 = if plane == 0 { 0 } else { ydec }; let plane_bsize = if plane == 0 { bsize } else { bsize.subsampled_size(xdec2, ydec2).unwrap() }; let bw = plane_bsize.width_mi(); let bh = plane_bsize.height_mi(); for above in &mut self.above_coeff_context[plane][(bo.0.x >> xdec2)..][..bw] { *above = 0; } let bo_y = bo.y_in_sb(); for left in &mut self.left_coeff_context[plane][(bo_y >> ydec2)..][..bh] { *left = 0; } } } pub fn skip_context(&self, bo: TileBlockOffset) -> usize { let above_skip = bo.0.y > 0 && self.blocks.above_of(bo).skip; let left_skip = bo.0.x > 0 && self.blocks.left_of(bo).skip; above_skip as usize + left_skip as usize } /// # Panics /// /// - If called with a non-square `bsize` pub fn update_partition_context( &mut self, bo: TileBlockOffset, subsize: BlockSize, bsize: BlockSize, ) { assert!(bsize.is_sqr()); let bw = bsize.width_mi(); let bh = bsize.height_mi(); let above_ctx = &mut self.above_partition_context[bo.0.x >> 1..(bo.0.x + bw) >> 1]; let left_ctx = &mut self.left_partition_context [bo.y_in_sb() >> 1..(bo.y_in_sb() + bh) >> 1]; // update the partition context at the end notes. set partition bits // of block sizes larger than the current one to be one, and partition // bits of smaller block sizes to be zero. for above in &mut above_ctx[..bw >> 1] { *above = partition_context_lookup[subsize as usize][0]; } for left in &mut left_ctx[..bh >> 1] { *left = partition_context_lookup[subsize as usize][1]; } } } rav1e-0.7.1/src/context/superblock_unit.rs000064400000000000000000000101051046102023000166750ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; pub const MAX_SB_SIZE_LOG2: usize = 7; const SB_SIZE_LOG2: usize = 6; pub const SB_SIZE: usize = 1 << SB_SIZE_LOG2; const SB_SQUARE: usize = SB_SIZE * SB_SIZE; pub const MI_SIZE_LOG2: usize = 2; pub const MI_SIZE: usize = 1 << MI_SIZE_LOG2; pub const MAX_MIB_SIZE_LOG2: usize = MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2; pub const MIB_SIZE_LOG2: usize = SB_SIZE_LOG2 - MI_SIZE_LOG2; pub const MIB_SIZE: usize = 1 << MIB_SIZE_LOG2; pub const MIB_MASK: usize = MIB_SIZE - 1; pub const SUPERBLOCK_TO_PLANE_SHIFT: usize = SB_SIZE_LOG2; pub const SUPERBLOCK_TO_BLOCK_SHIFT: usize = MIB_SIZE_LOG2; pub const BLOCK_TO_PLANE_SHIFT: usize = MI_SIZE_LOG2; pub const IMPORTANCE_BLOCK_TO_BLOCK_SHIFT: usize = 1; pub const LOCAL_BLOCK_MASK: usize = (1 << SUPERBLOCK_TO_BLOCK_SHIFT) - 1; pub const MAX_SB_IN_IMP_B: usize = 1 << (MAX_SB_SIZE_LOG2 - IMPORTANCE_BLOCK_TO_BLOCK_SHIFT - BLOCK_TO_PLANE_SHIFT); /// Absolute offset in superblocks, where a superblock is defined /// to be an `N*N` square where `N == (1 << SUPERBLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct SuperBlockOffset { pub x: usize, pub y: usize, } /// Absolute offset in superblocks inside a plane, where a superblock is defined /// to be an `N*N` square where `N == (1 << SUPERBLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct PlaneSuperBlockOffset(pub SuperBlockOffset); /// Absolute offset in superblocks inside a tile, where a superblock is defined /// to be an `N*N` square where `N == (1 << SUPERBLOCK_TO_PLANE_SHIFT)`. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct TileSuperBlockOffset(pub SuperBlockOffset); impl SuperBlockOffset { /// Offset of a block inside the current superblock. #[inline] const fn block_offset(self, block_x: usize, block_y: usize) -> BlockOffset { BlockOffset { x: (self.x << SUPERBLOCK_TO_BLOCK_SHIFT) + block_x, y: (self.y << SUPERBLOCK_TO_BLOCK_SHIFT) + block_y, } } /// Offset of the top-left pixel of this block. #[inline] const fn plane_offset(self, plane: &PlaneConfig) -> PlaneOffset { PlaneOffset { x: (self.x as isize) << (SUPERBLOCK_TO_PLANE_SHIFT - plane.xdec), y: (self.y as isize) << (SUPERBLOCK_TO_PLANE_SHIFT - plane.ydec), } } } impl Add for SuperBlockOffset { type Output = Self; #[inline] fn add(self, rhs: Self) -> Self::Output { Self { x: self.x + rhs.x, y: self.y + rhs.y } } } impl PlaneSuperBlockOffset { /// Offset of a block inside the current superblock. #[inline] pub const fn block_offset( self, block_x: usize, block_y: usize, ) -> PlaneBlockOffset { PlaneBlockOffset(self.0.block_offset(block_x, block_y)) } /// Offset of the top-left pixel of this block. #[inline] pub const fn plane_offset(self, plane: &PlaneConfig) -> PlaneOffset { self.0.plane_offset(plane) } } impl Add for PlaneSuperBlockOffset { type Output = Self; #[inline] fn add(self, rhs: Self) -> Self::Output { PlaneSuperBlockOffset(self.0 + rhs.0) } } impl TileSuperBlockOffset { /// Offset of a block inside the current superblock. #[inline] pub const fn block_offset( self, block_x: usize, block_y: usize, ) -> TileBlockOffset { TileBlockOffset(self.0.block_offset(block_x, block_y)) } /// Offset of the top-left pixel of this block. #[inline] pub const fn plane_offset(self, plane: &PlaneConfig) -> PlaneOffset { self.0.plane_offset(plane) } } impl Add for TileSuperBlockOffset { type Output = Self; #[inline] fn add(self, rhs: Self) -> Self::Output { TileSuperBlockOffset(self.0 + rhs.0) } } rav1e-0.7.1/src/context/transform_unit.rs000064400000000000000000000671121046102023000165510ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; use crate::predict::PredictionMode; use crate::predict::PredictionMode::*; use crate::transform::TxType::*; use std::mem::MaybeUninit; pub const MAX_TX_SIZE: usize = 64; pub const MAX_CODED_TX_SIZE: usize = 32; pub const MAX_CODED_TX_SQUARE: usize = MAX_CODED_TX_SIZE * MAX_CODED_TX_SIZE; pub const TX_SIZE_SQR_CONTEXTS: usize = 4; // Coded tx_size <= 32x32, so is the # of CDF contexts from tx sizes pub const TX_SETS: usize = 6; pub const TX_SETS_INTRA: usize = 3; pub const TX_SETS_INTER: usize = 4; pub const INTRA_MODES: usize = 13; pub const UV_INTRA_MODES: usize = 14; const MAX_VARTX_DEPTH: usize = 2; pub const TXFM_PARTITION_CONTEXTS: usize = (TxSize::TX_SIZES - TxSize::TX_8X8 as usize) * 6 - 3; // Number of transform types in each set type pub static num_tx_set: [usize; TX_SETS] = [1, 2, 5, 7, 12, 16]; pub static av1_tx_used: [[usize; TX_TYPES]; TX_SETS] = [ [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], ]; // Maps set types above to the indices used for intra static tx_set_index_intra: [i8; TX_SETS] = [0, -1, 2, 1, -1, -1]; // Maps set types above to the indices used for inter static tx_set_index_inter: [i8; TX_SETS] = [0, 3, -1, -1, 2, 1]; pub static av1_tx_ind: [[usize; TX_TYPES]; TX_SETS] = [ [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0], [3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0], [7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6], ]; pub static max_txsize_rect_lookup: [TxSize; BlockSize::BLOCK_SIZES_ALL] = [ TX_4X4, // 4x4 TX_4X8, // 4x8 TX_8X4, // 8x4 TX_8X8, // 8x8 TX_8X16, // 8x16 TX_16X8, // 16x8 TX_16X16, // 16x16 TX_16X32, // 16x32 TX_32X16, // 32x16 TX_32X32, // 32x32 TX_32X64, // 32x64 TX_64X32, // 64x32 TX_64X64, // 64x64 TX_64X64, // 64x128 TX_64X64, // 128x64 TX_64X64, // 128x128 TX_4X16, // 4x16 TX_16X4, // 16x4 TX_8X32, // 8x32 TX_32X8, // 32x8 TX_16X64, // 16x64 TX_64X16, // 64x16 ]; pub static sub_tx_size_map: [TxSize; TxSize::TX_SIZES_ALL] = [ TX_4X4, // TX_4X4 TX_4X4, // TX_8X8 TX_8X8, // TX_16X16 TX_16X16, // TX_32X32 TX_32X32, // TX_64X64 TX_4X4, // TX_4X8 TX_4X4, // TX_8X4 TX_8X8, // TX_8X16 TX_8X8, // TX_16X8 TX_16X16, // TX_16X32 TX_16X16, // TX_32X16 TX_32X32, // TX_32X64 TX_32X32, // TX_64X32 TX_4X8, // TX_4X16 TX_8X4, // TX_16X4 TX_8X16, // TX_8X32 TX_16X8, // TX_32X8 TX_16X32, // TX_16X64 TX_32X16, // TX_64X16 ]; #[inline] pub fn has_chroma( bo: TileBlockOffset, bsize: BlockSize, subsampling_x: usize, subsampling_y: usize, chroma_sampling: ChromaSampling, ) -> bool { if chroma_sampling == ChromaSampling::Cs400 { return false; }; let bw = bsize.width_mi(); let bh = bsize.height_mi(); ((bo.0.x & 0x01) == 1 || (bw & 0x01) == 0 || subsampling_x == 0) && ((bo.0.y & 0x01) == 1 || (bh & 0x01) == 0 || subsampling_y == 0) } pub fn get_tx_set( tx_size: TxSize, is_inter: bool, use_reduced_set: bool, ) -> TxSet { let tx_size_sqr_up = tx_size.sqr_up(); let tx_size_sqr = tx_size.sqr(); if tx_size_sqr_up.block_size() > BlockSize::BLOCK_32X32 { return TxSet::TX_SET_DCTONLY; } if is_inter { if use_reduced_set || tx_size_sqr_up == TxSize::TX_32X32 { TxSet::TX_SET_INTER_3 } else if tx_size_sqr == TxSize::TX_16X16 { TxSet::TX_SET_INTER_2 } else { TxSet::TX_SET_INTER_1 } } else if tx_size_sqr_up == TxSize::TX_32X32 { TxSet::TX_SET_DCTONLY } else if use_reduced_set || tx_size_sqr == TxSize::TX_16X16 { TxSet::TX_SET_INTRA_2 } else { TxSet::TX_SET_INTRA_1 } } pub fn get_tx_set_index( tx_size: TxSize, is_inter: bool, use_reduced_set: bool, ) -> i8 { let set_type = get_tx_set(tx_size, is_inter, use_reduced_set); if is_inter { tx_set_index_inter[set_type as usize] } else { tx_set_index_intra[set_type as usize] } } static intra_mode_to_tx_type_context: [TxType; INTRA_MODES] = [ DCT_DCT, // DC ADST_DCT, // V DCT_ADST, // H DCT_DCT, // D45 ADST_ADST, // D135 ADST_DCT, // D113 DCT_ADST, // D157 DCT_ADST, // D203 ADST_DCT, // D67 ADST_ADST, // SMOOTH ADST_DCT, // SMOOTH_V DCT_ADST, // SMOOTH_H ADST_ADST, // PAETH ]; static uv2y: [PredictionMode; UV_INTRA_MODES] = [ DC_PRED, // UV_DC_PRED V_PRED, // UV_V_PRED H_PRED, // UV_H_PRED D45_PRED, // UV_D45_PRED D135_PRED, // UV_D135_PRED D113_PRED, // UV_D113_PRED D157_PRED, // UV_D157_PRED D203_PRED, // UV_D203_PRED D67_PRED, // UV_D67_PRED SMOOTH_PRED, // UV_SMOOTH_PRED SMOOTH_V_PRED, // UV_SMOOTH_V_PRED SMOOTH_H_PRED, // UV_SMOOTH_H_PRED PAETH_PRED, // UV_PAETH_PRED DC_PRED, // CFL_PRED ]; pub fn uv_intra_mode_to_tx_type_context(pred: PredictionMode) -> TxType { intra_mode_to_tx_type_context[uv2y[pred as usize] as usize] } // Level Map pub const TXB_SKIP_CONTEXTS: usize = 13; pub const EOB_COEF_CONTEXTS: usize = 9; const SIG_COEF_CONTEXTS_2D: usize = 26; const SIG_COEF_CONTEXTS_1D: usize = 16; pub const SIG_COEF_CONTEXTS_EOB: usize = 4; pub const SIG_COEF_CONTEXTS: usize = SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D; const COEFF_BASE_CONTEXTS: usize = SIG_COEF_CONTEXTS; pub const DC_SIGN_CONTEXTS: usize = 3; const BR_TMP_OFFSET: usize = 12; const BR_REF_CAT: usize = 4; pub const LEVEL_CONTEXTS: usize = 21; pub const NUM_BASE_LEVELS: usize = 2; pub const BR_CDF_SIZE: usize = 4; pub const COEFF_BASE_RANGE: usize = 4 * (BR_CDF_SIZE - 1); pub const COEFF_CONTEXT_BITS: usize = 6; pub const COEFF_CONTEXT_MASK: usize = (1 << COEFF_CONTEXT_BITS) - 1; const MAX_BASE_BR_RANGE: usize = COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1; const BASE_CONTEXT_POSITION_NUM: usize = 12; // Pad 4 extra columns to remove horizontal availability check. pub const TX_PAD_HOR_LOG2: usize = 2; pub const TX_PAD_HOR: usize = 4; // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability // check. pub const TX_PAD_TOP: usize = 2; pub const TX_PAD_BOTTOM: usize = 4; pub const TX_PAD_VER: usize = TX_PAD_TOP + TX_PAD_BOTTOM; // Pad 16 extra bytes to avoid reading overflow in SIMD optimization. const TX_PAD_END: usize = 16; pub const TX_PAD_2D: usize = (MAX_CODED_TX_SIZE + TX_PAD_HOR) * (MAX_CODED_TX_SIZE + TX_PAD_VER) + TX_PAD_END; const TX_CLASSES: usize = 3; #[derive(Copy, Clone, PartialEq, Eq)] pub enum TxClass { TX_CLASS_2D = 0, TX_CLASS_HORIZ = 1, TX_CLASS_VERT = 2, } #[derive(Copy, Clone, PartialEq, Eq)] pub enum SegLvl { SEG_LVL_ALT_Q = 0, /* Use alternate Quantizer .... */ SEG_LVL_ALT_LF_Y_V = 1, /* Use alternate loop filter value on y plane vertical */ SEG_LVL_ALT_LF_Y_H = 2, /* Use alternate loop filter value on y plane horizontal */ SEG_LVL_ALT_LF_U = 3, /* Use alternate loop filter value on u plane */ SEG_LVL_ALT_LF_V = 4, /* Use alternate loop filter value on v plane */ SEG_LVL_REF_FRAME = 5, /* Optional Segment reference frame */ SEG_LVL_SKIP = 6, /* Optional Segment (0,0) + skip mode */ SEG_LVL_GLOBALMV = 7, SEG_LVL_MAX = 8, } pub const seg_feature_bits: [u32; SegLvl::SEG_LVL_MAX as usize] = [8, 6, 6, 6, 6, 3, 0, 0]; pub const seg_feature_is_signed: [bool; SegLvl::SEG_LVL_MAX as usize] = [true, true, true, true, true, false, false, false]; use crate::context::TxClass::*; pub static tx_type_to_class: [TxClass; TX_TYPES] = [ TX_CLASS_2D, // DCT_DCT TX_CLASS_2D, // ADST_DCT TX_CLASS_2D, // DCT_ADST TX_CLASS_2D, // ADST_ADST TX_CLASS_2D, // FLIPADST_DCT TX_CLASS_2D, // DCT_FLIPADST TX_CLASS_2D, // FLIPADST_FLIPADST TX_CLASS_2D, // ADST_FLIPADST TX_CLASS_2D, // FLIPADST_ADST TX_CLASS_2D, // IDTX TX_CLASS_VERT, // V_DCT TX_CLASS_HORIZ, // H_DCT TX_CLASS_VERT, // V_ADST TX_CLASS_HORIZ, // H_ADST TX_CLASS_VERT, // V_FLIPADST TX_CLASS_HORIZ, // H_FLIPADST ]; pub static eob_to_pos_small: [u8; 33] = [ 0, 1, 2, // 0-2 3, 3, // 3-4 4, 4, 4, 4, // 5-8 5, 5, 5, 5, 5, 5, 5, 5, // 9-16 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, // 17-32 ]; pub static eob_to_pos_large: [u8; 17] = [ 6, // place holder 7, // 33-64 8, 8, // 65-128 9, 9, 9, 9, // 129-256 10, 10, 10, 10, 10, 10, 10, 10, // 257-512 11, // 513- ]; pub static k_eob_group_start: [u16; 12] = [0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513]; pub static k_eob_offset_bits: [u16; 12] = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; // The ctx offset table when TX is TX_CLASS_2D. // TX col and row indices are clamped to 4 #[rustfmt::skip] pub static av1_nz_map_ctx_offset: [[[i8; 5]; 5]; TxSize::TX_SIZES_ALL] = [ // TX_4X4 [ [ 0, 1, 6, 6, 0], [ 1, 6, 6, 21, 0], [ 6, 6, 21, 21, 0], [ 6, 21, 21, 21, 0], [ 0, 0, 0, 0, 0] ], // TX_8X8 [ [ 0, 1, 6, 6, 21], [ 1, 6, 6, 21, 21], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_16X16 [ [ 0, 1, 6, 6, 21], [ 1, 6, 6, 21, 21], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_32X32 [ [ 0, 1, 6, 6, 21], [ 1, 6, 6, 21, 21], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_64X64 [ [ 0, 1, 6, 6, 21], [ 1, 6, 6, 21, 21], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_4X8 [ [ 0, 11, 11, 11, 0], [11, 11, 11, 11, 0], [ 6, 6, 21, 21, 0], [ 6, 21, 21, 21, 0], [21, 21, 21, 21, 0] ], // TX_8X4 [ [ 0, 16, 6, 6, 21], [16, 16, 6, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21], [ 0, 0, 0, 0, 0] ], // TX_8X16 [ [ 0, 11, 11, 11, 11], [11, 11, 11, 11, 11], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_16X8 [ [ 0, 16, 6, 6, 21], [16, 16, 6, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21] ], // TX_16X32 [ [ 0, 11, 11, 11, 11], [11, 11, 11, 11, 11], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_32X16 [ [ 0, 16, 6, 6, 21], [16, 16, 6, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21] ], // TX_32X64 [ [ 0, 11, 11, 11, 11], [11, 11, 11, 11, 11], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_64X32 [ [ 0, 16, 6, 6, 21], [16, 16, 6, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21] ], // TX_4X16 [ [ 0, 11, 11, 11, 0], [11, 11, 11, 11, 0], [ 6, 6, 21, 21, 0], [ 6, 21, 21, 21, 0], [21, 21, 21, 21, 0] ], // TX_16X4 [ [ 0, 16, 6, 6, 21], [16, 16, 6, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21], [ 0, 0, 0, 0, 0] ], // TX_8X32 [ [ 0, 11, 11, 11, 11], [11, 11, 11, 11, 11], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_32X8 [ [ 0, 16, 6, 6, 21], [16, 16, 6, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21] ], // TX_16X64 [ [ 0, 11, 11, 11, 11], [11, 11, 11, 11, 11], [ 6, 6, 21, 21, 21], [ 6, 21, 21, 21, 21], [21, 21, 21, 21, 21] ], // TX_64X16 [ [ 0, 16, 6, 6, 21], [16, 16, 6, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21], [16, 16, 21, 21, 21] ] ]; const NZ_MAP_CTX_0: usize = SIG_COEF_CONTEXTS_2D; const NZ_MAP_CTX_5: usize = NZ_MAP_CTX_0 + 5; const NZ_MAP_CTX_10: usize = NZ_MAP_CTX_0 + 10; pub static nz_map_ctx_offset_1d: [usize; 32] = [ NZ_MAP_CTX_0, NZ_MAP_CTX_5, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, ]; const CONTEXT_MAG_POSITION_NUM: usize = 3; static mag_ref_offset_with_txclass: [[[usize; 2]; CONTEXT_MAG_POSITION_NUM]; 3] = [ [[0, 1], [1, 0], [1, 1]], [[0, 1], [1, 0], [0, 2]], [[0, 1], [1, 0], [2, 0]], ]; // End of Level Map pub struct TXB_CTX { pub txb_skip_ctx: usize, pub dc_sign_ctx: usize, } impl<'a> ContextWriter<'a> { /// # Panics /// /// - If an invalid combination of `tx_type` and `tx_size` is passed pub fn write_tx_type( &mut self, w: &mut W, tx_size: TxSize, tx_type: TxType, y_mode: PredictionMode, is_inter: bool, use_reduced_tx_set: bool, ) { let square_tx_size = tx_size.sqr(); let tx_set = get_tx_set(tx_size, is_inter, use_reduced_tx_set); let num_tx_types = num_tx_set[tx_set as usize]; if num_tx_types > 1 { let tx_set_index = get_tx_set_index(tx_size, is_inter, use_reduced_tx_set); assert!(tx_set_index > 0); assert!(av1_tx_used[tx_set as usize][tx_type as usize] != 0); if is_inter { let s = av1_tx_ind[tx_set as usize][tx_type as usize] as u32; if tx_set_index == 1 { let cdf = &self.fc.inter_tx_1_cdf[square_tx_size as usize]; symbol_with_update!(self, w, s, cdf); } else if tx_set_index == 2 { let cdf = &self.fc.inter_tx_2_cdf[square_tx_size as usize]; symbol_with_update!(self, w, s, cdf); } else { let cdf = &self.fc.inter_tx_3_cdf[square_tx_size as usize]; symbol_with_update!(self, w, s, cdf); } } else { let intra_dir = y_mode; // TODO: Once use_filter_intra is enabled, // intra_dir = // fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]; let s = av1_tx_ind[tx_set as usize][tx_type as usize] as u32; if tx_set_index == 1 { let cdf = &self.fc.intra_tx_1_cdf[square_tx_size as usize] [intra_dir as usize]; symbol_with_update!(self, w, s, cdf); } else { let cdf = &self.fc.intra_tx_2_cdf[square_tx_size as usize] [intra_dir as usize]; symbol_with_update!(self, w, s, cdf); } } } } fn get_tx_size_context( &self, bo: TileBlockOffset, bsize: BlockSize, ) -> usize { let max_tx_size = max_txsize_rect_lookup[bsize as usize]; let max_tx_wide = max_tx_size.width() as u8; let max_tx_high = max_tx_size.height() as u8; let has_above = bo.0.y > 0; let has_left = bo.0.x > 0; let mut above = self.bc.above_tx_context[bo.0.x] >= max_tx_wide; let mut left = self.bc.left_tx_context[bo.y_in_sb()] >= max_tx_high; if has_above { let above_blk = self.bc.blocks.above_of(bo); if above_blk.is_inter() { above = (above_blk.n4_w << MI_SIZE_LOG2) >= max_tx_wide; }; } if has_left { let left_blk = self.bc.blocks.left_of(bo); if left_blk.is_inter() { left = (left_blk.n4_h << MI_SIZE_LOG2) >= max_tx_high; }; } if has_above && has_left { return above as usize + left as usize; }; if has_above { return above as usize; }; if has_left { return left as usize; }; 0 } pub fn write_tx_size_intra( &mut self, w: &mut W, bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, ) { fn tx_size_to_depth(tx_size: TxSize, bsize: BlockSize) -> usize { let mut ctx_size = max_txsize_rect_lookup[bsize as usize]; let mut depth: usize = 0; while tx_size != ctx_size { depth += 1; ctx_size = sub_tx_size_map[ctx_size as usize]; debug_assert!(depth <= MAX_TX_DEPTH); } depth } fn bsize_to_max_depth(bsize: BlockSize) -> usize { let mut tx_size: TxSize = max_txsize_rect_lookup[bsize as usize]; let mut depth = 0; while depth < MAX_TX_DEPTH && tx_size != TX_4X4 { depth += 1; tx_size = sub_tx_size_map[tx_size as usize]; debug_assert!(depth <= MAX_TX_DEPTH); } depth } fn bsize_to_tx_size_cat(bsize: BlockSize) -> usize { let mut tx_size: TxSize = max_txsize_rect_lookup[bsize as usize]; debug_assert!(tx_size != TX_4X4); let mut depth = 0; while tx_size != TX_4X4 { depth += 1; tx_size = sub_tx_size_map[tx_size as usize]; } debug_assert!(depth <= MAX_TX_CATS); depth - 1 } debug_assert!(!self.bc.blocks[bo].is_inter()); debug_assert!(bsize > BlockSize::BLOCK_4X4); let tx_size_ctx = self.get_tx_size_context(bo, bsize); let depth = tx_size_to_depth(tx_size, bsize); let max_depths = bsize_to_max_depth(bsize); let tx_size_cat = bsize_to_tx_size_cat(bsize); debug_assert!(depth <= max_depths); debug_assert!(!tx_size.is_rect() || bsize.is_rect_tx_allowed()); if tx_size_cat > 0 { let cdf = &self.fc.tx_size_cdf[tx_size_cat - 1][tx_size_ctx]; symbol_with_update!(self, w, depth as u32, cdf); } else { let cdf = &self.fc.tx_size_8x8_cdf[tx_size_ctx]; symbol_with_update!(self, w, depth as u32, cdf); } } // Based on https://aomediacodec.github.io/av1-spec/#cdf-selection-process // Used to decide the cdf (context) for txfm_split fn get_above_tx_width( &self, bo: TileBlockOffset, _bsize: BlockSize, _tx_size: TxSize, first_tx: bool, ) -> usize { let has_above = bo.0.y > 0; if first_tx { if !has_above { return 64; } let above_blk = self.bc.blocks.above_of(bo); if above_blk.skip && above_blk.is_inter() { return above_blk.bsize.width(); } } self.bc.above_tx_context[bo.0.x] as usize } fn get_left_tx_height( &self, bo: TileBlockOffset, _bsize: BlockSize, _tx_size: TxSize, first_tx: bool, ) -> usize { let has_left = bo.0.x > 0; if first_tx { if !has_left { return 64; } let left_blk = self.bc.blocks.left_of(bo); if left_blk.skip && left_blk.is_inter() { return left_blk.bsize.height(); } } self.bc.left_tx_context[bo.y_in_sb()] as usize } fn txfm_partition_context( &self, bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, tbx: usize, tby: usize, ) -> usize { debug_assert!(tx_size > TX_4X4); debug_assert!(bsize > BlockSize::BLOCK_4X4); // TODO: from 2nd level partition, must know whether the tx block is the topmost(or leftmost) within a partition let above = (self.get_above_tx_width(bo, bsize, tx_size, tby == 0) < tx_size.width()) as usize; let left = (self.get_left_tx_height(bo, bsize, tx_size, tbx == 0) < tx_size.height()) as usize; let max_tx_size: TxSize = bsize.tx_size().sqr_up(); let category: usize = (tx_size.sqr_up() != max_tx_size) as usize + (TxSize::TX_SIZES - 1 - max_tx_size as usize) * 2; debug_assert!(category < TXFM_PARTITION_CONTEXTS); category * 3 + above + left } pub fn write_tx_size_inter( &mut self, w: &mut W, bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, txfm_split: bool, tbx: usize, tby: usize, depth: usize, ) { if bo.0.x >= self.bc.blocks.cols() || bo.0.y >= self.bc.blocks.rows() { return; } debug_assert!(self.bc.blocks[bo].is_inter()); debug_assert!(bsize > BlockSize::BLOCK_4X4); debug_assert!(!tx_size.is_rect() || bsize.is_rect_tx_allowed()); if tx_size != TX_4X4 && depth < MAX_VARTX_DEPTH { let ctx = self.txfm_partition_context(bo, bsize, tx_size, tbx, tby); let cdf = &self.fc.txfm_partition_cdf[ctx]; symbol_with_update!(self, w, txfm_split as u32, cdf); } else { debug_assert!(!txfm_split); } if !txfm_split { self.bc.update_tx_size_context(bo, tx_size.block_size(), tx_size, false); } else { // if txfm_split == true, split one level only let split_tx_size = sub_tx_size_map[tx_size as usize]; let bw = bsize.width_mi() / split_tx_size.width_mi(); let bh = bsize.height_mi() / split_tx_size.height_mi(); for by in 0..bh { for bx in 0..bw { let tx_bo = TileBlockOffset(BlockOffset { x: bo.0.x + bx * split_tx_size.width_mi(), y: bo.0.y + by * split_tx_size.height_mi(), }); self.write_tx_size_inter( w, tx_bo, bsize, split_tx_size, false, bx, by, depth + 1, ); } } } } #[inline] pub const fn get_txsize_entropy_ctx(tx_size: TxSize) -> usize { (tx_size.sqr() as usize + tx_size.sqr_up() as usize + 1) >> 1 } pub fn txb_init_levels( &self, coeffs: &[T], height: usize, levels: &mut [u8], levels_stride: usize, ) { // Coefficients and levels are transposed from how they work in the spec for (coeffs_col, levels_col) in coeffs.chunks_exact(height).zip(levels.chunks_exact_mut(levels_stride)) { for (coeff, level) in coeffs_col.iter().zip(levels_col) { *level = coeff.abs().min(T::cast_from(127)).as_(); } } } // Since the coefficients and levels are transposed in relation to how they // work in the spec, use the log of block height in our calculations instead // of block width. #[inline] pub const fn get_txb_bhl(tx_size: TxSize) -> usize { av1_get_coded_tx_size(tx_size).height_log2() } /// Returns `(eob_pt, eob_extra)` /// /// # Panics /// /// - If `eob` is prior to the start of the group #[inline] pub fn get_eob_pos_token(eob: u16) -> (u32, u32) { let t = if eob < 33 { eob_to_pos_small[usize::from(eob)] as u32 } else { let e = usize::from(cmp::min((eob - 1) >> 5, 16)); eob_to_pos_large[e] as u32 }; assert!(eob as i32 >= k_eob_group_start[t as usize] as i32); let extra = eob as u32 - k_eob_group_start[t as usize] as u32; (t, extra) } pub fn get_nz_mag(levels: &[u8], bhl: usize, tx_class: TxClass) -> usize { // Levels are transposed from how they work in the spec // May version. // Note: AOMMIN(level, 3) is useless for decoder since level < 3. let mut mag = cmp::min(3, levels[1]); // { 1, 0 } mag += cmp::min(3, levels[(1 << bhl) + TX_PAD_HOR]); // { 0, 1 } if tx_class == TX_CLASS_2D { mag += cmp::min(3, levels[(1 << bhl) + TX_PAD_HOR + 1]); // { 1, 1 } mag += cmp::min(3, levels[2]); // { 2, 0 } mag += cmp::min(3, levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]); // { 0, 2 } } else if tx_class == TX_CLASS_VERT { mag += cmp::min(3, levels[2]); // { 2, 0 } mag += cmp::min(3, levels[3]); // { 3, 0 } mag += cmp::min(3, levels[4]); // { 4, 0 } } else { mag += cmp::min(3, levels[(2 << bhl) + (2 << TX_PAD_HOR_LOG2)]); // { 0, 2 } mag += cmp::min(3, levels[(3 << bhl) + (3 << TX_PAD_HOR_LOG2)]); // { 0, 3 } mag += cmp::min(3, levels[(4 << bhl) + (4 << TX_PAD_HOR_LOG2)]); // { 0, 4 } } mag as usize } fn get_nz_map_ctx_from_stats( stats: usize, coeff_idx: usize, // raster order bhl: usize, tx_size: TxSize, tx_class: TxClass, ) -> usize { if (tx_class as u32 | coeff_idx as u32) == 0 { return 0; }; // Coefficients are transposed from how they work in the spec let col: usize = coeff_idx >> bhl; let row: usize = coeff_idx - (col << bhl); let ctx = ((stats + 1) >> 1).min(4); ctx + match tx_class { TX_CLASS_2D => { // This is the algorithm to generate table av1_nz_map_ctx_offset[]. // const int width = tx_size_wide[tx_size]; // const int height = tx_size_high[tx_size]; // if (width < height) { // if (row < 2) return 11 + ctx; // } else if (width > height) { // if (col < 2) return 16 + ctx; // } // if (row + col < 2) return ctx + 1; // if (row + col < 4) return 5 + ctx + 1; // return 21 + ctx; av1_nz_map_ctx_offset[tx_size as usize][cmp::min(row, 4)] [cmp::min(col, 4)] as usize } TX_CLASS_HORIZ => nz_map_ctx_offset_1d[col], TX_CLASS_VERT => nz_map_ctx_offset_1d[row], } } fn get_nz_map_ctx( levels: &[u8], coeff_idx: usize, bhl: usize, area: usize, scan_idx: usize, is_eob: bool, tx_size: TxSize, tx_class: TxClass, ) -> usize { if is_eob { if scan_idx == 0 { return 0; } if scan_idx <= area / 8 { return 1; } if scan_idx <= area / 4 { return 2; } return 3; } // Levels are transposed from how they work in the spec let padded_idx = coeff_idx + ((coeff_idx >> bhl) << TX_PAD_HOR_LOG2); let stats = Self::get_nz_mag(&levels[padded_idx..], bhl, tx_class); Self::get_nz_map_ctx_from_stats(stats, coeff_idx, bhl, tx_size, tx_class) } /// `coeff_contexts_no_scan` is not in the scan order. /// Value for `pos = scan[i]` is at `coeff[i]`, not at `coeff[pos]`. pub fn get_nz_map_contexts<'c>( &self, levels: &mut [u8], scan: &[u16], eob: u16, tx_size: TxSize, tx_class: TxClass, coeff_contexts_no_scan: &'c mut [MaybeUninit], ) -> &'c mut [i8] { let bhl = Self::get_txb_bhl(tx_size); let area = av1_get_coded_tx_size(tx_size).area(); let scan = &scan[..usize::from(eob)]; let coeffs = &mut coeff_contexts_no_scan[..usize::from(eob)]; for (i, (coeff, pos)) in coeffs.iter_mut().zip(scan.iter().copied()).enumerate() { coeff.write(Self::get_nz_map_ctx( levels, pos as usize, bhl, area, i, i == usize::from(eob) - 1, tx_size, tx_class, ) as i8); } // SAFETY: every element has been initialized unsafe { slice_assume_init_mut(coeffs) } } pub fn get_br_ctx( levels: &[u8], coeff_idx: usize, // raster order bhl: usize, tx_class: TxClass, ) -> usize { // Coefficients and levels are transposed from how they work in the spec let col: usize = coeff_idx >> bhl; let row: usize = coeff_idx - (col << bhl); let stride: usize = (1 << bhl) + TX_PAD_HOR; let pos: usize = col * stride + row; let mut mag: usize = (levels[pos + 1] + levels[pos + stride]) as usize; match tx_class { TX_CLASS_2D => { mag += levels[pos + stride + 1] as usize; mag = cmp::min((mag + 1) >> 1, 6); if coeff_idx == 0 { return mag; } if (row < 2) && (col < 2) { return mag + 7; } } TX_CLASS_HORIZ => { mag += levels[pos + (stride << 1)] as usize; mag = cmp::min((mag + 1) >> 1, 6); if coeff_idx == 0 { return mag; } if col == 0 { return mag + 7; } } TX_CLASS_VERT => { mag += levels[pos + 2] as usize; mag = cmp::min((mag + 1) >> 1, 6); if coeff_idx == 0 { return mag; } if row == 0 { return mag + 7; } } } mag + 14 } } rav1e-0.7.1/src/cpu_features/aarch64.rs000064400000000000000000000075301046102023000157260ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use arg_enum_proc_macro::ArgEnum; use std::env; use std::str::FromStr; #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, ArgEnum)] pub enum CpuFeatureLevel { RUST, NEON, } impl CpuFeatureLevel { #[cfg(test)] pub(crate) const fn all() -> &'static [Self] { use CpuFeatureLevel::*; &[RUST, NEON] } pub const fn len() -> usize { CpuFeatureLevel::NEON as usize + 1 } #[inline(always)] pub fn as_index(self) -> usize { self as usize } } impl Default for CpuFeatureLevel { fn default() -> CpuFeatureLevel { let detected = CpuFeatureLevel::NEON; let manual: CpuFeatureLevel = match env::var("RAV1E_CPU_TARGET") { Ok(feature) => CpuFeatureLevel::from_str(&feature).unwrap_or(detected), Err(_e) => detected, }; if manual > detected { detected } else { manual } } } // Create a static lookup table for CPUFeatureLevel enums // Note: keys are CpuFeatureLevels without any prefix (no CpuFeatureLevel::) macro_rules! cpu_function_lookup_table { // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { static $name: [$type; crate::cpu_features::CpuFeatureLevel::len()] = { use crate::cpu_features::CpuFeatureLevel; #[allow(unused_mut)] let mut out: [$type; CpuFeatureLevel::len()] = [$empty; CpuFeatureLevel::len()]; // Can't use out[0][.] == $empty in static as of rust 1.40 #[allow(unused_mut)] let mut set: [bool; CpuFeatureLevel::len()] = [false; CpuFeatureLevel::len()]; #[allow(unused_imports)] use CpuFeatureLevel::*; $( out[$key as usize] = $value; set[$key as usize] = true; )* cpu_function_lookup_table!(waterfall_cpu_features(out, set, [NEON])); out }; }; ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { $pub cpu_function_lookup_table!($name: [$type], default: $empty, [$(($key, $value)),*]); }; // Fill empty output functions with the existent functions they support. // cpus should be in order of lowest cpu level to highest // Used like an internal function // Put in here to avoid adding more public macros (waterfall_cpu_features($out:ident, $set:ident, [$($cpu:ident),*])) => { // Use an array to emulate if statements (not supported in static as of // rust 1.40). Setting best[0] (false) and best[1] (true) is equivalent to // doing nothing and overriding our value respectively. #[allow(unused_assignments)] let mut best = [$out[0], $out[0]]; $( // If the current entry has a function, update out best function. best[$set[$cpu as usize] as usize] = $out[$cpu as usize]; // Update our current entry. Does nothing if it already had a function. $out[$cpu as usize] = best[1]; )* }; // use $name_$key as our values ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { paste::item!{ cpu_function_lookup_table!( $pub, $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { paste::item!{ cpu_function_lookup_table!( $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; } rav1e-0.7.1/src/cpu_features/mod.rs000064400000000000000000000013731046102023000152540ustar 00000000000000// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { #[macro_use] mod x86; pub use x86::*; } else if #[cfg(asm_neon)] { #[macro_use] mod aarch64; pub use aarch64::*; } else { mod rust; pub use rust::*; } } rav1e-0.7.1/src/cpu_features/rust.rs000064400000000000000000000015101046102023000154630ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use arg_enum_proc_macro::ArgEnum; #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, ArgEnum, Default)] pub enum CpuFeatureLevel { #[default] RUST, } impl CpuFeatureLevel { #[cfg(test)] #[allow(unused)] pub(crate) const fn all() -> &'static [Self] { use CpuFeatureLevel::*; &[RUST] } } rav1e-0.7.1/src/cpu_features/x86.rs000064400000000000000000000126271046102023000151260ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use arg_enum_proc_macro::ArgEnum; use std::env; use std::str::FromStr; #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, ArgEnum)] pub enum CpuFeatureLevel { RUST, SSE2, SSSE3, #[arg_enum(alias = "sse4.1")] SSE4_1, AVX2, AVX512, #[arg_enum(alias = "avx512vpclmulqdq")] AVX512ICL, } impl CpuFeatureLevel { #[cfg(test)] pub(crate) const fn all() -> &'static [Self] { use CpuFeatureLevel::*; &[RUST, SSE2, SSSE3, SSE4_1, AVX2, AVX512, AVX512ICL] } pub const fn len() -> usize { CpuFeatureLevel::AVX512ICL as usize + 1 } #[inline(always)] pub const fn as_index(self) -> usize { self as usize } } impl Default for CpuFeatureLevel { fn default() -> CpuFeatureLevel { fn avx512_detected() -> bool { is_x86_feature_detected!("avx512bw") && is_x86_feature_detected!("avx512cd") && is_x86_feature_detected!("avx512dq") && is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") } #[allow(deprecated)] // Until MSRV >= 1.69.0 fn avx512icl_detected() -> bool { // Per dav1d, these are the flags needed. avx512_detected() && is_x86_feature_detected!("avx512vnni") && is_x86_feature_detected!("avx512ifma") && is_x86_feature_detected!("avx512vbmi") && is_x86_feature_detected!("avx512vbmi2") && is_x86_feature_detected!("avx512vpopcntdq") && is_x86_feature_detected!("avx512bitalg") && is_x86_feature_detected!("avx512gfni") && is_x86_feature_detected!("avx512vaes") && is_x86_feature_detected!("avx512vpclmulqdq") } let detected: CpuFeatureLevel = if avx512icl_detected() { CpuFeatureLevel::AVX512ICL } else if avx512_detected() { CpuFeatureLevel::AVX512 } else if is_x86_feature_detected!("avx2") { CpuFeatureLevel::AVX2 } else if is_x86_feature_detected!("sse4.1") { CpuFeatureLevel::SSE4_1 } else if is_x86_feature_detected!("ssse3") { CpuFeatureLevel::SSSE3 } else if is_x86_feature_detected!("sse2") { CpuFeatureLevel::SSE2 } else { CpuFeatureLevel::RUST }; let manual: CpuFeatureLevel = match env::var("RAV1E_CPU_TARGET") { Ok(feature) => CpuFeatureLevel::from_str(&feature).unwrap_or(detected), Err(_e) => detected, }; if manual > detected { detected } else { manual } } } // Create a static lookup table for CPUFeatureLevel enums // Note: keys are CpuFeatureLevels without any prefix (no CpuFeatureLevel::) macro_rules! cpu_function_lookup_table { // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { static $name: [$type; crate::cpu_features::CpuFeatureLevel::len()] = { use crate::cpu_features::CpuFeatureLevel; #[allow(unused_mut)] let mut out: [$type; CpuFeatureLevel::len()] = [$empty; CpuFeatureLevel::len()]; // Can't use out[0][.] == $empty in static as of rust 1.40 #[allow(unused_mut)] let mut set: [bool; CpuFeatureLevel::len()] = [false; CpuFeatureLevel::len()]; #[allow(unused_imports)] use CpuFeatureLevel::*; $( out[$key as usize] = $value; set[$key as usize] = true; )* cpu_function_lookup_table!(waterfall_cpu_features(out, set, [SSE2, SSSE3, SSE4_1, AVX2, AVX512, AVX512ICL])); out }; }; ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$(($key:ident, $value:expr)),*]) => { $pub cpu_function_lookup_table!($name: [$type], default: $empty, [$(($key, $value)),*]); }; // Fill empty output functions with the existent functions they support. // cpus should be in order of lowest cpu level to highest // Used like an internal function // Put in here to avoid adding more public macros (waterfall_cpu_features($out:ident, $set:ident, [$($cpu:ident),*])) => { // Use an array to emulate if statements (not supported in static as of // rust 1.40). Setting best[0] (false) and best[1] (true) is equivalent to // doing nothing and overriding our value respectively. #[allow(unused_assignments)] let mut best = [$out[0], $out[0]]; $( // If the current entry has a function, update out best function. best[$set[$cpu as usize] as usize] = $out[$cpu as usize]; // Update our current entry. Does nothing if it already had a function. $out[$cpu as usize] = best[1]; )* }; // use $name_$key as our values ($pub:vis, $name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { paste::item!{ cpu_function_lookup_table!( $pub, $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; // version for default visibility ($name:ident: [$type:ty], default: $empty:expr, [$($key:ident),*]) => { paste::item!{ cpu_function_lookup_table!( $name: [$type], default: $empty, [$(($key, [<$name _$key>])),*] ); } }; } rav1e-0.7.1/src/deblock.rs000064400000000000000000001340311046102023000134110ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::api::FrameType; use crate::color::ChromaSampling::Cs400; use crate::context::*; use crate::encoder::FrameInvariants; use crate::partition::RefType::*; use crate::predict::PredictionMode::*; use crate::quantize::*; use crate::tiling::*; use crate::util::{clamp, ILog, Pixel}; use crate::DeblockState; use rayon::iter::*; use std::cmp; fn deblock_adjusted_level( deblock: &DeblockState, block: &Block, pli: usize, vertical: bool, ) -> usize { let idx = if pli == 0 { usize::from(!vertical) } else { pli + 1 }; let level = if deblock.block_deltas_enabled { // By-block filter strength delta, if the feature is active. let block_delta = if deblock.block_delta_multi { block.deblock_deltas[idx] << deblock.block_delta_shift } else { block.deblock_deltas[0] << deblock.block_delta_shift }; // Add to frame-specified filter strength (Y-vertical, Y-horizontal, U, V) clamp(block_delta + deblock.levels[idx] as i8, 0, MAX_LOOP_FILTER as i8) as u8 } else { deblock.levels[idx] }; // if fi.seg_feaure_active { // rav1e does not yet support segments or segment features // } // Are delta modifiers for specific references and modes active? If so, add them too. if deblock.deltas_enabled { let mode = block.mode; let reference = block.ref_frames[0]; let mode_type = usize::from( mode >= NEARESTMV && mode != GLOBALMV && mode != GLOBAL_GLOBALMV, ); let l5 = level >> 5; clamp( level as i32 + ((deblock.ref_deltas[reference.to_index()] as i32) << l5) + if reference == INTRA_FRAME { 0 } else { (deblock.mode_deltas[mode_type] as i32) << l5 }, 0, MAX_LOOP_FILTER as i32, ) as usize } else { level as usize } } #[inline] fn deblock_left<'a, T: Pixel>( blocks: &'a TileBlocks, in_bo: TileBlockOffset, p: &PlaneRegion, ) -> &'a Block { let xdec = p.plane_cfg.xdec; let ydec = p.plane_cfg.ydec; // subsampled chroma uses odd mi row/col // We already know we're not at the upper/left corner, so prev_block is in frame &blocks[in_bo.0.y | ydec][(in_bo.0.x | xdec) - (1 << xdec)] } #[inline] fn deblock_up<'a, T: Pixel>( blocks: &'a TileBlocks, in_bo: TileBlockOffset, p: &PlaneRegion, ) -> &'a Block { let xdec = p.plane_cfg.xdec; let ydec = p.plane_cfg.ydec; // subsampled chroma uses odd mi row/col &blocks[(in_bo.0.y | ydec) - (1 << ydec)][in_bo.0.x | xdec] } // Must be called on a tx edge, and not on a frame edge. This is enforced above the call. fn deblock_size( block: &Block, prev_block: &Block, p: &PlaneRegion, pli: usize, vertical: bool, block_edge: bool, ) -> usize { let xdec = p.plane_cfg.xdec; let ydec = p.plane_cfg.ydec; // filter application is conditional on skip and block edge if !(block_edge || !block.skip || !prev_block.skip || block.ref_frames[0] == INTRA_FRAME || prev_block.ref_frames[0] == INTRA_FRAME) { 0 } else { let (txsize, prev_txsize) = if pli == 0 { (block.txsize, prev_block.txsize) } else { ( block.bsize.largest_chroma_tx_size(xdec, ydec), prev_block.bsize.largest_chroma_tx_size(xdec, ydec), ) }; let (tx_n, prev_tx_n) = if vertical { (cmp::max(txsize.width_mi(), 1), cmp::max(prev_txsize.width_mi(), 1)) } else { (cmp::max(txsize.height_mi(), 1), cmp::max(prev_txsize.height_mi(), 1)) }; cmp::min( if pli == 0 { 14 } else { 6 }, cmp::min(tx_n, prev_tx_n) << MI_SIZE_LOG2, ) } } // Must be called on a tx edge #[inline] fn deblock_level( deblock: &DeblockState, block: &Block, prev_block: &Block, pli: usize, vertical: bool, ) -> usize { let level = deblock_adjusted_level(deblock, block, pli, vertical); if level == 0 { deblock_adjusted_level(deblock, prev_block, pli, vertical) } else { level } } // four taps, 4 outputs (two are trivial) #[inline] fn filter_narrow2_4( p1: i32, p0: i32, q0: i32, q1: i32, shift: usize, ) -> [i32; 4] { let filter0 = clamp(p1 - q1, -128 << shift, (128 << shift) - 1); let filter1 = clamp(filter0 + 3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3; // be certain our optimization removing a clamp is sound debug_assert!({ let base = clamp(filter0 + 3 * (q0 - p0), -128 << shift, (128 << shift) - 1); let test = clamp(base + 4, -128 << shift, (128 << shift) - 1) >> 3; filter1 == test }); let filter2 = clamp(filter0 + 3 * (q0 - p0) + 3, -128 << shift, (128 << shift) - 1) >> 3; // be certain our optimization removing a clamp is sound debug_assert!({ let base = clamp(filter0 + 3 * (q0 - p0), -128 << shift, (128 << shift) - 1); let test = clamp(base + 3, -128 << shift, (128 << shift) - 1) >> 3; filter2 == test }); [ p1, clamp(p0 + filter2, 0, (256 << shift) - 1), clamp(q0 - filter1, 0, (256 << shift) - 1), q1, ] } // six taps, 6 outputs (four are trivial) #[inline] fn filter_narrow2_6( p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize, ) -> [i32; 6] { let x = filter_narrow2_4(p1, p0, q0, q1, shift); [p2, x[0], x[1], x[2], x[3], q2] } // 12 taps, 12 outputs (ten are trivial) #[inline] fn filter_narrow2_12( p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, shift: usize, ) -> [i32; 12] { let x = filter_narrow2_4(p1, p0, q0, q1, shift); [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] } // four taps, 4 outputs #[inline] fn filter_narrow4_4( p1: i32, p0: i32, q0: i32, q1: i32, shift: usize, ) -> [i32; 4] { let filter1 = clamp(3 * (q0 - p0) + 4, -128 << shift, (128 << shift) - 1) >> 3; // be certain our optimization removing a clamp is sound debug_assert!({ let base = clamp(3 * (q0 - p0), -128 << shift, (128 << shift) - 1); let test = clamp(base + 4, -128 << shift, (128 << shift) - 1) >> 3; filter1 == test }); let filter2 = clamp(3 * (q0 - p0) + 3, -128 << shift, (128 << shift) - 1) >> 3; // be certain our optimization removing a clamp is sound debug_assert!({ let base = clamp(3 * (q0 - p0), -128 << shift, (128 << shift) - 1); let test = clamp(base + 3, -128 << shift, (128 << shift) - 1) >> 3; filter2 == test }); let filter3 = (filter1 + 1) >> 1; [ clamp(p1 + filter3, 0, (256 << shift) - 1), clamp(p0 + filter2, 0, (256 << shift) - 1), clamp(q0 - filter1, 0, (256 << shift) - 1), clamp(q1 - filter3, 0, (256 << shift) - 1), ] } // six taps, 6 outputs (two are trivial) #[inline] fn filter_narrow4_6( p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize, ) -> [i32; 6] { let x = filter_narrow4_4(p1, p0, q0, q1, shift); [p2, x[0], x[1], x[2], x[3], q2] } // 12 taps, 12 outputs (eight are trivial) #[inline] fn filter_narrow4_12( p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, shift: usize, ) -> [i32; 12] { let x = filter_narrow4_4(p1, p0, q0, q1, shift); [p5, p4, p3, p2, x[0], x[1], x[2], x[3], q2, q3, q4, q5] } // six taps, 4 outputs #[rustfmt::skip] #[inline] const fn filter_wide6_4( p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32 ) -> [i32; 4] { [ (p2*3 + p1*2 + p0*2 + q0 + (1<<2)) >> 3, (p2 + p1*2 + p0*2 + q0*2 + q1 + (1<<2)) >> 3, (p1 + p0*2 + q0*2 + q1*2 + q2 + (1<<2)) >> 3, (p0 + q0*2 + q1*2 + q2*3 + (1<<2)) >> 3 ] } // eight taps, 6 outputs #[rustfmt::skip] #[inline] const fn filter_wide8_6( p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32 ) -> [i32; 6] { [ (p3*3 + p2*2 + p1 + p0 + q0 + (1<<2)) >> 3, (p3*2 + p2 + p1*2 + p0 + q0 + q1 + (1<<2)) >> 3, (p3 + p2 + p1 + p0*2 + q0 + q1 + q2 +(1<<2)) >> 3, (p2 + p1 + p0 + q0*2 + q1 + q2 + q3 + (1<<2)) >> 3, (p1 + p0 + q0 + q1*2 + q2 + q3*2 + (1<<2)) >> 3, (p0 + q0 + q1 + q2*2 + q3*3 + (1<<2)) >> 3 ] } // 12 taps, 12 outputs (six are trivial) #[inline] const fn filter_wide8_12( p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, ) -> [i32; 12] { let x = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); [p5, p4, p3, x[0], x[1], x[2], x[3], x[4], x[5], q3, q4, q5] } // fourteen taps, 12 outputs #[rustfmt::skip] #[inline] const fn filter_wide14_12( p6: i32, p5: i32, p4: i32, p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, q4: i32, q5: i32, q6: i32 ) -> [i32; 12] { [ (p6*7 + p5*2 + p4*2 + p3 + p2 + p1 + p0 + q0 + (1<<3)) >> 4, (p6*5 + p5*2 + p4*2 + p3*2 + p2 + p1 + p0 + q0 + q1 + (1<<3)) >> 4, (p6*4 + p5 + p4*2 + p3*2 + p2*2 + p1 + p0 + q0 + q1 + q2 + (1<<3)) >> 4, (p6*3 + p5 + p4 + p3*2 + p2*2 + p1*2 + p0 + q0 + q1 + q2 + q3 + (1<<3)) >> 4, (p6*2 + p5 + p4 + p3 + p2*2 + p1*2 + p0*2 + q0 + q1 + q2 + q3 + q4 + (1<<3)) >> 4, (p6 + p5 + p4 + p3 + p2 + p1*2 + p0*2 + q0*2 + q1 + q2 + q3 + q4 + q5 + (1<<3)) >> 4, (p5 + p4 + p3 + p2 + p1 + p0*2 + q0*2 + q1*2 + q2 + q3 + q4 + q5 + q6 + (1<<3)) >> 4, (p4 + p3 + p2 + p1 + p0 + q0*2 + q1*2 + q2*2 + q3 + q4 + q5 + q6*2 + (1<<3)) >> 4, (p3 + p2 + p1 + p0 + q0 + q1*2 + q2*2 + q3*2 + q4 + q5 + q6*3 + (1<<3)) >> 4, (p2 + p1 + p0 + q0 + q1 + q2*2 + q3*2 + q4*2 + q5 + q6*4 + (1<<3)) >> 4, (p1 + p0 + q0 + q1 + q2 + q3*2 + q4*2 + q5*2 + q6*5 + (1<<3)) >> 4, (p0 + q0 + q1 + q2 + q3 + q4*2 + q5*2 + q6*7 + (1<<3)) >> 4 ] } #[inline] fn copy_horizontal( dst: &mut PlaneRegionMut<'_, T>, x: usize, y: usize, src: &[i32], ) { let row = &mut dst[y][x..]; for (dst, src) in row.iter_mut().take(src.len()).zip(src) { *dst = T::cast_from(*src); } } #[inline] fn copy_vertical( dst: &mut PlaneRegionMut<'_, T>, x: usize, y: usize, src: &[i32], ) { for (i, v) in src.iter().enumerate() { let p = &mut dst[y + i][x]; *p = T::cast_from(*v); } } #[inline] fn stride_sse(a: &[i32; LEN], b: &[i32; LEN]) -> i64 { a.iter().zip(b).map(|(a, b)| (a - b) * (a - b)).sum::() as i64 } #[inline] const fn _level_to_limit(level: i32, shift: usize) -> i32 { level << shift } #[inline] const fn limit_to_level(limit: i32, shift: usize) -> i32 { (limit + (1 << shift) - 1) >> shift } #[inline] const fn _level_to_blimit(level: i32, shift: usize) -> i32 { (3 * level + 4) << shift } #[inline] const fn blimit_to_level(blimit: i32, shift: usize) -> i32 { (((blimit + (1 << shift) - 1) >> shift) - 2) / 3 } #[inline] const fn _level_to_thresh(level: i32, shift: usize) -> i32 { level >> 4 << shift } #[inline] const fn thresh_to_level(thresh: i32, shift: usize) -> i32 { (thresh + (1 << shift) - 1) >> shift << 4 } #[inline] fn nhev4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize { thresh_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift) as usize } #[inline] fn mask4(p1: i32, p0: i32, q0: i32, q1: i32, shift: usize) -> usize { cmp::max( limit_to_level(cmp::max((p1 - p0).abs(), (q1 - q0).abs()), shift), blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift), ) as usize } #[inline] fn deblock_size4_inner( [p1, p0, q0, q1]: [i32; 4], level: usize, bd: usize, ) -> Option<[i32; 4]> { if mask4(p1, p0, q0, q1, bd - 8) <= level { let x = if nhev4(p1, p0, q0, q1, bd - 8) <= level { filter_narrow4_4(p1, p0, q0, q1, bd - 8) } else { filter_narrow2_4(p1, p0, q0, q1, bd - 8) }; Some(x) } else { None } } // Assumes rec[0] is set 2 taps back from the edge fn deblock_v_size4( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for y in 0..4 { let p = &rec[y]; let vals = [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_()]; if let Some(data) = deblock_size4_inner(vals, level, bd) { copy_horizontal(rec, 0, y, &data); } } } // Assumes rec[0] is set 2 taps back from the edge fn deblock_h_size4( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for x in 0..4 { let vals = [rec[0][x].as_(), rec[1][x].as_(), rec[2][x].as_(), rec[3][x].as_()]; if let Some(data) = deblock_size4_inner(vals, level, bd) { copy_vertical(rec, x, 0, &data); } } } // Assumes rec[0] and src[0] are set 2 taps back from the edge. // Accesses four taps, accumulates four pixels into the tally fn sse_size4( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { for i in 0..4 { let (p1, p0, q0, q1, a) = if horizontal_p { ( rec[0][i].as_(), rec[1][i].as_(), rec[2][i].as_(), rec[3][i].as_(), [src[0][i].as_(), src[1][i].as_(), src[2][i].as_(), src[3][i].as_()], ) } else { ( rec[i][0].as_(), rec[i][1].as_(), rec[i][2].as_(), rec[i][3].as_(), [src[i][0].as_(), src[i][1].as_(), src[i][2].as_(), src[i][3].as_()], ) }; // three possibilities: no filter, narrow2 and narrow4 // All possibilities produce four outputs let none: [_; 4] = [p1, p0, q0, q1]; let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); // mask4 sets the dividing line for filter vs no filter // nhev4 sets the dividing line between narrow2 and narrow4 let mask = clamp(mask4(p1, p0, q0, q1, bd - 8), 1, MAX_LOOP_FILTER + 1); let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); let sse_narrow2 = if nhev != mask { stride_sse(&a, &narrow2) } else { sse_none }; let sse_narrow4 = if nhev <= MAX_LOOP_FILTER { stride_sse(&a, &narrow4) } else { sse_none }; // accumulate possible filter values into the tally // level 0 is a special case tally[0] += sse_none; tally[mask] -= sse_none; tally[mask] += sse_narrow2; tally[nhev] -= sse_narrow2; tally[nhev] += sse_narrow4; } } #[inline] fn mask6( p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, shift: usize, ) -> usize { cmp::max( limit_to_level( cmp::max( (p2 - p1).abs(), cmp::max((p1 - p0).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs())), ), shift, ), blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift), ) as usize } #[inline] fn flat6(p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32) -> usize { cmp::max( (p1 - p0).abs(), cmp::max((q1 - q0).abs(), cmp::max((p2 - p0).abs(), (q2 - q0).abs())), ) as usize } #[inline] fn deblock_size6_inner( [p2, p1, p0, q0, q1, q2]: [i32; 6], level: usize, bd: usize, ) -> Option<[i32; 4]> { if mask6(p2, p1, p0, q0, q1, q2, bd - 8) <= level { let flat = 1 << (bd - 8); let x = if flat6(p2, p1, p0, q0, q1, q2) <= flat { filter_wide6_4(p2, p1, p0, q0, q1, q2) } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { filter_narrow4_4(p1, p0, q0, q1, bd - 8) } else { filter_narrow2_4(p1, p0, q0, q1, bd - 8) }; Some(x) } else { None } } // Assumes slice[0] is set 3 taps back from the edge fn deblock_v_size6( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for y in 0..4 { let p = &rec[y]; let vals = [p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_(), p[4].as_(), p[5].as_()]; if let Some(data) = deblock_size6_inner(vals, level, bd) { copy_horizontal(rec, 1, y, &data); } } } // Assumes slice[0] is set 3 taps back from the edge fn deblock_h_size6( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for x in 0..4 { let vals = [ rec[0][x].as_(), rec[1][x].as_(), rec[2][x].as_(), rec[3][x].as_(), rec[4][x].as_(), rec[5][x].as_(), ]; if let Some(data) = deblock_size6_inner(vals, level, bd) { copy_vertical(rec, x, 1, &data); } } } // Assumes rec[0] and src[0] are set 3 taps back from the edge. // Accesses six taps, accumulates four pixels into the tally fn sse_size6( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { let flat = 1 << (bd - 8); for i in 0..4 { let (p2, p1, p0, q0, q1, q2, a) = if horizontal_p { // six taps ( rec[0][i].as_(), rec[1][i].as_(), rec[2][i].as_(), rec[3][i].as_(), rec[4][i].as_(), rec[5][i].as_(), // four pixels to compare so offset one forward [src[1][i].as_(), src[2][i].as_(), src[3][i].as_(), src[4][i].as_()], ) } else { // six taps ( rec[i][0].as_(), rec[i][1].as_(), rec[i][2].as_(), rec[i][3].as_(), rec[i][4].as_(), rec[i][5].as_(), // four pixels to compare so offset one forward [src[i][1].as_(), src[i][2].as_(), src[i][3].as_(), src[i][4].as_()], ) }; // Four possibilities: no filter, wide6, narrow2 and narrow4 // All possibilities produce four outputs let none: [_; 4] = [p1, p0, q0, q1]; let wide6 = filter_wide6_4(p2, p1, p0, q0, q1, q2); let narrow2 = filter_narrow2_4(p1, p0, q0, q1, bd - 8); let narrow4 = filter_narrow4_4(p1, p0, q0, q1, bd - 8); // mask6 sets the dividing line for filter vs no filter // flat6 decides between wide and narrow filters (unrelated to level) // nhev4 sets the dividing line between narrow2 and narrow4 let mask = clamp(mask6(p2, p1, p0, q0, q1, q2, bd - 8), 1, MAX_LOOP_FILTER + 1); let flatp = flat6(p2, p1, p0, q0, q1, q2) <= flat; let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); let sse_wide6 = if flatp && mask <= MAX_LOOP_FILTER { stride_sse(&a, &wide6) } else { sse_none }; let sse_narrow2 = if !flatp && nhev != mask { stride_sse(&a, &narrow2) } else { sse_none }; let sse_narrow4 = if !flatp && nhev <= MAX_LOOP_FILTER { stride_sse(&a, &narrow4) } else { sse_none }; // accumulate possible filter values into the tally tally[0] += sse_none; tally[mask] -= sse_none; if flatp { tally[mask] += sse_wide6; } else { tally[mask] += sse_narrow2; tally[nhev] -= sse_narrow2; tally[nhev] += sse_narrow4; } } } #[inline] fn mask8( p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, shift: usize, ) -> usize { cmp::max( limit_to_level( cmp::max( (p3 - p2).abs(), cmp::max( (p2 - p1).abs(), cmp::max( (p1 - p0).abs(), cmp::max( (q3 - q2).abs(), cmp::max((q2 - q1).abs(), (q1 - q0).abs()), ), ), ), ), shift, ), blimit_to_level((p0 - q0).abs() * 2 + (p1 - q1).abs() / 2, shift), ) as usize } #[inline] fn flat8( p3: i32, p2: i32, p1: i32, p0: i32, q0: i32, q1: i32, q2: i32, q3: i32, ) -> usize { cmp::max( (p1 - p0).abs(), cmp::max( (q1 - q0).abs(), cmp::max( (p2 - p0).abs(), cmp::max((q2 - q0).abs(), cmp::max((p3 - p0).abs(), (q3 - q0).abs())), ), ), ) as usize } #[inline] fn deblock_size8_inner( [p3, p2, p1, p0, q0, q1, q2, q3]: [i32; 8], level: usize, bd: usize, ) -> Option<[i32; 6]> { if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { let flat = 1 << (bd - 8); let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3) } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8) } else { filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8) }; Some(x) } else { None } } // Assumes rec[0] is set 4 taps back from the edge fn deblock_v_size8( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for y in 0..4 { let p = &rec[y]; let vals = [ p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_(), p[4].as_(), p[5].as_(), p[6].as_(), p[7].as_(), ]; if let Some(data) = deblock_size8_inner(vals, level, bd) { copy_horizontal(rec, 1, y, &data); } } } // Assumes rec[0] is set 4 taps back from the edge fn deblock_h_size8( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for x in 0..4 { let vals = [ rec[0][x].as_(), rec[1][x].as_(), rec[2][x].as_(), rec[3][x].as_(), rec[4][x].as_(), rec[5][x].as_(), rec[6][x].as_(), rec[7][x].as_(), ]; if let Some(data) = deblock_size8_inner(vals, level, bd) { copy_vertical(rec, x, 1, &data); } } } // Assumes rec[0] and src[0] are set 4 taps back from the edge. // Accesses eight taps, accumulates six pixels into the tally fn sse_size8( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { let flat = 1 << (bd - 8); for i in 0..4 { let (p3, p2, p1, p0, q0, q1, q2, q3, a) = if horizontal_p { // eight taps ( rec[0][i].as_(), rec[1][i].as_(), rec[2][i].as_(), rec[3][i].as_(), rec[4][i].as_(), rec[5][i].as_(), rec[6][i].as_(), rec[7][i].as_(), // six pixels to compare so offset one forward [ src[1][i].as_(), src[2][i].as_(), src[3][i].as_(), src[4][i].as_(), src[5][i].as_(), src[6][i].as_(), ], ) } else { // eight taps ( rec[i][0].as_(), rec[i][1].as_(), rec[i][2].as_(), rec[i][3].as_(), rec[i][4].as_(), rec[i][5].as_(), rec[i][6].as_(), rec[i][7].as_(), // six pixels to compare so offset one forward [ src[i][1].as_(), src[i][2].as_(), src[i][3].as_(), src[i][4].as_(), src[i][5].as_(), src[i][6].as_(), ], ) }; // Four possibilities: no filter, wide8, narrow2 and narrow4 let none: [_; 6] = [p2, p1, p0, q0, q1, q2]; let wide8: [_; 6] = filter_wide8_6(p3, p2, p1, p0, q0, q1, q2, q3); let narrow2: [_; 6] = filter_narrow2_6(p2, p1, p0, q0, q1, q2, bd - 8); let narrow4: [_; 6] = filter_narrow4_6(p2, p1, p0, q0, q1, q2, bd - 8); // mask8 sets the dividing line for filter vs no filter // flat8 decides between wide and narrow filters (unrelated to level) // nhev4 sets the dividing line between narrow2 and narrow4 let mask = clamp( mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8), 1, MAX_LOOP_FILTER + 1, ); let flatp = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); let sse_wide8 = if flatp && mask <= MAX_LOOP_FILTER { stride_sse(&a, &wide8) } else { sse_none }; let sse_narrow2 = if !flatp && nhev != mask { stride_sse(&a, &narrow2) } else { sse_none }; let sse_narrow4 = if !flatp && nhev <= MAX_LOOP_FILTER { stride_sse(&a, &narrow4) } else { sse_none }; // accumulate possible filter values into the tally tally[0] += sse_none; tally[mask] -= sse_none; if flatp { tally[mask] += sse_wide8; } else { tally[mask] += sse_narrow2; tally[nhev] -= sse_narrow2; tally[nhev] += sse_narrow4; } } } #[inline] fn flat14_outer( p6: i32, p5: i32, p4: i32, p0: i32, q0: i32, q4: i32, q5: i32, q6: i32, ) -> usize { cmp::max( (p4 - p0).abs(), cmp::max( (q4 - q0).abs(), cmp::max( (p5 - p0).abs(), cmp::max((q5 - q0).abs(), cmp::max((p6 - p0).abs(), (q6 - q0).abs())), ), ), ) as usize } #[inline] fn deblock_size14_inner( [p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6]: [i32; 14], level: usize, bd: usize, ) -> Option<[i32; 12]> { // 'mask' test if mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8) <= level { let flat = 1 << (bd - 8); // inner flatness test let x = if flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat { // outer flatness test if flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat { // sufficient flatness across 14 pixel width; run full-width filter filter_wide14_12( p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, ) } else { // only flat in inner area, run 8-tap filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5) } } else if nhev4(p1, p0, q0, q1, bd - 8) <= level { // not flat, run narrow filter filter_narrow4_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8) } else { filter_narrow2_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8) }; Some(x) } else { None } } // Assumes rec[0] is set 7 taps back from the edge fn deblock_v_size14( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for y in 0..4 { let p = &rec[y]; let vals = [ p[0].as_(), p[1].as_(), p[2].as_(), p[3].as_(), p[4].as_(), p[5].as_(), p[6].as_(), p[7].as_(), p[8].as_(), p[9].as_(), p[10].as_(), p[11].as_(), p[12].as_(), p[13].as_(), ]; if let Some(data) = deblock_size14_inner(vals, level, bd) { copy_horizontal(rec, 1, y, &data); } } } // Assumes rec[0] is set 7 taps back from the edge fn deblock_h_size14( rec: &mut PlaneRegionMut<'_, T>, level: usize, bd: usize, ) { for x in 0..4 { let vals = [ rec[0][x].as_(), rec[1][x].as_(), rec[2][x].as_(), rec[3][x].as_(), rec[4][x].as_(), rec[5][x].as_(), rec[6][x].as_(), rec[7][x].as_(), rec[8][x].as_(), rec[9][x].as_(), rec[10][x].as_(), rec[11][x].as_(), rec[12][x].as_(), rec[13][x].as_(), ]; if let Some(data) = deblock_size14_inner(vals, level, bd) { copy_vertical(rec, x, 1, &data); } } } // Assumes rec[0] and src[0] are set 7 taps back from the edge. // Accesses fourteen taps, accumulates twelve pixels into the tally fn sse_size14( rec: &PlaneRegion<'_, T>, src: &PlaneRegion<'_, T>, tally: &mut [i64; MAX_LOOP_FILTER + 2], horizontal_p: bool, bd: usize, ) { let flat = 1 << (bd - 8); for i in 0..4 { let (p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, a) = if horizontal_p { // 14 taps ( rec[0][i].as_(), rec[1][i].as_(), rec[2][i].as_(), rec[3][i].as_(), rec[4][i].as_(), rec[5][i].as_(), rec[6][i].as_(), rec[7][i].as_(), rec[8][i].as_(), rec[9][i].as_(), rec[10][i].as_(), rec[11][i].as_(), rec[12][i].as_(), rec[13][i].as_(), // 12 pixels to compare so offset one forward [ src[1][i].as_(), src[2][i].as_(), src[3][i].as_(), src[4][i].as_(), src[5][i].as_(), src[6][i].as_(), src[7][i].as_(), src[8][i].as_(), src[9][i].as_(), src[10][i].as_(), src[11][i].as_(), src[12][i].as_(), ], ) } else { // 14 taps ( rec[i][0].as_(), rec[i][1].as_(), rec[i][2].as_(), rec[i][3].as_(), rec[i][4].as_(), rec[i][5].as_(), rec[i][6].as_(), rec[i][7].as_(), rec[i][8].as_(), rec[i][9].as_(), rec[i][10].as_(), rec[i][11].as_(), rec[i][12].as_(), rec[i][13].as_(), // 12 pixels to compare so offset one forward [ src[i][1].as_(), src[i][2].as_(), src[i][3].as_(), src[i][4].as_(), src[i][5].as_(), src[i][6].as_(), src[i][7].as_(), src[i][8].as_(), src[i][9].as_(), src[i][10].as_(), src[i][11].as_(), src[i][12].as_(), ], ) }; // Five possibilities: no filter, wide14, wide8, narrow2 and narrow4 let none: [i32; 12] = [p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5]; let wide14 = filter_wide14_12(p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6); let wide8 = filter_wide8_12(p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5); let narrow2 = filter_narrow2_12( p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8, ); let narrow4 = filter_narrow4_12( p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, bd - 8, ); // mask8 sets the dividing line for filter vs no filter // flat8 decides between wide and narrow filters (unrelated to level) // flat14 decides between wide14 and wide8 filters // nhev4 sets the dividing line between narrow2 and narrow4 let mask = clamp( mask8(p3, p2, p1, p0, q0, q1, q2, q3, bd - 8), 1, MAX_LOOP_FILTER + 1, ); let flat8p = flat8(p3, p2, p1, p0, q0, q1, q2, q3) <= flat; let flat14p = flat14_outer(p6, p5, p4, p0, q0, q4, q5, q6) <= flat; let nhev = clamp(nhev4(p1, p0, q0, q1, bd - 8), mask, MAX_LOOP_FILTER + 1); // sse for each; short-circuit the 'special' no-op cases. let sse_none = stride_sse(&a, &none); let sse_wide8 = if flat8p && !flat14p && mask <= MAX_LOOP_FILTER { stride_sse(&a, &wide8) } else { sse_none }; let sse_wide14 = if flat8p && flat14p && mask <= MAX_LOOP_FILTER { stride_sse(&a, &wide14) } else { sse_none }; let sse_narrow2 = if !flat8p && nhev != mask { stride_sse(&a, &narrow2) } else { sse_none }; let sse_narrow4 = if !flat8p && nhev <= MAX_LOOP_FILTER { stride_sse(&a, &narrow4) } else { sse_none }; // accumulate possible filter values into the tally tally[0] += sse_none; tally[mask] -= sse_none; if flat8p { if flat14p { tally[mask] += sse_wide14; } else { tally[mask] += sse_wide8; } } else { tally[mask] += sse_narrow2; tally[nhev] -= sse_narrow2; tally[nhev] += sse_narrow4; } } } fn filter_v_edge( deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset, p: &mut PlaneRegionMut, pli: usize, bd: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { block.txsize } else { block.bsize.largest_chroma_tx_size(xdec, ydec) }; let tx_edge = bo.0.x >> xdec & (txsize.width_mi() - 1) == 0; if tx_edge { let prev_block = deblock_left(blocks, bo, &p.as_const()); let block_edge = bo.0.x & (block.n4_w as usize - 1) == 0; let filter_size = deblock_size(block, prev_block, &p.as_const(), pli, true, block_edge); if filter_size > 0 { let level = deblock_level(deblock, block, prev_block, pli, true); if level > 0 { let po = bo.plane_offset(p.plane_cfg); let mut plane_region = p.subregion_mut(Area::Rect { x: po.x - (filter_size >> 1) as isize, y: po.y, width: filter_size, height: 4, }); match filter_size { 4 => { deblock_v_size4(&mut plane_region, level, bd); } 6 => { deblock_v_size6(&mut plane_region, level, bd); } 8 => { deblock_v_size8(&mut plane_region, level, bd); } 14 => { deblock_v_size14(&mut plane_region, level, bd); } _ => unreachable!(), } } } } } fn sse_v_edge( blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, src_plane: &PlaneRegion, tally: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, bd: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { block.txsize } else { block.bsize.largest_chroma_tx_size(xdec, ydec) }; let tx_edge = bo.0.x >> xdec & (txsize.width_mi() - 1) == 0; if tx_edge { let prev_block = deblock_left(blocks, bo, rec_plane); let block_edge = bo.0.x & (block.n4_w as usize - 1) == 0; let filter_size = deblock_size(block, prev_block, rec_plane, pli, true, block_edge); if filter_size > 0 { let po = bo.plane_offset(rec_plane.plane_cfg); // rec and src have identical subsampling let rec_region = rec_plane.subregion(Area::Rect { x: po.x - (filter_size >> 1) as isize, y: po.y, width: filter_size, height: 4, }); let src_region = src_plane.subregion(Area::Rect { x: po.x - (filter_size >> 1) as isize, y: po.y, width: filter_size, height: 4, }); match filter_size { 4 => { sse_size4(&rec_region, &src_region, tally, false, bd); } 6 => { sse_size6(&rec_region, &src_region, tally, false, bd); } 8 => { sse_size8(&rec_region, &src_region, tally, false, bd); } 14 => { sse_size14(&rec_region, &src_region, tally, false, bd); } _ => unreachable!(), } } } } fn filter_h_edge( deblock: &DeblockState, blocks: &TileBlocks, bo: TileBlockOffset, p: &mut PlaneRegionMut, pli: usize, bd: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { block.txsize } else { block.bsize.largest_chroma_tx_size(xdec, ydec) }; let tx_edge = bo.0.y >> ydec & (txsize.height_mi() - 1) == 0; if tx_edge { let prev_block = deblock_up(blocks, bo, &p.as_const()); let block_edge = bo.0.y & (block.n4_h as usize - 1) == 0; let filter_size = deblock_size(block, prev_block, &p.as_const(), pli, false, block_edge); if filter_size > 0 { let level = deblock_level(deblock, block, prev_block, pli, false); if level > 0 { let po = bo.plane_offset(p.plane_cfg); let mut plane_region = p.subregion_mut(Area::Rect { x: po.x, y: po.y - (filter_size >> 1) as isize, width: 4, height: filter_size, }); match filter_size { 4 => { deblock_h_size4(&mut plane_region, level, bd); } 6 => { deblock_h_size6(&mut plane_region, level, bd); } 8 => { deblock_h_size8(&mut plane_region, level, bd); } 14 => { deblock_h_size14(&mut plane_region, level, bd); } _ => unreachable!(), } } } } } fn sse_h_edge( blocks: &TileBlocks, bo: TileBlockOffset, rec_plane: &PlaneRegion, src_plane: &PlaneRegion, tally: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, bd: usize, xdec: usize, ydec: usize, ) { let block = &blocks[bo]; let txsize = if pli == 0 { block.txsize } else { block.bsize.largest_chroma_tx_size(xdec, ydec) }; let tx_edge = bo.0.y >> ydec & (txsize.height_mi() - 1) == 0; if tx_edge { let prev_block = deblock_up(blocks, bo, rec_plane); let block_edge = bo.0.y & (block.n4_h as usize - 1) == 0; let filter_size = deblock_size(block, prev_block, rec_plane, pli, true, block_edge); if filter_size > 0 { let po = bo.plane_offset(rec_plane.plane_cfg); // rec and src have identical subsampling let rec_region = rec_plane.subregion(Area::Rect { x: po.x, y: po.y - (filter_size >> 1) as isize, width: 4, height: filter_size, }); let src_region = src_plane.subregion(Area::Rect { x: po.x, y: po.y - (filter_size >> 1) as isize, width: 4, height: filter_size, }); match filter_size { 4 => { sse_size4(&rec_region, &src_region, tally, true, bd); } 6 => { sse_size6(&rec_region, &src_region, tally, true, bd); } 8 => { sse_size8(&rec_region, &src_region, tally, true, bd); } 14 => { sse_size14(&rec_region, &src_region, tally, true, bd); } _ => unreachable!(), } } } } // Deblocks all edges, vertical and horizontal, in a single plane #[profiling::function] pub fn deblock_plane( deblock: &DeblockState, p: &mut PlaneRegionMut, pli: usize, blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize, ) { let xdec = p.plane_cfg.xdec; let ydec = p.plane_cfg.ydec; assert!(xdec <= 1 && ydec <= 1); match pli { 0 => { if deblock.levels[0] == 0 && deblock.levels[1] == 0 { return; } } 1 => { if deblock.levels[2] == 0 { return; } } 2 => { if deblock.levels[3] == 0 { return; } } _ => return, } let rect = p.rect(); let cols = (cmp::min( blocks.cols(), ((crop_w - rect.x as usize) + MI_SIZE - 1) >> MI_SIZE_LOG2, ) + (1 << xdec >> 1)) >> xdec << xdec; // Clippy can go suck an egg let rows = (cmp::min( blocks.rows(), ((crop_h - rect.y as usize) + MI_SIZE - 1) >> MI_SIZE_LOG2, ) + (1 << ydec >> 1)) >> ydec << ydec; // Clippy can go suck an egg // vertical edge filtering leads horizontal by one full MI-sized // row (and horizontal filtering doesn't happen along the upper // edge). Unroll to avoid corner-cases. if rows > 0 { for x in (1 << xdec..cols).step_by(1 << xdec) { filter_v_edge( deblock, blocks, TileBlockOffset(BlockOffset { x, y: 0 }), p, pli, bd, xdec, ydec, ); } if rows > 1 << ydec { for x in (1 << xdec..cols).step_by(1 << xdec) { filter_v_edge( deblock, blocks, TileBlockOffset(BlockOffset { x, y: 1 << ydec }), p, pli, bd, xdec, ydec, ); } } } // filter rows where vertical and horizontal edge filtering both // happen (horizontal edge filtering lags vertical by one row). for y in ((2 << ydec)..rows).step_by(1 << ydec) { // Check for vertical edge at first MI block boundary on this row if cols > 1 << xdec { filter_v_edge( deblock, blocks, TileBlockOffset(BlockOffset { x: 1 << xdec, y }), p, pli, bd, xdec, ydec, ); } // run the rest of the row with both vertical and horizontal edge filtering. // Horizontal lags vertical edge by one row and two columns. for x in (2 << xdec..cols).step_by(1 << xdec) { filter_v_edge( deblock, blocks, TileBlockOffset(BlockOffset { x, y }), p, pli, bd, xdec, ydec, ); filter_h_edge( deblock, blocks, TileBlockOffset(BlockOffset { x: x - (2 << xdec), y: y - (1 << ydec), }), p, pli, bd, xdec, ydec, ); } // ..and the last two horizontal edges for the row if cols >= 2 << xdec { filter_h_edge( deblock, blocks, TileBlockOffset(BlockOffset { x: cols - (2 << xdec), y: y - (1 << ydec), }), p, pli, bd, xdec, ydec, ); } if cols >= 1 << xdec { filter_h_edge( deblock, blocks, TileBlockOffset(BlockOffset { x: cols - (1 << xdec), y: y - (1 << ydec), }), p, pli, bd, xdec, ydec, ); } } // Last horizontal row, vertical is already complete if rows > 1 << ydec { for x in (0..cols).step_by(1 << xdec) { filter_h_edge( deblock, blocks, TileBlockOffset(BlockOffset { x, y: rows - (1 << ydec) }), p, pli, bd, xdec, ydec, ); } } } // sse count of all edges in a single plane, accumulates into vertical and horizontal counts fn sse_plane( rec: &PlaneRegion, src: &PlaneRegion, v_sse: &mut [i64; MAX_LOOP_FILTER + 2], h_sse: &mut [i64; MAX_LOOP_FILTER + 2], pli: usize, blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize, ) { let xdec = rec.plane_cfg.xdec; let ydec = rec.plane_cfg.ydec; assert!(xdec <= 1 && ydec <= 1); let rect = rec.rect(); let cols = (cmp::min( blocks.cols(), (crop_w - rect.x as usize + MI_SIZE - 1) >> MI_SIZE_LOG2, ) + (1 << xdec >> 1)) >> xdec << xdec; // Clippy can go suck an egg let rows = (cmp::min( blocks.rows(), (crop_h - rect.y as usize + MI_SIZE - 1) >> MI_SIZE_LOG2, ) + (1 << ydec >> 1)) >> ydec << ydec; // Clippy can go suck an egg // No horizontal edge filtering along top of frame for x in (1 << xdec..cols).step_by(1 << xdec) { sse_v_edge( blocks, TileBlockOffset(BlockOffset { x, y: 0 }), rec, src, v_sse, pli, bd, xdec, ydec, ); } // Unlike actual filtering, we're counting horizontal and vertical // as separable cases. No need to lag the horizontal processing // behind vertical. for y in (1 << ydec..rows).step_by(1 << ydec) { // No vertical filtering along left edge of frame sse_h_edge( blocks, TileBlockOffset(BlockOffset { x: 0, y }), rec, src, h_sse, pli, bd, xdec, ydec, ); for x in (1 << xdec..cols).step_by(1 << xdec) { sse_v_edge( blocks, TileBlockOffset(BlockOffset { x, y }), rec, src, v_sse, pli, bd, xdec, ydec, ); sse_h_edge( blocks, TileBlockOffset(BlockOffset { x, y }), rec, src, h_sse, pli, bd, xdec, ydec, ); } } } // Deblocks all edges in all planes of a frame #[profiling::function] pub fn deblock_filter_frame( deblock: &DeblockState, tile: &mut TileMut, blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize, planes: usize, ) { tile.planes[..planes].par_iter_mut().enumerate().for_each(|(pli, plane)| { deblock_plane(deblock, plane, pli, blocks, crop_w, crop_h, bd); }); } fn sse_optimize( rec: &Tile, input: &Tile, blocks: &TileBlocks, crop_w: usize, crop_h: usize, bd: usize, monochrome: bool, ) -> [u8; 4] { // i64 allows us to accumulate a total of ~ 35 bits worth of pixels assert!( ILog::ilog(input.planes[0].plane_cfg.width) + ILog::ilog(input.planes[0].plane_cfg.height) < 35 ); let mut level = [0; 4]; let planes = if monochrome { 1 } else { MAX_PLANES }; for pli in 0..planes { let mut v_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2]; let mut h_tally: [i64; MAX_LOOP_FILTER + 2] = [0; MAX_LOOP_FILTER + 2]; sse_plane( &rec.planes[pli], &input.planes[pli], &mut v_tally, &mut h_tally, pli, blocks, crop_w, crop_h, bd, ); for i in 1..=MAX_LOOP_FILTER { v_tally[i] += v_tally[i - 1]; h_tally[i] += h_tally[i - 1]; } match pli { 0 => { let mut best_v = 999; let mut best_h = 999; for i in 0..=MAX_LOOP_FILTER { if best_v == 999 || v_tally[best_v] > v_tally[i] { best_v = i; }; if best_h == 999 || h_tally[best_h] > h_tally[i] { best_h = i; }; } level[0] = best_v as u8; level[1] = best_h as u8; } 1 | 2 => { let mut best = 999; for i in 0..=MAX_LOOP_FILTER { if best == 999 || v_tally[best] + h_tally[best] > v_tally[i] + h_tally[i] { best = i; }; } level[pli + 1] = best as u8; } _ => unreachable!(), } } level } #[profiling::function] pub fn deblock_filter_optimize( fi: &FrameInvariants, rec: &Tile, input: &Tile, blocks: &TileBlocks, crop_w: usize, crop_h: usize, ) -> [u8; 4] { if fi.config.speed_settings.fast_deblock { let q = ac_q(fi.base_q_idx, 0, fi.sequence.bit_depth).get() as i32; let level = clamp( match fi.sequence.bit_depth { 8 => { if fi.frame_type == FrameType::KEY { (q * 17563 - 421_574 + (1 << 18 >> 1)) >> 18 } else { (q * 6017 + 650_707 + (1 << 18 >> 1)) >> 18 } } 10 => { if fi.frame_type == FrameType::KEY { ((q * 20723 + 4_060_632 + (1 << 20 >> 1)) >> 20) - 4 } else { (q * 20723 + 4_060_632 + (1 << 20 >> 1)) >> 20 } } 12 => { if fi.frame_type == FrameType::KEY { ((q * 20723 + 16_242_526 + (1 << 22 >> 1)) >> 22) - 4 } else { (q * 20723 + 16_242_526 + (1 << 22 >> 1)) >> 22 } } _ => unreachable!(), }, 0, MAX_LOOP_FILTER as i32, ) as u8; [level; 4] } else { // Deblocking happens in 4x4 (luma) units; luma x,y are clipped to // the *crop frame* of the entire frame by 4x4 block. sse_optimize( rec, input, blocks, crop_w, crop_h, fi.sequence.bit_depth, fi.sequence.chroma_sampling == Cs400, ) } } rav1e-0.7.1/src/dist.rs000064400000000000000000000405171046102023000127560ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::dist::*; } else if #[cfg(asm_neon)] { pub use crate::asm::aarch64::dist::*; } else { pub use self::rust::*; } } pub(crate) mod rust { use crate::activity::apply_ssim_boost; use crate::cpu_features::CpuFeatureLevel; use crate::tiling::*; use crate::util::*; use crate::encoder::IMPORTANCE_BLOCK_SIZE; use crate::rdo::DistortionScale; /// Compute the sum of absolute differences over a block. /// w and h can be at most 128, the size of the largest block. pub fn get_sad( plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize, h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel, ) -> u32 { debug_assert!(w <= 128 && h <= 128); let plane_org = plane_org.subregion(Area::Rect { x: 0, y: 0, width: w, height: h }); let plane_ref = plane_ref.subregion(Area::Rect { x: 0, y: 0, width: w, height: h }); plane_org .rows_iter() .zip(plane_ref.rows_iter()) .map(|(src, dst)| { src .iter() .zip(dst) .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2))) .sum::() }) .sum() } #[inline(always)] const fn butterfly(a: i32, b: i32) -> (i32, i32) { ((a + b), (a - b)) } #[inline(always)] #[allow(clippy::identity_op, clippy::erasing_op)] fn hadamard4_1d< const LEN: usize, const N: usize, const STRIDE0: usize, const STRIDE1: usize, >( data: &mut [i32; LEN], ) { for i in 0..N { let sub: &mut [i32] = &mut data[i * STRIDE0..]; let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]); let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]); let (b0, b2) = butterfly(a0, a2); let (b1, b3) = butterfly(a1, a3); sub[0 * STRIDE1] = b0; sub[1 * STRIDE1] = b1; sub[2 * STRIDE1] = b2; sub[3 * STRIDE1] = b3; } } #[inline(always)] #[allow(clippy::identity_op, clippy::erasing_op)] fn hadamard8_1d< const LEN: usize, const N: usize, const STRIDE0: usize, const STRIDE1: usize, >( data: &mut [i32; LEN], ) { for i in 0..N { let sub: &mut [i32] = &mut data[i * STRIDE0..]; let (a0, a1) = butterfly(sub[0 * STRIDE1], sub[1 * STRIDE1]); let (a2, a3) = butterfly(sub[2 * STRIDE1], sub[3 * STRIDE1]); let (a4, a5) = butterfly(sub[4 * STRIDE1], sub[5 * STRIDE1]); let (a6, a7) = butterfly(sub[6 * STRIDE1], sub[7 * STRIDE1]); let (b0, b2) = butterfly(a0, a2); let (b1, b3) = butterfly(a1, a3); let (b4, b6) = butterfly(a4, a6); let (b5, b7) = butterfly(a5, a7); let (c0, c4) = butterfly(b0, b4); let (c1, c5) = butterfly(b1, b5); let (c2, c6) = butterfly(b2, b6); let (c3, c7) = butterfly(b3, b7); sub[0 * STRIDE1] = c0; sub[1 * STRIDE1] = c1; sub[2 * STRIDE1] = c2; sub[3 * STRIDE1] = c3; sub[4 * STRIDE1] = c4; sub[5 * STRIDE1] = c5; sub[6 * STRIDE1] = c6; sub[7 * STRIDE1] = c7; } } #[inline(always)] fn hadamard2d( data: &mut [i32; LEN], ) { /*Vertical transform.*/ let vert_func = if H == 4 { hadamard4_1d:: } else { hadamard8_1d:: }; vert_func(data); /*Horizontal transform.*/ let horz_func = if W == 4 { hadamard4_1d:: } else { hadamard8_1d:: }; horz_func(data); } // SAFETY: The length of data must be 16. unsafe fn hadamard4x4(data: &mut [i32]) { hadamard2d::<{ 4 * 4 }, 4, 4>(&mut *(data.as_mut_ptr() as *mut [i32; 16])); } // SAFETY: The length of data must be 64. unsafe fn hadamard8x8(data: &mut [i32]) { hadamard2d::<{ 8 * 8 }, 8, 8>(&mut *(data.as_mut_ptr() as *mut [i32; 64])); } /// Sum of absolute transformed differences over a block. /// w and h can be at most 128, the size of the largest block. /// Use the sum of 4x4 and 8x8 hadamard transforms for the transform, but /// revert to sad on edges when these transforms do not fit into w and h. /// 4x4 transforms instead of 8x8 transforms when width or height < 8. pub fn get_satd( plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, w: usize, h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel, ) -> u32 { assert!(w <= 128 && h <= 128); assert!(plane_org.rect().width >= w && plane_org.rect().height >= h); assert!(plane_ref.rect().width >= w && plane_ref.rect().height >= h); // Size of hadamard transform should be 4x4 or 8x8 // 4x* and *x4 use 4x4 and all other use 8x8 let size: usize = w.min(h).min(8); let tx2d = if size == 4 { hadamard4x4 } else { hadamard8x8 }; let mut sum: u64 = 0; // Loop over chunks the size of the chosen transform for chunk_y in (0..h).step_by(size) { let chunk_h = (h - chunk_y).min(size); for chunk_x in (0..w).step_by(size) { let chunk_w = (w - chunk_x).min(size); let chunk_area: Area = Area::Rect { x: chunk_x as isize, y: chunk_y as isize, width: chunk_w, height: chunk_h, }; let chunk_org = plane_org.subregion(chunk_area); let chunk_ref = plane_ref.subregion(chunk_area); // Revert to sad on edge blocks (frame edges) if chunk_w != size || chunk_h != size { sum += get_sad( &chunk_org, &chunk_ref, chunk_w, chunk_h, _bit_depth, _cpu, ) as u64; continue; } let buf: &mut [i32] = &mut [0; 8 * 8][..size * size]; // Move the difference of the transforms to a buffer for (row_diff, (row_org, row_ref)) in buf .chunks_mut(size) .zip(chunk_org.rows_iter().zip(chunk_ref.rows_iter())) { for (diff, (a, b)) in row_diff.iter_mut().zip(row_org.iter().zip(row_ref.iter())) { *diff = i32::cast_from(*a) - i32::cast_from(*b); } } // Perform the hadamard transform on the differences // SAFETY: A sufficient number elements exist for the size of the transform. unsafe { tx2d(buf); } // Sum the absolute values of the transformed differences sum += buf.iter().map(|a| a.unsigned_abs() as u64).sum::(); } } // Normalize the results let ln = msb(size as i32) as u64; ((sum + (1 << ln >> 1)) >> ln) as u32 } /// Number of bits rounded off before summing in `get_weighted_sse` pub const GET_WEIGHTED_SSE_SHIFT: u8 = 8; /// Computes weighted sum of squared error. /// /// Each scale is applied to a 4x4 region in the provided inputs. Each scale /// value is a fixed point number, currently [`DistortionScale`]. /// /// Implementations can require alignment (`bw` (block width) for [`src1`] and /// [`src2`] and `bw/4` for `scale`). #[inline(never)] pub fn get_weighted_sse( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, scale: &[u32], scale_stride: usize, w: usize, h: usize, _bit_depth: usize, _cpu: CpuFeatureLevel, ) -> u64 { let src1 = src1.subregion(Area::Rect { x: 0, y: 0, width: w, height: h }); // Always chunk and apply scaling on the sse of squares the size of // decimated/sub-sampled importance block sizes. // Warning: Changing this will require changing/disabling assembly. let chunk_size: usize = IMPORTANCE_BLOCK_SIZE >> 1; // Iterator of a row of scales, stretched out to be per row let scales = scale.chunks_exact(scale_stride); let sse = src1 .vert_windows(chunk_size) .step_by(chunk_size) .zip(src2.vert_windows(chunk_size).step_by(chunk_size)) .zip(scales) .map(|((row1, row2), scales)| { row1 .horz_windows(chunk_size) .step_by(chunk_size) .zip(row2.horz_windows(chunk_size).step_by(chunk_size)) .zip(scales) .map(|((chunk1, chunk2), &scale)| { let sum = chunk1 .rows_iter() .zip(chunk2.rows_iter()) .map(|(chunk_row1, chunk_row2)| { chunk_row1 .iter() .zip(chunk_row2) .map(|(&a, &b)| { let c = i32::cast_from(a) - i32::cast_from(b); (c * c) as u32 }) .sum::() }) .sum::(); (sum as u64 * scale as u64 + (1 << GET_WEIGHTED_SSE_SHIFT >> 1)) >> GET_WEIGHTED_SSE_SHIFT }) .sum::() }) .sum::(); let den = DistortionScale::new(1, 1 << GET_WEIGHTED_SSE_SHIFT).0 as u64; (sse + (den >> 1)) / den } /// Number of bits of precision used in `AREA_DIVISORS` const AREA_DIVISOR_BITS: u8 = 14; /// Lookup table for 2^`AREA_DIVISOR_BITS` / (1 + x) #[rustfmt::skip] const AREA_DIVISORS: [u16; 64] = [ 16384, 8192, 5461, 4096, 3277, 2731, 2341, 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092, 1024, 964, 910, 862, 819, 780, 745, 712, 683, 655, 630, 607, 585, 565, 546, 529, 512, 496, 482, 468, 455, 443, 431, 420, 410, 400, 390, 381, 372, 364, 356, 349, 341, 334, 328, 321, 315, 309, 303, 298, 293, 287, 282, 278, 273, 269, 264, 260, 256, ]; /// Computes a distortion metric of the sum of squares weighted by activity. /// w and h should be <= 8. #[inline(never)] pub fn cdef_dist_kernel( src: &PlaneRegion<'_, T>, dst: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, _cpu: CpuFeatureLevel, ) -> u32 { // TODO: Investigate using different constants in ssim boost for block sizes // smaller than 8x8. debug_assert!(src.plane_cfg.xdec == 0); debug_assert!(src.plane_cfg.ydec == 0); debug_assert!(dst.plane_cfg.xdec == 0); debug_assert!(dst.plane_cfg.ydec == 0); // Limit kernel to 8x8 debug_assert!(w <= 8); debug_assert!(h <= 8); // Compute the following summations. let mut sum_s: u32 = 0; // sum(src_{i,j}) let mut sum_d: u32 = 0; // sum(dst_{i,j}) let mut sum_s2: u32 = 0; // sum(src_{i,j}^2) let mut sum_d2: u32 = 0; // sum(dst_{i,j}^2) let mut sum_sd: u32 = 0; // sum(src_{i,j} * dst_{i,j}) for (row1, row2) in src.rows_iter().take(h).zip(dst.rows_iter()) { for (s, d) in row1[..w].iter().zip(row2) { let s: u32 = u32::cast_from(*s); let d: u32 = u32::cast_from(*d); sum_s += s; sum_d += d; sum_s2 += s * s; sum_d2 += d * d; sum_sd += s * d; } } // To get the distortion, compute sum of squared error and apply a weight // based on the variance of the two planes. let sse = sum_d2 + sum_s2 - 2 * sum_sd; // Convert to 64-bits to avoid overflow when squaring let sum_s = sum_s as u64; let sum_d = sum_d as u64; // Calculate the variance (more accurately variance*area) of each plane. // var[iance] = avg(X^2) - avg(X)^2 = sum(X^2) / n - sum(X)^2 / n^2 // (n = # samples i.e. area) // var * n = sum(X^2) - sum(X)^2 / n // When w and h are powers of two, this can be done via shifting. let div = AREA_DIVISORS[w * h - 1] as u64; let div_shift = AREA_DIVISOR_BITS; // Due to rounding, negative values can occur when w or h aren't powers of // two. Saturate to avoid underflow. let mut svar = sum_s2.saturating_sub( ((sum_s * sum_s * div + (1 << div_shift >> 1)) >> div_shift) as u32, ); let mut dvar = sum_d2.saturating_sub( ((sum_d * sum_d * div + (1 << div_shift >> 1)) >> div_shift) as u32, ); // Scale variances up to 8x8 size. // scaled variance = var * (8x8) / wxh // For 8x8, this is a nop. For powers of 2, this is doable with shifting. // TODO: It should be possible and faster to do this adjustment in ssim boost let scale_shift = AREA_DIVISOR_BITS - 6; svar = ((svar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32; dvar = ((dvar as u64 * div + (1 << scale_shift >> 1)) >> scale_shift) as u32; apply_ssim_boost(sse, svar, dvar, bit_depth) } } #[cfg(test)] pub mod test { use super::*; use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::tiling::Area; use crate::util::Pixel; // Generate plane data for get_sad_same() fn setup_planes() -> (Plane, Plane) { // Two planes with different strides let mut input_plane = Plane::new(640, 480, 0, 0, 128 + 8, 128 + 8); let mut rec_plane = Plane::new(640, 480, 0, 0, 2 * 128 + 8, 2 * 128 + 8); // Make the test pattern robust to data alignment let xpad_off = (input_plane.cfg.xorigin - input_plane.cfg.xpad) as i32 - 8i32; for (i, row) in input_plane.data.chunks_mut(input_plane.cfg.stride).enumerate() { for (j, pixel) in row.iter_mut().enumerate() { let val = ((j + i) as i32 - xpad_off) & 255i32; assert!(val >= u8::MIN.into() && val <= u8::MAX.into()); *pixel = T::cast_from(val); } } for (i, row) in rec_plane.data.chunks_mut(rec_plane.cfg.stride).enumerate() { for (j, pixel) in row.iter_mut().enumerate() { let val = (j as i32 - i as i32 - xpad_off) & 255i32; assert!(val >= u8::MIN.into() && val <= u8::MAX.into()); *pixel = T::cast_from(val); } } (input_plane, rec_plane) } // Regression and validation test for SAD computation fn get_sad_same_inner() { // dynamic allocation: test let blocks: Vec<(usize, usize, u32)> = vec![ (4, 4, 1912), (4, 8, 4296), (8, 4, 3496), (8, 8, 7824), (8, 16, 16592), (16, 8, 14416), (16, 16, 31136), (16, 32, 60064), (32, 16, 59552), (32, 32, 120128), (32, 64, 186688), (64, 32, 250176), (64, 64, 438912), (64, 128, 654272), (128, 64, 1016768), (128, 128, 1689792), (4, 16, 8680), (16, 4, 6664), (8, 32, 31056), (32, 8, 27600), (16, 64, 93344), (64, 16, 116384), ]; let bit_depth: usize = 8; let (input_plane, rec_plane) = setup_planes::(); for (w, h, distortion) in blocks { let area = Area::StartingAt { x: 32, y: 40 }; let input_region = input_plane.region(area); let rec_region = rec_plane.region(area); assert_eq!( distortion, get_sad( &input_region, &rec_region, w, h, bit_depth, CpuFeatureLevel::default() ) ); } } #[test] fn get_sad_same_u8() { get_sad_same_inner::(); } #[test] fn get_sad_same_u16() { get_sad_same_inner::(); } fn get_satd_same_inner() { let blocks: Vec<(usize, usize, u32)> = vec![ (4, 4, 1408), (4, 8, 2016), (8, 4, 1816), (8, 8, 3984), (8, 16, 5136), (16, 8, 4864), (16, 16, 9984), (16, 32, 13824), (32, 16, 13760), (32, 32, 27952), (32, 64, 37168), (64, 32, 45104), (64, 64, 84176), (64, 128, 127920), (128, 64, 173680), (128, 128, 321456), (4, 16, 3136), (16, 4, 2632), (8, 32, 7056), (32, 8, 6624), (16, 64, 18432), (64, 16, 21312), ]; let bit_depth: usize = 8; let (input_plane, rec_plane) = setup_planes::(); for (w, h, distortion) in blocks { let area = Area::StartingAt { x: 32, y: 40 }; let input_region = input_plane.region(area); let rec_region = rec_plane.region(area); assert_eq!( distortion, get_satd( &input_region, &rec_region, w, h, bit_depth, CpuFeatureLevel::default() ) ); } } #[test] fn get_satd_same_u8() { get_satd_same_inner::(); } #[test] fn get_satd_same_u16() { get_satd_same_inner::(); } } rav1e-0.7.1/src/ec.rs000064400000000000000000001063721046102023000124040ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_camel_case_types)] cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::ec::*; } else { pub use self::rust::*; } } use crate::context::{CDFContext, CDFContextLog, CDFOffset}; use bitstream_io::{BigEndian, BitWrite, BitWriter}; use std::io; pub const OD_BITRES: u8 = 3; const EC_PROB_SHIFT: u32 = 6; const EC_MIN_PROB: u32 = 4; type ec_window = u32; /// Public trait interface to a bitstream `Writer`: a `Counter` can be /// used to count bits for cost analysis without actually storing /// anything (using a new `WriterCounter` as a `Writer`), to record /// tokens for later writing (using a new `WriterRecorder` as a /// `Writer`) to write actual final bits out using a range encoder /// (using a new `WriterEncoder` as a `Writer`). A `WriterRecorder`'s /// contents can be replayed into a `WriterEncoder`. pub trait Writer { /// Write a symbol `s`, using the passed in cdf reference; leaves `cdf` unchanged fn symbol(&mut self, s: u32, cdf: &[u16; CDF_LEN]); /// return approximate number of fractional bits in `OD_BITRES` /// precision to write a symbol `s` using the passed in cdf reference; /// leaves `cdf` unchanged fn symbol_bits(&self, s: u32, cdf: &[u16]) -> u32; /// Write a symbol `s`, using the passed in cdf reference; updates the referenced cdf. fn symbol_with_update( &mut self, s: u32, cdf: CDFOffset, log: &mut CDFContextLog, fc: &mut CDFContext, ); /// Write a bool using passed in probability fn bool(&mut self, val: bool, f: u16); /// Write a single bit with flat probability fn bit(&mut self, bit: u16); /// Write literal `bits` with flat probability fn literal(&mut self, bits: u8, s: u32); /// Write passed `level` as a golomb code fn write_golomb(&mut self, level: u32); /// Write a value `v` in `[0, n-1]` quasi-uniformly fn write_quniform(&mut self, n: u32, v: u32); /// Return fractional bits needed to write a value `v` in `[0, n-1]` /// quasi-uniformly fn count_quniform(&self, n: u32, v: u32) -> u32; /// Write symbol `v` in `[0, n-1]` with parameter `k` as finite subexponential fn write_subexp(&mut self, n: u32, k: u8, v: u32); /// Return fractional bits needed to write symbol v in `[0, n-1]` with /// parameter k as finite subexponential fn count_subexp(&self, n: u32, k: u8, v: u32) -> u32; /// Write symbol `v` in `[0, n-1]` with parameter `k` as finite /// subexponential based on a reference `r` also in `[0, n-1]`. fn write_unsigned_subexp_with_ref(&mut self, v: u32, mx: u32, k: u8, r: u32); /// Return fractional bits needed to write symbol `v` in `[0, n-1]` with /// parameter `k` as finite subexponential based on a reference `r` /// also in `[0, n-1]`. fn count_unsigned_subexp_with_ref( &self, v: u32, mx: u32, k: u8, r: u32, ) -> u32; /// Write symbol v in `[-(n-1), n-1]` with parameter k as finite /// subexponential based on a reference ref also in `[-(n-1), n-1]`. fn write_signed_subexp_with_ref( &mut self, v: i32, low: i32, high: i32, k: u8, r: i32, ); /// Return fractional bits needed to write symbol `v` in `[-(n-1), n-1]` /// with parameter `k` as finite subexponential based on a reference /// `r` also in `[-(n-1), n-1]`. fn count_signed_subexp_with_ref( &self, v: i32, low: i32, high: i32, k: u8, r: i32, ) -> u32; /// Return current length of range-coded bitstream in integer bits fn tell(&mut self) -> u32; /// Return current length of range-coded bitstream in fractional /// bits with `OD_BITRES` decimal precision fn tell_frac(&mut self) -> u32; /// Save current point in coding/recording to a checkpoint fn checkpoint(&mut self) -> WriterCheckpoint; /// Restore saved position in coding/recording from a checkpoint fn rollback(&mut self, _: &WriterCheckpoint); /// Add additional bits from rate estimators without coding a real symbol fn add_bits_frac(&mut self, bits_frac: u32); } /// `StorageBackend` is an internal trait used to tie a specific `Writer` /// implementation's storage to the generic `Writer`. It would be /// private, but Rust is deprecating 'private trait in a public /// interface' support. pub trait StorageBackend { /// Store partially-computed range code into given storage backend fn store(&mut self, fl: u16, fh: u16, nms: u16); /// Return bit-length of encoded stream to date fn stream_bits(&mut self) -> usize; /// Backend implementation of checkpoint to pass through Writer interface fn checkpoint(&mut self) -> WriterCheckpoint; /// Backend implementation of rollback to pass through Writer interface fn rollback(&mut self, _: &WriterCheckpoint); } #[derive(Debug, Clone)] pub struct WriterBase { /// The number of values in the current range. rng: u16, /// The number of bits of data in the current value. cnt: i16, #[cfg(feature = "desync_finder")] /// Debug enable flag debug: bool, /// Extra offset added to tell() and tell_frac() to approximate costs /// of actually coding a symbol fake_bits_frac: u32, /// Use-specific storage s: S, } #[derive(Debug, Clone)] pub struct WriterCounter { /// Bits that would be shifted out to date bits: usize, } #[derive(Debug, Clone)] pub struct WriterRecorder { /// Storage for tokens storage: Vec<(u16, u16, u16)>, /// Bits that would be shifted out to date bits: usize, } #[derive(Debug, Clone)] pub struct WriterEncoder { /// A buffer for output bytes with their associated carry flags. precarry: Vec, /// The low end of the current range. low: ec_window, } #[derive(Clone)] pub struct WriterCheckpoint { /// Stream length coded/recorded to date, in the unit used by the Writer, /// which may be bytes or bits. This depends on the assumption /// that a Writer will only ever restore its own Checkpoint. stream_size: usize, /// To be defined by backend backend_var: usize, /// Saved number of values in the current range. rng: u16, /// Saved number of bits of data in the current value. cnt: i16, } /// Constructor for a counting Writer impl WriterCounter { #[inline] pub const fn new() -> WriterBase { WriterBase::new(WriterCounter { bits: 0 }) } } /// Constructor for a recording Writer impl WriterRecorder { #[inline] pub const fn new() -> WriterBase { WriterBase::new(WriterRecorder { storage: Vec::new(), bits: 0 }) } } /// Constructor for a encoding Writer impl WriterEncoder { #[inline] pub const fn new() -> WriterBase { WriterBase::new(WriterEncoder { precarry: Vec::new(), low: 0 }) } } /// The Counter stores nothing we write to it, it merely counts the /// bit usage like in an Encoder for cost analysis. impl StorageBackend for WriterBase { #[inline] fn store(&mut self, fl: u16, fh: u16, nms: u16) { let (_l, r) = self.lr_compute(fl, fh, nms); let d = r.leading_zeros() as usize; self.s.bits += d; self.rng = r << d; } #[inline] fn stream_bits(&mut self) -> usize { self.s.bits } #[inline] fn checkpoint(&mut self) -> WriterCheckpoint { WriterCheckpoint { stream_size: self.s.bits, backend_var: 0, rng: self.rng, // We do not use `cnt` within Counter, but setting it here allows the compiler // to do a 32-bit merged load/store. cnt: self.cnt, } } #[inline] fn rollback(&mut self, checkpoint: &WriterCheckpoint) { self.rng = checkpoint.rng; self.s.bits = checkpoint.stream_size; } } /// The Recorder does not produce a range-coded bitstream, but it /// still tracks the range coding progress like in an Encoder, as it /// neds to be able to report bit costs for RDO decisions. It stores a /// pair of mostly-computed range coding values per token recorded. impl StorageBackend for WriterBase { #[inline] fn store(&mut self, fl: u16, fh: u16, nms: u16) { let (_l, r) = self.lr_compute(fl, fh, nms); let d = r.leading_zeros() as usize; self.s.bits += d; self.rng = r << d; self.s.storage.push((fl, fh, nms)); } #[inline] fn stream_bits(&mut self) -> usize { self.s.bits } #[inline] fn checkpoint(&mut self) -> WriterCheckpoint { WriterCheckpoint { stream_size: self.s.bits, backend_var: self.s.storage.len(), rng: self.rng, cnt: self.cnt, } } #[inline] fn rollback(&mut self, checkpoint: &WriterCheckpoint) { self.rng = checkpoint.rng; self.cnt = checkpoint.cnt; self.s.bits = checkpoint.stream_size; self.s.storage.truncate(checkpoint.backend_var); } } /// An Encoder produces an actual range-coded bitstream from passed in /// tokens. It does not retain any information about the coded /// tokens, only the resulting bitstream, and so it cannot be replayed /// (only checkpointed and rolled back). impl StorageBackend for WriterBase { fn store(&mut self, fl: u16, fh: u16, nms: u16) { let (l, r) = self.lr_compute(fl, fh, nms); let mut low = l + self.s.low; let mut c = self.cnt; let d = r.leading_zeros() as usize; let mut s = c + (d as i16); if s >= 0 { c += 16; let mut m = (1 << c) - 1; if s >= 8 { self.s.precarry.push((low >> c) as u16); low &= m; c -= 8; m >>= 8; } self.s.precarry.push((low >> c) as u16); s = c + (d as i16) - 24; low &= m; } self.s.low = low << d; self.rng = r << d; self.cnt = s; } #[inline] fn stream_bits(&mut self) -> usize { self.s.precarry.len() * 8 } #[inline] fn checkpoint(&mut self) -> WriterCheckpoint { WriterCheckpoint { stream_size: self.s.precarry.len(), backend_var: self.s.low as usize, rng: self.rng, cnt: self.cnt, } } fn rollback(&mut self, checkpoint: &WriterCheckpoint) { self.rng = checkpoint.rng; self.cnt = checkpoint.cnt; self.s.low = checkpoint.backend_var as ec_window; self.s.precarry.truncate(checkpoint.stream_size); } } /// A few local helper functions needed by the Writer that are not /// part of the public interface. impl WriterBase { /// Internal constructor called by the subtypes that implement the /// actual encoder and Recorder. #[inline] #[cfg(not(feature = "desync_finder"))] const fn new(storage: S) -> Self { WriterBase { rng: 0x8000, cnt: -9, fake_bits_frac: 0, s: storage } } #[inline] #[cfg(feature = "desync_finder")] fn new(storage: S) -> Self { WriterBase { rng: 0x8000, cnt: -9, debug: std::env::var_os("RAV1E_DEBUG").is_some(), fake_bits_frac: 0, s: storage, } } /// Compute low and range values from token cdf values and local state const fn lr_compute(&self, fl: u16, fh: u16, nms: u16) -> (ec_window, u16) { let r = self.rng as u32; debug_assert!(32768 <= r); let mut u = (((r >> 8) * (fl as u32 >> EC_PROB_SHIFT)) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB * nms as u32; if fl >= 32768 { u = r; } let v = (((r >> 8) * (fh as u32 >> EC_PROB_SHIFT)) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB * (nms - 1) as u32; (r - u, (u - v) as u16) } /// Given the current total integer number of bits used and the current value of /// rng, computes the fraction number of bits used to `OD_BITRES` precision. /// This is used by `od_ec_enc_tell_frac()` and `od_ec_dec_tell_frac()`. /// `nbits_total`: The number of whole bits currently used, i.e., the value /// returned by `od_ec_enc_tell()` or `od_ec_dec_tell()`. /// `rng`: The current value of rng from either the encoder or decoder state. /// Return: The number of bits scaled by `2**OD_BITRES`. /// This will always be slightly larger than the exact value (e.g., all /// rounding error is in the positive direction). fn frac_compute(nbits_total: u32, mut rng: u32) -> u32 { // To handle the non-integral number of bits still left in the encoder/decoder // state, we compute the worst-case number of bits of val that must be // encoded to ensure that the value is inside the range for any possible // subsequent bits. // The computation here is independent of val itself (the decoder does not // even track that value), even though the real number of bits used after // od_ec_enc_done() may be 1 smaller if rng is a power of two and the // corresponding trailing bits of val are all zeros. // If we did try to track that special case, then coding a value with a // probability of 1/(1 << n) might sometimes appear to use more than n bits. // This may help explain the surprising result that a newly initialized // encoder or decoder claims to have used 1 bit. let nbits = nbits_total << OD_BITRES; let mut l = 0; for _ in 0..OD_BITRES { rng = (rng * rng) >> 15; let b = rng >> 16; l = (l << 1) | b; rng >>= b; } nbits - l } const fn recenter(r: u32, v: u32) -> u32 { if v > (r << 1) { v } else if v >= r { (v - r) << 1 } else { ((r - v) << 1) - 1 } } #[cfg(feature = "desync_finder")] fn print_backtrace(&self, s: u32) { let mut depth = 3; backtrace::trace(|frame| { let ip = frame.ip(); depth -= 1; if depth == 0 { backtrace::resolve(ip, |symbol| { if let Some(name) = symbol.name() { println!("Writing symbol {} from {}", s, name); } }); false } else { true } }); } } /// Replay implementation specific to the Recorder impl WriterBase { /// Replays the partially-computed range tokens out of the Recorder's /// storage and into the passed in Writer, which may be an Encoder /// or another Recorder. Clears the Recorder after replay. pub fn replay(&mut self, dest: &mut dyn StorageBackend) { for &(fl, fh, nms) in &self.s.storage { dest.store(fl, fh, nms); } self.rng = 0x8000; self.cnt = -9; self.s.storage.truncate(0); self.s.bits = 0; } } /// Done implementation specific to the Encoder impl WriterBase { /// Indicates that there are no more symbols to encode. Flushes /// remaining state into coding and returns a vector containing the /// final bitstream. pub fn done(&mut self) -> Vec { // We output the minimum number of bits that ensures that the symbols encoded // thus far will be decoded correctly regardless of the bits that follow. let l = self.s.low; let mut c = self.cnt; let mut s = 10; let m = 0x3FFF; let mut e = ((l + m) & !m) | (m + 1); s += c; if s > 0 { let mut n = (1 << (c + 16)) - 1; loop { self.s.precarry.push((e >> (c + 16)) as u16); e &= n; s -= 8; c -= 8; n >>= 8; if s <= 0 { break; } } } let mut c = 0; let mut offs = self.s.precarry.len(); // dynamic allocation: grows during encode let mut out = vec![0_u8; offs]; while offs > 0 { offs -= 1; c += self.s.precarry[offs]; out[offs] = c as u8; c >>= 8; } out } } /// Generic/shared implementation for `Writer`s with `StorageBackend`s /// (ie, `Encoder`s and `Recorder`s) impl Writer for WriterBase where WriterBase: StorageBackend, { /// Encode a single binary value. /// `val`: The value to encode (0 or 1). /// `f`: The probability that the val is one, scaled by 32768. fn bool(&mut self, val: bool, f: u16) { debug_assert!(0 < f); debug_assert!(f < 32768); self.symbol(u32::from(val), &[f, 0]); } /// Encode a single boolean value. /// /// - `val`: The value to encode (`false` or `true`). /// - `f`: The probability that the `val` is `true`, scaled by `32768`. fn bit(&mut self, bit: u16) { self.bool(bit == 1, 16384); } // fake add bits fn add_bits_frac(&mut self, bits_frac: u32) { self.fake_bits_frac += bits_frac } /// Encode a literal bitstring, bit by bit in MSB order, with flat /// probability. /// /// - 'bits': Length of bitstring /// - 's': Bit string to encode fn literal(&mut self, bits: u8, s: u32) { for bit in (0..bits).rev() { self.bit((1 & (s >> bit)) as u16); } } /// Encodes a symbol given a cumulative distribution function (CDF) table in Q15. /// /// - `s`: The index of the symbol to encode. /// - `cdf`: The CDF, such that symbol s falls in the range /// `[s > 0 ? cdf[s - 1] : 0, cdf[s])`. /// The values must be monotonically non-decreasing, and the last value /// must be greater than 32704. There should be at most 16 values. /// The lower 6 bits of the last value hold the count. #[inline(always)] fn symbol(&mut self, s: u32, cdf: &[u16; CDF_LEN]) { debug_assert!(cdf[cdf.len() - 1] < (1 << EC_PROB_SHIFT)); let s = s as usize; debug_assert!(s < cdf.len()); // The above is stricter than the following overflow check: s <= cdf.len() let nms = cdf.len() - s; let fl = if s > 0 { // SAFETY: We asserted that s is less than the length of the cdf unsafe { *cdf.get_unchecked(s - 1) } } else { 32768 }; // SAFETY: We asserted that s is less than the length of the cdf let fh = unsafe { *cdf.get_unchecked(s) }; debug_assert!((fh >> EC_PROB_SHIFT) <= (fl >> EC_PROB_SHIFT)); debug_assert!(fl <= 32768); self.store(fl, fh, nms as u16); } /// Encodes a symbol given a cumulative distribution function (CDF) /// table in Q15, then updates the CDF probabilities to reflect we've /// written one more symbol 's'. /// /// - `s`: The index of the symbol to encode. /// - `cdf`: The CDF, such that symbol s falls in the range /// `[s > 0 ? cdf[s - 1] : 0, cdf[s])`. /// The values must be monotonically non-decreasing, and the last value /// must be greater 32704. There should be at most 16 values. /// The lower 6 bits of the last value hold the count. fn symbol_with_update( &mut self, s: u32, cdf: CDFOffset, log: &mut CDFContextLog, fc: &mut CDFContext, ) { #[cfg(feature = "desync_finder")] { if self.debug { self.print_backtrace(s); } } let cdf = log.push(fc, cdf); self.symbol(s, cdf); update_cdf(cdf, s); } /// Returns approximate cost for a symbol given a cumulative /// distribution function (CDF) table and current write state. /// /// - `s`: The index of the symbol to encode. /// - `cdf`: The CDF, such that symbol s falls in the range /// `[s > 0 ? cdf[s - 1] : 0, cdf[s])`. /// The values must be monotonically non-decreasing, and the last value /// must be greater than 32704. There should be at most 16 values. /// The lower 6 bits of the last value hold the count. fn symbol_bits(&self, s: u32, cdf: &[u16]) -> u32 { let mut bits = 0; debug_assert!(cdf[cdf.len() - 1] < (1 << EC_PROB_SHIFT)); debug_assert!(32768 <= self.rng); let rng = (self.rng >> 8) as u32; let fh = cdf[s as usize] as u32 >> EC_PROB_SHIFT; let r: u32 = if s > 0 { let fl = cdf[s as usize - 1] as u32 >> EC_PROB_SHIFT; ((rng * fl) >> (7 - EC_PROB_SHIFT)) - ((rng * fh) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB } else { let nms1 = cdf.len() as u32 - s - 1; self.rng as u32 - ((rng * fh) >> (7 - EC_PROB_SHIFT)) - nms1 * EC_MIN_PROB }; // The 9 here counteracts the offset of -9 baked into cnt. Don't include a termination bit. let pre = Self::frac_compute((self.cnt + 9) as u32, self.rng as u32); let d = r.leading_zeros() - 16; let mut c = self.cnt; let mut sh = c + (d as i16); if sh >= 0 { c += 16; if sh >= 8 { bits += 8; c -= 8; } bits += 8; sh = c + (d as i16) - 24; } // The 9 here counteracts the offset of -9 baked into cnt. Don't include a termination bit. Self::frac_compute((bits + sh + 9) as u32, r << d) - pre } /// Encode a golomb to the bitstream. /// /// - 'level': passed in value to encode fn write_golomb(&mut self, level: u32) { let x = level + 1; let length = 32 - x.leading_zeros(); for _ in 0..length - 1 { self.bit(0); } for i in (0..length).rev() { self.bit(((x >> i) & 0x01) as u16); } } /// Write a value `v` in `[0, n-1]` quasi-uniformly /// - `n`: size of interval /// - `v`: value to encode fn write_quniform(&mut self, n: u32, v: u32) { if n > 1 { let l = 32 - n.leading_zeros() as u8; let m = (1 << l) - n; if v < m { self.literal(l - 1, v); } else { self.literal(l - 1, m + ((v - m) >> 1)); self.literal(1, (v - m) & 1); } } } /// Returns `QOD_BITRES` bits for a value `v` in `[0, n-1]` quasi-uniformly /// - `n`: size of interval /// - `v`: value to encode fn count_quniform(&self, n: u32, v: u32) -> u32 { let mut bits = 0; if n > 1 { let l = 32 - n.leading_zeros(); let m = (1 << l) - n; bits += (l - 1) << OD_BITRES; if v >= m { bits += 1 << OD_BITRES; } } bits } /// Write symbol `v` in `[0, n-1]` with parameter `k` as finite subexponential /// /// - `n`: size of interval /// - `k`: "parameter" /// - `v`: value to encode fn write_subexp(&mut self, n: u32, k: u8, v: u32) { let mut i = 0; let mut mk = 0; loop { let b = if i != 0 { k + i - 1 } else { k }; let a = 1 << b; if n <= mk + 3 * a { self.write_quniform(n - mk, v - mk); break; } else { let t = v >= mk + a; self.bool(t, 16384); if t { i += 1; mk += a; } else { self.literal(b, v - mk); break; } } } } /// Returns `QOD_BITRES` bits for symbol `v` in `[0, n-1]` with parameter `k` /// as finite subexponential /// /// - `n`: size of interval /// - `k`: "parameter" /// - `v`: value to encode fn count_subexp(&self, n: u32, k: u8, v: u32) -> u32 { let mut i = 0; let mut mk = 0; let mut bits = 0; loop { let b = if i != 0 { k + i - 1 } else { k }; let a = 1 << b; if n <= mk + 3 * a { bits += self.count_quniform(n - mk, v - mk); break; } else { let t = v >= mk + a; bits += 1 << OD_BITRES; if t { i += 1; mk += a; } else { bits += (b as u32) << OD_BITRES; break; } } } bits } /// Write symbol `v` in `[0, n-1]` with parameter `k` as finite /// subexponential based on a reference `r` also in `[0, n-1]`. /// /// - `v`: value to encode /// - `n`: size of interval /// - `k`: "parameter" /// - `r`: reference fn write_unsigned_subexp_with_ref(&mut self, v: u32, n: u32, k: u8, r: u32) { if (r << 1) <= n { self.write_subexp(n, k, Self::recenter(r, v)); } else { self.write_subexp(n, k, Self::recenter(n - 1 - r, n - 1 - v)); } } /// Returns `QOD_BITRES` bits for symbol `v` in `[0, n-1]` /// with parameter `k` as finite subexponential based on a /// reference `r` also in `[0, n-1]`. /// /// - `v`: value to encode /// - `n`: size of interval /// - `k`: "parameter" /// - `r`: reference fn count_unsigned_subexp_with_ref( &self, v: u32, n: u32, k: u8, r: u32, ) -> u32 { if (r << 1) <= n { self.count_subexp(n, k, Self::recenter(r, v)) } else { self.count_subexp(n, k, Self::recenter(n - 1 - r, n - 1 - v)) } } /// Write symbol `v` in `[-(n-1), n-1]` with parameter `k` as finite /// subexponential based on a reference `r` also in `[-(n-1), n-1]`. /// /// - `v`: value to encode /// - `n`: size of interval /// - `k`: "parameter" /// - `r`: reference fn write_signed_subexp_with_ref( &mut self, v: i32, low: i32, high: i32, k: u8, r: i32, ) { self.write_unsigned_subexp_with_ref( (v - low) as u32, (high - low) as u32, k, (r - low) as u32, ); } /// Returns `QOD_BITRES` bits for symbol `v` in `[-(n-1), n-1]` /// with parameter `k` as finite subexponential based on a /// reference `r` also in `[-(n-1), n-1]`. /// /// - `v`: value to encode /// - `n`: size of interval /// - `k`: "parameter" /// - `r`: reference fn count_signed_subexp_with_ref( &self, v: i32, low: i32, high: i32, k: u8, r: i32, ) -> u32 { self.count_unsigned_subexp_with_ref( (v - low) as u32, (high - low) as u32, k, (r - low) as u32, ) } /// Returns the number of bits "used" by the encoded symbols so far. /// This same number can be computed in either the encoder or the /// decoder, and is suitable for making coding decisions. The value /// will be the same whether using an `Encoder` or `Recorder`. /// /// Return: The integer number of bits. /// This will always be slightly larger than the exact value (e.g., all /// rounding error is in the positive direction). fn tell(&mut self) -> u32 { // The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra // bit, which we reserve for terminating the stream. (((self.stream_bits()) as i32) + (self.cnt as i32) + 10) as u32 + (self.fake_bits_frac >> 8) } /// Returns the number of bits "used" by the encoded symbols so far. /// This same number can be computed in either the encoder or the /// decoder, and is suitable for making coding decisions. The value /// will be the same whether using an `Encoder` or `Recorder`. /// /// Return: The number of bits scaled by `2**OD_BITRES`. /// This will always be slightly larger than the exact value (e.g., all /// rounding error is in the positive direction). fn tell_frac(&mut self) -> u32 { Self::frac_compute(self.tell(), self.rng as u32) + self.fake_bits_frac } /// Save current point in coding/recording to a checkpoint that can /// be restored later. A `WriterCheckpoint` can be generated for an /// `Encoder` or `Recorder`, but can only be used to rollback the `Writer` /// instance from which it was generated. fn checkpoint(&mut self) -> WriterCheckpoint { StorageBackend::checkpoint(self) } /// Roll back a given `Writer` to the state saved in the `WriterCheckpoint` /// /// - 'wc': Saved `Writer` state/posiiton to restore fn rollback(&mut self, wc: &WriterCheckpoint) { StorageBackend::rollback(self, wc) } } pub trait BCodeWriter { fn recenter_nonneg(&mut self, r: u16, v: u16) -> u16; fn recenter_finite_nonneg(&mut self, n: u16, r: u16, v: u16) -> u16; /// # Errors /// /// - Returns `std::io::Error` if the writer cannot be written to. fn write_quniform(&mut self, n: u16, v: u16) -> Result<(), std::io::Error>; /// # Errors /// /// - Returns `std::io::Error` if the writer cannot be written to. fn write_subexpfin( &mut self, n: u16, k: u16, v: u16, ) -> Result<(), std::io::Error>; /// # Errors /// /// - Returns `std::io::Error` if the writer cannot be written to. fn write_refsubexpfin( &mut self, n: u16, k: u16, r: i16, v: i16, ) -> Result<(), std::io::Error>; /// # Errors /// /// - Returns `std::io::Error` if the writer cannot be written to. fn write_s_refsubexpfin( &mut self, n: u16, k: u16, r: i16, v: i16, ) -> Result<(), std::io::Error>; } impl BCodeWriter for BitWriter { fn recenter_nonneg(&mut self, r: u16, v: u16) -> u16 { /* Recenters a non-negative literal v around a reference r */ if v > (r << 1) { v } else if v >= r { (v - r) << 1 } else { ((r - v) << 1) - 1 } } fn recenter_finite_nonneg(&mut self, n: u16, r: u16, v: u16) -> u16 { /* Recenters a non-negative literal v in [0, n-1] around a reference r also in [0, n-1] */ if (r << 1) <= n { self.recenter_nonneg(r, v) } else { self.recenter_nonneg(n - 1 - r, n - 1 - v) } } fn write_quniform(&mut self, n: u16, v: u16) -> Result<(), std::io::Error> { if n > 1 { let l = 16 - n.leading_zeros() as u8; let m = (1 << l) - n; if v < m { self.write(l as u32 - 1, v) } else { self.write(l as u32 - 1, m + ((v - m) >> 1))?; self.write(1, (v - m) & 1) } } else { Ok(()) } } fn write_subexpfin( &mut self, n: u16, k: u16, v: u16, ) -> Result<(), std::io::Error> { /* Finite subexponential code that codes a symbol v in [0, n-1] with parameter k */ let mut i = 0; let mut mk = 0; loop { let b = if i > 0 { k + i - 1 } else { k }; let a = 1 << b; if n <= mk + 3 * a { return self.write_quniform(n - mk, v - mk); } else { let t = v >= mk + a; self.write_bit(t)?; if t { i += 1; mk += a; } else { return self.write(b as u32, v - mk); } } } } fn write_refsubexpfin( &mut self, n: u16, k: u16, r: i16, v: i16, ) -> Result<(), std::io::Error> { /* Finite subexponential code that codes a symbol v in [0, n-1] with parameter k based on a reference ref also in [0, n-1]. Recenters symbol around r first and then uses a finite subexponential code. */ let recentered_v = self.recenter_finite_nonneg(n, r as u16, v as u16); self.write_subexpfin(n, k, recentered_v) } fn write_s_refsubexpfin( &mut self, n: u16, k: u16, r: i16, v: i16, ) -> Result<(), std::io::Error> { /* Signed version of the above function */ self.write_refsubexpfin( (n << 1) - 1, k, r + (n - 1) as i16, v + (n - 1) as i16, ) } } pub(crate) fn cdf_to_pdf( cdf: &[u16; CDF_LEN], ) -> [u16; CDF_LEN] { let mut pdf = [0; CDF_LEN]; let mut z = 32768u16 >> EC_PROB_SHIFT; for (d, &a) in pdf.iter_mut().zip(cdf.iter()) { *d = z - (a >> EC_PROB_SHIFT); z = a >> EC_PROB_SHIFT; } pdf } pub(crate) mod rust { // Function to update the CDF for Writer calls that do so. #[inline] pub fn update_cdf(cdf: &mut [u16; N], val: u32) { use crate::context::CDF_LEN_MAX; let nsymbs = cdf.len(); let mut rate = 3 + (nsymbs >> 1).min(2); if let Some(count) = cdf.last_mut() { rate += (*count >> 4) as usize; *count += 1 - (*count >> 5); } else { return; } // Single loop (faster) for (i, v) in cdf[..nsymbs - 1].iter_mut().enumerate().take(CDF_LEN_MAX - 1) { if i as u32 >= val { *v -= *v >> rate; } else { *v += (32768 - *v) >> rate; } } } } #[cfg(test)] mod test { use super::*; const WINDOW_SIZE: i16 = 32; const LOTS_OF_BITS: i16 = 0x4000; #[derive(Debug)] struct Reader<'a> { buf: &'a [u8], bptr: usize, dif: ec_window, rng: u16, cnt: i16, } impl<'a> Reader<'a> { fn new(buf: &'a [u8]) -> Self { let mut r = Reader { buf, bptr: 0, dif: (1 << (WINDOW_SIZE - 1)) - 1, rng: 0x8000, cnt: -15, }; r.refill(); r } fn refill(&mut self) { let mut s = WINDOW_SIZE - 9 - (self.cnt + 15); while s >= 0 && self.bptr < self.buf.len() { assert!(s <= WINDOW_SIZE - 8); self.dif ^= (self.buf[self.bptr] as ec_window) << s; self.cnt += 8; s -= 8; self.bptr += 1; } if self.bptr >= self.buf.len() { self.cnt = LOTS_OF_BITS; } } fn normalize(&mut self, dif: ec_window, rng: u32) { assert!(rng <= 65536); let d = rng.leading_zeros() - 16; //let d = 16 - (32-rng.leading_zeros()); self.cnt -= d as i16; /*This is equivalent to shifting in 1's instead of 0's.*/ self.dif = ((dif + 1) << d) - 1; self.rng = (rng << d) as u16; if self.cnt < 0 { self.refill() } } fn bool(&mut self, f: u32) -> bool { assert!(f < 32768); let r = self.rng as u32; assert!(self.dif >> (WINDOW_SIZE - 16) < r); assert!(32768 <= r); let v = (((r >> 8) * (f >> EC_PROB_SHIFT)) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB; let vw = v << (WINDOW_SIZE - 16); let (dif, rng, ret) = if self.dif >= vw { (self.dif - vw, r - v, false) } else { (self.dif, v, true) }; self.normalize(dif, rng); ret } fn symbol(&mut self, icdf: &[u16]) -> i32 { let r = self.rng as u32; assert!(self.dif >> (WINDOW_SIZE - 16) < r); assert!(32768 <= r); let n = icdf.len() as u32 - 1; let c = self.dif >> (WINDOW_SIZE - 16); let mut v = self.rng as u32; let mut ret = 0i32; let mut u = v; v = ((r >> 8) * (icdf[ret as usize] as u32 >> EC_PROB_SHIFT)) >> (7 - EC_PROB_SHIFT); v += EC_MIN_PROB * (n - ret as u32); while c < v { u = v; ret += 1; v = ((r >> 8) * (icdf[ret as usize] as u32 >> EC_PROB_SHIFT)) >> (7 - EC_PROB_SHIFT); v += EC_MIN_PROB * (n - ret as u32); } assert!(v < u); assert!(u <= r); let new_dif = self.dif - (v << (WINDOW_SIZE - 16)); self.normalize(new_dif, u - v); ret } } #[test] fn booleans() { let mut w = WriterEncoder::new(); w.bool(false, 1); w.bool(true, 2); w.bool(false, 3); w.bool(true, 1); w.bool(true, 2); w.bool(false, 3); let b = w.done(); let mut r = Reader::new(&b); assert!(!r.bool(1)); assert!(r.bool(2)); assert!(!r.bool(3)); assert!(r.bool(1)); assert!(r.bool(2)); assert!(!r.bool(3)); } #[test] fn cdf() { let cdf = [7296, 3819, 1716, 0]; let mut w = WriterEncoder::new(); w.symbol(0, &cdf); w.symbol(0, &cdf); w.symbol(0, &cdf); w.symbol(1, &cdf); w.symbol(1, &cdf); w.symbol(1, &cdf); w.symbol(2, &cdf); w.symbol(2, &cdf); w.symbol(2, &cdf); let b = w.done(); let mut r = Reader::new(&b); assert_eq!(r.symbol(&cdf), 0); assert_eq!(r.symbol(&cdf), 0); assert_eq!(r.symbol(&cdf), 0); assert_eq!(r.symbol(&cdf), 1); assert_eq!(r.symbol(&cdf), 1); assert_eq!(r.symbol(&cdf), 1); assert_eq!(r.symbol(&cdf), 2); assert_eq!(r.symbol(&cdf), 2); assert_eq!(r.symbol(&cdf), 2); } #[test] fn mixed() { let cdf = [7296, 3819, 1716, 0]; let mut w = WriterEncoder::new(); w.symbol(0, &cdf); w.bool(true, 2); w.symbol(0, &cdf); w.bool(true, 2); w.symbol(0, &cdf); w.bool(true, 2); w.symbol(1, &cdf); w.bool(true, 1); w.symbol(1, &cdf); w.bool(false, 2); w.symbol(1, &cdf); w.symbol(2, &cdf); w.symbol(2, &cdf); w.symbol(2, &cdf); let b = w.done(); let mut r = Reader::new(&b); assert_eq!(r.symbol(&cdf), 0); assert!(r.bool(2)); assert_eq!(r.symbol(&cdf), 0); assert!(r.bool(2)); assert_eq!(r.symbol(&cdf), 0); assert!(r.bool(2)); assert_eq!(r.symbol(&cdf), 1); assert!(r.bool(1)); assert_eq!(r.symbol(&cdf), 1); assert!(!r.bool(2)); assert_eq!(r.symbol(&cdf), 1); assert_eq!(r.symbol(&cdf), 2); assert_eq!(r.symbol(&cdf), 2); assert_eq!(r.symbol(&cdf), 2); } } rav1e-0.7.1/src/encoder.rs000064400000000000000000003461641046102023000134410ustar 00000000000000// Copyright (c) 2018-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::activity::*; use crate::api::config::GrainTableSegment; use crate::api::*; use crate::cdef::*; use crate::context::*; use crate::deblock::*; use crate::ec::*; use crate::frame::*; use crate::header::*; use crate::lrf::*; use crate::mc::{FilterMode, MotionVector}; use crate::me::*; use crate::partition::PartitionType::*; use crate::partition::RefType::*; use crate::partition::*; use crate::predict::{ luma_ac, AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode, }; use crate::quantize::*; use crate::rate::{ QuantizerParameters, FRAME_SUBTYPE_I, FRAME_SUBTYPE_P, QSCALE, }; use crate::rdo::*; use crate::segmentation::*; use crate::serialize::{Deserialize, Serialize}; use crate::stats::EncoderStats; use crate::tiling::*; use crate::transform::*; use crate::util::*; use crate::wasm_bindgen::*; use arg_enum_proc_macro::ArgEnum; use arrayvec::*; use bitstream_io::{BigEndian, BitWrite, BitWriter}; use rayon::iter::*; use std::collections::VecDeque; use std::io::Write; use std::mem::MaybeUninit; use std::sync::Arc; use std::{fmt, io, mem}; #[allow(dead_code)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CDEFSearchMethod { PickFromQ, FastSearch, FullSearch, } #[inline(always)] fn poly2(q: f32, a: f32, b: f32, c: f32, max: i32) -> i32 { clamp((q * q).mul_add(a, q.mul_add(b, c)).round() as i32, 0, max) } pub static TEMPORAL_DELIMITER: [u8; 2] = [0x12, 0x00]; const MAX_NUM_TEMPORAL_LAYERS: usize = 8; const MAX_NUM_SPATIAL_LAYERS: usize = 4; const MAX_NUM_OPERATING_POINTS: usize = MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS; /// Size of blocks for the importance computation, in pixels. pub const IMPORTANCE_BLOCK_SIZE: usize = 1 << (IMPORTANCE_BLOCK_TO_BLOCK_SHIFT + BLOCK_TO_PLANE_SHIFT); #[derive(Debug, Clone)] pub struct ReferenceFrame { pub order_hint: u32, pub width: u32, pub height: u32, pub render_width: u32, pub render_height: u32, pub frame: Arc>, pub input_hres: Arc>, pub input_qres: Arc>, pub cdfs: CDFContext, pub frame_me_stats: RefMEStats, pub output_frameno: u64, pub segmentation: SegmentationState, } #[derive(Debug, Clone, Default)] pub struct ReferenceFramesSet { pub frames: [Option>>; REF_FRAMES], pub deblock: [DeblockState; REF_FRAMES], } impl ReferenceFramesSet { pub fn new() -> Self { Self { frames: Default::default(), deblock: Default::default() } } } #[wasm_bindgen] #[derive( ArgEnum, Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default, )] #[repr(C)] pub enum Tune { Psnr, #[default] Psychovisual, } const FRAME_ID_LENGTH: u32 = 15; const DELTA_FRAME_ID_LENGTH: u32 = 14; #[derive(Copy, Clone, Debug)] pub struct Sequence { /// OBU Sequence header of AV1 pub profile: u8, pub num_bits_width: u32, pub num_bits_height: u32, pub bit_depth: usize, pub chroma_sampling: ChromaSampling, pub chroma_sample_position: ChromaSamplePosition, pub pixel_range: PixelRange, pub color_description: Option, pub mastering_display: Option, pub content_light: Option, pub max_frame_width: u32, pub max_frame_height: u32, pub frame_id_numbers_present_flag: bool, pub frame_id_length: u32, pub delta_frame_id_length: u32, pub use_128x128_superblock: bool, pub order_hint_bits_minus_1: u32, /// 0 - force off /// 1 - force on /// 2 - adaptive pub force_screen_content_tools: u32, /// 0 - Not to force. MV can be in 1/4 or 1/8 /// 1 - force to integer /// 2 - adaptive pub force_integer_mv: u32, /// Video is a single frame still picture pub still_picture: bool, /// Use reduced header for still picture pub reduced_still_picture_hdr: bool, /// enables/disables filter_intra pub enable_filter_intra: bool, /// enables/disables corner/edge filtering and upsampling pub enable_intra_edge_filter: bool, /// enables/disables interintra_compound pub enable_interintra_compound: bool, /// enables/disables masked compound pub enable_masked_compound: bool, /// 0 - disable dual interpolation filter /// 1 - enable vert/horiz filter selection pub enable_dual_filter: bool, /// 0 - disable order hint, and related tools /// jnt_comp, ref_frame_mvs, frame_sign_bias /// if 0, enable_jnt_comp and /// enable_ref_frame_mvs must be set zs 0. pub enable_order_hint: bool, /// 0 - disable joint compound modes /// 1 - enable it pub enable_jnt_comp: bool, /// 0 - disable ref frame mvs /// 1 - enable it pub enable_ref_frame_mvs: bool, /// 0 - disable warped motion for sequence /// 1 - enable it for the sequence pub enable_warped_motion: bool, /// 0 - Disable superres for the sequence, and disable /// transmitting per-frame superres enabled flag. /// 1 - Enable superres for the sequence, and also /// enable per-frame flag to denote if superres is /// enabled for that frame. pub enable_superres: bool, /// To turn on/off CDEF pub enable_cdef: bool, /// To turn on/off loop restoration pub enable_restoration: bool, /// To turn on/off larger-than-superblock loop restoration units pub enable_large_lru: bool, /// allow encoder to delay loop filter RDO/coding until after frame reconstruciton is complete pub enable_delayed_loopfilter_rdo: bool, pub operating_points_cnt_minus_1: usize, pub operating_point_idc: [u16; MAX_NUM_OPERATING_POINTS], pub display_model_info_present_flag: bool, pub decoder_model_info_present_flag: bool, pub level_idx: [u8; MAX_NUM_OPERATING_POINTS], /// seq_tier in the spec. One bit: 0 or 1. pub tier: [usize; MAX_NUM_OPERATING_POINTS], pub film_grain_params_present: bool, pub timing_info_present: bool, pub tiling: TilingInfo, pub time_base: Rational, } impl Sequence { /// # Panics /// /// Panics if the resulting tile sizes would be too large. pub fn new(config: &EncoderConfig) -> Sequence { let width_bits = 32 - (config.width as u32).leading_zeros(); let height_bits = 32 - (config.height as u32).leading_zeros(); assert!(width_bits <= 16); assert!(height_bits <= 16); let profile = if config.bit_depth == 12 || config.chroma_sampling == ChromaSampling::Cs422 { 2 } else { u8::from(config.chroma_sampling == ChromaSampling::Cs444) }; let operating_point_idc: [u16; MAX_NUM_OPERATING_POINTS] = [0; MAX_NUM_OPERATING_POINTS]; let level_idx: [u8; MAX_NUM_OPERATING_POINTS] = if let Some(level_idx) = config.level_idx { [level_idx; MAX_NUM_OPERATING_POINTS] } else { [31; MAX_NUM_OPERATING_POINTS] }; let tier: [usize; MAX_NUM_OPERATING_POINTS] = [0; MAX_NUM_OPERATING_POINTS]; // Restoration filters are not useful for very small frame sizes, // so disable them in that case. let enable_restoration_filters = config.width >= 32 && config.height >= 32; let use_128x128_superblock = false; let frame_rate = config.frame_rate(); let sb_size_log2 = Self::sb_size_log2(use_128x128_superblock); let mut tiling = TilingInfo::from_target_tiles( sb_size_log2, config.width, config.height, frame_rate, TilingInfo::tile_log2(1, config.tile_cols).unwrap(), TilingInfo::tile_log2(1, config.tile_rows).unwrap(), config.chroma_sampling == ChromaSampling::Cs422, ); if config.tiles > 0 { let mut tile_rows_log2 = 0; let mut tile_cols_log2 = 0; while (tile_rows_log2 < tiling.max_tile_rows_log2) || (tile_cols_log2 < tiling.max_tile_cols_log2) { tiling = TilingInfo::from_target_tiles( sb_size_log2, config.width, config.height, frame_rate, tile_cols_log2, tile_rows_log2, config.chroma_sampling == ChromaSampling::Cs422, ); if tiling.rows * tiling.cols >= config.tiles { break; }; if ((tiling.tile_height_sb >= tiling.tile_width_sb) && (tiling.tile_rows_log2 < tiling.max_tile_rows_log2)) || (tile_cols_log2 >= tiling.max_tile_cols_log2) { tile_rows_log2 += 1; } else { tile_cols_log2 += 1; } } } Sequence { tiling, profile, num_bits_width: width_bits, num_bits_height: height_bits, bit_depth: config.bit_depth, chroma_sampling: config.chroma_sampling, chroma_sample_position: config.chroma_sample_position, pixel_range: config.pixel_range, color_description: config.color_description, mastering_display: config.mastering_display, content_light: config.content_light, max_frame_width: config.width as u32, max_frame_height: config.height as u32, frame_id_numbers_present_flag: false, frame_id_length: FRAME_ID_LENGTH, delta_frame_id_length: DELTA_FRAME_ID_LENGTH, use_128x128_superblock, order_hint_bits_minus_1: 5, force_screen_content_tools: if config.still_picture { 2 } else { 0 }, force_integer_mv: 2, still_picture: config.still_picture, reduced_still_picture_hdr: config.still_picture, enable_filter_intra: false, enable_intra_edge_filter: true, enable_interintra_compound: false, enable_masked_compound: false, enable_dual_filter: false, enable_order_hint: !config.still_picture, enable_jnt_comp: false, enable_ref_frame_mvs: false, enable_warped_motion: false, enable_superres: false, enable_cdef: config.speed_settings.cdef && enable_restoration_filters, enable_restoration: config.speed_settings.lrf && enable_restoration_filters, enable_large_lru: true, enable_delayed_loopfilter_rdo: true, operating_points_cnt_minus_1: 0, operating_point_idc, display_model_info_present_flag: false, decoder_model_info_present_flag: false, level_idx, tier, film_grain_params_present: config .film_grain_params .as_ref() .map(|entries| !entries.is_empty()) .unwrap_or(false), timing_info_present: config.enable_timing_info, time_base: config.time_base, } } pub const fn get_relative_dist(&self, a: u32, b: u32) -> i32 { let diff = a as i32 - b as i32; let m = 1 << self.order_hint_bits_minus_1; (diff & (m - 1)) - (diff & m) } pub fn get_skip_mode_allowed( &self, fi: &FrameInvariants, inter_cfg: &InterConfig, reference_select: bool, ) -> bool { if fi.intra_only || !reference_select || !self.enable_order_hint { return false; } let mut forward_idx: isize = -1; let mut backward_idx: isize = -1; let mut forward_hint = 0; let mut backward_hint = 0; for i in inter_cfg.allowed_ref_frames().iter().map(|rf| rf.to_index()) { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[i] as usize] { let ref_hint = rec.order_hint; if self.get_relative_dist(ref_hint, fi.order_hint) < 0 { if forward_idx < 0 || self.get_relative_dist(ref_hint, forward_hint) > 0 { forward_idx = i as isize; forward_hint = ref_hint; } } else if self.get_relative_dist(ref_hint, fi.order_hint) > 0 && (backward_idx < 0 || self.get_relative_dist(ref_hint, backward_hint) > 0) { backward_idx = i as isize; backward_hint = ref_hint; } } } if forward_idx < 0 { false } else if backward_idx >= 0 { // set skip_mode_frame true } else { let mut second_forward_idx: isize = -1; let mut second_forward_hint = 0; for i in inter_cfg.allowed_ref_frames().iter().map(|rf| rf.to_index()) { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[i] as usize] { let ref_hint = rec.order_hint; if self.get_relative_dist(ref_hint, forward_hint) < 0 && (second_forward_idx < 0 || self.get_relative_dist(ref_hint, second_forward_hint) > 0) { second_forward_idx = i as isize; second_forward_hint = ref_hint; } } } // TODO: Set skip_mode_frame, when second_forward_idx is not less than 0. second_forward_idx >= 0 } } #[inline(always)] const fn sb_size_log2(use_128x128_superblock: bool) -> usize { 6 + (use_128x128_superblock as usize) } } #[derive(Debug, Clone)] pub struct FrameState { pub sb_size_log2: usize, pub input: Arc>, pub input_hres: Arc>, // half-resolution version of input luma pub input_qres: Arc>, // quarter-resolution version of input luma pub rec: Arc>, pub cdfs: CDFContext, pub context_update_tile_id: usize, // tile id used for the CDFontext pub max_tile_size_bytes: u32, pub deblock: DeblockState, pub segmentation: SegmentationState, pub restoration: RestorationState, // Because we only reference these within a tile context, // these are stored per-tile for easier access. pub frame_me_stats: RefMEStats, pub enc_stats: EncoderStats, } impl FrameState { pub fn new(fi: &FrameInvariants) -> Self { // TODO(negge): Use fi.cfg.chroma_sampling when we store VideoDetails in FrameInvariants FrameState::new_with_frame( fi, Arc::new(Frame::new(fi.width, fi.height, fi.sequence.chroma_sampling)), ) } /// Similar to [`FrameState::new_with_frame`], but takes an `me_stats` /// and `rec` to enable reusing the same underlying allocations to create /// a `FrameState` /// /// This function primarily exists for [`estimate_inter_costs`], and so /// it does not create hres or qres versions of `frame` as downscaling is /// somewhat expensive and are not needed for [`estimate_inter_costs`]. pub fn new_with_frame_and_me_stats_and_rec( fi: &FrameInvariants, frame: Arc>, me_stats: RefMEStats, rec: Arc>, ) -> Self { let rs = RestorationState::new(fi, &frame); let hres = Plane::new(0, 0, 0, 0, 0, 0); let qres = Plane::new(0, 0, 0, 0, 0, 0); Self { sb_size_log2: fi.sb_size_log2(), input: frame, input_hres: Arc::new(hres), input_qres: Arc::new(qres), rec, cdfs: CDFContext::new(0), context_update_tile_id: 0, max_tile_size_bytes: 0, deblock: Default::default(), segmentation: Default::default(), restoration: rs, frame_me_stats: me_stats, enc_stats: Default::default(), } } pub fn new_with_frame( fi: &FrameInvariants, frame: Arc>, ) -> Self { let rs = RestorationState::new(fi, &frame); let luma_width = frame.planes[0].cfg.width; let luma_height = frame.planes[0].cfg.height; let hres = frame.planes[0].downsampled(fi.width, fi.height); let qres = hres.downsampled(fi.width, fi.height); Self { sb_size_log2: fi.sb_size_log2(), input: frame, input_hres: Arc::new(hres), input_qres: Arc::new(qres), rec: Arc::new(Frame::new( luma_width, luma_height, fi.sequence.chroma_sampling, )), cdfs: CDFContext::new(0), context_update_tile_id: 0, max_tile_size_bytes: 0, deblock: Default::default(), segmentation: Default::default(), restoration: rs, frame_me_stats: FrameMEStats::new_arc_array(fi.w_in_b, fi.h_in_b), enc_stats: Default::default(), } } pub fn apply_tile_state_mut(&mut self, f: F) -> R where F: FnOnce(&mut TileStateMut<'_, T>) -> R, { let PlaneConfig { width, height, .. } = self.rec.planes[0].cfg; let sbo_0 = PlaneSuperBlockOffset(SuperBlockOffset { x: 0, y: 0 }); let frame_me_stats = self.frame_me_stats.clone(); let frame_me_stats = &mut *frame_me_stats.write().expect("poisoned lock"); let ts = &mut TileStateMut::new( self, sbo_0, self.sb_size_log2, width, height, frame_me_stats, ); f(ts) } } #[derive(Copy, Clone, Debug)] pub struct DeblockState { pub levels: [u8; MAX_PLANES + 1], // Y vertical edges, Y horizontal, U, V pub sharpness: u8, pub deltas_enabled: bool, pub delta_updates_enabled: bool, pub ref_deltas: [i8; REF_FRAMES], pub mode_deltas: [i8; 2], pub block_deltas_enabled: bool, pub block_delta_shift: u8, pub block_delta_multi: bool, } impl Default for DeblockState { fn default() -> Self { DeblockState { levels: [8, 8, 4, 4], sharpness: 0, deltas_enabled: false, // requires delta_q_enabled delta_updates_enabled: false, ref_deltas: [1, 0, 0, 0, 0, -1, -1, -1], mode_deltas: [0, 0], block_deltas_enabled: false, block_delta_shift: 0, block_delta_multi: false, } } } #[derive(Copy, Clone, Debug, Default)] pub struct SegmentationState { pub enabled: bool, pub update_data: bool, pub update_map: bool, pub preskip: bool, pub last_active_segid: u8, pub features: [[bool; SegLvl::SEG_LVL_MAX as usize]; 8], pub data: [[i16; SegLvl::SEG_LVL_MAX as usize]; 8], pub threshold: [DistortionScale; 7], pub min_segment: u8, pub max_segment: u8, } impl SegmentationState { #[profiling::function] pub fn update_threshold(&mut self, base_q_idx: u8, bd: usize) { let base_ac_q = ac_q(base_q_idx, 0, bd).get() as u64; let real_ac_q = ArrayVec::<_, MAX_SEGMENTS>::from_iter( self.data[..=self.max_segment as usize].iter().map(|data| { ac_q(base_q_idx, data[SegLvl::SEG_LVL_ALT_Q as usize] as i8, bd).get() as u64 }), ); self.threshold.fill(DistortionScale(0)); for ((q1, q2), threshold) in real_ac_q.iter().skip(1).zip(&real_ac_q).zip(&mut self.threshold) { *threshold = DistortionScale::new(base_ac_q.pow(2), q1 * q2); } } #[cfg(feature = "dump_lookahead_data")] pub fn dump_threshold( &self, data_location: std::path::PathBuf, input_frameno: u64, ) { use byteorder::{NativeEndian, WriteBytesExt}; let file_name = format!("{:010}-thresholds", input_frameno); let max_segment = self.max_segment; // dynamic allocation: debugging only let mut buf = vec![]; buf.write_u64::(max_segment as u64).unwrap(); for &v in &self.threshold[..max_segment as usize] { buf.write_u32::(v.0).unwrap(); } ::std::fs::write(data_location.join(file_name).with_extension("bin"), buf) .unwrap(); } } // Frame Invariants are invariant inside a frame #[allow(dead_code)] #[derive(Debug, Clone)] pub struct FrameInvariants { pub sequence: Arc, pub config: Arc, pub width: usize, pub height: usize, pub render_width: u32, pub render_height: u32, pub frame_size_override_flag: bool, pub render_and_frame_size_different: bool, pub sb_width: usize, pub sb_height: usize, pub w_in_b: usize, pub h_in_b: usize, pub input_frameno: u64, pub order_hint: u32, pub show_frame: bool, pub showable_frame: bool, pub error_resilient: bool, pub intra_only: bool, pub allow_high_precision_mv: bool, pub frame_type: FrameType, pub frame_to_show_map_idx: u32, pub use_reduced_tx_set: bool, pub reference_mode: ReferenceMode, pub use_prev_frame_mvs: bool, pub partition_range: PartitionRange, pub globalmv_transformation_type: [GlobalMVMode; INTER_REFS_PER_FRAME], pub num_tg: usize, pub large_scale_tile: bool, pub disable_cdf_update: bool, pub allow_screen_content_tools: u32, pub force_integer_mv: u32, pub primary_ref_frame: u32, pub refresh_frame_flags: u32, // a bitmask that specifies which // reference frame slots will be updated with the current frame // after it is decoded. pub allow_intrabc: bool, pub use_ref_frame_mvs: bool, pub is_filter_switchable: bool, pub is_motion_mode_switchable: bool, pub disable_frame_end_update_cdf: bool, pub allow_warped_motion: bool, pub cdef_search_method: CDEFSearchMethod, pub cdef_damping: u8, pub cdef_bits: u8, pub cdef_y_strengths: [u8; 8], pub cdef_uv_strengths: [u8; 8], pub delta_q_present: bool, pub ref_frames: [u8; INTER_REFS_PER_FRAME], pub ref_frame_sign_bias: [bool; INTER_REFS_PER_FRAME], pub rec_buffer: ReferenceFramesSet, pub base_q_idx: u8, pub dc_delta_q: [i8; 3], pub ac_delta_q: [i8; 3], pub lambda: f64, pub me_lambda: f64, pub dist_scale: [DistortionScale; 3], pub me_range_scale: u8, pub use_tx_domain_distortion: bool, pub use_tx_domain_rate: bool, pub idx_in_group_output: u64, pub pyramid_level: u64, pub enable_early_exit: bool, pub tx_mode_select: bool, pub enable_inter_txfm_split: bool, pub default_filter: FilterMode, pub enable_segmentation: bool, pub t35_metadata: Box<[T35]>, /// Target CPU feature level. pub cpu_feature_level: crate::cpu_features::CpuFeatureLevel, // These will be set if this is a coded (non-SEF) frame. // We do not need them for SEFs. pub coded_frame_data: Option>, } /// These frame invariants are only used on coded frames, i.e. non-SEFs. /// They are stored separately to avoid useless allocations /// when we do not need them. /// /// Currently this consists only of lookahaed data. /// This may change in the future. #[derive(Debug, Clone)] pub struct CodedFrameData { /// The lookahead version of `rec_buffer`, used for storing and propagating /// the original reference frames (rather than reconstructed ones). The /// lookahead uses both `rec_buffer` and `lookahead_rec_buffer`, where /// `rec_buffer` contains the current frame's reference frames and /// `lookahead_rec_buffer` contains the next frame's reference frames. pub lookahead_rec_buffer: ReferenceFramesSet, /// Frame width in importance blocks. pub w_in_imp_b: usize, /// Frame height in importance blocks. pub h_in_imp_b: usize, /// Intra prediction cost estimations for each importance block. pub lookahead_intra_costs: Box<[u32]>, /// Future importance values for each importance block. That is, a value /// indicating how much future frames depend on the block (for example, via /// inter-prediction). pub block_importances: Box<[f32]>, /// Pre-computed distortion_scale. pub distortion_scales: Box<[DistortionScale]>, /// Pre-computed activity_scale. pub activity_scales: Box<[DistortionScale]>, pub activity_mask: ActivityMask, /// Combined metric of activity and distortion pub spatiotemporal_scores: Box<[DistortionScale]>, } impl CodedFrameData { pub fn new(fi: &FrameInvariants) -> CodedFrameData { // Width and height are padded to 8×8 block size. let w_in_imp_b = fi.w_in_b / 2; let h_in_imp_b = fi.h_in_b / 2; CodedFrameData { lookahead_rec_buffer: ReferenceFramesSet::new(), w_in_imp_b, h_in_imp_b, // This is never used before it is assigned lookahead_intra_costs: Box::new([]), // dynamic allocation: once per frame block_importances: vec![0.; w_in_imp_b * h_in_imp_b].into_boxed_slice(), distortion_scales: vec![ DistortionScale::default(); w_in_imp_b * h_in_imp_b ] .into_boxed_slice(), activity_scales: vec![ DistortionScale::default(); w_in_imp_b * h_in_imp_b ] .into_boxed_slice(), activity_mask: Default::default(), spatiotemporal_scores: Default::default(), } } // Assumes that we have already computed activity scales and distortion scales // Returns -0.5 log2(mean(scale)) #[profiling::function] pub fn compute_spatiotemporal_scores(&mut self) -> i64 { let mut scores = self .distortion_scales .iter() .zip(self.activity_scales.iter()) .map(|(&d, &a)| d * a) .collect::>(); let inv_mean = DistortionScale::inv_mean(&scores); for score in scores.iter_mut() { *score *= inv_mean; } for scale in self.distortion_scales.iter_mut() { *scale *= inv_mean; } self.spatiotemporal_scores = scores; inv_mean.blog64() >> 1 } // Assumes that we have already computed distortion_scales // Returns -0.5 log2(mean(scale)) #[profiling::function] pub fn compute_temporal_scores(&mut self) -> i64 { let inv_mean = DistortionScale::inv_mean(&self.distortion_scales); for scale in self.distortion_scales.iter_mut() { *scale *= inv_mean; } self.spatiotemporal_scores = self.distortion_scales.clone(); inv_mean.blog64() >> 1 } #[cfg(feature = "dump_lookahead_data")] pub fn dump_scales( &self, data_location: std::path::PathBuf, scales: Scales, input_frameno: u64, ) { use byteorder::{NativeEndian, WriteBytesExt}; let file_name = format!( "{:010}-{}", input_frameno, match scales { Scales::ActivityScales => "activity_scales", Scales::DistortionScales => "distortion_scales", Scales::SpatiotemporalScales => "spatiotemporal_scales", } ); // dynamic allocation: debugging only let mut buf = vec![]; buf.write_u64::(self.w_in_imp_b as u64).unwrap(); buf.write_u64::(self.h_in_imp_b as u64).unwrap(); for &v in match scales { Scales::ActivityScales => &self.activity_scales[..], Scales::DistortionScales => &self.distortion_scales[..], Scales::SpatiotemporalScales => &self.spatiotemporal_scores[..], } { buf.write_u32::(v.0).unwrap(); } ::std::fs::write(data_location.join(file_name).with_extension("bin"), buf) .unwrap(); } } #[cfg(feature = "dump_lookahead_data")] pub enum Scales { ActivityScales, DistortionScales, SpatiotemporalScales, } pub(crate) const fn pos_to_lvl(pos: u64, pyramid_depth: u64) -> u64 { // Derive level within pyramid for a frame with a given coding order position // For example, with a pyramid of depth 2, the 2 least significant bits of the // position determine the level: // 00 -> 0 // 01 -> 2 // 10 -> 1 // 11 -> 2 pyramid_depth - (pos | (1 << pyramid_depth)).trailing_zeros() as u64 } impl FrameInvariants { #[allow(clippy::erasing_op, clippy::identity_op)] /// # Panics /// /// - If the size of `T` does not match the sequence's bit depth pub fn new(config: Arc, sequence: Arc) -> Self { assert!( sequence.bit_depth <= mem::size_of::() * 8, "bit depth cannot fit into u8" ); let (width, height) = (config.width, config.height); let frame_size_override_flag = width as u32 != sequence.max_frame_width || height as u32 != sequence.max_frame_height; let (render_width, render_height) = config.render_size(); let render_and_frame_size_different = render_width != width || render_height != height; let use_reduced_tx_set = config.speed_settings.transform.reduced_tx_set; let use_tx_domain_distortion = config.tune == Tune::Psnr && config.speed_settings.transform.tx_domain_distortion; let use_tx_domain_rate = config.speed_settings.transform.tx_domain_rate; let w_in_b = 2 * config.width.align_power_of_two_and_shift(3); // MiCols, ((width+7)/8)<<3 >> MI_SIZE_LOG2 let h_in_b = 2 * config.height.align_power_of_two_and_shift(3); // MiRows, ((height+7)/8)<<3 >> MI_SIZE_LOG2 Self { width, height, render_width: render_width as u32, render_height: render_height as u32, frame_size_override_flag, render_and_frame_size_different, sb_width: width.align_power_of_two_and_shift(6), sb_height: height.align_power_of_two_and_shift(6), w_in_b, h_in_b, input_frameno: 0, order_hint: 0, show_frame: true, showable_frame: !sequence.reduced_still_picture_hdr, error_resilient: false, intra_only: true, allow_high_precision_mv: false, frame_type: FrameType::KEY, frame_to_show_map_idx: 0, use_reduced_tx_set, reference_mode: ReferenceMode::SINGLE, use_prev_frame_mvs: false, partition_range: config.speed_settings.partition.partition_range, globalmv_transformation_type: [GlobalMVMode::IDENTITY; INTER_REFS_PER_FRAME], num_tg: 1, large_scale_tile: false, disable_cdf_update: false, allow_screen_content_tools: sequence.force_screen_content_tools, force_integer_mv: 1, primary_ref_frame: PRIMARY_REF_NONE, refresh_frame_flags: ALL_REF_FRAMES_MASK, allow_intrabc: false, use_ref_frame_mvs: false, is_filter_switchable: false, is_motion_mode_switchable: false, // 0: only the SIMPLE motion mode will be used. disable_frame_end_update_cdf: sequence.reduced_still_picture_hdr, allow_warped_motion: false, cdef_search_method: CDEFSearchMethod::PickFromQ, cdef_damping: 3, cdef_bits: 0, cdef_y_strengths: [ 0 * 4 + 0, 1 * 4 + 0, 2 * 4 + 1, 3 * 4 + 1, 5 * 4 + 2, 7 * 4 + 3, 10 * 4 + 3, 13 * 4 + 3, ], cdef_uv_strengths: [ 0 * 4 + 0, 1 * 4 + 0, 2 * 4 + 1, 3 * 4 + 1, 5 * 4 + 2, 7 * 4 + 3, 10 * 4 + 3, 13 * 4 + 3, ], delta_q_present: false, ref_frames: [0; INTER_REFS_PER_FRAME], ref_frame_sign_bias: [false; INTER_REFS_PER_FRAME], rec_buffer: ReferenceFramesSet::new(), base_q_idx: config.quantizer as u8, dc_delta_q: [0; 3], ac_delta_q: [0; 3], lambda: 0.0, dist_scale: Default::default(), me_lambda: 0.0, me_range_scale: 1, use_tx_domain_distortion, use_tx_domain_rate, idx_in_group_output: 0, pyramid_level: 0, enable_early_exit: true, tx_mode_select: false, default_filter: FilterMode::REGULAR, cpu_feature_level: Default::default(), enable_segmentation: config.speed_settings.segmentation != SegmentationLevel::Disabled, enable_inter_txfm_split: config .speed_settings .transform .enable_inter_tx_split, t35_metadata: Box::new([]), sequence, config, coded_frame_data: None, } } pub fn new_key_frame( config: Arc, sequence: Arc, gop_input_frameno_start: u64, t35_metadata: Box<[T35]>, ) -> Self { let tx_mode_select = config.speed_settings.transform.rdo_tx_decision; let mut fi = Self::new(config, sequence); fi.input_frameno = gop_input_frameno_start; fi.tx_mode_select = tx_mode_select; fi.coded_frame_data = Some(CodedFrameData::new(&fi)); fi.t35_metadata = t35_metadata; fi } /// Returns the created `FrameInvariants`, or `None` if this should be /// a placeholder frame. pub(crate) fn new_inter_frame( previous_coded_fi: &Self, inter_cfg: &InterConfig, gop_input_frameno_start: u64, output_frameno_in_gop: u64, next_keyframe_input_frameno: u64, error_resilient: bool, t35_metadata: Box<[T35]>, ) -> Option { let input_frameno = inter_cfg .get_input_frameno(output_frameno_in_gop, gop_input_frameno_start); if input_frameno >= next_keyframe_input_frameno { // This is an invalid frame. We set it as a placeholder in the FI list. return None; } // We have this special thin clone method to avoid cloning the // quite large lookahead data for SEFs, when it is not needed. let mut fi = previous_coded_fi.clone_without_coded_data(); fi.intra_only = false; fi.force_integer_mv = 0; // note: should be 1 if fi.intra_only is true fi.idx_in_group_output = inter_cfg.get_idx_in_group_output(output_frameno_in_gop); fi.tx_mode_select = fi.enable_inter_txfm_split; let show_existing_frame = inter_cfg.get_show_existing_frame(fi.idx_in_group_output); if !show_existing_frame { fi.coded_frame_data = previous_coded_fi.coded_frame_data.clone(); } fi.order_hint = inter_cfg.get_order_hint(output_frameno_in_gop, fi.idx_in_group_output); fi.pyramid_level = inter_cfg.get_level(fi.idx_in_group_output); fi.frame_type = if (inter_cfg.switch_frame_interval > 0) && (output_frameno_in_gop % inter_cfg.switch_frame_interval == 0) && (fi.pyramid_level == 0) { FrameType::SWITCH } else { FrameType::INTER }; fi.error_resilient = if fi.frame_type == FrameType::SWITCH { true } else { error_resilient }; fi.frame_size_override_flag = if fi.frame_type == FrameType::SWITCH { true } else if fi.sequence.reduced_still_picture_hdr { false } else if fi.frame_type == FrameType::INTER && !fi.error_resilient && fi.render_and_frame_size_different { // force frame_size_with_refs() code path if render size != frame size true } else { fi.width as u32 != fi.sequence.max_frame_width || fi.height as u32 != fi.sequence.max_frame_height }; // this is the slot that the current frame is going to be saved into let slot_idx = inter_cfg.get_slot_idx(fi.pyramid_level, fi.order_hint); fi.show_frame = inter_cfg.get_show_frame(fi.idx_in_group_output); fi.t35_metadata = if fi.show_frame { t35_metadata } else { Box::new([]) }; fi.frame_to_show_map_idx = slot_idx; fi.refresh_frame_flags = if fi.frame_type == FrameType::SWITCH { ALL_REF_FRAMES_MASK } else if fi.is_show_existing_frame() { 0 } else { 1 << slot_idx }; let second_ref_frame = if fi.idx_in_group_output == 0 { LAST2_FRAME } else { ALTREF_FRAME }; let ref_in_previous_group = LAST3_FRAME; // reuse probability estimates from previous frames only in top level frames fi.primary_ref_frame = if fi.error_resilient || (fi.pyramid_level > 2) { PRIMARY_REF_NONE } else { (ref_in_previous_group.to_index()) as u32 }; if fi.pyramid_level == 0 { // level 0 has no forward references // default to last P frame fi.ref_frames = [ // calculations done relative to the slot_idx for this frame. // the last four frames can be found by subtracting from the current slot_idx // add 4 to prevent underflow // TODO: maybe use order_hint here like in get_slot_idx? // this is the previous P frame (slot_idx + 4 - 1) as u8 % 4 ; INTER_REFS_PER_FRAME]; if inter_cfg.multiref { // use the second-previous p frame as a second reference frame fi.ref_frames[second_ref_frame.to_index()] = (slot_idx + 4 - 2) as u8 % 4; } } else { debug_assert!(inter_cfg.multiref); // fill in defaults // default to backwards reference in lower level fi.ref_frames = [{ let oh = fi.order_hint - (inter_cfg.group_input_len as u32 >> fi.pyramid_level); let lvl1 = pos_to_lvl(oh as u64, inter_cfg.pyramid_depth); if lvl1 == 0 { ((oh >> inter_cfg.pyramid_depth) % 4) as u8 } else { 3 + lvl1 as u8 } }; INTER_REFS_PER_FRAME]; // use forward reference in lower level as a second reference frame fi.ref_frames[second_ref_frame.to_index()] = { let oh = fi.order_hint + (inter_cfg.group_input_len as u32 >> fi.pyramid_level); let lvl2 = pos_to_lvl(oh as u64, inter_cfg.pyramid_depth); if lvl2 == 0 { ((oh >> inter_cfg.pyramid_depth) % 4) as u8 } else { 3 + lvl2 as u8 } }; // use a reference to the previous frame in the same level // (horizontally) as a third reference fi.ref_frames[ref_in_previous_group.to_index()] = slot_idx as u8; } fi.set_ref_frame_sign_bias(); fi.reference_mode = if inter_cfg.multiref && fi.idx_in_group_output != 0 { ReferenceMode::SELECT } else { ReferenceMode::SINGLE }; fi.input_frameno = input_frameno; fi.me_range_scale = (inter_cfg.group_input_len >> fi.pyramid_level) as u8; if fi.show_frame || fi.showable_frame { let cur_frame_time = fi.frame_timestamp(); // Increment the film grain seed for the next frame if let Some(params) = Arc::make_mut(&mut fi.config).get_film_grain_mut_at(cur_frame_time) { params.random_seed = params.random_seed.wrapping_add(3248); if params.random_seed == 0 { params.random_seed = DEFAULT_GRAIN_SEED; } } } Some(fi) } pub fn is_show_existing_frame(&self) -> bool { self.coded_frame_data.is_none() } pub fn clone_without_coded_data(&self) -> Self { Self { coded_frame_data: None, sequence: self.sequence.clone(), config: self.config.clone(), width: self.width, height: self.height, render_width: self.render_width, render_height: self.render_height, frame_size_override_flag: self.frame_size_override_flag, render_and_frame_size_different: self.render_and_frame_size_different, sb_width: self.sb_width, sb_height: self.sb_height, w_in_b: self.w_in_b, h_in_b: self.h_in_b, input_frameno: self.input_frameno, order_hint: self.order_hint, show_frame: self.show_frame, showable_frame: self.showable_frame, error_resilient: self.error_resilient, intra_only: self.intra_only, allow_high_precision_mv: self.allow_high_precision_mv, frame_type: self.frame_type, frame_to_show_map_idx: self.frame_to_show_map_idx, use_reduced_tx_set: self.use_reduced_tx_set, reference_mode: self.reference_mode, use_prev_frame_mvs: self.use_prev_frame_mvs, partition_range: self.partition_range, globalmv_transformation_type: self.globalmv_transformation_type, num_tg: self.num_tg, large_scale_tile: self.large_scale_tile, disable_cdf_update: self.disable_cdf_update, allow_screen_content_tools: self.allow_screen_content_tools, force_integer_mv: self.force_integer_mv, primary_ref_frame: self.primary_ref_frame, refresh_frame_flags: self.refresh_frame_flags, allow_intrabc: self.allow_intrabc, use_ref_frame_mvs: self.use_ref_frame_mvs, is_filter_switchable: self.is_filter_switchable, is_motion_mode_switchable: self.is_motion_mode_switchable, disable_frame_end_update_cdf: self.disable_frame_end_update_cdf, allow_warped_motion: self.allow_warped_motion, cdef_search_method: self.cdef_search_method, cdef_damping: self.cdef_damping, cdef_bits: self.cdef_bits, cdef_y_strengths: self.cdef_y_strengths, cdef_uv_strengths: self.cdef_uv_strengths, delta_q_present: self.delta_q_present, ref_frames: self.ref_frames, ref_frame_sign_bias: self.ref_frame_sign_bias, rec_buffer: self.rec_buffer.clone(), base_q_idx: self.base_q_idx, dc_delta_q: self.dc_delta_q, ac_delta_q: self.ac_delta_q, lambda: self.lambda, me_lambda: self.me_lambda, dist_scale: self.dist_scale, me_range_scale: self.me_range_scale, use_tx_domain_distortion: self.use_tx_domain_distortion, use_tx_domain_rate: self.use_tx_domain_rate, idx_in_group_output: self.idx_in_group_output, pyramid_level: self.pyramid_level, enable_early_exit: self.enable_early_exit, tx_mode_select: self.tx_mode_select, enable_inter_txfm_split: self.enable_inter_txfm_split, default_filter: self.default_filter, enable_segmentation: self.enable_segmentation, t35_metadata: self.t35_metadata.clone(), cpu_feature_level: self.cpu_feature_level, } } pub fn set_ref_frame_sign_bias(&mut self) { for i in 0..INTER_REFS_PER_FRAME { self.ref_frame_sign_bias[i] = if !self.sequence.enable_order_hint { false } else if let Some(ref rec) = self.rec_buffer.frames[self.ref_frames[i] as usize] { let hint = rec.order_hint; self.sequence.get_relative_dist(hint, self.order_hint) > 0 } else { false }; } } pub fn get_frame_subtype(&self) -> usize { if self.frame_type == FrameType::KEY { FRAME_SUBTYPE_I } else { FRAME_SUBTYPE_P + (self.pyramid_level as usize) } } fn pick_strength_from_q(&mut self, qps: &QuantizerParameters) { self.cdef_damping = 3 + (self.base_q_idx >> 6); let q = bexp64(qps.log_target_q + q57(QSCALE)) as f32; /* These coefficients were trained on libaom. */ let (y_f1, y_f2, uv_f1, uv_f2) = if !self.intra_only { ( poly2(q, -0.0000023593946_f32, 0.0068615186_f32, 0.02709886_f32, 15), poly2(q, -0.00000057629734_f32, 0.0013993345_f32, 0.03831067_f32, 3), poly2(q, -0.0000007095069_f32, 0.0034628846_f32, 0.00887099_f32, 15), poly2(q, 0.00000023874085_f32, 0.00028223585_f32, 0.05576307_f32, 3), ) } else { ( poly2(q, 0.0000033731974_f32, 0.008070594_f32, 0.0187634_f32, 15), poly2(q, 0.0000029167343_f32, 0.0027798624_f32, 0.0079405_f32, 3), poly2(q, -0.0000130790995_f32, 0.012892405_f32, -0.00748388_f32, 15), poly2(q, 0.0000032651783_f32, 0.00035520183_f32, 0.00228092_f32, 3), ) }; self.cdef_y_strengths[0] = (y_f1 * CDEF_SEC_STRENGTHS as i32 + y_f2) as u8; self.cdef_uv_strengths[0] = (uv_f1 * CDEF_SEC_STRENGTHS as i32 + uv_f2) as u8; } pub fn set_quantizers(&mut self, qps: &QuantizerParameters) { self.base_q_idx = qps.ac_qi[0]; let base_q_idx = self.base_q_idx as i32; for pi in 0..3 { self.dc_delta_q[pi] = (qps.dc_qi[pi] as i32 - base_q_idx) as i8; self.ac_delta_q[pi] = (qps.ac_qi[pi] as i32 - base_q_idx) as i8; } self.lambda = qps.lambda * ((1 << (2 * (self.sequence.bit_depth - 8))) as f64); self.me_lambda = self.lambda.sqrt(); self.dist_scale = qps.dist_scale.map(DistortionScale::from); match self.cdef_search_method { CDEFSearchMethod::PickFromQ => { self.pick_strength_from_q(qps); } // TODO: implement FastSearch and FullSearch _ => unreachable!(), } } #[inline(always)] pub fn sb_size_log2(&self) -> usize { self.sequence.tiling.sb_size_log2 } pub fn film_grain_params(&self) -> Option<&GrainTableSegment> { if !(self.show_frame || self.showable_frame) { return None; } let cur_frame_time = self.frame_timestamp(); self.config.get_film_grain_at(cur_frame_time) } pub fn frame_timestamp(&self) -> u64 { // I don't know why this is the base unit for a timestamp but it is. 1/10000000 of a second. const TIMESTAMP_BASE_UNIT: u64 = 10_000_000; self.input_frameno * TIMESTAMP_BASE_UNIT * self.sequence.time_base.num / self.sequence.time_base.den } } impl fmt::Display for FrameInvariants { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Input Frame {} - {}", self.input_frameno, self.frame_type) } } /// # Errors /// /// - If the frame packet cannot be written to pub fn write_temporal_delimiter(packet: &mut dyn io::Write) -> io::Result<()> { packet.write_all(&TEMPORAL_DELIMITER)?; Ok(()) } fn write_key_frame_obus( packet: &mut dyn io::Write, fi: &FrameInvariants, obu_extension: u32, ) -> io::Result<()> { let mut buf1 = Vec::new(); let mut buf2 = Vec::new(); { let mut bw2 = BitWriter::endian(&mut buf2, BigEndian); bw2.write_sequence_header_obu(fi)?; bw2.write_bit(true)?; // trailing bit bw2.byte_align()?; } { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_obu_header(ObuType::OBU_SEQUENCE_HEADER, obu_extension)?; } packet.write_all(&buf1).unwrap(); buf1.clear(); { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_uleb128(buf2.len() as u64)?; } packet.write_all(&buf1).unwrap(); buf1.clear(); packet.write_all(&buf2).unwrap(); buf2.clear(); if fi.sequence.content_light.is_some() { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_sequence_metadata_obu( ObuMetaType::OBU_META_HDR_CLL, &fi.sequence, )?; packet.write_all(&buf1).unwrap(); buf1.clear(); } if fi.sequence.mastering_display.is_some() { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_sequence_metadata_obu( ObuMetaType::OBU_META_HDR_MDCV, &fi.sequence, )?; packet.write_all(&buf1).unwrap(); buf1.clear(); } Ok(()) } /// Write into `dst` the difference between the blocks at `src1` and `src2` fn diff( dst: &mut [MaybeUninit], src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, ) { debug_assert!(dst.len() % src1.rect().width == 0); debug_assert_eq!(src1.rows_iter().count(), src1.rect().height); let width = src1.rect().width; let height = src1.rect().height; if width == 0 || width != src2.rect().width || height == 0 || src1.rows_iter().len() != src2.rows_iter().len() { debug_assert!(false); return; } for ((l, s1), s2) in dst.chunks_exact_mut(width).zip(src1.rows_iter()).zip(src2.rows_iter()) { for ((r, v1), v2) in l.iter_mut().zip(s1).zip(s2) { r.write(i16::cast_from(*v1) - i16::cast_from(*v2)); } } } fn get_qidx( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, cw: &ContextWriter, tile_bo: TileBlockOffset, ) -> u8 { let mut qidx = fi.base_q_idx; let sidx = cw.bc.blocks[tile_bo].segmentation_idx as usize; if ts.segmentation.features[sidx][SegLvl::SEG_LVL_ALT_Q as usize] { let delta = ts.segmentation.data[sidx][SegLvl::SEG_LVL_ALT_Q as usize]; qidx = clamp((qidx as i16) + delta, 0, 255) as u8; } qidx } /// For a transform block, /// predict, transform, quantize, write coefficients to a bitstream, /// dequantize, inverse-transform. /// /// # Panics /// /// - If the block size is invalid for subsampling /// - If a tx type other than DCT is used for 64x64 blocks pub fn encode_tx_block( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, p: usize, // Offset in the luma plane of the partition enclosing this block. tile_partition_bo: TileBlockOffset, // tx block position within a partition, unit: tx block number bx: usize, by: usize, // Offset in the luma plane where this tx block is colocated. Note that for // a chroma block, this offset might be outside of the current partition. // For example in 4:2:0, four 4x4 luma partitions share one 4x4 chroma block, // this block is part of the last 4x4 partition, but its `tx_bo` offset // matches the offset of the first 4x4 partition. tx_bo: TileBlockOffset, mode: PredictionMode, tx_size: TxSize, tx_type: TxType, bsize: BlockSize, po: PlaneOffset, skip: bool, qidx: u8, ac: &[i16], pred_intra_param: IntraParam, rdo_type: RDOType, need_recon_pixel: bool, ) -> (bool, ScaledDistortion) { let PlaneConfig { xdec, ydec, .. } = ts.input.planes[p].cfg; let tile_rect = ts.tile_rect().decimated(xdec, ydec); let area = Area::BlockRect { bo: tx_bo.0, width: tx_size.width(), height: tx_size.height(), }; if tx_bo.0.x >= ts.mi_width || tx_bo.0.y >= ts.mi_height { return (false, ScaledDistortion::zero()); } debug_assert!(tx_bo.0.x < ts.mi_width); debug_assert!(tx_bo.0.y < ts.mi_height); debug_assert!( tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT ); let plane_bsize = bsize.subsampled_size(xdec, ydec).unwrap(); debug_assert!(p != 0 || !mode.is_intra() || tx_size.block_size() == plane_bsize || need_recon_pixel, "mode.is_intra()={:#?}, plane={:#?}, tx_size.block_size()={:#?}, plane_bsize={:#?}, need_recon_pixel={:#?}", mode.is_intra(), p, tx_size.block_size(), plane_bsize, need_recon_pixel); let ief_params = if mode.is_directional() && fi.sequence.enable_intra_edge_filter { let (plane_xdec, plane_ydec) = if p == 0 { (0, 0) } else { (xdec, ydec) }; let above_block_info = ts.above_block_info(tile_partition_bo, plane_xdec, plane_ydec); let left_block_info = ts.left_block_info(tile_partition_bo, plane_xdec, plane_ydec); Some(IntraEdgeFilterParameters::new(p, above_block_info, left_block_info)) } else { None }; let frame_bo = ts.to_frame_block_offset(tx_bo); let rec = &mut ts.rec.planes[p]; if mode.is_intra() { let bit_depth = fi.sequence.bit_depth; let mut edge_buf = Aligned::uninit_array(); let edge_buf = get_intra_edges( &mut edge_buf, &rec.as_const(), tile_partition_bo, bx, by, bsize, po, tx_size, bit_depth, Some(mode), fi.sequence.enable_intra_edge_filter, pred_intra_param, ); mode.predict_intra( tile_rect, &mut rec.subregion_mut(area), tx_size, bit_depth, ac, pred_intra_param, ief_params, &edge_buf, fi.cpu_feature_level, ); } if skip { return (false, ScaledDistortion::zero()); } let coded_tx_area = av1_get_coded_tx_size(tx_size).area(); let mut residual = Aligned::<[MaybeUninit; 64 * 64]>::uninit_array(); let mut coeffs = Aligned::<[MaybeUninit; 64 * 64]>::uninit_array(); let mut qcoeffs = Aligned::<[MaybeUninit; 32 * 32]>::uninit_array(); let mut rcoeffs = Aligned::<[MaybeUninit; 32 * 32]>::uninit_array(); let residual = &mut residual.data[..tx_size.area()]; let coeffs = &mut coeffs.data[..tx_size.area()]; let qcoeffs = init_slice_repeat_mut( &mut qcoeffs.data[..coded_tx_area], T::Coeff::cast_from(0), ); let rcoeffs = &mut rcoeffs.data[..coded_tx_area]; let (visible_tx_w, visible_tx_h) = clip_visible_bsize( (fi.width + xdec) >> xdec, (fi.height + ydec) >> ydec, tx_size.block_size(), (frame_bo.0.x << MI_SIZE_LOG2) >> xdec, (frame_bo.0.y << MI_SIZE_LOG2) >> ydec, ); if visible_tx_w != 0 && visible_tx_h != 0 { diff( residual, &ts.input_tile.planes[p].subregion(area), &rec.subregion(area), ); } else { residual.fill(MaybeUninit::new(0)); } // SAFETY: `diff()` inits `tx_size.area()` elements when it matches size of `subregion(area)` let residual = unsafe { slice_assume_init_mut(residual) }; forward_transform( residual, coeffs, tx_size.width(), tx_size, tx_type, fi.sequence.bit_depth, fi.cpu_feature_level, ); // SAFETY: forward_transform initialized coeffs let coeffs = unsafe { slice_assume_init_mut(coeffs) }; let eob = ts.qc.quantize(coeffs, qcoeffs, tx_size, tx_type); let has_coeff = if need_recon_pixel || rdo_type.needs_coeff_rate() { debug_assert!((((fi.w_in_b - frame_bo.0.x) << MI_SIZE_LOG2) >> xdec) >= 4); debug_assert!((((fi.h_in_b - frame_bo.0.y) << MI_SIZE_LOG2) >> ydec) >= 4); let frame_clipped_txw: usize = (((fi.w_in_b - frame_bo.0.x) << MI_SIZE_LOG2) >> xdec) .min(tx_size.width()); let frame_clipped_txh: usize = (((fi.h_in_b - frame_bo.0.y) << MI_SIZE_LOG2) >> ydec) .min(tx_size.height()); cw.write_coeffs_lv_map( w, p, tx_bo, qcoeffs, eob, mode, tx_size, tx_type, plane_bsize, xdec, ydec, fi.use_reduced_tx_set, frame_clipped_txw, frame_clipped_txh, ) } else { true }; // Reconstruct dequantize( qidx, qcoeffs, eob, rcoeffs, tx_size, fi.sequence.bit_depth, fi.dc_delta_q[p], fi.ac_delta_q[p], fi.cpu_feature_level, ); // SAFETY: dequantize initialized rcoeffs let rcoeffs = unsafe { slice_assume_init_mut(rcoeffs) }; if eob == 0 { // All zero coefficients is a no-op } else if !fi.use_tx_domain_distortion || need_recon_pixel { inverse_transform_add( rcoeffs, &mut rec.subregion_mut(area), eob, tx_size, tx_type, fi.sequence.bit_depth, fi.cpu_feature_level, ); } let tx_dist = if rdo_type.needs_tx_dist() && visible_tx_w != 0 && visible_tx_h != 0 { // Store tx-domain distortion of this block // rcoeffs above 32 rows/cols aren't held in the array, because they are // always 0. The first 32x32 is stored first in coeffs so we can iterate // over coeffs and rcoeffs for the first 32 rows/cols. For the // coefficients above 32 rows/cols, we iterate over the rest of coeffs // with the assumption that rcoeff coefficients are zero. let mut raw_tx_dist = coeffs .iter() .zip(rcoeffs.iter()) .map(|(&a, &b)| { let c = i32::cast_from(a) - i32::cast_from(b); (c * c) as u64 }) .sum::() + coeffs[rcoeffs.len()..] .iter() .map(|&a| { let c = i32::cast_from(a); (c * c) as u64 }) .sum::(); let tx_dist_scale_bits = 2 * (3 - get_log_tx_scale(tx_size)); let tx_dist_scale_rounding_offset = 1 << (tx_dist_scale_bits - 1); raw_tx_dist = (raw_tx_dist + tx_dist_scale_rounding_offset) >> tx_dist_scale_bits; if rdo_type == RDOType::TxDistEstRate { // look up rate and distortion in table let estimated_rate = estimate_rate(fi.base_q_idx, tx_size, raw_tx_dist); w.add_bits_frac(estimated_rate as u32); } let bias = distortion_scale(fi, ts.to_frame_block_offset(tx_bo), bsize); RawDistortion::new(raw_tx_dist) * bias * fi.dist_scale[p] } else { ScaledDistortion::zero() }; (has_coeff, tx_dist) } /// # Panics /// /// - If the block size is invalid for subsampling #[profiling::function] pub fn motion_compensate( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset, luma_only: bool, ) { debug_assert!(!luma_mode.is_intra()); let PlaneConfig { xdec: u_xdec, ydec: u_ydec, .. } = ts.input.planes[1].cfg; // Inter mode prediction can take place once for a whole partition, // instead of each tx-block. let num_planes = 1 + if !luma_only && has_chroma( tile_bo, bsize, u_xdec, u_ydec, fi.sequence.chroma_sampling, ) { 2 } else { 0 }; let luma_tile_rect = ts.tile_rect(); let compound_buffer = &mut ts.inter_compound_buffers; for p in 0..num_planes { let plane_bsize = if p == 0 { bsize } else { bsize.subsampled_size(u_xdec, u_ydec).unwrap() }; let rec = &mut ts.rec.planes[p]; let po = tile_bo.plane_offset(rec.plane_cfg); let &PlaneConfig { xdec, ydec, .. } = rec.plane_cfg; let tile_rect = luma_tile_rect.decimated(xdec, ydec); let area = Area::BlockStartingAt { bo: tile_bo.0 }; if p > 0 && bsize < BlockSize::BLOCK_8X8 { let mut some_use_intra = false; if bsize == BlockSize::BLOCK_4X4 || bsize == BlockSize::BLOCK_4X8 { some_use_intra |= cw.bc.blocks[tile_bo.with_offset(-1, 0)].mode.is_intra(); }; if !some_use_intra && bsize == BlockSize::BLOCK_4X4 || bsize == BlockSize::BLOCK_8X4 { some_use_intra |= cw.bc.blocks[tile_bo.with_offset(0, -1)].mode.is_intra(); }; if !some_use_intra && bsize == BlockSize::BLOCK_4X4 { some_use_intra |= cw.bc.blocks[tile_bo.with_offset(-1, -1)].mode.is_intra(); }; if some_use_intra { luma_mode.predict_inter( fi, tile_rect, p, po, &mut rec.subregion_mut(area), plane_bsize.width(), plane_bsize.height(), ref_frames, mvs, compound_buffer, ); } else { assert!(u_xdec == 1 && u_ydec == 1); // TODO: these are absolutely only valid for 4:2:0 if bsize == BlockSize::BLOCK_4X4 { let mv0 = cw.bc.blocks[tile_bo.with_offset(-1, -1)].mv; let rf0 = cw.bc.blocks[tile_bo.with_offset(-1, -1)].ref_frames; let mv1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].mv; let rf1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].ref_frames; let po1 = PlaneOffset { x: po.x + 2, y: po.y }; let area1 = Area::StartingAt { x: po1.x, y: po1.y }; let mv2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].mv; let rf2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].ref_frames; let po2 = PlaneOffset { x: po.x, y: po.y + 2 }; let area2 = Area::StartingAt { x: po2.x, y: po2.y }; let po3 = PlaneOffset { x: po.x + 2, y: po.y + 2 }; let area3 = Area::StartingAt { x: po3.x, y: po3.y }; luma_mode.predict_inter( fi, tile_rect, p, po, &mut rec.subregion_mut(area), 2, 2, rf0, mv0, compound_buffer, ); luma_mode.predict_inter( fi, tile_rect, p, po1, &mut rec.subregion_mut(area1), 2, 2, rf1, mv1, compound_buffer, ); luma_mode.predict_inter( fi, tile_rect, p, po2, &mut rec.subregion_mut(area2), 2, 2, rf2, mv2, compound_buffer, ); luma_mode.predict_inter( fi, tile_rect, p, po3, &mut rec.subregion_mut(area3), 2, 2, ref_frames, mvs, compound_buffer, ); } if bsize == BlockSize::BLOCK_8X4 { let mv1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].mv; let rf1 = cw.bc.blocks[tile_bo.with_offset(0, -1)].ref_frames; luma_mode.predict_inter( fi, tile_rect, p, po, &mut rec.subregion_mut(area), 4, 2, rf1, mv1, compound_buffer, ); let po3 = PlaneOffset { x: po.x, y: po.y + 2 }; let area3 = Area::StartingAt { x: po3.x, y: po3.y }; luma_mode.predict_inter( fi, tile_rect, p, po3, &mut rec.subregion_mut(area3), 4, 2, ref_frames, mvs, compound_buffer, ); } if bsize == BlockSize::BLOCK_4X8 { let mv2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].mv; let rf2 = cw.bc.blocks[tile_bo.with_offset(-1, 0)].ref_frames; luma_mode.predict_inter( fi, tile_rect, p, po, &mut rec.subregion_mut(area), 2, 4, rf2, mv2, compound_buffer, ); let po3 = PlaneOffset { x: po.x + 2, y: po.y }; let area3 = Area::StartingAt { x: po3.x, y: po3.y }; luma_mode.predict_inter( fi, tile_rect, p, po3, &mut rec.subregion_mut(area3), 2, 4, ref_frames, mvs, compound_buffer, ); } } } else { luma_mode.predict_inter( fi, tile_rect, p, po, &mut rec.subregion_mut(area), plane_bsize.width(), plane_bsize.height(), ref_frames, mvs, compound_buffer, ); } } } pub fn save_block_motion( ts: &mut TileStateMut<'_, T>, bsize: BlockSize, tile_bo: TileBlockOffset, ref_frame: usize, mv: MotionVector, ) { let tile_me_stats = &mut ts.me_stats[ref_frame]; let tile_bo_x_end = (tile_bo.0.x + bsize.width_mi()).min(ts.mi_width); let tile_bo_y_end = (tile_bo.0.y + bsize.height_mi()).min(ts.mi_height); for mi_y in tile_bo.0.y..tile_bo_y_end { for mi_x in tile_bo.0.x..tile_bo_x_end { tile_me_stats[mi_y][mi_x].mv = mv; } } } #[profiling::function] pub fn encode_block_pre_cdef( seq: &Sequence, ts: &TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, skip: bool, ) -> bool { cw.bc.blocks.set_skip(tile_bo, bsize, skip); if ts.segmentation.enabled && ts.segmentation.update_map && ts.segmentation.preskip { cw.write_segmentation( w, tile_bo, bsize, false, ts.segmentation.last_active_segid, ); } cw.write_skip(w, tile_bo, skip); if ts.segmentation.enabled && ts.segmentation.update_map && !ts.segmentation.preskip { cw.write_segmentation( w, tile_bo, bsize, skip, ts.segmentation.last_active_segid, ); } if !skip && seq.enable_cdef { cw.bc.cdef_coded = true; } cw.bc.cdef_coded } /// # Panics /// /// - If chroma and luma do not match for inter modes /// - If an invalid motion vector is found #[profiling::function] pub fn encode_block_post_cdef( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode, chroma_mode: PredictionMode, angle_delta: AngleDelta, ref_frames: [RefType; 2], mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset, skip: bool, cfl: CFLParams, tx_size: TxSize, tx_type: TxType, mode_context: usize, mv_stack: &[CandidateMV], rdo_type: RDOType, need_recon_pixel: bool, enc_stats: Option<&mut EncoderStats>, ) -> (bool, ScaledDistortion) { let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; let is_inter = !luma_mode.is_intra(); if is_inter { assert!(luma_mode == chroma_mode); }; let sb_size = if fi.sequence.use_128x128_superblock { BlockSize::BLOCK_128X128 } else { BlockSize::BLOCK_64X64 }; let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; if skip { cw.bc.reset_skip_context( tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling, ); } cw.bc.blocks.set_block_size(tile_bo, bsize); cw.bc.blocks.set_mode(tile_bo, bsize, luma_mode); cw.bc.blocks.set_tx_size(tile_bo, bsize, tx_size); cw.bc.blocks.set_ref_frames(tile_bo, bsize, ref_frames); cw.bc.blocks.set_motion_vectors(tile_bo, bsize, mvs); //write_q_deltas(); if cw.bc.code_deltas && ts.deblock.block_deltas_enabled && (bsize < sb_size || !skip) { cw.write_block_deblock_deltas( w, tile_bo, ts.deblock.block_delta_multi, planes, ); } cw.bc.code_deltas = false; if fi.frame_type.has_inter() { cw.write_is_inter(w, tile_bo, is_inter); if is_inter { cw.fill_neighbours_ref_counts(tile_bo); cw.write_ref_frames(w, fi, tile_bo); if luma_mode.is_compound() { cw.write_compound_mode(w, luma_mode, mode_context); } else { cw.write_inter_mode(w, luma_mode, mode_context); } let ref_mv_idx = 0; let num_mv_found = mv_stack.len(); if luma_mode == PredictionMode::NEWMV || luma_mode == PredictionMode::NEW_NEWMV { if luma_mode == PredictionMode::NEW_NEWMV { assert!(num_mv_found >= 2); } for idx in 0..2 { if num_mv_found > idx + 1 { let drl_mode = ref_mv_idx > idx; let ctx: usize = (mv_stack[idx].weight < REF_CAT_LEVEL) as usize + (mv_stack[idx + 1].weight < REF_CAT_LEVEL) as usize; cw.write_drl_mode(w, drl_mode, ctx); if !drl_mode { break; } } } } let ref_mvs = if num_mv_found > 0 { [mv_stack[ref_mv_idx].this_mv, mv_stack[ref_mv_idx].comp_mv] } else { [MotionVector::default(); 2] }; let mv_precision = if fi.force_integer_mv != 0 { MvSubpelPrecision::MV_SUBPEL_NONE } else if fi.allow_high_precision_mv { MvSubpelPrecision::MV_SUBPEL_HIGH_PRECISION } else { MvSubpelPrecision::MV_SUBPEL_LOW_PRECISION }; if luma_mode == PredictionMode::NEWMV || luma_mode == PredictionMode::NEW_NEWMV || luma_mode == PredictionMode::NEW_NEARESTMV { cw.write_mv(w, mvs[0], ref_mvs[0], mv_precision); } if luma_mode == PredictionMode::NEW_NEWMV || luma_mode == PredictionMode::NEAREST_NEWMV { cw.write_mv(w, mvs[1], ref_mvs[1], mv_precision); } if luma_mode.has_nearmv() { let ref_mv_idx = luma_mode.ref_mv_idx(); if luma_mode != PredictionMode::NEAR0MV { assert!(num_mv_found > ref_mv_idx); } for idx in 1..3 { if num_mv_found > idx + 1 { let drl_mode = ref_mv_idx > idx; let ctx: usize = (mv_stack[idx].weight < REF_CAT_LEVEL) as usize + (mv_stack[idx + 1].weight < REF_CAT_LEVEL) as usize; cw.write_drl_mode(w, drl_mode, ctx); if !drl_mode { break; } } } if mv_stack.len() > 1 { assert!(mv_stack[ref_mv_idx].this_mv.row == mvs[0].row); assert!(mv_stack[ref_mv_idx].this_mv.col == mvs[0].col); } else { assert!(0 == mvs[0].row); assert!(0 == mvs[0].col); } } else if luma_mode == PredictionMode::NEARESTMV { if mv_stack.is_empty() { assert_eq!(mvs[0].row, 0); assert_eq!(mvs[0].col, 0); } else { assert_eq!(mvs[0].row, mv_stack[0].this_mv.row); assert_eq!(mvs[0].col, mv_stack[0].this_mv.col); } } } else { cw.write_intra_mode(w, bsize, luma_mode); } } else { cw.write_intra_mode_kf(w, tile_bo, luma_mode); } if !is_inter { if luma_mode.is_directional() && bsize >= BlockSize::BLOCK_8X8 { cw.write_angle_delta(w, angle_delta.y, luma_mode); } if has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling) { cw.write_intra_uv_mode(w, chroma_mode, luma_mode, bsize); if chroma_mode.is_cfl() { assert!(bsize.cfl_allowed()); cw.write_cfl_alphas(w, cfl); } if chroma_mode.is_directional() && bsize >= BlockSize::BLOCK_8X8 { cw.write_angle_delta(w, angle_delta.uv, chroma_mode); } } if fi.allow_screen_content_tools > 0 && bsize >= BlockSize::BLOCK_8X8 && bsize.width() <= 64 && bsize.height() <= 64 { cw.write_use_palette_mode( w, false, bsize, tile_bo, luma_mode, chroma_mode, xdec, ydec, fi.sequence.chroma_sampling, ); } if fi.sequence.enable_filter_intra && luma_mode == PredictionMode::DC_PRED && bsize.width() <= 32 && bsize.height() <= 32 { cw.write_use_filter_intra(w, false, bsize); // turn off FILTER_INTRA } } // write tx_size here if fi.tx_mode_select { if bsize > BlockSize::BLOCK_4X4 && (!is_inter || !skip) { if !is_inter { cw.write_tx_size_intra(w, tile_bo, bsize, tx_size); cw.bc.update_tx_size_context(tile_bo, bsize, tx_size, false); } else { // write var_tx_size // if here, bsize > BLOCK_4X4 && is_inter && !skip && !Lossless debug_assert!(fi.tx_mode_select); debug_assert!(bsize > BlockSize::BLOCK_4X4); debug_assert!(is_inter); debug_assert!(!skip); let max_tx_size = max_txsize_rect_lookup[bsize as usize]; debug_assert!(max_tx_size.block_size() <= BlockSize::BLOCK_64X64); //TODO: "&& tx_size.block_size() < bsize" will be replaced with tx-split info for a partition // once it is available. let txfm_split = fi.enable_inter_txfm_split && tx_size.block_size() < bsize; // TODO: Revise write_tx_size_inter() for txfm_split = true cw.write_tx_size_inter( w, tile_bo, bsize, max_tx_size, txfm_split, 0, 0, 0, ); } } else { debug_assert!(bsize == BlockSize::BLOCK_4X4 || (is_inter && skip)); cw.bc.update_tx_size_context(tile_bo, bsize, tx_size, is_inter && skip); } } if let Some(enc_stats) = enc_stats { let pixels = tx_size.area(); enc_stats.block_size_counts[bsize as usize] += pixels; enc_stats.tx_type_counts[tx_type as usize] += pixels; enc_stats.luma_pred_mode_counts[luma_mode as usize] += pixels; enc_stats.chroma_pred_mode_counts[chroma_mode as usize] += pixels; if skip { enc_stats.skip_block_count += pixels; } } if fi.sequence.enable_intra_edge_filter { for y in 0..bsize.height_mi() { if tile_bo.0.y + y >= ts.mi_height { continue; } for x in 0..bsize.width_mi() { if tile_bo.0.x + x >= ts.mi_width { continue; } let bi = &mut ts.coded_block_info[tile_bo.0.y + y][tile_bo.0.x + x]; bi.luma_mode = luma_mode; bi.chroma_mode = chroma_mode; bi.reference_types = ref_frames; } } } if is_inter { motion_compensate( fi, ts, cw, luma_mode, ref_frames, mvs, bsize, tile_bo, false, ); write_tx_tree( fi, ts, cw, w, luma_mode, angle_delta.y, tile_bo, bsize, tx_size, tx_type, skip, false, rdo_type, need_recon_pixel, ) } else { write_tx_blocks( fi, ts, cw, w, luma_mode, chroma_mode, angle_delta, tile_bo, bsize, tx_size, tx_type, skip, cfl, false, rdo_type, need_recon_pixel, ) } } /// # Panics /// /// - If attempting to encode a lossless block (not yet supported) pub fn write_tx_blocks( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode, chroma_mode: PredictionMode, angle_delta: AngleDelta, tile_bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, tx_type: TxType, skip: bool, cfl: CFLParams, luma_only: bool, rdo_type: RDOType, need_recon_pixel: bool, ) -> (bool, ScaledDistortion) { let bw = bsize.width_mi() / tx_size.width_mi(); let bh = bsize.height_mi() / tx_size.height_mi(); let qidx = get_qidx(fi, ts, cw, tile_bo); // TODO: Lossless is not yet supported. if !skip { assert_ne!(qidx, 0); } let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let mut ac = Aligned::<[MaybeUninit; 32 * 32]>::uninit_array(); let mut partition_has_coeff: bool = false; let mut tx_dist = ScaledDistortion::zero(); let do_chroma = has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); ts.qc.update( qidx, tx_size, luma_mode.is_intra(), fi.sequence.bit_depth, fi.dc_delta_q[0], 0, ); for by in 0..bh { for bx in 0..bw { let tx_bo = TileBlockOffset(BlockOffset { x: tile_bo.0.x + bx * tx_size.width_mi(), y: tile_bo.0.y + by * tx_size.height_mi(), }); if tx_bo.0.x >= ts.mi_width || tx_bo.0.y >= ts.mi_height { continue; } let po = tx_bo.plane_offset(&ts.input.planes[0].cfg); let (has_coeff, dist) = encode_tx_block( fi, ts, cw, w, 0, tile_bo, bx, by, tx_bo, luma_mode, tx_size, tx_type, bsize, po, skip, qidx, &[], IntraParam::AngleDelta(angle_delta.y), rdo_type, need_recon_pixel, ); partition_has_coeff |= has_coeff; tx_dist += dist; } } if !do_chroma || luma_only || fi.sequence.chroma_sampling == ChromaSampling::Cs400 { return (partition_has_coeff, tx_dist); }; debug_assert!(has_chroma( tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling )); let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec); let mut bw_uv = (bw * tx_size.width_mi()) >> xdec; let mut bh_uv = (bh * tx_size.height_mi()) >> ydec; if bw_uv == 0 || bh_uv == 0 { bw_uv = 1; bh_uv = 1; } bw_uv /= uv_tx_size.width_mi(); bh_uv /= uv_tx_size.height_mi(); let ac_data = if chroma_mode.is_cfl() { luma_ac(&mut ac.data, ts, tile_bo, bsize, tx_size, fi) } else { [].as_slice() }; let uv_tx_type = if uv_tx_size.width() >= 32 || uv_tx_size.height() >= 32 { TxType::DCT_DCT } else { uv_intra_mode_to_tx_type_context(chroma_mode) }; for p in 1..3 { ts.qc.update( qidx, uv_tx_size, true, fi.sequence.bit_depth, fi.dc_delta_q[p], fi.ac_delta_q[p], ); let alpha = cfl.alpha(p - 1); for by in 0..bh_uv { for bx in 0..bw_uv { let tx_bo = TileBlockOffset(BlockOffset { x: tile_bo.0.x + ((bx * uv_tx_size.width_mi()) << xdec) - ((bw * tx_size.width_mi() == 1) as usize) * xdec, y: tile_bo.0.y + ((by * uv_tx_size.height_mi()) << ydec) - ((bh * tx_size.height_mi() == 1) as usize) * ydec, }); let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg); po.x += (bx * uv_tx_size.width()) as isize; po.y += (by * uv_tx_size.height()) as isize; let (has_coeff, dist) = encode_tx_block( fi, ts, cw, w, p, tile_bo, bx, by, tx_bo, chroma_mode, uv_tx_size, uv_tx_type, bsize, po, skip, qidx, ac_data, if chroma_mode.is_cfl() { IntraParam::Alpha(alpha) } else { IntraParam::AngleDelta(angle_delta.uv) }, rdo_type, need_recon_pixel, ); partition_has_coeff |= has_coeff; tx_dist += dist; } } } (partition_has_coeff, tx_dist) } pub fn write_tx_tree( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, luma_mode: PredictionMode, angle_delta_y: i8, tile_bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, tx_type: TxType, skip: bool, luma_only: bool, rdo_type: RDOType, need_recon_pixel: bool, ) -> (bool, ScaledDistortion) { if skip { return (false, ScaledDistortion::zero()); } let bw = bsize.width_mi() / tx_size.width_mi(); let bh = bsize.height_mi() / tx_size.height_mi(); let qidx = get_qidx(fi, ts, cw, tile_bo); let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let ac = &[0i16; 0]; let mut partition_has_coeff: bool = false; let mut tx_dist = ScaledDistortion::zero(); ts.qc.update( qidx, tx_size, luma_mode.is_intra(), fi.sequence.bit_depth, fi.dc_delta_q[0], 0, ); // TODO: If tx-parition more than only 1-level, this code does not work. // It should recursively traverse the tx block that are split recursivelty by calling write_tx_tree(), // as defined in https://aomediacodec.github.io/av1-spec/#transform-tree-syntax for by in 0..bh { for bx in 0..bw { let tx_bo = TileBlockOffset(BlockOffset { x: tile_bo.0.x + bx * tx_size.width_mi(), y: tile_bo.0.y + by * tx_size.height_mi(), }); if tx_bo.0.x >= ts.mi_width || tx_bo.0.y >= ts.mi_height { continue; } let po = tx_bo.plane_offset(&ts.input.planes[0].cfg); let (has_coeff, dist) = encode_tx_block( fi, ts, cw, w, 0, tile_bo, 0, 0, tx_bo, luma_mode, tx_size, tx_type, bsize, po, skip, qidx, ac, IntraParam::AngleDelta(angle_delta_y), rdo_type, need_recon_pixel, ); partition_has_coeff |= has_coeff; tx_dist += dist; } } if !has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling) || luma_only || fi.sequence.chroma_sampling == ChromaSampling::Cs400 { return (partition_has_coeff, tx_dist); }; debug_assert!(has_chroma( tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling )); let max_tx_size = max_txsize_rect_lookup[bsize as usize]; debug_assert!(max_tx_size.block_size() <= BlockSize::BLOCK_64X64); let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec); let mut bw_uv = max_tx_size.width_mi() >> xdec; let mut bh_uv = max_tx_size.height_mi() >> ydec; if bw_uv == 0 || bh_uv == 0 { bw_uv = 1; bh_uv = 1; } bw_uv /= uv_tx_size.width_mi(); bh_uv /= uv_tx_size.height_mi(); let uv_tx_type = if partition_has_coeff { tx_type.uv_inter(uv_tx_size) } else { TxType::DCT_DCT }; for p in 1..3 { ts.qc.update( qidx, uv_tx_size, false, fi.sequence.bit_depth, fi.dc_delta_q[p], fi.ac_delta_q[p], ); for by in 0..bh_uv { for bx in 0..bw_uv { let tx_bo = TileBlockOffset(BlockOffset { x: tile_bo.0.x + ((bx * uv_tx_size.width_mi()) << xdec) - (max_tx_size.width_mi() == 1) as usize * xdec, y: tile_bo.0.y + ((by * uv_tx_size.height_mi()) << ydec) - (max_tx_size.height_mi() == 1) as usize * ydec, }); let mut po = tile_bo.plane_offset(&ts.input.planes[p].cfg); po.x += (bx * uv_tx_size.width()) as isize; po.y += (by * uv_tx_size.height()) as isize; let (has_coeff, dist) = encode_tx_block( fi, ts, cw, w, p, tile_bo, bx, by, tx_bo, luma_mode, uv_tx_size, uv_tx_type, bsize, po, skip, qidx, ac, IntraParam::AngleDelta(angle_delta_y), rdo_type, need_recon_pixel, ); partition_has_coeff |= has_coeff; tx_dist += dist; } } } (partition_has_coeff, tx_dist) } #[profiling::function] pub fn encode_block_with_modes( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, mode_decision: &PartitionParameters, rdo_type: RDOType, enc_stats: Option<&mut EncoderStats>, ) { let (mode_luma, mode_chroma) = (mode_decision.pred_mode_luma, mode_decision.pred_mode_chroma); let cfl = mode_decision.pred_cfl_params; let ref_frames = mode_decision.ref_frames; let mvs = mode_decision.mvs; let mut skip = mode_decision.skip; let mut cdef_coded = cw.bc.cdef_coded; // Set correct segmentation ID before encoding and before // rdo_tx_size_type(). cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, mode_decision.sidx); let mut mv_stack = ArrayVec::::new(); let is_compound = ref_frames[1] != NONE_FRAME; let mode_context = cw.find_mvrefs(tile_bo, ref_frames, &mut mv_stack, bsize, fi, is_compound); let (tx_size, tx_type) = if !mode_decision.skip && !mode_decision.has_coeff { skip = true; rdo_tx_size_type( fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip, ) } else { (mode_decision.tx_size, mode_decision.tx_type) }; cdef_coded = encode_block_pre_cdef( &fi.sequence, ts, cw, if cdef_coded { w_post_cdef } else { w_pre_cdef }, bsize, tile_bo, skip, ); encode_block_post_cdef( fi, ts, cw, if cdef_coded { w_post_cdef } else { w_pre_cdef }, mode_luma, mode_chroma, mode_decision.angle_delta, ref_frames, mvs, bsize, tile_bo, skip, cfl, tx_size, tx_type, mode_context, &mv_stack, rdo_type, true, enc_stats, ); } #[profiling::function] fn encode_partition_bottomup( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, ref_rd_cost: f64, inter_cfg: &InterConfig, enc_stats: &mut EncoderStats, ) -> PartitionGroupParameters { let rdo_type = RDOType::PixelDistRealRate; let mut rd_cost = std::f64::MAX; let mut best_rd = std::f64::MAX; let mut rdo_output = PartitionGroupParameters { rd_cost, part_type: PartitionType::PARTITION_INVALID, part_modes: ArrayVec::new(), }; if tile_bo.0.x >= ts.mi_width || tile_bo.0.y >= ts.mi_height { return rdo_output; } let is_square = bsize.is_sqr(); let hbs = bsize.width_mi() / 2; let has_cols = tile_bo.0.x + hbs < ts.mi_width; let has_rows = tile_bo.0.y + hbs < ts.mi_height; let is_straddle_x = tile_bo.0.x + bsize.width_mi() > ts.mi_width; let is_straddle_y = tile_bo.0.y + bsize.height_mi() > ts.mi_height; // TODO: Update for 128x128 superblocks assert!(fi.partition_range.max <= BlockSize::BLOCK_64X64); let must_split = is_square && (bsize > fi.partition_range.max || !has_cols || !has_rows); let can_split = // FIXME: sub-8x8 inter blocks not supported for non-4:2:0 sampling if fi.frame_type.has_inter() && fi.sequence.chroma_sampling != ChromaSampling::Cs420 && bsize <= BlockSize::BLOCK_8X8 { false } else { (bsize > fi.partition_range.min && is_square) || must_split }; assert!(bsize >= BlockSize::BLOCK_8X8 || !can_split); let mut best_partition = PartitionType::PARTITION_INVALID; let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let w_pre_checkpoint = w_pre_cdef.checkpoint(); let w_post_checkpoint = w_post_cdef.checkpoint(); // Code the whole block if !must_split { let cost = if bsize >= BlockSize::BLOCK_8X8 && is_square { let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; let tell = w.tell_frac(); cw.write_partition(w, tile_bo, PartitionType::PARTITION_NONE, bsize); compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero()) } else { 0.0 }; let mode_decision = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); if !mode_decision.pred_mode_luma.is_intra() { // Fill the saved motion structure save_block_motion( ts, mode_decision.bsize, mode_decision.bo, mode_decision.ref_frames[0].to_index(), mode_decision.mvs[0], ); } rd_cost = mode_decision.rd_cost + cost; best_partition = PartitionType::PARTITION_NONE; best_rd = rd_cost; rdo_output.part_modes.push(mode_decision.clone()); if !can_split { encode_block_with_modes( fi, ts, cw, w_pre_cdef, w_post_cdef, bsize, tile_bo, &mode_decision, rdo_type, Some(enc_stats), ); } } // if !must_split let mut early_exit = false; // Test all partition types other than PARTITION_NONE by comparing their RD costs if can_split { debug_assert!(is_square); let mut partition_types = ArrayVec::::new(); if bsize <= fi.config.speed_settings.partition.non_square_partition_max_threshold || is_straddle_x || is_straddle_y { if has_cols { partition_types.push(PartitionType::PARTITION_HORZ); } if !(fi.sequence.chroma_sampling == ChromaSampling::Cs422) && has_rows { partition_types.push(PartitionType::PARTITION_VERT); } } partition_types.push(PartitionType::PARTITION_SPLIT); for partition in partition_types { // (!has_rows || !has_cols) --> must_split debug_assert!((has_rows && has_cols) || must_split); // (!has_rows && has_cols) --> partition != PartitionType::PARTITION_VERT debug_assert!( has_rows || !has_cols || (partition != PartitionType::PARTITION_VERT) ); // (has_rows && !has_cols) --> partition != PartitionType::PARTITION_HORZ debug_assert!( !has_rows || has_cols || (partition != PartitionType::PARTITION_HORZ) ); // (!has_rows && !has_cols) --> partition == PartitionType::PARTITION_SPLIT debug_assert!( has_rows || has_cols || (partition == PartitionType::PARTITION_SPLIT) ); cw.rollback(&cw_checkpoint); w_pre_cdef.rollback(&w_pre_checkpoint); w_post_cdef.rollback(&w_post_checkpoint); let subsize = bsize.subsize(partition).unwrap(); let hbsw = subsize.width_mi(); // Half the block size width in blocks let hbsh = subsize.height_mi(); // Half the block size height in blocks let mut child_modes = ArrayVec::::new(); rd_cost = 0.0; if bsize >= BlockSize::BLOCK_8X8 { let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; let tell = w.tell_frac(); cw.write_partition(w, tile_bo, partition, bsize); rd_cost = compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero()); } let four_partitions = [ tile_bo, TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y }), TileBlockOffset(BlockOffset { x: tile_bo.0.x, y: tile_bo.0.y + hbsh }), TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y + hbsh, }), ]; let partitions = get_sub_partitions(&four_partitions, partition); early_exit = false; // If either of horz or vert partition types is being tested, // two partitioned rectangles, defined in 'partitions', of the current block // is passed to encode_partition_bottomup() for offset in partitions { if offset.0.x >= ts.mi_width || offset.0.y >= ts.mi_height { continue; } let child_rdo_output = encode_partition_bottomup( fi, ts, cw, w_pre_cdef, w_post_cdef, subsize, offset, best_rd, inter_cfg, enc_stats, ); let cost = child_rdo_output.rd_cost; assert!(cost >= 0.0); if cost != std::f64::MAX { rd_cost += cost; if !must_split && fi.enable_early_exit && (rd_cost >= best_rd || rd_cost >= ref_rd_cost) { assert!(cost != std::f64::MAX); early_exit = true; break; } else if partition != PartitionType::PARTITION_SPLIT { child_modes.push(child_rdo_output.part_modes[0].clone()); } } } if !early_exit && rd_cost < best_rd { best_rd = rd_cost; best_partition = partition; if partition != PartitionType::PARTITION_SPLIT { assert!(!child_modes.is_empty()); rdo_output.part_modes = child_modes; } } } debug_assert!( early_exit || best_partition != PartitionType::PARTITION_INVALID ); // If the best partition is not PARTITION_SPLIT, recode it if best_partition != PartitionType::PARTITION_SPLIT { assert!(!rdo_output.part_modes.is_empty()); cw.rollback(&cw_checkpoint); w_pre_cdef.rollback(&w_pre_checkpoint); w_post_cdef.rollback(&w_post_checkpoint); assert!(best_partition != PartitionType::PARTITION_NONE || !must_split); let subsize = bsize.subsize(best_partition).unwrap(); if bsize >= BlockSize::BLOCK_8X8 { let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; cw.write_partition(w, tile_bo, best_partition, bsize); } for mode in rdo_output.part_modes.clone() { assert!(subsize == mode.bsize); if !mode.pred_mode_luma.is_intra() { save_block_motion( ts, mode.bsize, mode.bo, mode.ref_frames[0].to_index(), mode.mvs[0], ); } // FIXME: redundant block re-encode encode_block_with_modes( fi, ts, cw, w_pre_cdef, w_post_cdef, mode.bsize, mode.bo, &mode, rdo_type, Some(enc_stats), ); } } } // if can_split { assert!(best_partition != PartitionType::PARTITION_INVALID); if is_square && bsize >= BlockSize::BLOCK_8X8 && (bsize == BlockSize::BLOCK_8X8 || best_partition != PartitionType::PARTITION_SPLIT) { cw.bc.update_partition_context( tile_bo, bsize.subsize(best_partition).unwrap(), bsize, ); } rdo_output.rd_cost = best_rd; rdo_output.part_type = best_partition; if best_partition != PartitionType::PARTITION_NONE { rdo_output.part_modes.clear(); } rdo_output } fn encode_partition_topdown( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, block_output: &Option, inter_cfg: &InterConfig, enc_stats: &mut EncoderStats, ) { if tile_bo.0.x >= ts.mi_width || tile_bo.0.y >= ts.mi_height { return; } let is_square = bsize.is_sqr(); let rdo_type = RDOType::PixelDistRealRate; let hbs = bsize.width_mi() / 2; let has_cols = tile_bo.0.x + hbs < ts.mi_width; let has_rows = tile_bo.0.y + hbs < ts.mi_height; // TODO: Update for 128x128 superblocks debug_assert!(fi.partition_range.max <= BlockSize::BLOCK_64X64); let must_split = is_square && (bsize > fi.partition_range.max || !has_cols || !has_rows); let can_split = // FIXME: sub-8x8 inter blocks not supported for non-4:2:0 sampling if fi.frame_type.has_inter() && fi.sequence.chroma_sampling != ChromaSampling::Cs420 && bsize <= BlockSize::BLOCK_8X8 { false } else { (bsize > fi.partition_range.min && is_square) || must_split }; let mut rdo_output = block_output.clone().unwrap_or_else(|| PartitionGroupParameters { part_type: PartitionType::PARTITION_INVALID, rd_cost: std::f64::MAX, part_modes: ArrayVec::new(), }); let partition = if must_split { PartitionType::PARTITION_SPLIT } else if can_split { debug_assert!(bsize.is_sqr()); // Blocks of sizes within the supported range are subjected to a partitioning decision rdo_output = rdo_partition_decision( fi, ts, cw, w_pre_cdef, w_post_cdef, bsize, tile_bo, &rdo_output, &[PartitionType::PARTITION_SPLIT, PartitionType::PARTITION_NONE], rdo_type, inter_cfg, ); rdo_output.part_type } else { // Blocks of sizes below the supported range are encoded directly PartitionType::PARTITION_NONE }; debug_assert!(partition != PartitionType::PARTITION_INVALID); let subsize = bsize.subsize(partition).unwrap(); if bsize >= BlockSize::BLOCK_8X8 && is_square { let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; cw.write_partition(w, tile_bo, partition, bsize); } match partition { PartitionType::PARTITION_NONE => { let rdo_decision; let part_decision = if let Some(part_mode) = rdo_output.part_modes.first() { // The optimal prediction mode is known from a previous iteration part_mode } else { // Make a prediction mode decision for blocks encoded with no rdo_partition_decision call (e.g. edges) rdo_decision = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); &rdo_decision }; let mut mode_luma = part_decision.pred_mode_luma; let mut mode_chroma = part_decision.pred_mode_chroma; let cfl = part_decision.pred_cfl_params; let skip = part_decision.skip; let ref_frames = part_decision.ref_frames; let mvs = part_decision.mvs; let mut cdef_coded = cw.bc.cdef_coded; // Set correct segmentation ID before encoding and before // rdo_tx_size_type(). cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, part_decision.sidx); // NOTE: Cannot avoid calling rdo_tx_size_type() here again, // because, with top-down partition RDO, the neighboring contexts // of current partition can change, i.e. neighboring partitions can split down more. let (tx_size, tx_type) = rdo_tx_size_type( fi, ts, cw, bsize, tile_bo, mode_luma, ref_frames, mvs, skip, ); let mut mv_stack = ArrayVec::::new(); let is_compound = ref_frames[1] != NONE_FRAME; let mode_context = cw.find_mvrefs( tile_bo, ref_frames, &mut mv_stack, bsize, fi, is_compound, ); // TODO: proper remap when is_compound is true if !mode_luma.is_intra() { if is_compound && mode_luma != PredictionMode::GLOBAL_GLOBALMV { let match0 = mv_stack[0].this_mv.row == mvs[0].row && mv_stack[0].this_mv.col == mvs[0].col; let match1 = mv_stack[0].comp_mv.row == mvs[1].row && mv_stack[0].comp_mv.col == mvs[1].col; let match2 = mv_stack[1].this_mv.row == mvs[0].row && mv_stack[1].this_mv.col == mvs[0].col; let match3 = mv_stack[1].comp_mv.row == mvs[1].row && mv_stack[1].comp_mv.col == mvs[1].col; let match4 = mv_stack.len() > 2 && mv_stack[2].this_mv == mvs[0]; let match5 = mv_stack.len() > 2 && mv_stack[2].comp_mv == mvs[1]; let match6 = mv_stack.len() > 3 && mv_stack[3].this_mv == mvs[0]; let match7 = mv_stack.len() > 3 && mv_stack[3].comp_mv == mvs[1]; mode_luma = if match0 && match1 { PredictionMode::NEAREST_NEARESTMV } else if match2 && match3 { PredictionMode::NEAR_NEAR0MV } else if match4 && match5 { PredictionMode::NEAR_NEAR1MV } else if match6 && match7 { PredictionMode::NEAR_NEAR2MV } else if match0 { PredictionMode::NEAREST_NEWMV } else if match1 { PredictionMode::NEW_NEARESTMV } else { PredictionMode::NEW_NEWMV }; if mode_luma != PredictionMode::NEAREST_NEARESTMV && mvs[0].row == 0 && mvs[0].col == 0 && mvs[1].row == 0 && mvs[1].col == 0 { mode_luma = PredictionMode::GLOBAL_GLOBALMV; } mode_chroma = mode_luma; } else if !is_compound && mode_luma != PredictionMode::GLOBALMV { mode_luma = PredictionMode::NEWMV; for (c, m) in mv_stack.iter().take(4).zip( [ PredictionMode::NEARESTMV, PredictionMode::NEAR0MV, PredictionMode::NEAR1MV, PredictionMode::NEAR2MV, ] .iter(), ) { if c.this_mv.row == mvs[0].row && c.this_mv.col == mvs[0].col { mode_luma = *m; } } if mode_luma == PredictionMode::NEWMV && mvs[0].row == 0 && mvs[0].col == 0 { mode_luma = if mv_stack.is_empty() { PredictionMode::NEARESTMV } else if mv_stack.len() == 1 { PredictionMode::NEAR0MV } else { PredictionMode::GLOBALMV }; } mode_chroma = mode_luma; } save_block_motion( ts, part_decision.bsize, part_decision.bo, part_decision.ref_frames[0].to_index(), part_decision.mvs[0], ); } // FIXME: every final block that has gone through the RDO decision process is encoded twice cdef_coded = encode_block_pre_cdef( &fi.sequence, ts, cw, if cdef_coded { w_post_cdef } else { w_pre_cdef }, bsize, tile_bo, skip, ); encode_block_post_cdef( fi, ts, cw, if cdef_coded { w_post_cdef } else { w_pre_cdef }, mode_luma, mode_chroma, part_decision.angle_delta, ref_frames, mvs, bsize, tile_bo, skip, cfl, tx_size, tx_type, mode_context, &mv_stack, RDOType::PixelDistRealRate, true, Some(enc_stats), ); } PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => { if !rdo_output.part_modes.is_empty() { debug_assert!(can_split && !must_split); // The optimal prediction modes for each split block is known from an rdo_partition_decision() call for mode in rdo_output.part_modes { // Each block is subjected to a new splitting decision encode_partition_topdown( fi, ts, cw, w_pre_cdef, w_post_cdef, subsize, mode.bo, &Some(PartitionGroupParameters { rd_cost: mode.rd_cost, part_type: PartitionType::PARTITION_NONE, part_modes: [mode][..].try_into().unwrap(), }), inter_cfg, enc_stats, ); } } else { debug_assert!(must_split); let hbsw = subsize.width_mi(); // Half the block size width in blocks let hbsh = subsize.height_mi(); // Half the block size height in blocks let four_partitions = [ tile_bo, TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y, }), TileBlockOffset(BlockOffset { x: tile_bo.0.x, y: tile_bo.0.y + hbsh, }), TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y + hbsh, }), ]; let partitions = get_sub_partitions(&four_partitions, partition); partitions.iter().for_each(|&offset| { encode_partition_topdown( fi, ts, cw, w_pre_cdef, w_post_cdef, subsize, offset, &None, inter_cfg, enc_stats, ); }); } } _ => unreachable!(), } if is_square && bsize >= BlockSize::BLOCK_8X8 && (bsize == BlockSize::BLOCK_8X8 || partition != PartitionType::PARTITION_SPLIT) { cw.bc.update_partition_context(tile_bo, subsize, bsize); } } fn get_initial_cdfcontext(fi: &FrameInvariants) -> CDFContext { let cdf = if fi.primary_ref_frame == PRIMARY_REF_NONE { None } else { let ref_frame_idx = fi.ref_frames[fi.primary_ref_frame as usize] as usize; let ref_frame = fi.rec_buffer.frames[ref_frame_idx].as_ref(); ref_frame.map(|rec| rec.cdfs) }; // return the retrieved instance if any, a new one otherwise cdf.unwrap_or_else(|| CDFContext::new(fi.base_q_idx)) } #[profiling::function] fn encode_tile_group( fi: &FrameInvariants, fs: &mut FrameState, inter_cfg: &InterConfig, ) -> Vec { let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; let mut blocks = FrameBlocks::new(fi.w_in_b, fi.h_in_b); let ti = &fi.sequence.tiling; let initial_cdf = get_initial_cdfcontext(fi); // dynamic allocation: once per frame let mut cdfs = vec![initial_cdf; ti.tile_count()]; let (raw_tiles, stats): (Vec<_>, Vec<_>) = ti .tile_iter_mut(fs, &mut blocks) .zip(cdfs.iter_mut()) .collect::>() .into_par_iter() .map(|(mut ctx, cdf)| { encode_tile(fi, &mut ctx.ts, cdf, &mut ctx.tb, inter_cfg) }) .unzip(); for tile_stats in stats { fs.enc_stats += &tile_stats; } /* Frame deblocking operates over a single large tile wrapping the * frame rather than the frame itself so that deblocking is * available inside RDO when needed */ /* TODO: Don't apply if lossless */ let levels = fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; deblock_filter_optimize( fi, &rec.as_const(), &ts.input.as_tile(), &blocks.as_tile_blocks(), fi.width, fi.height, ) }); fs.deblock.levels = levels; if fs.deblock.levels[0] != 0 || fs.deblock.levels[1] != 0 { fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; deblock_filter_frame( ts.deblock, rec, &blocks.as_tile_blocks(), fi.width, fi.height, fi.sequence.bit_depth, planes, ); }); } if fi.sequence.enable_restoration { // Until the loop filters are better pipelined, we'll need to keep // around a copy of both the deblocked and cdeffed frame. let deblocked_frame = (*fs.rec).clone(); /* TODO: Don't apply if lossless */ if fi.sequence.enable_cdef { fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec); }); } /* TODO: Don't apply if lossless */ fs.restoration.lrf_filter_frame( Arc::get_mut(&mut fs.rec).unwrap(), &deblocked_frame, fi, ); } else { /* TODO: Don't apply if lossless */ if fi.sequence.enable_cdef { let deblocked_frame = (*fs.rec).clone(); fs.apply_tile_state_mut(|ts| { let rec = &mut ts.rec; cdef_filter_tile(fi, &deblocked_frame, &blocks.as_tile_blocks(), rec); }); } } let (idx_max, max_len) = raw_tiles .iter() .map(Vec::len) .enumerate() .max_by_key(|&(_, len)| len) .unwrap(); if !fi.disable_frame_end_update_cdf { // use the biggest tile (in bytes) for CDF update fs.context_update_tile_id = idx_max; fs.cdfs = cdfs[idx_max]; fs.cdfs.reset_counts(); } let max_tile_size_bytes = ((ILog::ilog(max_len) + 7) / 8) as u32; debug_assert!(max_tile_size_bytes > 0 && max_tile_size_bytes <= 4); fs.max_tile_size_bytes = max_tile_size_bytes; build_raw_tile_group(ti, &raw_tiles, max_tile_size_bytes) } fn build_raw_tile_group( ti: &TilingInfo, raw_tiles: &[Vec], max_tile_size_bytes: u32, ) -> Vec { // let mut raw = Vec::new(); let mut bw = BitWriter::endian(&mut raw, BigEndian); if ti.cols * ti.rows > 1 { // tile_start_and_end_present_flag bw.write_bit(false).unwrap(); } bw.byte_align().unwrap(); for (i, raw_tile) in raw_tiles.iter().enumerate() { let last = raw_tiles.len() - 1; if i != last { let tile_size_minus_1 = raw_tile.len() - 1; bw.write_le(max_tile_size_bytes, tile_size_minus_1 as u64).unwrap(); } bw.write_bytes(raw_tile).unwrap(); } raw } pub struct SBSQueueEntry { pub sbo: TileSuperBlockOffset, pub lru_index: [i32; MAX_PLANES], pub cdef_coded: bool, pub w_pre_cdef: WriterBase, pub w_post_cdef: WriterBase, } #[profiling::function] fn check_lf_queue( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut WriterBase, sbs_q: &mut VecDeque, last_lru_ready: &mut [i32; 3], last_lru_rdoed: &mut [i32; 3], last_lru_coded: &mut [i32; 3], deblock_p: bool, ) { let mut check_queue = true; let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { MAX_PLANES }; // Walk queue from the head, see if anything is ready for RDO and flush while check_queue { if let Some(qe) = sbs_q.front_mut() { for pli in 0..planes { if qe.lru_index[pli] > last_lru_ready[pli] { check_queue = false; break; } } if check_queue { // yes, this entry is ready if qe.cdef_coded || fi.sequence.enable_restoration { // only RDO once for a given LRU. // One quirk worth noting: LRUs in different planes // may be different sizes; eg, one chroma LRU may // cover four luma LRUs. However, we won't get here // until all are ready for RDO because the smaller // ones all fit inside the biggest, and the biggest // doesn't trigger until everything is done. // RDO happens on all LRUs within the confines of the // biggest, all together. If any of this SB's planes' // LRUs are RDOed, in actuality they all are. // SBs tagged with a lru index of -1 are ignored in // LRU coding/rdoing decisions (but still need to rdo // for cdef). let mut already_rdoed = false; for pli in 0..planes { if qe.lru_index[pli] != -1 && qe.lru_index[pli] <= last_lru_rdoed[pli] { already_rdoed = true; break; } } if !already_rdoed { rdo_loop_decision(qe.sbo, fi, ts, cw, w, deblock_p); for pli in 0..planes { if qe.lru_index[pli] != -1 && last_lru_rdoed[pli] < qe.lru_index[pli] { last_lru_rdoed[pli] = qe.lru_index[pli]; } } } } // write LRF information if !fi.allow_intrabc && fi.sequence.enable_restoration { // TODO: also disallow if lossless for pli in 0..planes { if qe.lru_index[pli] != -1 && last_lru_coded[pli] < qe.lru_index[pli] { last_lru_coded[pli] = qe.lru_index[pli]; cw.write_lrf(w, &mut ts.restoration, qe.sbo, pli); } } } // Now that loop restoration is coded, we can replay the initial block bits qe.w_pre_cdef.replay(w); // Now code CDEF into the middle of the block if qe.cdef_coded { let cdef_index = cw.bc.blocks.get_cdef(qe.sbo); cw.write_cdef(w, cdef_index, fi.cdef_bits); // Code queued symbols that come after the CDEF index qe.w_post_cdef.replay(w); } sbs_q.pop_front(); } } else { check_queue = false; } } } #[profiling::function] fn encode_tile<'a, T: Pixel>( fi: &FrameInvariants, ts: &'a mut TileStateMut<'_, T>, fc: &'a mut CDFContext, blocks: &'a mut TileBlocksMut<'a>, inter_cfg: &InterConfig, ) -> (Vec, EncoderStats) { let mut enc_stats = EncoderStats::default(); let mut w = WriterEncoder::new(); let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; let bc = BlockContext::new(blocks); let mut cw = ContextWriter::new(fc, bc); let mut sbs_q: VecDeque = VecDeque::new(); let mut last_lru_ready = [-1; 3]; let mut last_lru_rdoed = [-1; 3]; let mut last_lru_coded = [-1; 3]; // main loop for sby in 0..ts.sb_height { cw.bc.reset_left_contexts(planes); for sbx in 0..ts.sb_width { cw.fc_log.clear(); let tile_sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }); let mut sbs_qe = SBSQueueEntry { sbo: tile_sbo, lru_index: [-1; MAX_PLANES], cdef_coded: false, w_pre_cdef: WriterRecorder::new(), w_post_cdef: WriterRecorder::new(), }; let tile_bo = tile_sbo.block_offset(0, 0); cw.bc.cdef_coded = false; cw.bc.code_deltas = fi.delta_q_present; let is_straddle_sbx = tile_bo.0.x + BlockSize::BLOCK_64X64.width_mi() > ts.mi_width; let is_straddle_sby = tile_bo.0.y + BlockSize::BLOCK_64X64.height_mi() > ts.mi_height; // Encode SuperBlock if fi.config.speed_settings.partition.encode_bottomup || is_straddle_sbx || is_straddle_sby { encode_partition_bottomup( fi, ts, &mut cw, &mut sbs_qe.w_pre_cdef, &mut sbs_qe.w_post_cdef, BlockSize::BLOCK_64X64, tile_bo, std::f64::MAX, inter_cfg, &mut enc_stats, ); } else { encode_partition_topdown( fi, ts, &mut cw, &mut sbs_qe.w_pre_cdef, &mut sbs_qe.w_post_cdef, BlockSize::BLOCK_64X64, tile_bo, &None, inter_cfg, &mut enc_stats, ); } { let mut check_queue = false; // queue our superblock for when the LRU is complete sbs_qe.cdef_coded = cw.bc.cdef_coded; for pli in 0..planes { if let Some((lru_x, lru_y)) = ts.restoration.planes[pli].restoration_unit_index(tile_sbo, false) { let lru_index = ts.restoration.planes[pli] .restoration_unit_countable(lru_x, lru_y) as i32; sbs_qe.lru_index[pli] = lru_index; if ts.restoration.planes[pli] .restoration_unit_last_sb_for_rdo(fi, ts.sbo, tile_sbo) { last_lru_ready[pli] = lru_index; check_queue = true; } } else { // we're likely in an area stretched into a new tile // tag this SB to be ignored in LRU decisions sbs_qe.lru_index[pli] = -1; check_queue = true; } } sbs_q.push_back(sbs_qe); if check_queue && !fi.sequence.enable_delayed_loopfilter_rdo { check_lf_queue( fi, ts, &mut cw, &mut w, &mut sbs_q, &mut last_lru_ready, &mut last_lru_rdoed, &mut last_lru_coded, true, ); } } } } if fi.sequence.enable_delayed_loopfilter_rdo { // Solve deblocking for just this tile /* TODO: Don't apply if lossless */ let deblock_levels = deblock_filter_optimize( fi, &ts.rec.as_const(), &ts.input_tile, &cw.bc.blocks.as_const(), fi.width, fi.height, ); if deblock_levels[0] != 0 || deblock_levels[1] != 0 { // copy reconstruction to a temp frame to restore it later let rec_copy = if planes == 3 { vec![ ts.rec.planes[0].scratch_copy(), ts.rec.planes[1].scratch_copy(), ts.rec.planes[2].scratch_copy(), ] } else { vec![ts.rec.planes[0].scratch_copy()] }; // copy ts.deblock because we need to set some of our own values here let mut deblock_copy = *ts.deblock; deblock_copy.levels = deblock_levels; // temporarily deblock the reference deblock_filter_frame( &deblock_copy, &mut ts.rec, &cw.bc.blocks.as_const(), fi.width, fi.height, fi.sequence.bit_depth, planes, ); // rdo lf and write check_lf_queue( fi, ts, &mut cw, &mut w, &mut sbs_q, &mut last_lru_ready, &mut last_lru_rdoed, &mut last_lru_coded, false, ); // copy original reference back in for pli in 0..planes { let dst = &mut ts.rec.planes[pli]; let src = &rec_copy[pli]; for (dst_row, src_row) in dst.rows_iter_mut().zip(src.rows_iter()) { for (out, input) in dst_row.iter_mut().zip(src_row) { *out = *input; } } } } else { // rdo lf and write check_lf_queue( fi, ts, &mut cw, &mut w, &mut sbs_q, &mut last_lru_ready, &mut last_lru_rdoed, &mut last_lru_coded, false, ); } } assert!( sbs_q.is_empty(), "Superblock queue not empty in tile at offset {}:{}", ts.sbo.0.x, ts.sbo.0.y ); (w.done(), enc_stats) } #[allow(unused)] fn write_tile_group_header(tile_start_and_end_present_flag: bool) -> Vec { let mut buf = Vec::new(); { let mut bw = BitWriter::endian(&mut buf, BigEndian); bw.write_bit(tile_start_and_end_present_flag).unwrap(); bw.byte_align().unwrap(); } buf } /// Write a packet containing only the placeholder that tells the decoder /// to present the already decoded frame present at `frame_to_show_map_idx` /// /// See `av1-spec` Section 6.8.2 and 7.18. /// /// # Panics /// /// - If the frame packets cannot be written #[profiling::function] pub fn encode_show_existing_frame( fi: &FrameInvariants, fs: &mut FrameState, inter_cfg: &InterConfig, ) -> Vec { debug_assert!(fi.is_show_existing_frame()); let obu_extension = 0; let mut packet = Vec::new(); if fi.frame_type == FrameType::KEY { write_key_frame_obus(&mut packet, fi, obu_extension).unwrap(); } for t35 in fi.t35_metadata.iter() { let mut t35_buf = Vec::new(); let mut t35_bw = BitWriter::endian(&mut t35_buf, BigEndian); t35_bw.write_t35_metadata_obu(t35).unwrap(); packet.write_all(&t35_buf).unwrap(); t35_buf.clear(); } let mut buf1 = Vec::new(); let mut buf2 = Vec::new(); { let mut bw2 = BitWriter::endian(&mut buf2, BigEndian); bw2.write_frame_header_obu(fi, fs, inter_cfg).unwrap(); } { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_obu_header(ObuType::OBU_FRAME_HEADER, obu_extension).unwrap(); } packet.write_all(&buf1).unwrap(); buf1.clear(); { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_uleb128(buf2.len() as u64).unwrap(); } packet.write_all(&buf1).unwrap(); buf1.clear(); packet.write_all(&buf2).unwrap(); buf2.clear(); let map_idx = fi.frame_to_show_map_idx as usize; if let Some(ref rec) = fi.rec_buffer.frames[map_idx] { let fs_rec = Arc::get_mut(&mut fs.rec).unwrap(); let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; for p in 0..planes { fs_rec.planes[p].data.copy_from_slice(&rec.frame.planes[p].data); } } packet } fn get_initial_segmentation( fi: &FrameInvariants, ) -> SegmentationState { let segmentation = if fi.primary_ref_frame == PRIMARY_REF_NONE { None } else { let ref_frame_idx = fi.ref_frames[fi.primary_ref_frame as usize] as usize; let ref_frame = fi.rec_buffer.frames[ref_frame_idx].as_ref(); ref_frame.map(|rec| rec.segmentation) }; // return the retrieved instance if any, a new one otherwise segmentation.unwrap_or_default() } /// # Panics /// /// - If the frame packets cannot be written #[profiling::function] pub fn encode_frame( fi: &FrameInvariants, fs: &mut FrameState, inter_cfg: &InterConfig, ) -> Vec { debug_assert!(!fi.is_show_existing_frame()); let obu_extension = 0; let mut packet = Vec::new(); if fi.enable_segmentation { fs.segmentation = get_initial_segmentation(fi); segmentation_optimize(fi, fs); } let tile_group = encode_tile_group(fi, fs, inter_cfg); if fi.frame_type == FrameType::KEY { write_key_frame_obus(&mut packet, fi, obu_extension).unwrap(); } for t35 in fi.t35_metadata.iter() { let mut t35_buf = Vec::new(); let mut t35_bw = BitWriter::endian(&mut t35_buf, BigEndian); t35_bw.write_t35_metadata_obu(t35).unwrap(); packet.write_all(&t35_buf).unwrap(); t35_buf.clear(); } let mut buf1 = Vec::new(); let mut buf2 = Vec::new(); { let mut bw2 = BitWriter::endian(&mut buf2, BigEndian); bw2.write_frame_header_obu(fi, fs, inter_cfg).unwrap(); } { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_obu_header(ObuType::OBU_FRAME, obu_extension).unwrap(); } packet.write_all(&buf1).unwrap(); buf1.clear(); { let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_uleb128((buf2.len() + tile_group.len()) as u64).unwrap(); } packet.write_all(&buf1).unwrap(); buf1.clear(); packet.write_all(&buf2).unwrap(); buf2.clear(); packet.write_all(&tile_group).unwrap(); packet } pub fn update_rec_buffer( output_frameno: u64, fi: &mut FrameInvariants, fs: &FrameState, ) { let rfs = Arc::new(ReferenceFrame { order_hint: fi.order_hint, width: fi.width as u32, height: fi.height as u32, render_width: fi.render_width, render_height: fi.render_height, frame: fs.rec.clone(), input_hres: fs.input_hres.clone(), input_qres: fs.input_qres.clone(), cdfs: fs.cdfs, frame_me_stats: fs.frame_me_stats.clone(), output_frameno, segmentation: fs.segmentation, }); for i in 0..REF_FRAMES { if (fi.refresh_frame_flags & (1 << i)) != 0 { fi.rec_buffer.frames[i] = Some(Arc::clone(&rfs)); fi.rec_buffer.deblock[i] = fs.deblock; } } } #[cfg(test)] mod test { use super::*; #[test] fn check_partition_types_order() { assert_eq!( RAV1E_PARTITION_TYPES[RAV1E_PARTITION_TYPES.len() - 1], PartitionType::PARTITION_SPLIT ); } } rav1e-0.7.1/src/entropymode.rs000064400000000000000000000744731046102023000143700ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_upper_case_globals)] use crate::context::*; use crate::partition::*; use crate::predict::*; use crate::transform::*; use crate::util::*; pub const PALETTE_BSIZE_CTXS: usize = 7; pub const PALETTE_Y_MODE_CONTEXTS: usize = 3; pub const PALETTE_UV_MODE_CONTEXTS: usize = 2; const PALETTE_COLOR_INDEX_CONTEXTS: usize = 5; const RESTORE_SWITCHABLE_TYPES: usize = 3; pub const TX_SIZE_CONTEXTS: usize = 3; // from seg_common.h const MAX_SEGMENTS: usize = 8; const SPATIAL_PREDICTION_PROBS: usize = 3; const SEG_TEMPORAL_PRED_CTXS: usize = 3; // enums.h const TX_SIZE_LUMA_MIN: usize = TxSize::TX_4X4 as usize; const TX_SIZE_CTX_MIN: usize = TX_SIZE_LUMA_MIN + 1; pub const MAX_TX_CATS: usize = TxSize::TX_SIZES - TX_SIZE_CTX_MIN; pub const BIG_TX_CATS: usize = MAX_TX_CATS - 1; // All except 8x8, which has lower max depth. pub const MAX_TX_DEPTH: usize = 2; // LUTS --------------------- pub static default_kf_y_mode_cdf: [[[u16; INTRA_MODES]; KF_MODE_CONTEXTS]; KF_MODE_CONTEXTS] = cdf_3d([ [ [ 15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244, 24189, 28165, 29093, 30466, ], [ 12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032, 24434, 28658, 30172, 31409, ], [ 10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620, 26160, 29336, 29929, 31567, ], [ 14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096, 24746, 29585, 30958, 32462, ], [ 12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583, 26437, 30261, 31073, 32475, ], ], [ [ 10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023, 25381, 29014, 30482, 31436, ], [ 5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423, 27610, 29905, 31276, 31794, ], [ 7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405, 24469, 27915, 29090, 30492, ], [ 8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825, 24649, 29153, 31096, 32210, ], [ 7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516, 26001, 29675, 30981, 31994, ], ], [ [ 12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055, 25729, 29538, 30305, 32077, ], [ 9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062, 23219, 27743, 29211, 30907, ], [ 6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555, 30467, 30794, 32086, ], [ 10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523, 23878, 28975, 30287, 32252, ], [ 9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561, 30072, 30737, 32463, ], ], [ [ 12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419, 25060, 29696, 30917, 32409, ], [ 8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468, 25225, 29485, 31158, 32342, ], [ 8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605, 29118, 30078, 32018, ], [ 7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743, 30389, 31536, 32528, ], [ 8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718, 25769, 29953, 30983, 32485, ], ], [ [ 12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449, 26219, 30214, 31150, 32477, ], [ 9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236, 25380, 29653, 31143, 32277, ], [ 8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466, 29900, 30523, 32261, ], [ 10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753, 24615, 29489, 30883, 32482, ], [ 7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180, 31355, 31802, 32593, ], ], ]); pub static default_angle_delta_cdf: [[u16; 2 * MAX_ANGLE_DELTA + 1]; DIRECTIONAL_MODES] = cdf_2d([ [2180, 5032, 7567, 22776, 26989, 30217], [2301, 5608, 8801, 23487, 26974, 30330], [3780, 11018, 13699, 19354, 23083, 31286], [4581, 11226, 15147, 17138, 21834, 28397], [1737, 10927, 14509, 19588, 22745, 28823], [2664, 10176, 12485, 17650, 21600, 30495], [2240, 11096, 15453, 20341, 22561, 28917], [3605, 10428, 12459, 17676, 21244, 30655], ]); pub static default_if_y_mode_cdf: [[u16; INTRA_MODES]; BLOCK_SIZE_GROUPS] = cdf_2d([ [ 22801, 23489, 24293, 24756, 25601, 26123, 26606, 27418, 27945, 29228, 29685, 30349, ], [ 18673, 19845, 22631, 23318, 23950, 24649, 25527, 27364, 28152, 29701, 29984, 30852, ], [ 19770, 20979, 23396, 23939, 24241, 24654, 25136, 27073, 27830, 29360, 29730, 30659, ], [ 20155, 21301, 22838, 23178, 23261, 23533, 23703, 24804, 25352, 26575, 27016, 28049, ], ]); pub static default_uv_mode_cdf: [[u16; INTRA_MODES]; INTRA_MODES] = cdf_2d([ [ 22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923, 28244, 30059, 30941, 31961, ], [ 9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824, 28359, 29505, 29800, 31796, ], [ 9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854, 30764, 31777, 32029, ], [ 13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148, 28577, 30612, 31355, 32493, ], [ 9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243, 31101, 31744, 32363, ], [ 11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458, 29711, 31161, 31441, 32550, ], [ 14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200, 30245, 31837, 32342, 32667, ], [ 12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128, 29267, 30643, 31961, 32461, ], [ 12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273, 28443, 30388, 30767, 32416, ], [ 19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719, 23174, 28861, 30379, 32175, ], [ 18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119, 23527, 27053, 31397, 32148, ], [ 17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907, 22482, 25896, 26541, 31819, ], [ 12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166, 15255, 15753, 16039, 16606, ], ]); pub static default_uv_mode_cfl_cdf: [[u16; UV_INTRA_MODES]; INTRA_MODES] = cdf_2d([ [ 10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656, 15986, 20086, 20995, 22455, 24212, ], [ 4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451, 22099, 24228, 24693, 27032, 29472, ], [ 5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774, 23138, 24256, 24703, 26679, ], [ 6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371, 21520, 22206, 23389, 24182, ], [ 4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411, 24911, 25380, 26027, 26376, ], [ 5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981, 24780, 25386, 26517, 27176, ], [ 4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803, 23188, 23763, 24455, 24940, ], [ 6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059, 22336, 23204, 23964, 24793, ], [ 5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898, 22494, 23139, 24764, 25989, ], [ 10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004, 15534, 20714, 21789, 23443, 24861, ], [ 10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235, 15902, 20102, 22696, 23774, 25838, ], [ 10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163, 15636, 19676, 20474, 23519, 25208, ], [ 3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248, 9875, 10521, 29048, ], ]); pub const default_partition_w8_cdf: [[u16; 4]; PARTITION_TYPES] = cdf_2d([ [19132, 25510, 30392], [13928, 19855, 28540], [12522, 23679, 28629], [9896, 18783, 25853], ]); pub const default_partition_cdf: [[u16; EXT_PARTITION_TYPES]; 3 * PARTITION_TYPES] = cdf_2d([ [15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902], [7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834], [5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117], [2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171], [18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544], [7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052], [6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047], [1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899], [20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724], [6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104], [5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238], [870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332], ]); pub const default_partition_w128_cdf: [[u16; 8]; PARTITION_TYPES] = cdf_2d([ [27899, 28219, 28529, 32484, 32539, 32619, 32639], [6607, 6990, 8268, 32060, 32219, 32338, 32371], [5429, 6676, 7122, 32027, 32227, 32531, 32582], [711, 966, 1172, 32448, 32538, 32617, 32664], ]); pub static default_intra_tx_1_cdf: [[[u16; 7]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS] = cdf_3d([ [ [1535, 8035, 9461, 12751, 23467, 27825], [564, 3335, 9709, 10870, 18143, 28094], [672, 3247, 3676, 11982, 19415, 23127], [5279, 13885, 15487, 18044, 23527, 30252], [4423, 6074, 7985, 10416, 25693, 29298], [1486, 4241, 9460, 10662, 16456, 27694], [439, 2838, 3522, 6737, 18058, 23754], [1190, 4233, 4855, 11670, 20281, 24377], [1045, 4312, 8647, 10159, 18644, 29335], [202, 3734, 4747, 7298, 17127, 24016], [447, 4312, 6819, 8884, 16010, 23858], [277, 4369, 5255, 8905, 16465, 22271], [3409, 5436, 10599, 15599, 19687, 24040], ], [ [1870, 13742, 14530, 16498, 23770, 27698], [326, 8796, 14632, 15079, 19272, 27486], [484, 7576, 7712, 14443, 19159, 22591], [1126, 15340, 15895, 17023, 20896, 30279], [655, 4854, 5249, 5913, 22099, 27138], [1299, 6458, 8885, 9290, 14851, 25497], [311, 5295, 5552, 6885, 16107, 22672], [883, 8059, 8270, 11258, 17289, 21549], [741, 7580, 9318, 10345, 16688, 29046], [110, 7406, 7915, 9195, 16041, 23329], [363, 7974, 9357, 10673, 15629, 24474], [153, 7647, 8112, 9936, 15307, 19996], [3511, 6332, 11165, 15335, 19323, 23594], ], [ [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], ], [ [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], [4681, 9362, 14043, 18725, 23406, 28087], ], ]); pub static default_intra_tx_2_cdf: [[[u16; 5]; INTRA_MODES]; TX_SIZE_SQR_CONTEXTS] = cdf_3d([ [ [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], ], [ [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], ], [ [1127, 12814, 22772, 27483], [145, 6761, 11980, 26667], [362, 5887, 11678, 16725], [385, 15213, 18587, 30693], [25, 2914, 23134, 27903], [60, 4470, 11749, 23991], [37, 3332, 14511, 21448], [157, 6320, 13036, 17439], [119, 6719, 12906, 29396], [47, 5537, 12576, 21499], [269, 6076, 11258, 23115], [83, 5615, 12001, 17228], [1968, 5556, 12023, 18547], ], [ [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], [6554, 13107, 19661, 26214], ], ]); pub static default_inter_tx_1_cdf: [[u16; TX_TYPES]; TX_SIZE_SQR_CONTEXTS] = cdf_2d([ [ 4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504, 22848, 23934, 25474, 27727, 28915, 30631, ], [ 1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674, 20408, 22517, 25010, 27116, 28856, 30749, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], ]); pub static default_inter_tx_2_cdf: [[u16; 12]; TX_SIZE_SQR_CONTEXTS] = cdf_2d([ [2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037], [2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037], [770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595, 28526, 30529], [2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845, 24576, 27307, 30037], ]); pub static default_inter_tx_3_cdf: [[u16; 2]; TX_SIZE_SQR_CONTEXTS] = cdf_2d([[16384], [4167], [1998], [748]]); pub static default_cfl_sign_cdf: [u16; CFL_JOINT_SIGNS] = cdf([1418, 2123, 13340, 18405, 26972, 28343, 32294]); pub static default_cfl_alpha_cdf: [[u16; CFL_ALPHABET_SIZE]; CFL_ALPHA_CONTEXTS] = cdf_2d([ [ 7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700, 32704, 32708, 32712, 32716, 32720, 32724, ], [ 14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620, 32647, 32668, 32672, 32676, 32680, 32684, ], [ 11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673, 32677, 32681, 32685, 32689, 32693, 32697, ], [ 26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708, 32712, 32716, 32720, 32724, 32728, 32732, ], [ 17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394, 32464, 32516, 32560, 32576, 32593, 32622, ], [ 14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144, 32413, 32520, 32594, 32622, 32656, 32660, ], ]); // This does not appear to be used in the rust project currently const SWITCHABLE_FILTERS: usize = 3; const SWITCHABLE_FILTER_CONTEXTS: usize = (SWITCHABLE_FILTERS + 1) * 4; #[allow(unused)] pub static default_switchable_interp_cdf: [[u16; SWITCHABLE_FILTERS]; SWITCHABLE_FILTER_CONTEXTS] = cdf_2d([ [31935, 32720], [5568, 32719], [422, 2938], [28244, 32608], [31206, 31953], [4862, 32121], [770, 1152], [20889, 25637], [31910, 32724], [4120, 32712], [305, 2247], [27403, 32636], [31022, 32009], [2963, 32093], [601, 943], [14969, 21398], ]); pub static default_newmv_cdf: [[u16; 2]; NEWMV_MODE_CONTEXTS] = [ cdf([24035]), cdf([16630]), cdf([15339]), cdf([8386]), cdf([12222]), cdf([4676]), [0; 2], ]; pub static default_zeromv_cdf: [[u16; 2]; GLOBALMV_MODE_CONTEXTS] = cdf_2d([[2175], [1054]]); pub static default_refmv_cdf: [[u16; 2]; REFMV_MODE_CONTEXTS] = cdf_2d([[23974], [24188], [17848], [28622], [24312], [19923]]); pub static default_drl_cdf: [[u16; 2]; DRL_MODE_CONTEXTS] = cdf_2d([[13104], [24560], [18945]]); pub static default_compound_mode_cdf: [[u16; INTER_COMPOUND_MODES]; INTER_MODE_CONTEXTS] = cdf_2d([ [7760, 13823, 15808, 17641, 19156, 20666, 26891], [10730, 19452, 21145, 22749, 24039, 25131, 28724], [10664, 20221, 21588, 22906, 24295, 25387, 28436], [13298, 16984, 20471, 24182, 25067, 25736, 26422], [18904, 23325, 25242, 27432, 27898, 28258, 30758], [10725, 17454, 20124, 22820, 24195, 25168, 26046], [17125, 24273, 25814, 27492, 28214, 28704, 30592], [13046, 23214, 24505, 25942, 27435, 28442, 29330], ]); #[allow(unused)] pub static default_interintra_cdf: [[u16; 2]; BLOCK_SIZE_GROUPS] = cdf_2d([[16384], [26887], [27597], [30237]]); #[allow(unused)] pub static default_interintra_mode_cdf: [[u16; InterIntraMode::INTERINTRA_MODES as usize]; BLOCK_SIZE_GROUPS] = cdf_2d([ [8192, 16384, 24576], [1875, 11082, 27332], [2473, 9996, 26388], [4238, 11537, 25926], ]); #[allow(unused)] pub static default_wedge_interintra_cdf: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL] = cdf_2d([ [16384], [16384], [16384], [20036], [24957], [26704], [27530], [29564], [29444], [26872], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], ]); #[allow(unused)] pub static default_compound_type_cdf: [[u16; CompoundType::COMPOUND_TYPES as usize - 1]; BlockSize::BLOCK_SIZES_ALL] = cdf_2d([ [16384], [16384], [16384], [23431], [13171], [11470], [9770], [9100], [8233], [6172], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [11820], [7701], [16384], [16384], ]); #[allow(unused)] pub static default_wedge_idx_cdf: [[u16; 16]; BlockSize::BLOCK_SIZES_ALL] = cdf_2d([ [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359, 22362, 24127, 25702, 27752, 29450, 31171, ], [ 806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367, 18452, 19422, 22839, 26127, 29629, ], [ 2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332, 24520, 27470, 29456, 30529, 31656, ], [ 1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163, 20961, 22884, 24471, 26719, 28714, 30877, ], [ 1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730, 18114, 19313, 22521, 26012, 29550, ], [ 2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270, 20533, 23434, 25972, 27944, 29570, 31416, ], [ 1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638, 22038, 23963, 25311, 26988, 28766, 31012, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284, 24985, 25684, 27259, 28883, 30911, ], [ 1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057, 27251, 29173, 30089, 30960, 31933, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], [ 2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, 20480, 22528, 24576, 26624, 28672, 30720, ], ]); #[allow(unused)] pub static default_motion_mode_cdf: [[u16; MotionMode::MOTION_MODES as usize]; BlockSize::BLOCK_SIZES_ALL] = cdf_2d([ [10923, 21845], [10923, 21845], [10923, 21845], [7651, 24760], [4738, 24765], [5391, 25528], [19419, 26810], [5123, 23606], [11606, 24308], [26260, 29116], [20360, 28062], [21679, 26830], [29516, 30701], [28898, 30397], [30878, 31335], [32507, 32558], [10923, 21845], [10923, 21845], [28799, 31390], [26431, 30774], [28973, 31594], [29742, 31203], ]); #[allow(unused)] pub static default_obmc_cdf: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL] = cdf_2d([ [16384], [16384], [16384], [10437], [9371], [9301], [17432], [14423], [15142], [25817], [22823], [22083], [30128], [31014], [31560], [32638], [16384], [16384], [23664], [20901], [24008], [26879], ]); pub static default_intra_inter_cdf: [[u16; 2]; INTRA_INTER_CONTEXTS] = cdf_2d([[806], [16662], [20186], [26538]]); pub static default_comp_mode_cdf: [[u16; 2]; COMP_INTER_CONTEXTS] = cdf_2d([[26828], [24035], [12031], [10640], [2901]]); pub static default_comp_ref_type_cdf: [[u16; 2]; COMP_REF_TYPE_CONTEXTS] = cdf_2d([[1198], [2070], [9166], [7499], [22475]]); #[allow(unused)] pub static default_uni_comp_ref_cdf: [[[u16; 2]; UNIDIR_COMP_REFS - 1]; UNI_COMP_REF_CONTEXTS] = cdf_3d([ [[5284], [3865], [3128]], [[23152], [14173], [15270]], [[31774], [25120], [26710]], ]); pub static default_single_ref_cdf: [[[u16; 2]; SINGLE_REFS - 1]; REF_CONTEXTS] = cdf_3d([ [[4897], [1555], [4236], [8650], [904], [1444]], [[16973], [16751], [19647], [24773], [11014], [15087]], [[29744], [30279], [31194], [31895], [26875], [30304]], ]); pub static default_comp_ref_cdf: [[[u16; 2]; FWD_REFS - 1]; REF_CONTEXTS] = cdf_3d([ [[4946], [9468], [1503]], [[19891], [22441], [15160]], [[30731], [31059], [27544]], ]); pub static default_comp_bwdref_cdf: [[[u16; 2]; BWD_REFS - 1]; REF_CONTEXTS] = cdf_3d([[[2235], [1423]], [[17182], [15175]], [[30606], [30489]]]); #[allow(unused)] pub static default_palette_y_size_cdf: [[u16; PaletteSize::PALETTE_SIZES as usize]; PALETTE_BSIZE_CTXS] = cdf_2d([ [7952, 13000, 18149, 21478, 25527, 29241], [7139, 11421, 16195, 19544, 23666, 28073], [7788, 12741, 17325, 20500, 24315, 28530], [8271, 14064, 18246, 21564, 25071, 28533], [12725, 19180, 21863, 24839, 27535, 30120], [9711, 14888, 16923, 21052, 25661, 27875], [14940, 20797, 21678, 24186, 27033, 28999], ]); #[allow(unused)] pub static default_palette_uv_size_cdf: [[u16; PaletteSize::PALETTE_SIZES as usize]; PALETTE_BSIZE_CTXS] = cdf_2d([ [8713, 19979, 27128, 29609, 31331, 32272], [5839, 15573, 23581, 26947, 29848, 31700], [4426, 11260, 17999, 21483, 25863, 29430], [3228, 9464, 14993, 18089, 22523, 27420], [3768, 8886, 13091, 17852, 22495, 27207], [2464, 8451, 12861, 21632, 25525, 28555], [1269, 5435, 10433, 18963, 21700, 25865], ]); pub static default_palette_y_mode_cdfs: [[[u16; 2]; PALETTE_Y_MODE_CONTEXTS]; PALETTE_BSIZE_CTXS] = cdf_3d([ [[31676], [3419], [1261]], [[31912], [2859], [980]], [[31823], [3400], [781]], [[32030], [3561], [904]], [[32309], [7337], [1462]], [[32265], [4015], [1521]], [[32450], [7946], [129]], ]); pub static default_palette_uv_mode_cdfs: [[u16; 2]; PALETTE_UV_MODE_CONTEXTS] = cdf_2d([[32461], [21488]]); #[allow(unused)] pub static default_palette_y_color_index_cdf: [[[u16; PaletteColor::PALETTE_COLORS as usize]; PALETTE_COLOR_INDEX_CONTEXTS]; PaletteSize::PALETTE_SIZES as usize] = [ cdf_2d([[28710], [16384], [10553], [27036], [31603]]), cdf_2d([ [27877, 30490], [11532, 25697], [6544, 30234], [23018, 28072], [31915, 32385], ]), cdf_2d([ [25572, 28046, 30045], [9478, 21590, 27256], [7248, 26837, 29824], [19167, 24486, 28349], [31400, 31825, 32250], ]), cdf_2d([ [24779, 26955, 28576, 30282], [8669, 20364, 24073, 28093], [4255, 27565, 29377, 31067], [19864, 23674, 26716, 29530], [31646, 31893, 32147, 32426], ]), cdf_2d([ [23132, 25407, 26970, 28435, 30073], [7443, 17242, 20717, 24762, 27982], [6300, 24862, 26944, 28784, 30671], [18916, 22895, 25267, 27435, 29652], [31270, 31550, 31808, 32059, 32353], ]), cdf_2d([ [23105, 25199, 26464, 27684, 28931, 30318], [6950, 15447, 18952, 22681, 25567, 28563], [7560, 23474, 25490, 27203, 28921, 30708], [18544, 22373, 24457, 26195, 28119, 30045], [31198, 31451, 31670, 31882, 32123, 32391], ]), cdf_2d([ [21689, 23883, 25163, 26352, 27506, 28827, 30195], [6892, 15385, 17840, 21606, 24287, 26753, 29204], [5651, 23182, 25042, 26518, 27982, 29392, 30900], [19349, 22578, 24418, 25994, 27524, 29031, 30448], [31028, 31270, 31504, 31705, 31927, 32153, 32392], ]), ]; #[allow(unused)] pub static default_palette_uv_color_index_cdf: [[[u16; PaletteColor::PALETTE_COLORS as usize]; PALETTE_COLOR_INDEX_CONTEXTS]; PaletteSize::PALETTE_SIZES as usize] = [ cdf_2d([[29089], [16384], [8713], [29257], [31610]]), cdf_2d([ [25257, 29145], [12287, 27293], [7033, 27960], [20145, 25405], [30608, 31639], ]), cdf_2d([ [24210, 27175, 29903], [9888, 22386, 27214], [5901, 26053, 29293], [18318, 22152, 28333], [30459, 31136, 31926], ]), cdf_2d([ [22980, 25479, 27781, 29986], [8413, 21408, 24859, 28874], [2257, 29449, 30594, 31598], [19189, 21202, 25915, 28620], [31844, 32044, 32281, 32518], ]), cdf_2d([ [22217, 24567, 26637, 28683, 30548], [7307, 16406, 19636, 24632, 28424], [4441, 25064, 26879, 28942, 30919], [17210, 20528, 23319, 26750, 29582], [30674, 30953, 31396, 31735, 32207], ]), cdf_2d([ [21239, 23168, 25044, 26962, 28705, 30506], [6545, 15012, 18004, 21817, 25503, 28701], [3448, 26295, 27437, 28704, 30126, 31442], [15889, 18323, 21704, 24698, 26976, 29690], [30988, 31204, 31479, 31734, 31983, 32325], ]), cdf_2d([ [21442, 23288, 24758, 26246, 27649, 28980, 30563], [5863, 14933, 17552, 20668, 23683, 26411, 29273], [3415, 25810, 26877, 27990, 29223, 30394, 31618], [17965, 20084, 22232, 23974, 26274, 28402, 30390], [31190, 31329, 31516, 31679, 31825, 32026, 32322], ]), ]; pub static default_txfm_partition_cdf: [[u16; 2]; TXFM_PARTITION_CONTEXTS] = cdf_2d([ [28581], [23846], [20847], [24315], [18196], [12133], [18791], [10887], [11005], [27179], [20004], [11281], [26549], [19308], [14224], [28015], [21546], [14400], [28165], [22401], [16088], ]); pub static default_skip_cdfs: [[u16; 2]; SKIP_CONTEXTS] = cdf_2d([[31671], [16515], [4576]]); #[allow(unused)] pub static default_skip_mode_cdfs: [[u16; 2]; SKIP_MODE_CONTEXTS] = cdf_2d([[32621], [20708], [8127]]); #[allow(unused)] pub static default_compound_idx_cdfs: [[u16; 2]; COMP_INDEX_CONTEXTS] = cdf_2d([[18244], [12865], [7053], [13259], [9334], [4644]]); #[allow(unused)] pub static default_comp_group_idx_cdfs: [[u16; 2]; COMP_GROUP_IDX_CONTEXTS] = cdf_2d([[26607], [22891], [18840], [24594], [19934], [22674]]); #[allow(unused)] pub static default_intrabc_cdf: [u16; 2] = cdf([30531]); #[allow(unused)] pub static default_filter_intra_mode_cdf: [u16; FilterIntraMode::FILTER_INTRA_MODES as usize] = cdf([8949, 12776, 17211, 29558]); pub static default_filter_intra_cdfs: [[u16; 2]; BlockSize::BLOCK_SIZES_ALL] = cdf_2d([ [4621], [6743], [5893], [7866], [12551], [9394], [12408], [14301], [12756], [22343], [16384], [16384], [16384], [16384], [16384], [16384], [12770], [10368], [20229], [18101], [16384], [16384], ]); pub static default_switchable_restore_cdf: [u16; RESTORE_SWITCHABLE_TYPES] = cdf([9413, 22581]); pub static default_wiener_restore_cdf: [u16; 2] = cdf([11570]); pub static default_sgrproj_restore_cdf: [u16; 2] = cdf([16855]); #[allow(unused)] pub static default_delta_q_cdf: [u16; DELTA_Q_PROBS + 1] = cdf([28160, 32120, 32677]); pub static default_delta_lf_multi_cdf: [[u16; DELTA_LF_PROBS + 1]; FRAME_LF_COUNT] = cdf_2d([ [28160, 32120, 32677], [28160, 32120, 32677], [28160, 32120, 32677], [28160, 32120, 32677], ]); pub static default_delta_lf_cdf: [u16; DELTA_LF_PROBS + 1] = cdf([28160, 32120, 32677]); // FIXME(someone) need real defaults here #[allow(unused)] pub static default_seg_tree_cdf: [u16; MAX_SEGMENTS] = cdf([4096, 8192, 12288, 16384, 20480, 24576, 28672]); #[allow(unused)] pub static default_segment_pred_cdf: [[u16; 2]; SEG_TEMPORAL_PRED_CTXS] = cdf_2d([[128 * 128], [128 * 128], [128 * 128]]); pub static default_spatial_pred_seg_tree_cdf: [[u16; MAX_SEGMENTS]; SPATIAL_PREDICTION_PROBS] = cdf_2d([ [5622, 7893, 16093, 18233, 27809, 28373, 32533], [14274, 18230, 22557, 24935, 29980, 30851, 32344], [27527, 28487, 28723, 28890, 32397, 32647, 32679], ]); pub static default_tx_size_8x8_cdf: [[u16; MAX_TX_DEPTH]; TX_SIZE_CONTEXTS] = cdf_2d([[19968], [19968], [24320]]); pub static default_tx_size_cdf: [[[u16; MAX_TX_DEPTH + 1]; TX_SIZE_CONTEXTS]; BIG_TX_CATS] = cdf_3d([ [[12272, 30172], [12272, 30172], [18677, 30848]], [[12986, 15180], [12986, 15180], [24302, 25602]], [[5782, 11475], [5782, 11475], [16803, 22759]], ]); rav1e-0.7.1/src/ext/x86/x86inc.asm000064400000000000000000001571151046102023000145160ustar 00000000000000;***************************************************************************** ;* x86inc.asm: x86 abstraction layer ;***************************************************************************** ;* Copyright (C) 2005-2022 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner ;* Anton Mitrofanov ;* Fiona Glaser ;* ;* Permission to use, copy, modify, and/or distribute this software for any ;* purpose with or without fee is hereby granted, provided that the above ;* copyright notice and this permission notice appear in all copies. ;* ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;***************************************************************************** ; This is a header file for the x86inc.asm assembly language, which uses ; NASM/YASM syntax combined with a large number of macros to provide easy ; abstraction between different calling conventions (x86_32, win64, linux64). ; It also has various other useful features to simplify writing the kind of ; DSP functions that are most often used. %ifndef private_prefix %error private_prefix not defined %endif %ifndef public_prefix %define public_prefix private_prefix %endif %ifndef STACK_ALIGNMENT %if ARCH_X86_64 %define STACK_ALIGNMENT 16 %else %define STACK_ALIGNMENT 4 %endif %endif %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,win64 %define WIN64 1 %elifidn __OUTPUT_FORMAT__,x64 %define WIN64 1 %else %define UNIX64 1 %endif %endif %define FORMAT_ELF 0 %define FORMAT_MACHO 0 %ifidn __OUTPUT_FORMAT__,elf %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf32 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,elf64 %define FORMAT_ELF 1 %elifidn __OUTPUT_FORMAT__,macho %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho32 %define FORMAT_MACHO 1 %elifidn __OUTPUT_FORMAT__,macho64 %define FORMAT_MACHO 1 %endif %ifdef PREFIX %define mangle(x) _ %+ x %else %define mangle(x) x %endif ; Use VEX-encoding even in non-AVX functions %ifndef FORCE_VEX_ENCODING %define FORCE_VEX_ENCODING 0 %endif %macro SECTION_RODATA 0-1 16 %ifidn __OUTPUT_FORMAT__,win32 SECTION .rdata align=%1 %elif WIN64 SECTION .rdata align=%1 %else SECTION .rodata align=%1 %endif %endmacro %if ARCH_X86_64 %define PIC 1 ; always use PIC on x86-64 default rel %elifidn __OUTPUT_FORMAT__,win32 %define PIC 0 ; PIC isn't used on 32-bit Windows %elifndef PIC %define PIC 0 %endif %define HAVE_PRIVATE_EXTERN 1 %ifdef __NASM_VER__ %use smartalign %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 %define HAVE_PRIVATE_EXTERN 0 %endif %endif ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that ; covers most use cases. ; PROLOGUE: ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. ; %4 = (optional) stack size to be allocated. The stack will be aligned before ; allocating the specified stack size. If the required stack alignment is ; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. ; %4+/%5+ = list of names to define to registers ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. ; cglobal foo, 2,3,7,0x40, dst, src, tmp ; declares a function (foo) that automatically loads two arguments (dst and ; src) into registers, uses one additional register (tmp) plus 7 vector ; registers (m0-m6) and allocates 0x40 bytes of stack space. ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle ; we need more flexible macro. ; RET: ; Pops anything that was pushed by PROLOGUE, and returns. ; REP_RET: ; Use this instead of RET if it's a branch target. ; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size ; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size %macro DECLARE_REG 2-3 %define r%1q %2 %define r%1d %2d %define r%1w %2w %define r%1b %2b %define r%1h %2h %define %2q %2 %if %0 == 2 %define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory %define r%1m [rstk + stack_offset + %3] %define r%1mp qword r %+ %1 %+ m %else %define r%1m [rstk + stack_offset + %3] %define r%1mp dword r %+ %1 %+ m %endif %define r%1 %2 %endmacro %macro DECLARE_REG_SIZE 3 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 %define r%1h %3 %define e%1h %3 %define r%1b %2 %define e%1b %2 %if ARCH_X86_64 == 0 %define r%1 e%1 %endif %endmacro DECLARE_REG_SIZE ax, al, ah DECLARE_REG_SIZE bx, bl, bh DECLARE_REG_SIZE cx, cl, ch DECLARE_REG_SIZE dx, dl, dh DECLARE_REG_SIZE si, sil, null DECLARE_REG_SIZE di, dil, null DECLARE_REG_SIZE bp, bpl, null ; t# defines for when per-arch register allocation is more complex than just function arguments %macro DECLARE_REG_TMP 1-* %assign %%i 0 %rep %0 CAT_XDEFINE t, %%i, r%1 %assign %%i %%i+1 %rotate 1 %endrep %endmacro %macro DECLARE_REG_TMP_SIZE 0-* %rep %0 %define t%1q t%1 %+ q %define t%1d t%1 %+ d %define t%1w t%1 %+ w %define t%1h t%1 %+ h %define t%1b t%1 %+ b %rotate 1 %endrep %endmacro DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if ARCH_X86_64 %define gprsize 8 %else %define gprsize 4 %endif %macro LEA 2 %if ARCH_X86_64 lea %1, [%2] %elif PIC call $+5 ; special-cased to not affect the RSB on most CPU:s pop %1 add %1, (%2)-$+1 %else mov %1, %2 %endif %endmacro ; Repeats an instruction/operation for multiple arguments. ; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3" %macro REPX 2-* ; operation, args %xdefine %%f(x) %1 %rep %0 - 1 %rotate 1 %%f(%1) %endrep %endmacro %macro PUSH 1 push %1 %ifidn rstk, rsp %assign stack_offset stack_offset+gprsize %endif %endmacro %macro POP 1 pop %1 %ifidn rstk, rsp %assign stack_offset stack_offset-gprsize %endif %endmacro %macro PUSH_IF_USED 1-* %rep %0 %if %1 < regs_used PUSH r%1 %endif %rotate 1 %endrep %endmacro %macro POP_IF_USED 1-* %rep %0 %if %1 < regs_used pop r%1 %endif %rotate 1 %endrep %endmacro %macro LOAD_IF_USED 1-* %rep %0 %if %1 < num_args mov r%1, r %+ %1 %+ mp %endif %rotate 1 %endrep %endmacro %macro SUB 2 sub %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset+(%2) %endif %endmacro %macro ADD 2 add %1, %2 %ifidn %1, rstk %assign stack_offset stack_offset-(%2) %endif %endmacro %macro movifnidn 2 %ifnidn %1, %2 mov %1, %2 %endif %endmacro %if ARCH_X86_64 == 0 %define movsxd movifnidn %endif %macro movsxdifnidn 2 %ifnidn %1, %2 movsxd %1, %2 %endif %endmacro %macro ASSERT 1 %if (%1) == 0 %error assertion ``%1'' failed %endif %endmacro %macro DEFINE_ARGS 0-* %ifdef n_arg_names %assign %%i 0 %rep n_arg_names CAT_UNDEF arg_name %+ %%i, q CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w CAT_UNDEF arg_name %+ %%i, h CAT_UNDEF arg_name %+ %%i, b CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name %+ %%i, mp CAT_UNDEF arg_name, %%i %assign %%i %%i+1 %endrep %endif %xdefine %%stack_offset stack_offset %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine %assign %%i 0 %rep %0 %xdefine %1q r %+ %%i %+ q %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w %xdefine %1h r %+ %%i %+ h %xdefine %1b r %+ %%i %+ b %xdefine %1m r %+ %%i %+ m %xdefine %1mp r %+ %%i %+ mp CAT_XDEFINE arg_name, %%i, %1 %assign %%i %%i+1 %rotate 1 %endrep %xdefine stack_offset %%stack_offset %assign n_arg_names %0 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) %define high_mm_regs (16*cpuflag(avx512)) ; Large stack allocations on Windows need to use stack probing in order ; to guarantee that all stack memory is committed before accessing it. ; This is done by ensuring that the guard page(s) at the end of the ; currently committed pages are touched prior to any pages beyond that. %if WIN64 %assign STACK_PROBE_SIZE 8192 %elifidn __OUTPUT_FORMAT__, win32 %assign STACK_PROBE_SIZE 4096 %else %assign STACK_PROBE_SIZE 0 %endif %macro PROBE_STACK 1 ; stack_size %if STACK_PROBE_SIZE %assign %%i STACK_PROBE_SIZE %rep %1 / STACK_PROBE_SIZE mov eax, [rsp-%%i] %assign %%i %%i+STACK_PROBE_SIZE %endrep %endif %endmacro %macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 %assign %%pad 0 %assign stack_size %1 %if stack_size < 0 %assign stack_size -stack_size %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space %if mmsize != 8 %assign xmm_regs_used %2 %if xmm_regs_used > 8 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT ; maintain the current stack alignment %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) PROBE_STACK stack_size_padded SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) %xdefine rstk r %+ %%reg_num ; align stack, and save original stack location directly above ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) %if %1 < 0 ; need to store rsp on stack %xdefine rstkm [rsp + stack_size + %%pad] %assign %%pad %%pad + gprsize %else ; can keep rsp in rstk during whole function %xdefine rstkm rstk %endif %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) PROBE_STACK stack_size_padded mov rstk, rsp and rsp, ~(required_stack_alignment-1) sub rsp, stack_size_padded movifnidn rstkm, rstk %endif WIN64_PUSH_XMM %endif %endif %endmacro %macro SETUP_STACK_POINTER 0-1 0 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 ; Reserve an additional register for storing the original stack pointer, but avoid using ; eax/rax for this purpose since it can potentially get overwritten as a return value. %assign regs_used (regs_used + 1) %if ARCH_X86_64 && regs_used == 7 %assign regs_used 8 %elif ARCH_X86_64 == 0 && regs_used == 1 %assign regs_used 2 %endif %endif %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3 ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax) ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used. %assign regs_used 5 + UNIX64 * 3 %endif %endif %endif %endmacro %if WIN64 ; Windows x64 ;================================================= DECLARE_REG 0, rcx DECLARE_REG 1, rdx DECLARE_REG 2, R8 DECLARE_REG 3, R9 DECLARE_REG 4, R10, 40 DECLARE_REG 5, R11, 48 DECLARE_REG 6, rax, 56 DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 DECLARE_REG 11, R14, 96 DECLARE_REG 12, R15, 104 DECLARE_REG 13, R12, 112 DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 ALLOC_STACK %4, %3 %if mmsize != 8 && stack_size == 0 WIN64_SPILL_XMM %3 %endif LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. %if xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif %if xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 %assign %%i 8 %rep %%xmm_regs_on_stack movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep %endif %endmacro %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 + high_mm_regs %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 %if %%xmm_regs_on_stack > 0 %assign %%i xmm_regs_used - high_mm_regs %rep %%xmm_regs_on_stack %assign %%i %%i-1 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif %if xmm_regs_used > 7 + high_mm_regs movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif %if xmm_regs_used > 6 + high_mm_regs movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro %macro WIN64_RESTORE_XMM 0 WIN64_RESTORE_XMM_INTERNAL %assign stack_offset (stack_offset-stack_size_padded) %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %elif ARCH_X86_64 ; *nix x64 ;============================================= DECLARE_REG 0, rdi DECLARE_REG 1, rsi DECLARE_REG 2, rdx DECLARE_REG 3, rcx DECLARE_REG 4, R8 DECLARE_REG 5, R9 DECLARE_REG 6, rax, 8 DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 DECLARE_REG 11, R14, 48 DECLARE_REG 12, R15, 56 DECLARE_REG 13, R12, 64 DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 ALLOC_STACK %4 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %else ; X86_32 ;============================================================== DECLARE_REG 0, eax, 4 DECLARE_REG 1, ecx, 8 DECLARE_REG 2, edx, 12 DECLARE_REG 3, ebx, 16 DECLARE_REG 4, esi, 20 DECLARE_REG 5, edi, 24 DECLARE_REG 6, ebp, 28 %define rsp esp %macro DECLARE_ARG 1-* %rep %0 %define r%1m [rstk + stack_offset + 4*%1 + 4] %define r%1mp dword r%1m %rotate 1 %endrep %endmacro DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 ASSERT regs_used >= num_args %if num_args > 7 %assign num_args 7 %endif %if regs_used > 7 %assign regs_used 7 %endif SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 ALLOC_STACK %4 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 %if %0 > 4 %ifnum %4 DEFINE_ARGS %5 %else DEFINE_ARGS %4, %5 %endif %elifnnum %4 DEFINE_ARGS %4 %endif %endmacro %define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 %if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded %endif %endif POP_IF_USED 6, 5, 4, 3 %if vzeroupper_required vzeroupper %endif AUTO_REP_RET %endmacro %endif ;====================================================================== %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 %endmacro %macro WIN64_RESTORE_XMM 0 %assign xmm_regs_used 0 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro %endif ; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either ; a branch or a branch target. So switch to a 2-byte form of ret in that case. ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 %if has_epilogue || cpuflag(ssse3) RET %else rep ret %endif annotate_function_size %endmacro %define last_branch_adr $$ %macro AUTO_REP_RET 0 %if notcpuflag(ssse3) times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr. %endif ret annotate_function_size %endmacro %macro BRANCH_INSTR 0-* %rep %0 %macro %1 1-2 %1 %2 %1 %if notcpuflag(ssse3) %%branch_instr equ $ %xdefine last_branch_adr %%branch_instr %endif %endmacro %rotate 1 %endrep %endmacro BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp %macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent %if has_epilogue call %1 RET %elif %2 jmp %1 %endif annotate_function_size %endmacro ;============================================================================= ; arch-independent part ;============================================================================= %assign function_align 16 ; Begin a function. ; Applies any symbol mangling needed for C linkage, and sets up a define such that ; subsequent uses of the function name automatically refer to the mangled version. ; Appends cpuflags to the function name if cpuflags has been specified. ; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX ; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2). %macro cglobal 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 1, %1 %+ SUFFIX, %2 %endmacro %macro cvisible 1-2+ "" ; name, [PROLOGUE args] cglobal_internal 0, %1 %+ SUFFIX, %2 %endmacro %macro cglobal_internal 2-3+ annotate_function_size %ifndef cglobaled_%2 %if %1 %xdefine %2 mangle(private_prefix %+ _ %+ %2) %else %xdefine %2 mangle(public_prefix %+ _ %+ %2) %endif %xdefine %2.skip_prologue %2 %+ .skip_prologue CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 %xdefine current_function_section __SECT__ %if FORMAT_ELF %if %1 global %2:function hidden %else global %2:function %endif %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1 global %2:private_extern %else global %2 %endif align function_align %2: RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif %endmacro ; Create a global symbol from a local label with the correct name mangling and type %macro cglobal_label 1 %if FORMAT_ELF global current_function %+ %1:function hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global current_function %+ %1:private_extern %else global current_function %+ %1 %endif %1: %endmacro %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro ; like cextern, but without the prefix %macro cextern_naked 1 %ifdef PREFIX %xdefine %1 mangle(%1) %endif CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) %if FORMAT_ELF global %1:data hidden %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN global %1:private_extern %else global %1 %endif %1: %2 %endmacro ; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. %if FORMAT_ELF [SECTION .note.GNU-stack noalloc noexec nowrite progbits] %endif ; Tell debuggers how large the function was. ; This may be invoked multiple times per function; we rely on later instances overriding earlier ones. ; This is invoked by RET and similar macros, and also cglobal does it for the previous function, ; but if the last function in a source file doesn't use any of the standard macros for its epilogue, ; then its size might be unspecified. %macro annotate_function_size 0 %ifdef __YASM_VER__ %ifdef current_function %if FORMAT_ELF current_function_section %%ecf equ $ size current_function %%ecf - current_function __SECT__ %endif %endif %endif %endmacro ; cpuflags %assign cpuflags_mmx (1<<0) %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx %assign cpuflags_3dnow (1<<2) | cpuflags_mmx %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 %assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 %assign cpuflags_sse3 (1<<8) | cpuflags_sse2 %assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 %assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 %assign cpuflags_sse42 (1<<11) | cpuflags_sse4 %assign cpuflags_aesni (1<<12) | cpuflags_sse42 %assign cpuflags_gfni (1<<13) | cpuflags_sse42 %assign cpuflags_avx (1<<14) | cpuflags_sse42 %assign cpuflags_xop (1<<15) | cpuflags_avx %assign cpuflags_fma4 (1<<16) | cpuflags_avx %assign cpuflags_fma3 (1<<17) | cpuflags_avx %assign cpuflags_bmi1 (1<<18) | cpuflags_avx|cpuflags_lzcnt %assign cpuflags_bmi2 (1<<19) | cpuflags_bmi1 %assign cpuflags_avx2 (1<<20) | cpuflags_fma3|cpuflags_bmi2 %assign cpuflags_avx512 (1<<21) | cpuflags_avx2 ; F, CD, BW, DQ, VL %assign cpuflags_avx512icl (1<<22) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ %assign cpuflags_cache32 (1<<23) %assign cpuflags_cache64 (1<<24) %assign cpuflags_aligned (1<<25) ; not a cpu feature, but a function variant %assign cpuflags_atom (1<<26) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) %define notcpuflag(x) (cpuflag(x) ^ 1) ; Takes an arbitrary number of cpuflags from the above list. ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. %macro INIT_CPUFLAGS 0-* %xdefine SUFFIX %undef cpuname %assign cpuflags 0 %if %0 >= 1 %rep %0 %ifdef cpuname %xdefine cpuname cpuname %+ _%1 %else %xdefine cpuname %1 %endif %assign cpuflags cpuflags | cpuflags_%1 %rotate 1 %endrep %xdefine SUFFIX _ %+ cpuname %if cpuflag(avx) %assign avx_enabled 1 %endif %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) %define mova movaps %define movu movups %define movnta movntps %endif %if cpuflag(aligned) %define movu mova %elif cpuflag(sse3) && notcpuflag(ssse3) %define movu lddqu %endif %endif %if ARCH_X86_64 || cpuflag(sse2) %ifdef __NASM_VER__ ALIGNMODE p6 %else CPU amdnop %endif %else %ifdef __NASM_VER__ ALIGNMODE nop %else CPU basicnop %endif %endif %endmacro ; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# ; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# ; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 %endmacro %macro CAT_UNDEF 2 %undef %1%2 %endmacro %macro DEFINE_MMREGS 1 ; mmtype %assign %%prev_mmregs 0 %ifdef num_mmregs %assign %%prev_mmregs num_mmregs %endif %assign num_mmregs 8 %if ARCH_X86_64 && mmsize >= 16 %assign num_mmregs 16 %if cpuflag(avx512) || mmsize == 64 %assign num_mmregs 32 %endif %endif %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1 %+ %%i CAT_XDEFINE nn%1, %%i, %%i %assign %%i %%i+1 %endrep %if %%prev_mmregs > num_mmregs %rep %%prev_mmregs - num_mmregs CAT_UNDEF m, %%i CAT_UNDEF nn %+ mmtype, %%i %assign %%i %%i+1 %endrep %endif %xdefine mmtype %1 %endmacro ; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper %macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg %if ARCH_X86_64 && cpuflag(avx512) %assign %%i %1 %rep 16-%1 %assign %%i_high %%i+16 SWAP %%i, %%i_high %assign %%i %%i+1 %endrep %endif %endmacro %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 %define mmsize 8 %define mova movq %define movu movq %define movh movd %define movnta movntq INIT_CPUFLAGS %1 DEFINE_MMREGS mm %endmacro %macro INIT_XMM 0-1+ %assign avx_enabled FORCE_VEX_ENCODING %define RESET_MM_PERMUTATION INIT_XMM %1 %define mmsize 16 %define mova movdqa %define movu movdqu %define movh movq %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS xmm %if WIN64 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers %endif %xdefine bcstd 1to4 %xdefine bcstq 1to2 %endmacro %macro INIT_YMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_YMM %1 %define mmsize 32 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS ymm AVX512_MM_PERMUTATION %xdefine bcstd 1to8 %xdefine bcstq 1to4 %endmacro %macro INIT_ZMM 0-1+ %assign avx_enabled 1 %define RESET_MM_PERMUTATION INIT_ZMM %1 %define mmsize 64 %define mova movdqa %define movu movdqu %undef movh %define movnta movntdq INIT_CPUFLAGS %1 DEFINE_MMREGS zmm AVX512_MM_PERMUTATION %xdefine bcstd 1to16 %xdefine bcstq 1to8 %endmacro INIT_XMM %macro DECLARE_MMCAST 1 %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 %define ymmzmm%1 ymm%1 %define zmmmm%1 mm%1 %define zmmxmm%1 xmm%1 %define zmmymm%1 ymm%1 %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 %define zm%1 zmm %+ m%1 %endmacro %assign i 0 %rep 32 DECLARE_MMCAST i %assign i i+1 %endrep ; I often want to use macros that permute their arguments. e.g. there's no ; efficient way to implement butterfly or transpose or dct without swapping some ; arguments. ; ; I would like to not have to manually keep track of the permutations: ; If I insert a permutation in the middle of a function, it should automatically ; change everything that follows. For more complex macros I may also have multiple ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. ; ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that ; permutes its arguments. It's equivalent to exchanging the contents of the ; registers, except that this way you exchange the register names instead, so it ; doesn't cost any cycles. %macro PERMUTE 2-* ; takes a list of pairs to swap %rep %0/2 %xdefine %%tmp%2 m%2 %rotate 2 %endrep %rep %0/2 %xdefine m%1 %%tmp%2 CAT_XDEFINE nn, m%1, %1 %rotate 2 %endrep %endmacro %macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs) %ifnum %1 ; SWAP 0, 1, ... SWAP_INTERNAL_NUM %1, %2 %else ; SWAP m0, m1, ... SWAP_INTERNAL_NAME %1, %2 %endif %endmacro %macro SWAP_INTERNAL_NUM 2-* %rep %0-1 %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp CAT_XDEFINE nn, m%1, %1 CAT_XDEFINE nn, m%2, %2 %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* %xdefine %%args nn %+ %1 %rep %0-1 %xdefine %%args %%args, nn %+ %2 %rotate 1 %endrep SWAP_INTERNAL_NUM %%args %endmacro ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later ; calls to that function will automatically load the permutation, so values can ; be returned in mmregs. %macro SAVE_MM_PERMUTATION 0-1 %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %assign %%i 0 %rep num_mmregs %xdefine %%tmp m %+ %%i CAT_XDEFINE %%f, %%i, regnumof %+ %%tmp %assign %%i %%i+1 %endrep %endmacro %macro LOAD_MM_PERMUTATION 0-1 ; name to load from %if %0 %xdefine %%f %1_m %else %xdefine %%f current_function %+ _m %endif %xdefine %%tmp %%f %+ 0 %ifnum %%tmp DEFINE_MMREGS mmtype %assign %%i 0 %rep num_mmregs %xdefine %%tmp %%f %+ %%i CAT_XDEFINE %%m, %%i, m %+ %%tmp %assign %%i %%i+1 %endrep %rep num_mmregs %assign %%i %%i-1 CAT_XDEFINE m, %%i, %%m %+ %%i CAT_XDEFINE nn, m %+ %%i, %%i %endrep %endif %endmacro ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 %ifid %1 call_internal %1 %+ SUFFIX, %1 %else call %1 %endif %endmacro %macro call_internal 2 %xdefine %%i %2 %ifndef cglobaled_%2 %ifdef cglobaled_%1 %xdefine %%i %1 %endif %endif call %%i LOAD_MM_PERMUTATION %%i %endmacro ; Substitutions that reduce instruction size but are functionally equivalent %macro add 2 %ifnum %2 %if %2==128 sub %1, -128 %else add %1, %2 %endif %else add %1, %2 %endif %endmacro %macro sub 2 %ifnum %2 %if %2==128 add %1, -128 %else sub %1, %2 %endif %else sub %1, %2 %endif %endmacro ;============================================================================= ; AVX abstraction layer ;============================================================================= %assign i 0 %rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 CAT_XDEFINE sizeofzmm, i, 64 CAT_XDEFINE regnumofxmm, i, i CAT_XDEFINE regnumofymm, i, i CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i %macro CHECK_AVX_INSTR_EMU 3-* %xdefine %%opcode %1 %xdefine %%dst %2 %rep %0-2 %ifidn %%dst, %3 %error non-avx emulation of ``%%opcode'' is not supported %endif %rotate 1 %endrep %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%6+: operands %macro RUN_AVX_INSTR 6-9+ %ifnum sizeof%7 %assign __sizeofreg sizeof%7 %elifnum sizeof%6 %assign __sizeofreg sizeof%6 %else %assign __sizeofreg mmsize %endif %assign __emulate_avx 0 %if avx_enabled && __sizeofreg >= 16 %xdefine __instr v%1 %else %xdefine __instr %1 %if %0 >= 8+%4 %assign __emulate_avx 1 %endif %endif %ifnidn %2, fnord %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 16 && notcpuflag(sse2) %error use of ``%1'' sse2 instruction in cpuname function: current_function %elif %3 == 0 && __sizeofreg == 32 && notcpuflag(avx2) %error use of ``%1'' avx2 instruction in cpuname function: current_function %elif __sizeofreg == 16 && notcpuflag(sse) %error use of ``%1'' sse instruction in cpuname function: current_function %elif __sizeofreg == 32 && notcpuflag(avx) %error use of ``%1'' avx instruction in cpuname function: current_function %elif __sizeofreg == 64 && notcpuflag(avx512) %error use of ``%1'' avx512 instruction in cpuname function: current_function %elifidn %1, pextrw ; special case because the base instruction is mmx2, %ifnid %6 ; but sse4 is required for memory operands %if notcpuflag(sse4) %error use of ``%1'' sse4 instruction in cpuname function: current_function %endif %endif %endif %endif %endif %if __emulate_avx %xdefine __src1 %7 %xdefine __src2 %8 %if %5 && %4 == 0 %ifnidn %6, %7 %ifidn %6, %8 %xdefine __src1 %8 %xdefine __src2 %7 %elifnnum sizeof%8 ; 3-operand AVX instructions with a memory arg can only have it in src2, ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). ; So, if the instruction is commutative with a memory arg, swap them. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %endif %ifnidn %6, __src1 %if %0 >= 9 CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, __src2, %9 %else CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, __src2 %endif %if __sizeofreg == 8 MOVQ %6, __src1 %elif %3 MOVAPS %6, __src1 %else MOVDQA %6, __src1 %endif %endif %if %0 >= 9 %1 %6, __src2, %9 %else %1 %6, __src2 %endif %elif %0 >= 9 %if avx_enabled && __sizeofreg >= 16 && %4 == 1 %ifnnum regnumof%7 %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8, %9 %else __instr %6, %7, %8, %9 %endif %else __instr %6, %7, %8, %9 %endif %elif %0 == 8 %if avx_enabled && __sizeofreg >= 16 && %4 == 0 %xdefine __src1 %7 %xdefine __src2 %8 %if %5 %ifnum regnumof%7 %ifnum regnumof%8 %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32 ; Most VEX-encoded instructions require an additional byte to encode when ; src2 is a high register (e.g. m8..15). If the instruction is commutative ; we can swap src1 and src2 when doing so reduces the instruction length. %xdefine __src1 %8 %xdefine __src2 %7 %endif %endif %elifnum regnumof%8 ; put memory operands in src2 when possible %xdefine __src1 %8 %xdefine __src2 %7 %else %assign __emulate_avx 1 %endif %elifnnum regnumof%7 ; EVEX allows imm8 shift instructions to be used with memory operands, ; but VEX does not. This handles those special cases. %ifnnum %8 %assign __emulate_avx 1 %elif notcpuflag(avx512) %assign __emulate_avx 1 %endif %endif %if __emulate_avx ; a separate load is required %if %3 vmovaps %6, %7 %else vmovdqa %6, %7 %endif __instr %6, %6, %8 %else __instr %6, __src1, __src2 %endif %else __instr %6, %7, %8 %endif %elif %0 == 7 %if avx_enabled && __sizeofreg >= 16 && %5 %xdefine __src1 %6 %xdefine __src2 %7 %ifnum regnumof%6 %ifnum regnumof%7 %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32 %xdefine __src1 %7 %xdefine __src2 %6 %endif %endif %endif __instr %6, __src1, __src2 %else __instr %6, %7 %endif %else __instr %6 %endif %endmacro ;%1 == instruction ;%2 == minimal instruction set ;%3 == 1 if float, 0 if int ;%4 == 1 if 4-operand emulation, 0 if 3-operand emulation, 255 otherwise (no emulation) ;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not %macro AVX_INSTR 1-5 fnord, 0, 255, 0 %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5 %ifidn %2, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1 %elifidn %3, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2 %elifidn %4, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3 %elifidn %5, fnord RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4 %else RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5 %endif %endmacro %endmacro ; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 AVX_INSTR addsd, sse2, 1, 0, 0 AVX_INSTR addss, sse, 1, 0, 0 AVX_INSTR addsubpd, sse3, 1, 0, 0 AVX_INSTR addsubps, sse3, 1, 0, 0 AVX_INSTR aesdec, aesni, 0, 0, 0 AVX_INSTR aesdeclast, aesni, 0, 0, 0 AVX_INSTR aesenc, aesni, 0, 0, 0 AVX_INSTR aesenclast, aesni, 0, 0, 0 AVX_INSTR aesimc, aesni AVX_INSTR aeskeygenassist, aesni AVX_INSTR andnpd, sse2, 1, 0, 0 AVX_INSTR andnps, sse, 1, 0, 0 AVX_INSTR andpd, sse2, 1, 0, 1 AVX_INSTR andps, sse, 1, 0, 1 AVX_INSTR blendpd, sse4, 1, 1, 0 AVX_INSTR blendps, sse4, 1, 1, 0 AVX_INSTR blendvpd, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR blendvps, sse4, 1, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR cmpeqpd, sse2, 1, 0, 1 AVX_INSTR cmpeqps, sse, 1, 0, 1 AVX_INSTR cmpeqsd, sse2, 1, 0, 0 AVX_INSTR cmpeqss, sse, 1, 0, 0 AVX_INSTR cmplepd, sse2, 1, 0, 0 AVX_INSTR cmpleps, sse, 1, 0, 0 AVX_INSTR cmplesd, sse2, 1, 0, 0 AVX_INSTR cmpless, sse, 1, 0, 0 AVX_INSTR cmpltpd, sse2, 1, 0, 0 AVX_INSTR cmpltps, sse, 1, 0, 0 AVX_INSTR cmpltsd, sse2, 1, 0, 0 AVX_INSTR cmpltss, sse, 1, 0, 0 AVX_INSTR cmpneqpd, sse2, 1, 0, 1 AVX_INSTR cmpneqps, sse, 1, 0, 1 AVX_INSTR cmpneqsd, sse2, 1, 0, 0 AVX_INSTR cmpneqss, sse, 1, 0, 0 AVX_INSTR cmpnlepd, sse2, 1, 0, 0 AVX_INSTR cmpnleps, sse, 1, 0, 0 AVX_INSTR cmpnlesd, sse2, 1, 0, 0 AVX_INSTR cmpnless, sse, 1, 0, 0 AVX_INSTR cmpnltpd, sse2, 1, 0, 0 AVX_INSTR cmpnltps, sse, 1, 0, 0 AVX_INSTR cmpnltsd, sse2, 1, 0, 0 AVX_INSTR cmpnltss, sse, 1, 0, 0 AVX_INSTR cmpordpd, sse2 1, 0, 1 AVX_INSTR cmpordps, sse 1, 0, 1 AVX_INSTR cmpordsd, sse2 1, 0, 0 AVX_INSTR cmpordss, sse 1, 0, 0 AVX_INSTR cmppd, sse2, 1, 1, 0 AVX_INSTR cmpps, sse, 1, 1, 0 AVX_INSTR cmpsd, sse2, 1, 1, 0 AVX_INSTR cmpss, sse, 1, 1, 0 AVX_INSTR cmpunordpd, sse2, 1, 0, 1 AVX_INSTR cmpunordps, sse, 1, 0, 1 AVX_INSTR cmpunordsd, sse2, 1, 0, 0 AVX_INSTR cmpunordss, sse, 1, 0, 0 AVX_INSTR comisd, sse2, 1 AVX_INSTR comiss, sse, 1 AVX_INSTR cvtdq2pd, sse2, 1 AVX_INSTR cvtdq2ps, sse2, 1 AVX_INSTR cvtpd2dq, sse2, 1 AVX_INSTR cvtpd2ps, sse2, 1 AVX_INSTR cvtps2dq, sse2, 1 AVX_INSTR cvtps2pd, sse2, 1 AVX_INSTR cvtsd2si, sse2, 1 AVX_INSTR cvtsd2ss, sse2, 1, 0, 0 AVX_INSTR cvtsi2sd, sse2, 1, 0, 0 AVX_INSTR cvtsi2ss, sse, 1, 0, 0 AVX_INSTR cvtss2sd, sse2, 1, 0, 0 AVX_INSTR cvtss2si, sse, 1 AVX_INSTR cvttpd2dq, sse2, 1 AVX_INSTR cvttps2dq, sse2, 1 AVX_INSTR cvttsd2si, sse2, 1 AVX_INSTR cvttss2si, sse, 1 AVX_INSTR divpd, sse2, 1, 0, 0 AVX_INSTR divps, sse, 1, 0, 0 AVX_INSTR divsd, sse2, 1, 0, 0 AVX_INSTR divss, sse, 1, 0, 0 AVX_INSTR dppd, sse4, 1, 1, 0 AVX_INSTR dpps, sse4, 1, 1, 0 AVX_INSTR extractps, sse4, 1 AVX_INSTR gf2p8affineinvqb, gfni, 0, 1, 0 AVX_INSTR gf2p8affineqb, gfni, 0, 1, 0 AVX_INSTR gf2p8mulb, gfni, 0, 0, 0 AVX_INSTR haddpd, sse3, 1, 0, 0 AVX_INSTR haddps, sse3, 1, 0, 0 AVX_INSTR hsubpd, sse3, 1, 0, 0 AVX_INSTR hsubps, sse3, 1, 0, 0 AVX_INSTR insertps, sse4, 1, 1, 0 AVX_INSTR lddqu, sse3 AVX_INSTR ldmxcsr, sse, 1 AVX_INSTR maskmovdqu, sse2 AVX_INSTR maxpd, sse2, 1, 0, 1 AVX_INSTR maxps, sse, 1, 0, 1 AVX_INSTR maxsd, sse2, 1, 0, 0 AVX_INSTR maxss, sse, 1, 0, 0 AVX_INSTR minpd, sse2, 1, 0, 1 AVX_INSTR minps, sse, 1, 0, 1 AVX_INSTR minsd, sse2, 1, 0, 0 AVX_INSTR minss, sse, 1, 0, 0 AVX_INSTR movapd, sse2, 1 AVX_INSTR movaps, sse, 1 AVX_INSTR movd, mmx AVX_INSTR movddup, sse3, 1 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 AVX_INSTR movhlps, sse, 1, 0, 0 AVX_INSTR movhpd, sse2, 1, 0, 0 AVX_INSTR movhps, sse, 1, 0, 0 AVX_INSTR movlhps, sse, 1, 0, 0 AVX_INSTR movlpd, sse2, 1, 0, 0 AVX_INSTR movlps, sse, 1, 0, 0 AVX_INSTR movmskpd, sse2, 1 AVX_INSTR movmskps, sse, 1 AVX_INSTR movntdq, sse2 AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2, 1 AVX_INSTR movntps, sse, 1 AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3, 1 AVX_INSTR movsldup, sse3, 1 AVX_INSTR movss, sse, 1, 0, 0 AVX_INSTR movupd, sse2, 1 AVX_INSTR movups, sse, 1 AVX_INSTR mpsadbw, sse4, 0, 1, 0 AVX_INSTR mulpd, sse2, 1, 0, 1 AVX_INSTR mulps, sse, 1, 0, 1 AVX_INSTR mulsd, sse2, 1, 0, 0 AVX_INSTR mulss, sse, 1, 0, 0 AVX_INSTR orpd, sse2, 1, 0, 1 AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 AVX_INSTR packssdw, mmx, 0, 0, 0 AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 AVX_INSTR pclmulqdq, fnord, 0, 1, 0 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpestri, sse42 AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpistri, sse42 AVX_INSTR pcmpistrm, sse42 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4, 0, 1, 0 AVX_INSTR pinsrd, sse4, 0, 1, 0 AVX_INSTR pinsrq, sse4, 0, 1, 0 AVX_INSTR pinsrw, mmx2, 0, 1, 0 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaxsb, sse4, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 AVX_INSTR pshufb, ssse3, 0, 0, 0 AVX_INSTR pshufd, sse2 AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR roundpd, sse4, 1 AVX_INSTR roundps, sse4, 1 AVX_INSTR roundsd, sse4, 1, 1, 0 AVX_INSTR roundss, sse4, 1, 1, 0 AVX_INSTR rsqrtps, sse, 1 AVX_INSTR rsqrtss, sse, 1, 0, 0 AVX_INSTR shufpd, sse2, 1, 1, 0 AVX_INSTR shufps, sse, 1, 1, 0 AVX_INSTR sqrtpd, sse2, 1 AVX_INSTR sqrtps, sse, 1 AVX_INSTR sqrtsd, sse2, 1, 0, 0 AVX_INSTR sqrtss, sse, 1, 0, 0 AVX_INSTR stmxcsr, sse, 1 AVX_INSTR subpd, sse2, 1, 0, 0 AVX_INSTR subps, sse, 1, 0, 0 AVX_INSTR subsd, sse2, 1, 0, 0 AVX_INSTR subss, sse, 1, 0, 0 AVX_INSTR ucomisd, sse2, 1 AVX_INSTR ucomiss, sse, 1 AVX_INSTR unpckhpd, sse2, 1, 0, 0 AVX_INSTR unpckhps, sse, 1, 0, 0 AVX_INSTR unpcklpd, sse2, 1, 0, 0 AVX_INSTR unpcklps, sse, 1, 0, 0 AVX_INSTR xorpd, sse2, 1, 0, 1 AVX_INSTR xorps, sse, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfmul, 3dnow, 1, 0, 1 AVX_INSTR pfsub, 3dnow, 1, 0, 0 ;%1 == instruction ;%2 == minimal instruction set %macro GPR_INSTR 2 %macro %1 2-5 fnord, %1, %2 %ifdef cpuname %if notcpuflag(%5) %error use of ``%4'' %5 instruction in cpuname function: current_function %endif %endif %ifidn %3, fnord %4 %1, %2 %else %4 %1, %2, %3 %endif %endmacro %endmacro GPR_INSTR andn, bmi1 GPR_INSTR bextr, bmi1 GPR_INSTR blsi, bmi1 GPR_INSTR blsmsk, bmi1 GPR_INSTR blsr, bmi1 GPR_INSTR bzhi, bmi2 GPR_INSTR mulx, bmi2 GPR_INSTR pdep, bmi2 GPR_INSTR pext, bmi2 GPR_INSTR popcnt, sse42 GPR_INSTR rorx, bmi2 GPR_INSTR sarx, bmi2 GPR_INSTR shlx, bmi2 GPR_INSTR shrx, bmi2 ; base-4 constants for shuffles %assign i 0 %rep 256 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) %if j < 10 CAT_XDEFINE q000, j, i %elif j < 100 CAT_XDEFINE q00, j, i %elif j < 1000 CAT_XDEFINE q0, j, i %else CAT_XDEFINE q, j, i %endif %assign i i+1 %endrep %undef i %undef j %macro FMA_INSTR 3 %macro %1 4-7 %1, %2, %3 %if cpuflag(xop) v%5 %1, %2, %3, %4 %elifnidn %1, %4 %6 %1, %2, %3 %7 %1, %4 %else %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. ; FMA3 is only possible if dst is the same as one of the src registers. ; Either src2 or src3 can be a memory operand. %macro FMA4_INSTR 2-* %push fma4_instr %xdefine %$prefix %1 %rep %0 - 1 %macro %$prefix%2 4-6 %$prefix, %2 %if notcpuflag(fma3) && notcpuflag(fma4) %error use of ``%5%6'' fma instruction in cpuname function: current_function %elif cpuflag(fma4) v%5%6 %1, %2, %3, %4 %elifidn %1, %2 ; If %3 or %4 is a memory operand it needs to be encoded as the last operand. %ifnum sizeof%3 v%{5}213%6 %2, %3, %4 %else v%{5}132%6 %2, %4, %3 %endif %elifidn %1, %3 v%{5}213%6 %3, %2, %4 %elifidn %1, %4 v%{5}231%6 %4, %2, %3 %else %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported %endif %endmacro %rotate 1 %endrep %pop %endmacro FMA4_INSTR fmadd, pd, ps, sd, ss FMA4_INSTR fmaddsub, pd, ps FMA4_INSTR fmsub, pd, ps, sd, ss FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss ; Macros for converting VEX instructions to equivalent EVEX ones. %macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex %macro %1 2-7 fnord, fnord, %1, %2, %3 %ifidn %3, fnord %define %%args %1, %2 %elifidn %4, fnord %define %%args %1, %2, %3 %else %define %%args %1, %2, %3, %4 %endif %assign %%evex_required cpuflag(avx512) & %7 %ifnum regnumof%1 %if regnumof%1 >= 16 || sizeof%1 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%2 %if regnumof%2 >= 16 || sizeof%2 > 32 %assign %%evex_required 1 %endif %endif %ifnum regnumof%3 %if regnumof%3 >= 16 || sizeof%3 > 32 %assign %%evex_required 1 %endif %endif %if %%evex_required %6 %%args %else %5 %%args ; Prefer VEX over EVEX due to shorter instruction length %endif %endmacro %endmacro EVEX_INSTR vbroadcastf128, vbroadcastf32x4 EVEX_INSTR vbroadcasti128, vbroadcasti32x4 EVEX_INSTR vextractf128, vextractf32x4 EVEX_INSTR vextracti128, vextracti32x4 EVEX_INSTR vinsertf128, vinsertf32x4 EVEX_INSTR vinserti128, vinserti32x4 EVEX_INSTR vmovdqa, vmovdqa32 EVEX_INSTR vmovdqu, vmovdqu32 EVEX_INSTR vpand, vpandd EVEX_INSTR vpandn, vpandnd EVEX_INSTR vpor, vpord EVEX_INSTR vpxor, vpxord EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision EVEX_INSTR vrcpss, vrcp14ss, 1 EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 rav1e-0.7.1/src/frame/mod.rs000064400000000000000000000057441046102023000136670ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use num_derive::FromPrimitive; use crate::api::{Opaque, T35}; use crate::context::SB_SIZE; use crate::mc::SUBPEL_FILTER_SIZE; use crate::util::*; use crate::tiling::*; mod plane; pub use plane::*; const FRAME_MARGIN: usize = 16 + SUBPEL_FILTER_SIZE; const LUMA_PADDING: usize = SB_SIZE + FRAME_MARGIN; /// Override the frame type decision /// /// Only certain frame types can be selected. #[derive(Debug, PartialEq, Eq, Clone, Copy, FromPrimitive, Default)] #[repr(C)] pub enum FrameTypeOverride { /// Do not force any decision. #[default] No, /// Force the frame to be a Keyframe. Key, } /// Optional per-frame encoder parameters #[derive(Debug, Default)] pub struct FrameParameters { /// Force emitted frame to be of the type selected pub frame_type_override: FrameTypeOverride, /// Output the provided data in the matching encoded Packet pub opaque: Option, /// List of t35 metadata associated with this frame pub t35_metadata: Box<[T35]>, } pub use v_frame::frame::Frame; /// Public Trait Interface for Frame Allocation pub(crate) trait FrameAlloc { /// Initialise new frame default type fn new(width: usize, height: usize, chroma_sampling: ChromaSampling) -> Self; } impl FrameAlloc for Frame { /// Creates a new frame with the given parameters. /// new function calls `new_with_padding` function which takes `luma_padding` /// as parameter fn new( width: usize, height: usize, chroma_sampling: ChromaSampling, ) -> Self { v_frame::frame::Frame::new_with_padding( width, height, chroma_sampling, LUMA_PADDING, ) } } /// Public Trait for calculating Padding pub(crate) trait FramePad { fn pad(&mut self, w: usize, h: usize, planes: usize); } impl FramePad for Frame { fn pad(&mut self, w: usize, h: usize, planes: usize) { for pli in 0..planes { self.planes[pli].pad(w, h); } } } /// Public Trait for new Tile of a frame pub(crate) trait AsTile { fn as_tile(&self) -> Tile<'_, T>; fn as_tile_mut(&mut self) -> TileMut<'_, T>; } impl AsTile for Frame { #[inline(always)] fn as_tile(&self) -> Tile<'_, T> { let PlaneConfig { width, height, .. } = self.planes[0].cfg; Tile::new(self, TileRect { x: 0, y: 0, width, height }) } #[inline(always)] fn as_tile_mut(&mut self) -> TileMut<'_, T> { let PlaneConfig { width, height, .. } = self.planes[0].cfg; TileMut::new(self, TileRect { x: 0, y: 0, width, height }) } } rav1e-0.7.1/src/frame/plane.rs000064400000000000000000000032471046102023000142030ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::tiling::*; use crate::util::*; pub use v_frame::plane::*; pub trait AsRegion { fn as_region(&self) -> PlaneRegion<'_, T>; fn as_region_mut(&mut self) -> PlaneRegionMut<'_, T>; fn region_mut(&mut self, area: Area) -> PlaneRegionMut<'_, T>; fn region(&self, area: Area) -> PlaneRegion<'_, T>; } impl AsRegion for Plane { #[inline(always)] fn region(&self, area: Area) -> PlaneRegion<'_, T> { let rect = area.to_rect( self.cfg.xdec, self.cfg.ydec, self.cfg.stride - self.cfg.xorigin, self.cfg.alloc_height - self.cfg.yorigin, ); PlaneRegion::new(self, rect) } #[inline(always)] fn region_mut(&mut self, area: Area) -> PlaneRegionMut<'_, T> { let rect = area.to_rect( self.cfg.xdec, self.cfg.ydec, self.cfg.stride - self.cfg.xorigin, self.cfg.alloc_height - self.cfg.yorigin, ); PlaneRegionMut::new(self, rect) } #[inline(always)] fn as_region(&self) -> PlaneRegion<'_, T> { PlaneRegion::new_from_plane(self) } #[inline(always)] fn as_region_mut(&mut self) -> PlaneRegionMut<'_, T> { PlaneRegionMut::new_from_plane(self) } } rav1e-0.7.1/src/fuzzing.rs000064400000000000000000000265611046102023000135120ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use std::marker::PhantomData; use std::sync::Arc; use libfuzzer_sys::arbitrary::{Arbitrary, Error, Unstructured}; use crate::prelude::*; // Adding new fuzz targets // // 1. Add a function to this file which looks like this: // // pub fn fuzz_something(data: Data) { // // Invoke everything you need. // // // // Your function may accept a value of any type that implements // // Arbitrary [1]. This is how fuzzer affects the execution—by // // feeding in different bytes, which result in different // // arbitrary values being generated. // // [1]: https://docs.rs/arbitrary/0.3.3/arbitrary/trait.Arbitrary.html // // // // Derive Debug for the structures you create with arbitrary data. // } // // 2. cargo fuzz add something // 3. Copy the contents of any other .rs file from fuzz/fuzz_targets/ into the // newly created fuzz/fuzz_targets/something.rs and change the function // being called to fuzz_something. // // Now you can fuzz the new target with cargo fuzz. #[derive(Debug)] pub struct ArbitraryConfig { config: Config, } #[inline] fn arbitrary_rational(u: &mut Unstructured<'_>) -> Result { Ok(Rational::new(Arbitrary::arbitrary(u)?, Arbitrary::arbitrary(u)?)) } #[inline] fn arbitrary_color_description( u: &mut Unstructured<'_>, ) -> Result, Error> { if Arbitrary::arbitrary(u)? { return Ok(None); } Ok(Some(ColorDescription { color_primaries: *u.choose(&[ ColorPrimaries::BT709, ColorPrimaries::Unspecified, ColorPrimaries::BT470M, ColorPrimaries::BT470BG, ColorPrimaries::BT601, ColorPrimaries::SMPTE240, ColorPrimaries::GenericFilm, ColorPrimaries::BT2020, ColorPrimaries::XYZ, ColorPrimaries::SMPTE431, ColorPrimaries::SMPTE432, ColorPrimaries::EBU3213, ])?, transfer_characteristics: *u.choose(&[ TransferCharacteristics::BT709, TransferCharacteristics::Unspecified, TransferCharacteristics::BT470M, TransferCharacteristics::BT470BG, TransferCharacteristics::BT601, TransferCharacteristics::SMPTE240, TransferCharacteristics::Linear, TransferCharacteristics::Log100, TransferCharacteristics::Log100Sqrt10, TransferCharacteristics::IEC61966, TransferCharacteristics::BT1361, TransferCharacteristics::SRGB, TransferCharacteristics::BT2020_10Bit, TransferCharacteristics::BT2020_12Bit, TransferCharacteristics::SMPTE2084, TransferCharacteristics::SMPTE428, TransferCharacteristics::HLG, ])?, matrix_coefficients: *u.choose(&[ MatrixCoefficients::Identity, MatrixCoefficients::BT709, MatrixCoefficients::Unspecified, MatrixCoefficients::FCC, MatrixCoefficients::BT470BG, MatrixCoefficients::BT601, MatrixCoefficients::SMPTE240, MatrixCoefficients::YCgCo, MatrixCoefficients::BT2020NCL, MatrixCoefficients::BT2020CL, MatrixCoefficients::SMPTE2085, MatrixCoefficients::ChromatNCL, MatrixCoefficients::ChromatCL, MatrixCoefficients::ICtCp, ])?, })) } #[inline] fn arbitrary_chromacity_point( u: &mut Unstructured<'_>, ) -> Result { Ok(ChromaticityPoint { x: Arbitrary::arbitrary(u)?, y: Arbitrary::arbitrary(u)?, }) } #[inline] fn arbitrary_mastering_display( u: &mut Unstructured<'_>, ) -> Result, Error> { if Arbitrary::arbitrary(u)? { return Ok(None); } Ok(Some(MasteringDisplay { primaries: [ arbitrary_chromacity_point(u)?, arbitrary_chromacity_point(u)?, arbitrary_chromacity_point(u)?, ], white_point: arbitrary_chromacity_point(u)?, max_luminance: Arbitrary::arbitrary(u)?, min_luminance: Arbitrary::arbitrary(u)?, })) } #[inline] fn arbitrary_content_light( u: &mut Unstructured<'_>, ) -> Result, Error> { if Arbitrary::arbitrary(u)? { return Ok(None); } Ok(Some(ContentLight { max_content_light_level: Arbitrary::arbitrary(u)?, max_frame_average_light_level: Arbitrary::arbitrary(u)?, })) } impl Arbitrary<'_> for ArbitraryConfig { fn arbitrary(u: &mut Unstructured<'_>) -> Result { let mut enc = EncoderConfig::with_speed_preset(Arbitrary::arbitrary(u)?); enc.width = Arbitrary::arbitrary(u)?; enc.height = Arbitrary::arbitrary(u)?; enc.bit_depth = u.int_in_range(0..=16)?; enc.still_picture = Arbitrary::arbitrary(u)?; enc.time_base = arbitrary_rational(u)?; enc.min_key_frame_interval = Arbitrary::arbitrary(u)?; enc.max_key_frame_interval = Arbitrary::arbitrary(u)?; enc.reservoir_frame_delay = Arbitrary::arbitrary(u)?; enc.low_latency = Arbitrary::arbitrary(u)?; enc.quantizer = Arbitrary::arbitrary(u)?; enc.min_quantizer = Arbitrary::arbitrary(u)?; enc.bitrate = Arbitrary::arbitrary(u)?; enc.tile_cols = Arbitrary::arbitrary(u)?; enc.tile_rows = Arbitrary::arbitrary(u)?; enc.tiles = Arbitrary::arbitrary(u)?; enc.speed_settings.rdo_lookahead_frames = Arbitrary::arbitrary(u)?; let config = Config::new().with_encoder_config(enc).with_threads(1); Ok(Self { config }) } } pub fn fuzz_construct_context(arbitrary: ArbitraryConfig) { let _: Result, _> = arbitrary.config.new_context(); } fn encode_frames( ctx: &mut Context, mut frames: impl Iterator>, ) -> Result<(), EncoderStatus> { loop { let rv = ctx.receive_packet(); debug!("ctx.receive_packet() = {:#?}", rv); match rv { Ok(_packet) => {} Err(EncoderStatus::Encoded) => {} Err(EncoderStatus::LimitReached) => { break; } Err(EncoderStatus::NeedMoreData) => { ctx.send_frame(frames.next().map(Arc::new))?; } Err(EncoderStatus::EnoughData) => { unreachable!(); } Err(EncoderStatus::NotReady) => { unreachable!(); } Err(EncoderStatus::Failure) => { return Err(EncoderStatus::Failure); } } } Ok(()) } #[derive(Debug)] pub struct ArbitraryEncoder { config: Config, frame_count: u8, pixels: Box<[u8]>, } impl Arbitrary<'_> for ArbitraryEncoder { fn arbitrary(u: &mut Unstructured<'_>) -> Result { let enc = EncoderConfig { speed_settings: SpeedSettings::from_preset(u.int_in_range(0..=10)?), width: u.int_in_range(1..=256)?, height: u.int_in_range(1..=256)?, still_picture: Arbitrary::arbitrary(u)?, time_base: arbitrary_rational(u)?, min_key_frame_interval: u.int_in_range(0..=3)?, max_key_frame_interval: u.int_in_range(1..=4)?, low_latency: Arbitrary::arbitrary(u)?, quantizer: Arbitrary::arbitrary(u)?, min_quantizer: Arbitrary::arbitrary(u)?, bitrate: Arbitrary::arbitrary(u)?, tile_cols: u.int_in_range(0..=2)?, tile_rows: u.int_in_range(0..=2)?, tiles: u.int_in_range(0..=16)?, chroma_sampling: *u.choose(&[ ChromaSampling::Cs420, ChromaSampling::Cs422, ChromaSampling::Cs444, ChromaSampling::Cs400, ])?, chroma_sample_position: *u.choose(&[ ChromaSamplePosition::Unknown, ChromaSamplePosition::Vertical, ChromaSamplePosition::Colocated, ])?, pixel_range: *u.choose(&[PixelRange::Limited, PixelRange::Full])?, error_resilient: Arbitrary::arbitrary(u)?, reservoir_frame_delay: Arbitrary::arbitrary(u)?, sample_aspect_ratio: arbitrary_rational(u)?, bit_depth: 8, color_description: arbitrary_color_description(u)?, mastering_display: arbitrary_mastering_display(u)?, content_light: arbitrary_content_light(u)?, level_idx: Some(31), enable_timing_info: Arbitrary::arbitrary(u)?, switch_frame_interval: u.int_in_range(0..=3)?, tune: *u.choose(&[Tune::Psnr, Tune::Psychovisual])?, film_grain_params: None, }; let frame_count = if enc.still_picture { 1 } else { u.int_in_range(1..=3)? }; if u.is_empty() { return Err(Error::NotEnoughData); } let pixels = u.bytes(u.len())?.to_vec().into_boxed_slice(); let config = Config::new().with_encoder_config(enc).with_threads(1); Ok(Self { config, frame_count, pixels }) } } pub fn fuzz_encode(arbitrary: ArbitraryEncoder) { let res = arbitrary.config.new_context(); if res.is_err() { return; } let mut context: Context = res.unwrap(); let mut pixels = arbitrary.pixels.iter().cycle(); let mut frame = context.new_frame(); let frames = (0..arbitrary.frame_count).map(|_| { for plane in &mut frame.planes { let stride = plane.cfg.stride; for row in plane.data_origin_mut().chunks_mut(stride) { for pixel in row { *pixel = *pixels.next().unwrap(); } } } frame.clone() }); let _ = encode_frames(&mut context, frames); } #[derive(Debug)] pub struct DecodeTestParameters { w: usize, h: usize, speed: u8, q: usize, limit: usize, bit_depth: usize, chroma_sampling: ChromaSampling, min_keyint: u64, max_keyint: u64, switch_frame_interval: u64, low_latency: bool, error_resilient: bool, bitrate: i32, tile_cols_log2: usize, tile_rows_log2: usize, still_picture: bool, pixel: PhantomData, } impl Arbitrary<'_> for DecodeTestParameters { fn arbitrary(u: &mut Unstructured<'_>) -> Result { let mut p = Self { w: u.int_in_range(16..=16 + 255)?, h: u.int_in_range(16..=16 + 255)?, speed: u.int_in_range(0..=10)?, q: u8::arbitrary(u)?.into(), limit: u.int_in_range(1..=3)?, bit_depth: 8, chroma_sampling: *u.choose(&[ ChromaSampling::Cs420, ChromaSampling::Cs422, ChromaSampling::Cs444, ChromaSampling::Cs400, ])?, min_keyint: u.int_in_range(0..=3)?, max_keyint: u.int_in_range(1..=4)?, switch_frame_interval: u.int_in_range(0..=3)?, low_latency: bool::arbitrary(u)?, error_resilient: bool::arbitrary(u)?, bitrate: u16::arbitrary(u)?.into(), tile_cols_log2: u.int_in_range(0..=2)?, tile_rows_log2: u.int_in_range(0..=2)?, still_picture: bool::arbitrary(u)?, pixel: PhantomData, }; if matches!(T::type_enum(), PixelType::U16) { p.bit_depth = *u.choose(&[8, 10, 12])?; } if !p.low_latency { p.switch_frame_interval = 0; } if p.still_picture { p.limit = 1 } Ok(p) } } #[cfg(feature = "decode_test_dav1d")] pub fn fuzz_encode_decode(p: DecodeTestParameters) { use crate::test_encode_decode::*; let mut dec = get_decoder::("dav1d", p.w, p.h); dec.encode_decode( true, p.w, p.h, p.speed, p.q, p.limit, p.bit_depth, p.chroma_sampling, p.min_keyint, p.max_keyint, p.switch_frame_interval, p.low_latency, p.error_resilient, p.bitrate, p.tile_cols_log2, p.tile_rows_log2, p.still_picture, None, ); } rav1e-0.7.1/src/header.rs000064400000000000000000001141671046102023000132460ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::api::*; use crate::context::*; use crate::ec::*; use crate::lrf::*; use crate::partition::*; use crate::tiling::MAX_TILE_WIDTH; use crate::util::Fixed; use crate::util::Pixel; use crate::DeblockState; use crate::FrameInvariants; use crate::FrameState; use crate::SegmentationState; use crate::Sequence; use arrayvec::ArrayVec; use bitstream_io::{BigEndian, BitWrite, BitWriter, LittleEndian}; use std::io; pub const PRIMARY_REF_NONE: u32 = 7; pub const ALL_REF_FRAMES_MASK: u32 = (1 << REF_FRAMES) - 1; const PRIMARY_REF_BITS: u32 = 3; #[allow(unused)] const OP_POINTS_IDC_BITS: usize = 12; #[allow(unused)] const LEVEL_MAJOR_MIN: usize = 2; #[allow(unused)] const LEVEL_MAJOR_BITS: usize = 3; #[allow(unused)] const LEVEL_MINOR_BITS: usize = 2; #[allow(unused)] const LEVEL_BITS: usize = LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS; #[allow(dead_code, non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ReferenceMode { SINGLE = 0, COMPOUND = 1, SELECT = 2, } #[allow(non_camel_case_types)] #[allow(unused)] pub enum ObuType { OBU_SEQUENCE_HEADER = 1, OBU_TEMPORAL_DELIMITER = 2, OBU_FRAME_HEADER = 3, OBU_TILE_GROUP = 4, OBU_METADATA = 5, OBU_FRAME = 6, OBU_REDUNDANT_FRAME_HEADER = 7, OBU_TILE_LIST = 8, OBU_PADDING = 15, } #[derive(Clone, Copy)] #[allow(non_camel_case_types)] #[allow(unused)] pub enum ObuMetaType { OBU_META_HDR_CLL = 1, OBU_META_HDR_MDCV = 2, OBU_META_SCALABILITY = 3, OBU_META_ITUT_T35 = 4, OBU_META_TIMECODE = 5, } impl ObuMetaType { const fn size(self) -> u64 { use self::ObuMetaType::*; match self { OBU_META_HDR_CLL => 4, OBU_META_HDR_MDCV => 24, _ => 0, } } } pub trait ULEB128Writer { fn write_uleb128(&mut self, payload: u64) -> io::Result<()>; } impl ULEB128Writer for BitWriter { fn write_uleb128(&mut self, payload: u64) -> io::Result<()> { // NOTE from libaom: // Disallow values larger than 32-bits to ensure consistent behavior on 32 and // 64 bit targets: value is typically used to determine buffer allocation size // when decoded. let mut coded_value: ArrayVec = ArrayVec::new(); let mut value = payload as u32; loop { let mut byte = (value & 0x7f) as u8; value >>= 7u8; if value != 0 { // Signal that more bytes follow. byte |= 0x80; } coded_value.push(byte); if value == 0 { // We have to break at the end of the loop // because there must be at least one byte written. break; } } for byte in coded_value { self.write(8, byte)?; } Ok(()) } } pub trait LEWriter { fn write_le(&mut self, bytes: u32, payload: u64) -> io::Result<()>; } // to write little endian values in a globally big-endian BitWriter impl LEWriter for BitWriter { fn write_le(&mut self, bytes: u32, value: u64) -> io::Result<()> { let mut data = Vec::new(); let mut bwle = BitWriter::endian(&mut data, LittleEndian); bwle.write(bytes * 8, value)?; self.write_bytes(&data) } } pub trait UncompressedHeader { // Start of OBU Headers fn write_obu_header( &mut self, obu_type: ObuType, obu_extension: u32, ) -> io::Result<()>; fn write_sequence_metadata_obu( &mut self, obu_meta_type: ObuMetaType, seq: &Sequence, ) -> io::Result<()>; fn write_sequence_header_obu( &mut self, fi: &FrameInvariants, ) -> io::Result<()>; fn write_frame_header_obu( &mut self, fi: &FrameInvariants, fs: &FrameState, inter_cfg: &InterConfig, ) -> io::Result<()>; fn write_sequence_header( &mut self, fi: &FrameInvariants, ) -> io::Result<()>; fn write_color_config(&mut self, seq: &Sequence) -> io::Result<()>; fn write_t35_metadata_obu(&mut self, t35: &T35) -> io::Result<()>; // End of OBU Headers fn write_max_frame_size( &mut self, fi: &FrameInvariants, ) -> io::Result<()>; fn write_frame_size( &mut self, fi: &FrameInvariants, ) -> io::Result<()>; fn write_render_size( &mut self, fi: &FrameInvariants, ) -> io::Result<()>; fn write_frame_size_with_refs( &mut self, fi: &FrameInvariants, ) -> io::Result<()>; fn write_deblock_filter_a( &mut self, fi: &FrameInvariants, deblock: &DeblockState, ) -> io::Result<()>; fn write_deblock_filter_b( &mut self, fi: &FrameInvariants, deblock: &DeblockState, ) -> io::Result<()>; fn write_frame_cdef( &mut self, fi: &FrameInvariants, ) -> io::Result<()>; fn write_frame_lrf( &mut self, fi: &FrameInvariants, rs: &RestorationState, ) -> io::Result<()>; fn write_segment_data( &mut self, fi: &FrameInvariants, segmentation: &SegmentationState, ) -> io::Result<()>; fn write_delta_q(&mut self, delta_q: i8) -> io::Result<()>; } impl UncompressedHeader for BitWriter { // Start of OBU Headers // Write OBU Header syntax fn write_obu_header( &mut self, obu_type: ObuType, obu_extension: u32, ) -> io::Result<()> { self.write_bit(false)?; // forbidden bit. self.write(4, obu_type as u32)?; self.write_bit(obu_extension != 0)?; self.write_bit(true)?; // obu_has_payload_length_field self.write_bit(false)?; // reserved if obu_extension != 0 { unimplemented!(); //self.write(8, obu_extension & 0xFF)?; size += 8; } Ok(()) } fn write_sequence_metadata_obu( &mut self, obu_meta_type: ObuMetaType, seq: &Sequence, ) -> io::Result<()> { // header self.write_obu_header(ObuType::OBU_METADATA, 0)?; // uleb128() - length // we use a constant value to avoid computing the OBU size every time // since it is fixed (depending on the metadata) // +2 is for the metadata_type field and the trailing bits byte self.write_uleb128(obu_meta_type.size() + 2)?; // uleb128() - metadata_type (1 byte) self.write_uleb128(obu_meta_type as u64)?; match obu_meta_type { ObuMetaType::OBU_META_HDR_CLL => { let cll = seq.content_light.unwrap(); self.write(16, cll.max_content_light_level)?; self.write(16, cll.max_frame_average_light_level)?; } ObuMetaType::OBU_META_HDR_MDCV => { let mdcv = seq.mastering_display.unwrap(); for i in 0..3 { self.write(16, mdcv.primaries[i].x)?; self.write(16, mdcv.primaries[i].y)?; } self.write(16, mdcv.white_point.x)?; self.write(16, mdcv.white_point.y)?; self.write(32, mdcv.max_luminance)?; self.write(32, mdcv.min_luminance)?; } _ => {} } // trailing bits (1 byte) self.write_bit(true)?; self.byte_align()?; Ok(()) } fn write_t35_metadata_obu(&mut self, t35: &T35) -> io::Result<()> { self.write_obu_header(ObuType::OBU_METADATA, 0)?; // metadata type + country code + optional extension + trailing bits self.write_uleb128( t35.data.len() as u64 + if t35.country_code == 0xFF { 4 } else { 3 }, )?; self.write_uleb128(ObuMetaType::OBU_META_ITUT_T35 as u64)?; self.write(8, t35.country_code)?; if t35.country_code == 0xFF { self.write(8, t35.country_code_extension_byte)?; } self.write_bytes(&t35.data)?; // trailing bits (1 byte) self.write_bit(true)?; self.byte_align()?; Ok(()) } fn write_sequence_header_obu( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { assert!( !fi.sequence.reduced_still_picture_hdr || fi.sequence.still_picture ); self.write(3, fi.sequence.profile)?; // profile self.write_bit(fi.sequence.still_picture)?; // still_picture self.write_bit(fi.sequence.reduced_still_picture_hdr)?; // reduced_still_picture_header assert!(fi.sequence.level_idx[0] <= 31); if fi.sequence.reduced_still_picture_hdr { assert!(!fi.sequence.timing_info_present); assert!(!fi.sequence.decoder_model_info_present_flag); assert_eq!(fi.sequence.operating_points_cnt_minus_1, 0); assert_eq!(fi.sequence.operating_point_idc[0], 0); self.write(5, fi.sequence.level_idx[0])?; // level assert_eq!(fi.sequence.tier[0], 0); } else { self.write_bit(fi.sequence.timing_info_present)?; // timing info present if fi.sequence.timing_info_present { self.write(32, fi.sequence.time_base.num)?; self.write(32, fi.sequence.time_base.den)?; self.write_bit(true)?; // equal picture interval self.write_bit(true)?; // zero interval self.write_bit(false)?; // decoder model info present flag } self.write_bit(false)?; // initial display delay present flag self.write(5, 0)?; // one operating point self.write(12, 0)?; // idc self.write(5, fi.sequence.level_idx[0])?; // level if fi.sequence.level_idx[0] > 7 { self.write(1, 0)?; // tier } } self.write_sequence_header(fi)?; self.write_color_config(&fi.sequence)?; self.write_bit(fi.sequence.film_grain_params_present)?; Ok(()) } fn write_sequence_header( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { self.write_max_frame_size(fi)?; let seq = &fi.sequence; if seq.reduced_still_picture_hdr { assert!(!seq.frame_id_numbers_present_flag); } else { self.write_bit(seq.frame_id_numbers_present_flag)?; } if seq.frame_id_numbers_present_flag { // We must always have delta_frame_id_length < frame_id_length, // in order for a frame to be referenced with a unique delta. // Avoid wasting bits by using a coding that enforces this restriction. self.write(4, seq.delta_frame_id_length - 2)?; self.write(3, seq.frame_id_length - seq.delta_frame_id_length - 1)?; } self.write_bit(seq.use_128x128_superblock)?; self.write_bit(seq.enable_filter_intra)?; self.write_bit(seq.enable_intra_edge_filter)?; if seq.reduced_still_picture_hdr { assert!(!seq.enable_interintra_compound); assert!(!seq.enable_masked_compound); assert!(!seq.enable_warped_motion); assert!(!seq.enable_dual_filter); assert!(!seq.enable_order_hint); assert!(!seq.enable_jnt_comp); assert!(!seq.enable_ref_frame_mvs); assert!(seq.force_screen_content_tools == 2); assert!(seq.force_integer_mv == 2); } else { self.write_bit(seq.enable_interintra_compound)?; self.write_bit(seq.enable_masked_compound)?; self.write_bit(seq.enable_warped_motion)?; self.write_bit(seq.enable_dual_filter)?; self.write_bit(seq.enable_order_hint)?; if seq.enable_order_hint { self.write_bit(seq.enable_jnt_comp)?; self.write_bit(seq.enable_ref_frame_mvs)?; } if seq.force_screen_content_tools == 2 { self.write_bit(true)?; } else { self.write_bit(false)?; self.write_bit(seq.force_screen_content_tools != 0)?; } if seq.force_screen_content_tools > 0 { if seq.force_integer_mv == 2 { self.write_bit(true)?; } else { self.write_bit(false)?; self.write_bit(seq.force_integer_mv != 0)?; } } else { assert!(seq.force_integer_mv == 2); } if seq.enable_order_hint { self.write(3, seq.order_hint_bits_minus_1)?; } } self.write_bit(seq.enable_superres)?; self.write_bit(seq.enable_cdef)?; self.write_bit(seq.enable_restoration)?; Ok(()) } // fn write_color_config(&mut self, seq: &Sequence) -> io::Result<()> { let high_bitdepth = seq.bit_depth > 8; self.write_bit(high_bitdepth)?; if seq.profile == 2 && high_bitdepth { self.write_bit(seq.bit_depth == 12)?; // twelve_bit } let monochrome = seq.chroma_sampling == ChromaSampling::Cs400; if seq.profile == 1 { assert!(!monochrome); } else { self.write_bit(monochrome)?; // mono_chrome } // color_description_present_flag self.write_bit(seq.color_description.is_some())?; let mut srgb_triple = false; if let Some(color_description) = seq.color_description { self.write(8, color_description.color_primaries as u8)?; self.write(8, color_description.transfer_characteristics as u8)?; self.write(8, color_description.matrix_coefficients as u8)?; srgb_triple = color_description.is_srgb_triple(); } if monochrome || !srgb_triple { self.write_bit(seq.pixel_range == PixelRange::Full)?; // color_range } if monochrome { return Ok(()); } else if srgb_triple { assert!(seq.pixel_range == PixelRange::Full); assert!(seq.chroma_sampling == ChromaSampling::Cs444); } else { if seq.profile == 0 { assert!(seq.chroma_sampling == ChromaSampling::Cs420); } else if seq.profile == 1 { assert!(seq.chroma_sampling == ChromaSampling::Cs444); } else if seq.bit_depth == 12 { let subsampling_x = seq.chroma_sampling != ChromaSampling::Cs444; let subsampling_y = seq.chroma_sampling == ChromaSampling::Cs420; self.write_bit(subsampling_x)?; if subsampling_x { self.write_bit(subsampling_y)?; } } else { assert!(seq.chroma_sampling == ChromaSampling::Cs422); } if seq.chroma_sampling == ChromaSampling::Cs420 { self.write(2, seq.chroma_sample_position as u32)?; } } self.write_bit(true)?; // separate_uv_delta_q Ok(()) } #[allow(unused)] fn write_frame_header_obu( &mut self, fi: &FrameInvariants, fs: &FrameState, inter_cfg: &InterConfig, ) -> io::Result<()> { if fi.sequence.reduced_still_picture_hdr { assert!(!fi.is_show_existing_frame()); assert!(fi.frame_type == FrameType::KEY); assert!(fi.show_frame); assert!(!fi.showable_frame); } else { self.write_bit(fi.is_show_existing_frame())?; if fi.is_show_existing_frame() { self.write(3, fi.frame_to_show_map_idx)?; //TODO: /* temporal_point_info(); if fi.sequence.decoder_model_info_present_flag && timing_info.equal_picture_interval == 0 { // write frame_presentation_delay; } if fi.sequence.frame_id_numbers_present_flag { // write display_frame_id; }*/ self.write_bit(true)?; // trailing bit self.byte_align()?; return Ok(()); } self.write(2, fi.frame_type as u32)?; self.write_bit(fi.show_frame)?; // show frame if fi.show_frame { //TODO: /* temporal_point_info(); if fi.sequence.decoder_model_info_present_flag && timing_info.equal_picture_interval == 0 { // write frame_presentation_delay; }*/ } else { self.write_bit(fi.showable_frame)?; } if fi.error_resilient { assert!(fi.primary_ref_frame == PRIMARY_REF_NONE); } if fi.frame_type == FrameType::SWITCH { assert!(fi.error_resilient); } else if !(fi.frame_type == FrameType::KEY && fi.show_frame) { self.write_bit(fi.error_resilient)?; // error resilient } } self.write_bit(fi.disable_cdf_update)?; if fi.sequence.force_screen_content_tools == 2 { self.write_bit(fi.allow_screen_content_tools != 0)?; } else { assert!( fi.allow_screen_content_tools == fi.sequence.force_screen_content_tools ); } if fi.allow_screen_content_tools > 0 { if fi.sequence.force_integer_mv == 2 { self.write_bit(fi.force_integer_mv != 0)?; } else { assert!(fi.force_integer_mv == fi.sequence.force_integer_mv); } } assert!( fi.force_integer_mv == u32::from(fi.frame_type == FrameType::KEY || fi.intra_only) ); if fi.sequence.frame_id_numbers_present_flag { unimplemented!(); //TODO: //let frame_id_len = fi.sequence.frame_id_length; //self.write(frame_id_len, fi.current_frame_id); } if fi.frame_type != FrameType::SWITCH && !fi.sequence.reduced_still_picture_hdr { self.write_bit(fi.frame_size_override_flag)?; // frame size overhead flag } if fi.sequence.enable_order_hint { let n = fi.sequence.order_hint_bits_minus_1 + 1; let mask = (1 << n) - 1; self.write(n, fi.order_hint & mask)?; } if !fi.error_resilient && !fi.intra_only { self.write(PRIMARY_REF_BITS, fi.primary_ref_frame)?; } if fi.sequence.decoder_model_info_present_flag { unimplemented!(); } if fi.frame_type == FrameType::KEY { if !fi.show_frame { // unshown keyframe (forward keyframe) unimplemented!(); self.write(REF_FRAMES as u32, fi.refresh_frame_flags)?; } else { assert!(fi.refresh_frame_flags == ALL_REF_FRAMES_MASK); } } else if fi.frame_type == FrameType::SWITCH { assert!(fi.refresh_frame_flags == ALL_REF_FRAMES_MASK); } else { // Inter frame info goes here if fi.intra_only { assert!(fi.refresh_frame_flags != ALL_REF_FRAMES_MASK); } else { // TODO: This should be set once inter mode is used } self.write(REF_FRAMES as u32, fi.refresh_frame_flags)?; }; if (!fi.intra_only || fi.refresh_frame_flags != ALL_REF_FRAMES_MASK) { // Write all ref frame order hints if error_resilient_mode == 1 if (fi.error_resilient && fi.sequence.enable_order_hint) { for i in 0..REF_FRAMES { let n = fi.sequence.order_hint_bits_minus_1 + 1; let mask = (1 << n) - 1; if let Some(ref rec) = fi.rec_buffer.frames[i] { let ref_hint = rec.order_hint; self.write(n, ref_hint & mask)?; } else { self.write(n, 0)?; } } } } // if KEY or INTRA_ONLY frame if fi.intra_only { self.write_frame_size(fi)?; self.write_render_size(fi)?; if fi.allow_screen_content_tools != 0 { // TODO: && UpscaledWidth == FrameWidth. self.write_bit(fi.allow_intrabc)?; } } let frame_refs_short_signaling = false; if fi.frame_type == FrameType::KEY || fi.intra_only { // Done by above } else { if fi.sequence.enable_order_hint { self.write_bit(frame_refs_short_signaling)?; if frame_refs_short_signaling { unimplemented!(); } } for i in 0..INTER_REFS_PER_FRAME { if !frame_refs_short_signaling { self.write(REF_FRAMES_LOG2 as u32, fi.ref_frames[i])?; } if fi.sequence.frame_id_numbers_present_flag { unimplemented!(); } } if !fi.error_resilient && fi.frame_size_override_flag { self.write_frame_size_with_refs(fi)?; } else { self.write_frame_size(fi)?; self.write_render_size(fi)?; } if fi.force_integer_mv == 0 { self.write_bit(fi.allow_high_precision_mv); } self.write_bit(fi.is_filter_switchable)?; if !fi.is_filter_switchable { self.write(2, fi.default_filter as u8)?; } self.write_bit(fi.is_motion_mode_switchable)?; if (!fi.error_resilient && fi.sequence.enable_ref_frame_mvs) { self.write_bit(fi.use_ref_frame_mvs)?; } } if fi.sequence.reduced_still_picture_hdr || fi.disable_cdf_update { assert!(fi.disable_frame_end_update_cdf); } else { self.write_bit(fi.disable_frame_end_update_cdf)?; } // tile // // Can we use the uniform spacing tile syntax? 'Uniform spacing' // is a slight misnomer; it's more constrained than just a uniform // spacing. let ti = &fi.sequence.tiling; if fi.sb_width.align_power_of_two_and_shift(ti.tile_cols_log2) == ti.tile_width_sb && fi.sb_height.align_power_of_two_and_shift(ti.tile_rows_log2) == ti.tile_height_sb { // yes; our actual tile width/height setting (which is always // currently uniform) also matches the constrained width/height // calculation implicit in the uniform spacing flag. self.write_bit(true)?; // uniform_tile_spacing_flag let cols_ones = ti.tile_cols_log2 - ti.min_tile_cols_log2; for _ in 0..cols_ones { self.write_bit(true); } if ti.tile_cols_log2 < ti.max_tile_cols_log2 { self.write_bit(false); } let rows_ones = ti.tile_rows_log2 - ti.min_tile_rows_log2; for _ in 0..rows_ones { self.write_bit(true); } if ti.tile_rows_log2 < ti.max_tile_rows_log2 { self.write_bit(false); } } else { self.write_bit(false)?; // uniform_tile_spacing_flag let mut sofar = 0; let mut widest_tile_sb = 0; for _ in 0..ti.cols { let max = (MAX_TILE_WIDTH >> if fi.sequence.use_128x128_superblock { 7 } else { 6 }) .min(fi.sb_width - sofar) as u16; let this_sb_width = ti.tile_width_sb.min(fi.sb_width - sofar); self.write_quniform(max, (this_sb_width - 1) as u16); sofar += this_sb_width; widest_tile_sb = widest_tile_sb.max(this_sb_width); } let max_tile_area_sb = if ti.min_tiles_log2 > 0 { (fi.sb_height * fi.sb_width) >> (ti.min_tiles_log2 + 1) } else { fi.sb_height * fi.sb_width }; let max_tile_height_sb = (max_tile_area_sb / widest_tile_sb).max(1); sofar = 0; for i in 0..ti.rows { let max = max_tile_height_sb.min(fi.sb_height - sofar) as u16; let this_sb_height = ti.tile_height_sb.min(fi.sb_height - sofar); self.write_quniform(max, (this_sb_height - 1) as u16); sofar += this_sb_height; } } let tiles_log2 = ti.tile_cols_log2 + ti.tile_rows_log2; if tiles_log2 > 0 { // context_update_tile_id // for now, always use the first tile CDF self.write(tiles_log2 as u32, fs.context_update_tile_id as u32)?; // tile_size_bytes_minus_1 self.write(2, fs.max_tile_size_bytes - 1); } // quantization assert!(fi.base_q_idx > 0); self.write(8, fi.base_q_idx)?; // base_q_idx self.write_delta_q(fi.dc_delta_q[0])?; if fi.sequence.chroma_sampling != ChromaSampling::Cs400 { assert!(fi.ac_delta_q[0] == 0); let diff_uv_delta = fi.dc_delta_q[1] != fi.dc_delta_q[2] || fi.ac_delta_q[1] != fi.ac_delta_q[2]; self.write_bit(diff_uv_delta)?; self.write_delta_q(fi.dc_delta_q[1])?; self.write_delta_q(fi.ac_delta_q[1])?; if diff_uv_delta { self.write_delta_q(fi.dc_delta_q[2])?; self.write_delta_q(fi.ac_delta_q[2])?; } } self.write_bit(false)?; // no qm // segmentation self.write_segment_data(fi, &fs.segmentation)?; // delta_q self.write_bit(false)?; // delta_q_present_flag: no delta q // delta_lf_params in the spec self.write_deblock_filter_a(fi, &fs.deblock)?; // code for features not yet implemented.... // loop_filter_params in the spec self.write_deblock_filter_b(fi, &fs.deblock)?; // cdef self.write_frame_cdef(fi)?; // loop restoration self.write_frame_lrf(fi, &fs.restoration)?; self.write_bit(fi.tx_mode_select)?; // tx mode let mut reference_select = false; if !fi.intra_only { reference_select = fi.reference_mode != ReferenceMode::SINGLE; self.write_bit(reference_select)?; } let skip_mode_allowed = fi.sequence.get_skip_mode_allowed(fi, inter_cfg, reference_select); if skip_mode_allowed { self.write_bit(false)?; // skip_mode_present } if fi.intra_only || fi.error_resilient || !fi.sequence.enable_warped_motion { } else { self.write_bit(fi.allow_warped_motion)?; // allow_warped_motion } self.write_bit(fi.use_reduced_tx_set)?; // reduced tx // global motion if !fi.intra_only { for i in 0..7 { let mode = fi.globalmv_transformation_type[i]; self.write_bit(mode != GlobalMVMode::IDENTITY)?; if mode != GlobalMVMode::IDENTITY { self.write_bit(mode == GlobalMVMode::ROTZOOM)?; if mode != GlobalMVMode::ROTZOOM { self.write_bit(mode == GlobalMVMode::TRANSLATION)?; } } match mode { GlobalMVMode::IDENTITY => { /* Nothing to do */ } GlobalMVMode::TRANSLATION => { let mv_x = 0; let mv_x_ref = 0; let mv_y = 0; let mv_y_ref = 0; let bits = 12 - 6 + 3 - !fi.allow_high_precision_mv as u8; let bits_diff = 12 - 3 + fi.allow_high_precision_mv as u8; BCodeWriter::write_s_refsubexpfin( self, (1 << bits) + 1, 3, mv_x_ref >> bits_diff, mv_x >> bits_diff, )?; BCodeWriter::write_s_refsubexpfin( self, (1 << bits) + 1, 3, mv_y_ref >> bits_diff, mv_y >> bits_diff, )?; } GlobalMVMode::ROTZOOM => unimplemented!(), GlobalMVMode::AFFINE => unimplemented!(), }; } } if fi.sequence.film_grain_params_present { if let Some(grain_params) = fi.film_grain_params() { // Apply grain self.write_bit(true)?; self.write(16, grain_params.random_seed)?; if fi.frame_type == FrameType::INTER { // For the purposes of photon noise, // it's simpler to always update the params, // and the output will be the same. self.write_bit(true)?; } self.write(4, grain_params.scaling_points_y.len() as u8)?; for point in &grain_params.scaling_points_y { self.write(8, point[0])?; self.write(8, point[1])?; } let chroma_scaling_from_luma = if fi.sequence.chroma_sampling != ChromaSampling::Cs400 { self.write_bit(grain_params.chroma_scaling_from_luma)?; grain_params.chroma_scaling_from_luma } else { false }; if !(fi.sequence.chroma_sampling == ChromaSampling::Cs400 || chroma_scaling_from_luma || (fi.sequence.chroma_sampling == ChromaSampling::Cs420 && grain_params.scaling_points_y.is_empty())) { self.write(4, grain_params.scaling_points_cb.len() as u8)?; for point in &grain_params.scaling_points_cb { self.write(8, point[0])?; self.write(8, point[1])?; } self.write(4, grain_params.scaling_points_cr.len() as u8)?; for point in &grain_params.scaling_points_cr { self.write(8, point[0])?; self.write(8, point[1])?; } } self.write(2, grain_params.scaling_shift - 8)?; self.write(2, grain_params.ar_coeff_lag)?; let mut num_pos_luma = (2 * grain_params.ar_coeff_lag * (grain_params.ar_coeff_lag + 1)) as usize; let mut num_pos_chroma; if !grain_params.scaling_points_y.is_empty() { num_pos_chroma = num_pos_luma + 1; for i in 0..num_pos_luma { self.write(8, grain_params.ar_coeffs_y[i] as i16 + 128)?; } } else { num_pos_chroma = num_pos_luma; } if chroma_scaling_from_luma || !grain_params.scaling_points_cb.is_empty() { for i in 0..num_pos_chroma { self.write(8, grain_params.ar_coeffs_cb[i] as i16 + 128)?; } } if chroma_scaling_from_luma || !grain_params.scaling_points_cr.is_empty() { for i in 0..num_pos_chroma { self.write(8, grain_params.ar_coeffs_cr[i] as i16 + 128)?; } } self.write(2, grain_params.ar_coeff_shift - 6)?; self.write(2, grain_params.grain_scale_shift)?; if !grain_params.scaling_points_cb.is_empty() { self.write(8, grain_params.cb_mult)?; self.write(8, grain_params.cb_luma_mult)?; self.write(9, grain_params.cb_offset)?; } if !grain_params.scaling_points_cr.is_empty() { self.write(8, grain_params.cr_mult)?; self.write(8, grain_params.cr_luma_mult)?; self.write(9, grain_params.cr_offset)?; } self.write_bit(grain_params.overlap_flag)?; self.write_bit(fi.sequence.pixel_range == PixelRange::Limited)?; } else { // No film grain for this frame self.write_bit(false)?; } } if fi.large_scale_tile { unimplemented!(); } self.byte_align()?; Ok(()) } // End of OBU Headers fn write_max_frame_size( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { // width_bits and height_bits will have to be moved to the sequence header OBU // when we add support for it. let width = fi.width - 1; let height = fi.height - 1; let width_bits = log_in_base_2(width as u32) as u32 + 1; let height_bits = log_in_base_2(height as u32) as u32 + 1; assert!(width_bits <= 16); assert!(height_bits <= 16); self.write(4, width_bits - 1)?; self.write(4, height_bits - 1)?; self.write(width_bits, width as u16)?; self.write(height_bits, height as u16)?; Ok(()) } fn write_frame_size( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { // width_bits and height_bits will have to be moved to the sequence header OBU // when we add support for it. if fi.frame_size_override_flag { let width = fi.width - 1; let height = fi.height - 1; let width_bits = log_in_base_2(width as u32) as u32 + 1; let height_bits = log_in_base_2(height as u32) as u32 + 1; assert!(width_bits <= 16); assert!(height_bits <= 16); self.write(width_bits, width as u16)?; self.write(height_bits, height as u16)?; } if fi.sequence.enable_superres { unimplemented!(); } Ok(()) } fn write_render_size( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { self.write_bit(fi.render_and_frame_size_different)?; if fi.render_and_frame_size_different { self.write(16, fi.render_width - 1)?; self.write(16, fi.render_height - 1)?; } Ok(()) } fn write_frame_size_with_refs( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { let mut found_ref = false; for i in 0..INTER_REFS_PER_FRAME { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[i] as usize] { if rec.width == fi.width as u32 && rec.height == fi.height as u32 && rec.render_width == fi.render_width && rec.render_height == fi.render_height { self.write_bit(true)?; found_ref = true; break; } else { self.write_bit(false)?; } } else { self.write_bit(false)?; } } if !found_ref { self.write_frame_size(fi)?; self.write_render_size(fi)?; } else if fi.sequence.enable_superres { unimplemented!(); } Ok(()) } fn write_deblock_filter_a( &mut self, fi: &FrameInvariants, deblock: &DeblockState, ) -> io::Result<()> { if fi.delta_q_present { if !fi.allow_intrabc { self.write_bit(deblock.block_deltas_enabled)?; } if deblock.block_deltas_enabled { self.write(2, deblock.block_delta_shift)?; self.write_bit(deblock.block_delta_multi)?; } } Ok(()) } fn write_deblock_filter_b( &mut self, fi: &FrameInvariants, deblock: &DeblockState, ) -> io::Result<()> { let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { MAX_PLANES }; assert!(deblock.levels[0] < 64); self.write(6, deblock.levels[0])?; // loop deblocking filter level 0 assert!(deblock.levels[1] < 64); self.write(6, deblock.levels[1])?; // loop deblocking filter level 1 if planes > 1 && (deblock.levels[0] > 0 || deblock.levels[1] > 0) { assert!(deblock.levels[2] < 64); self.write(6, deblock.levels[2])?; // loop deblocking filter level 2 assert!(deblock.levels[3] < 64); self.write(6, deblock.levels[3])?; // loop deblocking filter level 3 } self.write(3, deblock.sharpness)?; // deblocking filter sharpness self.write_bit(deblock.deltas_enabled)?; // loop deblocking filter deltas enabled if deblock.deltas_enabled { self.write_bit(deblock.delta_updates_enabled)?; // deltas updates enabled if deblock.delta_updates_enabled { // conditionally write ref delta updates let prev_ref_deltas = if fi.primary_ref_frame == PRIMARY_REF_NONE { [1, 0, 0, 0, 0, -1, -1, -1] } else { fi.rec_buffer.deblock [fi.ref_frames[fi.primary_ref_frame as usize] as usize] .ref_deltas }; for i in 0..REF_FRAMES { let update = deblock.ref_deltas[i] != prev_ref_deltas[i]; self.write_bit(update)?; if update { self.write_signed(7, deblock.ref_deltas[i])?; } } // conditionally write mode delta updates let prev_mode_deltas = if fi.primary_ref_frame == PRIMARY_REF_NONE { [0, 0] } else { fi.rec_buffer.deblock [fi.ref_frames[fi.primary_ref_frame as usize] as usize] .mode_deltas }; for i in 0..2 { let update = deblock.mode_deltas[i] != prev_mode_deltas[i]; self.write_bit(update)?; if update { self.write_signed(7, deblock.mode_deltas[i])?; } } } } Ok(()) } fn write_frame_cdef( &mut self, fi: &FrameInvariants, ) -> io::Result<()> { if fi.sequence.enable_cdef && !fi.allow_intrabc { assert!(fi.cdef_damping >= 3); assert!(fi.cdef_damping <= 6); self.write(2, fi.cdef_damping - 3)?; assert!(fi.cdef_bits < 4); self.write(2, fi.cdef_bits)?; // cdef bits for i in 0..(1 << fi.cdef_bits) { assert!(fi.cdef_y_strengths[i] < 64); assert!(fi.cdef_uv_strengths[i] < 64); self.write(6, fi.cdef_y_strengths[i])?; // cdef y strength if fi.sequence.chroma_sampling != ChromaSampling::Cs400 { self.write(6, fi.cdef_uv_strengths[i])?; // cdef uv strength } } } Ok(()) } fn write_frame_lrf( &mut self, fi: &FrameInvariants, rs: &RestorationState, ) -> io::Result<()> { if fi.sequence.enable_restoration && !fi.allow_intrabc { // && !self.lossless let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { MAX_PLANES }; let mut use_lrf = false; let mut use_chroma_lrf = false; for i in 0..planes { self.write(2, rs.planes[i].cfg.lrf_type)?; // filter type by plane if rs.planes[i].cfg.lrf_type != RESTORE_NONE { use_lrf = true; if i > 0 { use_chroma_lrf = true; } } } if use_lrf { // The Y shift value written here indicates shift up from superblock size if !fi.sequence.use_128x128_superblock { self.write(1, u8::from(rs.planes[0].cfg.unit_size > 64))?; } if rs.planes[0].cfg.unit_size > 64 { self.write(1, u8::from(rs.planes[0].cfg.unit_size > 128))?; } if use_chroma_lrf && fi.sequence.chroma_sampling == ChromaSampling::Cs420 { self.write( 1, u8::from(rs.planes[0].cfg.unit_size > rs.planes[1].cfg.unit_size), )?; } } } Ok(()) } fn write_segment_data( &mut self, fi: &FrameInvariants, segmentation: &SegmentationState, ) -> io::Result<()> { assert_eq!(fi.enable_segmentation, segmentation.enabled); self.write_bit(fi.enable_segmentation)?; if segmentation.enabled { if fi.primary_ref_frame == PRIMARY_REF_NONE { assert!(segmentation.update_map); assert!(segmentation.update_data); } else { self.write_bit(segmentation.update_map)?; if segmentation.update_map { self.write_bit(false)?; /* Without using temporal prediction */ } self.write_bit(segmentation.update_data)?; } if segmentation.update_data { for i in 0..8 { for j in 0..SegLvl::SEG_LVL_MAX as usize { self.write_bit(segmentation.features[i][j])?; if segmentation.features[i][j] { let bits = seg_feature_bits[j]; let data = segmentation.data[i][j]; if seg_feature_is_signed[j] { self.write_signed(bits + 1, data)?; } else { self.write(bits, data)?; } } } } } } Ok(()) } fn write_delta_q(&mut self, delta_q: i8) -> io::Result<()> { self.write_bit(delta_q != 0)?; if delta_q != 0 { assert!((-63..=63).contains(&delta_q)); self.write_signed(6 + 1, delta_q)?; } Ok(()) } } #[cfg(test)] mod tests { use super::ULEB128Writer; use bitstream_io::{BigEndian, BitWriter}; use nom::error::Error; use nom::IResult; use quickcheck::quickcheck; fn leb128(mut input: &[u8]) -> IResult<&[u8], u64, Error<&[u8]>> { use nom::bytes::complete::take; let mut value = 0u64; for i in 0..8u8 { let result = take(1usize)(input)?; input = result.0; let leb128_byte = result.1[0]; value |= u64::from(leb128_byte & 0x7f) << (i * 7); if (leb128_byte & 0x80) == 0 { break; } } Ok((input, value)) } quickcheck! { fn validate_leb128_write(val: u32) -> bool { let mut buf1 = Vec::new(); let mut bw1 = BitWriter::endian(&mut buf1, BigEndian); bw1.write_uleb128(val as u64).unwrap(); let result = leb128(&buf1).unwrap(); u64::from(val) == result.1 && result.0.is_empty() } } } rav1e-0.7.1/src/levels.rs000064400000000000000000000045371046102023000133070ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub static AV1_LEVEL_DEFINED: [bool; 32] = [ true, // 2.0 true, // 2.1 false, false, true, // 3.0 true, // 3.1 false, false, true, // 4.0 true, // 4.1 false, false, true, // 5.0 true, // 5.1 true, // 5.2 true, // 5.3 true, // 6.0 true, // 6.1 true, // 6.2 true, // 6.3 false, false, false, false, false, false, false, false, false, false, false, false, ]; pub static AV1_LEVEL_MAX_PIC_SIZE: [usize; 32] = [ 147456, // 2.0 278784, // 2.1 0, 0, 665856, // 3.0 1065024, // 3.1 0, 0, 2359296, // 4.0 23592960, // 4.1 0, 0, 8912896, // 5.0 8912896, // 5.1 8912896, // 5.2 8912896, // 5.3 35651584, // 6.0 35651584, // 6.1 35651584, // 6.2 35651584, // 6.3 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; pub static AV1_LEVEL_MAX_H_SIZE: [usize; 32] = [ 2048, // 2.0 2816, // 2.1 0, 0, 4352, // 3.0 5504, // 3.1 0, 0, 6144, // 4.0 6144, // 4.1 0, 0, 8192, // 5.0 8192, // 5.1 8192, // 5.2 8192, // 5.3 16384, // 6.0 16384, // 6.1 16384, // 6.2 16384, // 6.3 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; pub static AV1_LEVEL_MAX_V_SIZE: [usize; 32] = [ 1152, // 2.0 1584, // 2.1 0, 0, 2448, // 3.0 3096, // 3.1 0, 0, 3456, // 4.0 3456, // 4.1 0, 0, 4352, // 5.0 4352, // 5.1 4352, // 5.2 4352, // 5.3 8704, // 6.0 8704, // 6.1 8704, // 6.2 8704, // 6.3 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; pub static AV1_LEVEL_MAX_DISPLAY_RATE: [usize; 32] = [ 4_423_680, // 2.0 8_363_520, // 2.1 0, 0, 19_975_680, // 3.0 31_950_720, // 3.1 0, 0, 70_778_880, // 4.0 141_557_760, // 4.1 0, 0, 267_386_880, // 5.0 534_773_760, // 5.1 1_069_547_520, // 5.2 1_069_547_520, // 5.3 1_069_547_520, // 6.0 2_139_095_040, // 6.1 4_278_190_080, // 6.2 4_278_190_080, // 6.3 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; rav1e-0.7.1/src/lib.rs000064400000000000000000000253001046102023000125520ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. //! rav1e is an [AV1] video encoder. It is designed to eventually cover all use //! cases, though in its current form it is most suitable for cases where //! libaom (the reference encoder) is too slow. //! //! ## Features //! //! * Intra and inter frames //! * 64x64 superblocks //! * 4x4 to 64x64 RDO-selected square and 2:1/1:2 rectangular blocks //! * DC, H, V, Paeth, smooth, and a subset of directional prediction modes //! * DCT, (FLIP-)ADST and identity transforms (up to 64x64, 16x16 and 32x32 //! respectively) //! * 8-, 10- and 12-bit depth color //! * 4:2:0 (full support), 4:2:2 and 4:4:4 (limited) chroma sampling //! * Variable speed settings //! * Near real-time encoding at high speed levels //! //! ## Usage //! //! Encoding is done through the [`Context`] struct. Examples on //! [`Context::receive_packet`] show how to create a [`Context`], send frames //! into it and receive packets of encoded data. //! //! [AV1]: https://aomediacodec.github.io/av1-spec/av1-spec.pdf //! [`Context`]: struct.Context.html //! [`Context::receive_packet`]: struct.Context.html#method.receive_packet // Safety lints #![deny(bare_trait_objects)] #![deny(clippy::as_ptr_cast_mut)] #![deny(clippy::large_stack_arrays)] // Performance lints #![warn(clippy::inefficient_to_string)] #![warn(clippy::invalid_upcast_comparisons)] #![warn(clippy::iter_with_drain)] #![warn(clippy::linkedlist)] #![warn(clippy::mutex_integer)] #![warn(clippy::naive_bytecount)] #![warn(clippy::needless_bitwise_bool)] #![warn(clippy::needless_collect)] #![warn(clippy::or_fun_call)] #![warn(clippy::stable_sort_primitive)] #![warn(clippy::suboptimal_flops)] #![warn(clippy::trivial_regex)] #![warn(clippy::trivially_copy_pass_by_ref)] #![warn(clippy::unnecessary_join)] #![warn(clippy::unused_async)] #![warn(clippy::zero_sized_map_values)] // Correctness lints #![deny(clippy::case_sensitive_file_extension_comparisons)] #![deny(clippy::copy_iterator)] #![deny(clippy::expl_impl_clone_on_copy)] #![deny(clippy::float_cmp)] #![warn(clippy::imprecise_flops)] #![deny(clippy::manual_instant_elapsed)] #![deny(clippy::mem_forget)] #![deny(clippy::path_buf_push_overwrite)] #![deny(clippy::same_functions_in_if_condition)] #![deny(clippy::unchecked_duration_subtraction)] #![deny(clippy::unicode_not_nfc)] // Clarity/formatting lints #![warn(clippy::checked_conversions)] #![allow(clippy::comparison_chain)] #![warn(clippy::derive_partial_eq_without_eq)] #![allow(clippy::enum_variant_names)] #![warn(clippy::explicit_deref_methods)] #![warn(clippy::filter_map_next)] #![warn(clippy::flat_map_option)] #![warn(clippy::fn_params_excessive_bools)] #![warn(clippy::implicit_clone)] #![warn(clippy::iter_not_returning_iterator)] #![warn(clippy::iter_on_empty_collections)] #![warn(clippy::macro_use_imports)] #![warn(clippy::manual_clamp)] #![warn(clippy::manual_let_else)] #![warn(clippy::manual_ok_or)] #![warn(clippy::manual_string_new)] #![warn(clippy::map_flatten)] #![warn(clippy::match_bool)] #![warn(clippy::mut_mut)] #![warn(clippy::needless_borrow)] #![warn(clippy::needless_continue)] #![allow(clippy::needless_range_loop)] #![allow(clippy::too_many_arguments)] #![warn(clippy::range_minus_one)] #![warn(clippy::range_plus_one)] #![warn(clippy::ref_binding_to_reference)] #![warn(clippy::ref_option_ref)] #![warn(clippy::trait_duplication_in_bounds)] #![warn(clippy::unused_peekable)] #![warn(clippy::unused_rounding)] #![warn(clippy::unused_self)] #![allow(clippy::upper_case_acronyms)] #![warn(clippy::verbose_bit_mask)] #![warn(clippy::verbose_file_reads)] // Documentation lints #![warn(clippy::doc_link_with_quotes)] #![warn(clippy::doc_markdown)] #![warn(clippy::missing_errors_doc)] #![warn(clippy::missing_panics_doc)] // FIXME: We should fix instances of this lint and change it to `warn` #![allow(clippy::missing_safety_doc)] // Override assert! and assert_eq! in tests #[cfg(test)] #[macro_use] extern crate pretty_assertions; #[macro_use] extern crate log; pub(crate) mod built_info { // The file has been placed there by the build script. include!(concat!(env!("OUT_DIR"), "/built.rs")); } mod serialize { cfg_if::cfg_if! { if #[cfg(feature="serialize")] { pub use serde::*; } else { pub use noop_proc_macro::{Deserialize, Serialize}; } } } mod wasm_bindgen { cfg_if::cfg_if! { if #[cfg(feature="wasm")] { pub use wasm_bindgen::prelude::*; } else { pub use noop_proc_macro::wasm_bindgen; } } } #[cfg(any(cargo_c, feature = "capi"))] pub mod capi; #[macro_use] mod transform; #[macro_use] mod cpu_features; mod activity; pub(crate) mod asm; mod dist; mod ec; mod partition; mod predict; mod quantize; mod rdo; mod rdo_tables; #[macro_use] mod util; mod cdef; #[doc(hidden)] pub mod context; mod deblock; mod encoder; mod entropymode; mod levels; mod lrf; mod mc; mod me; mod rate; mod recon_intra; mod sad_plane; mod scan_order; #[cfg(feature = "scenechange")] pub mod scenechange; #[cfg(not(feature = "scenechange"))] mod scenechange; mod segmentation; mod stats; #[doc(hidden)] pub mod tiling; mod token_cdfs; mod api; mod frame; mod header; use crate::encoder::*; pub use crate::api::{ Config, Context, EncoderConfig, EncoderStatus, InvalidConfig, Packet, }; pub use crate::frame::Frame; pub use crate::util::{CastFromPrimitive, Pixel, PixelType}; /// Commonly used types and traits. pub mod prelude { pub use crate::api::*; pub use crate::encoder::{Sequence, Tune}; pub use crate::frame::{ Frame, FrameParameters, FrameTypeOverride, Plane, PlaneConfig, }; pub use crate::partition::BlockSize; pub use crate::predict::PredictionMode; pub use crate::transform::TxType; pub use crate::util::{CastFromPrimitive, Pixel, PixelType}; } /// Basic data structures pub mod data { pub use crate::api::{ ChromaticityPoint, EncoderStatus, FrameType, Packet, Rational, }; pub use crate::frame::{Frame, FrameParameters}; pub use crate::stats::EncoderStats; pub use crate::util::{CastFromPrimitive, Pixel, PixelType}; } pub use crate::api::color; /// Encoder configuration and settings pub mod config { pub use crate::api::config::{ GrainTableSegment, NoiseGenArgs, TransferFunction, NUM_UV_COEFFS, NUM_UV_POINTS, NUM_Y_COEFFS, NUM_Y_POINTS, }; pub use crate::api::{ Config, EncoderConfig, InvalidConfig, PredictionModesSetting, RateControlConfig, RateControlError, RateControlSummary, SpeedSettings, }; pub use crate::cpu_features::CpuFeatureLevel; } /// Version information /// /// The information is recovered from `Cargo.toml` and `git describe`, when available. /// /// ``` /// use rav1e::version; /// use semver::Version; /// /// let major = version::major(); /// let minor = version::minor(); /// let patch = version::patch(); /// /// let short = version::short(); /// /// let v1 = Version::new(major, minor, patch); /// let v2 = Version::parse(&short).unwrap(); /// /// assert_eq!(v1.major, v2.major); /// ``` pub mod version { /// Major version component /// /// It is increased every time a release presents a incompatible API change. /// /// # Panics /// /// Will panic if package is not built with Cargo, /// or if the package version is not a valid triplet of integers. pub fn major() -> u64 { env!("CARGO_PKG_VERSION_MAJOR").parse().unwrap() } /// Minor version component /// /// It is increased every time a release presents new functionalities are added /// in a backwards-compatible manner. /// /// # Panics /// /// Will panic if package is not built with Cargo, /// or if the package version is not a valid triplet of integers. pub fn minor() -> u64 { env!("CARGO_PKG_VERSION_MINOR").parse().unwrap() } /// Patch version component /// /// It is increased every time a release provides only backwards-compatible bugfixes. /// /// # Panics /// /// Will panic if package is not built with Cargo, /// or if the package version is not a valid triplet of integers. pub fn patch() -> u64 { env!("CARGO_PKG_VERSION_PATCH").parse().unwrap() } /// Version information as presented in `[package]` `version`. /// /// e.g. `0.1.0` /// /// Can be parsed by [semver](https://crates.io/crates/semver). pub fn short() -> String { env!("CARGO_PKG_VERSION").to_string() } /// Version information as presented in `[package] version` followed by the /// short commit hash if present. /// /// e.g. `0.1.0 - g743d464` /// pub fn long() -> String { let s = short(); let hash = hash(); if hash.is_empty() { s } else { format!("{s} - {hash}") } } cfg_if::cfg_if! { if #[cfg(feature="git_version")] { fn git_version() -> &'static str { crate::built_info::GIT_VERSION.unwrap_or_default() } fn git_hash() -> &'static str { crate::built_info::GIT_COMMIT_HASH.unwrap_or_default() } } else { fn git_version() -> &'static str { "UNKNOWN" } fn git_hash() -> &'static str { "UNKNOWN" } } } /// Commit hash (short) /// /// Short hash of the git commit used by this build /// /// e.g. `g743d464` /// pub fn hash() -> String { git_hash().to_string() } /// Version information with the information /// provided by `git describe --tags`. /// /// e.g. `0.1.0 (v0.1.0-1-g743d464)` /// pub fn full() -> String { format!("{} ({})", short(), git_version(),) } } #[cfg(all( any(test, fuzzing), any(feature = "decode_test", feature = "decode_test_dav1d") ))] mod test_encode_decode; #[cfg(feature = "bench")] pub mod bench { pub mod api { pub use crate::api::*; } pub mod cdef { pub use crate::cdef::*; } pub mod context { pub use crate::context::*; } pub mod dist { pub use crate::dist::*; } pub mod ec { pub use crate::ec::*; } pub mod encoder { pub use crate::encoder::*; } pub mod mc { pub use crate::mc::*; } pub mod partition { pub use crate::partition::*; } pub mod frame { pub use crate::frame::*; } pub mod predict { pub use crate::predict::*; } pub mod rdo { pub use crate::rdo::*; } pub mod tiling { pub use crate::tiling::*; } pub mod transform { pub use crate::transform::*; } pub mod util { pub use crate::util::*; } pub mod cpu_features { pub use crate::cpu_features::*; } } #[cfg(fuzzing)] pub mod fuzzing; rav1e-0.7.1/src/lrf.rs000064400000000000000000001433571046102023000126040ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { use crate::asm::x86::lrf::*; } else { use self::rust::*; } } use crate::api::SGRComplexityLevel; use crate::color::ChromaSampling::Cs400; use crate::context::{MAX_PLANES, SB_SIZE}; use crate::encoder::FrameInvariants; use crate::frame::{ AsRegion, Frame, Plane, PlaneConfig, PlaneOffset, PlaneSlice, }; use crate::tiling::{Area, PlaneRegion, PlaneRegionMut, Rect}; use crate::util::{clamp, CastFromPrimitive, ILog, Pixel}; use std::cmp; use std::iter::FusedIterator; use std::ops::{Index, IndexMut}; pub const RESTORATION_TILESIZE_MAX_LOG2: usize = 8; pub const RESTORE_NONE: u8 = 0; pub const RESTORE_SWITCHABLE: u8 = 1; pub const RESTORE_WIENER: u8 = 2; pub const RESTORE_SGRPROJ: u8 = 3; pub const WIENER_TAPS_MIN: [i8; 3] = [-5, -23, -17]; pub const WIENER_TAPS_MID: [i8; 3] = [3, -7, 15]; pub const WIENER_TAPS_MAX: [i8; 3] = [10, 8, 46]; #[allow(unused)] pub const WIENER_TAPS_K: [i8; 3] = [1, 2, 3]; pub const WIENER_BITS: usize = 7; pub const SGRPROJ_XQD_MIN: [i8; 2] = [-96, -32]; pub const SGRPROJ_XQD_MID: [i8; 2] = [-32, 31]; pub const SGRPROJ_XQD_MAX: [i8; 2] = [31, 95]; pub const SGRPROJ_PRJ_SUBEXP_K: u8 = 4; pub const SGRPROJ_PRJ_BITS: u8 = 7; pub const SGRPROJ_PARAMS_BITS: u8 = 4; pub const SGRPROJ_MTABLE_BITS: u8 = 20; pub const SGRPROJ_SGR_BITS: u8 = 8; pub const SGRPROJ_RECIP_BITS: u8 = 12; pub const SGRPROJ_RST_BITS: u8 = 4; pub const SGRPROJ_PARAMS_S: [[u32; 2]; 1 << SGRPROJ_PARAMS_BITS] = [ [140, 3236], [112, 2158], [93, 1618], [80, 1438], [70, 1295], [58, 1177], [47, 1079], [37, 996], [30, 925], [25, 863], [0, 2589], [0, 1618], [0, 1177], [0, 925], [56, 0], [22, 0], ]; // List of indices to SGRPROJ_PARAMS_S values that at a given complexity level. // SGRPROJ_ALL_SETS contains every possible index const SGRPROJ_ALL_SETS: &[u8] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; // SGRPROJ_REDUCED_SETS has half of the values. Using only these values gives // most of the gains from sgr. The decision of which values to use is somewhat // arbitrary. The sgr parameters has 3 discontinuous groups. The first has both // parameters as non-zero. The other two are distinguishable by which of the // two parameters is zero. There are an even number of each of these groups and // the non-zero parameters grow as the indices increase. This array uses the // 1st, 3rd, ... smallest params of each group. const SGRPROJ_REDUCED_SETS: &[u8] = &[1, 3, 5, 7, 9, 11, 13, 15]; pub const fn get_sgr_sets(complexity: SGRComplexityLevel) -> &'static [u8] { match complexity { SGRComplexityLevel::Full => SGRPROJ_ALL_SETS, SGRComplexityLevel::Reduced => SGRPROJ_REDUCED_SETS, } } pub const SOLVE_IMAGE_MAX: usize = 1 << RESTORATION_TILESIZE_MAX_LOG2; pub const SOLVE_IMAGE_STRIDE: usize = SOLVE_IMAGE_MAX + 6 + 2; pub const SOLVE_IMAGE_HEIGHT: usize = SOLVE_IMAGE_STRIDE; pub const SOLVE_IMAGE_SIZE: usize = SOLVE_IMAGE_STRIDE * SOLVE_IMAGE_HEIGHT; pub const STRIPE_IMAGE_MAX: usize = (1 << RESTORATION_TILESIZE_MAX_LOG2) + (1 << (RESTORATION_TILESIZE_MAX_LOG2 - 1)); pub const STRIPE_IMAGE_STRIDE: usize = STRIPE_IMAGE_MAX + 6 + 2; pub const STRIPE_IMAGE_HEIGHT: usize = 64 + 6 + 2; pub const STRIPE_IMAGE_SIZE: usize = STRIPE_IMAGE_STRIDE * STRIPE_IMAGE_HEIGHT; pub const IMAGE_WIDTH_MAX: usize = [STRIPE_IMAGE_MAX, SOLVE_IMAGE_MAX] [(STRIPE_IMAGE_MAX < SOLVE_IMAGE_MAX) as usize]; /// The buffer used in `sgrproj_stripe_filter()` and `sgrproj_solve()`. #[derive(Debug)] pub struct IntegralImageBuffer { pub integral_image: Vec, pub sq_integral_image: Vec, } impl IntegralImageBuffer { /// Creates a new buffer with the given size, filled with zeros. #[inline] pub fn zeroed(size: usize) -> Self { Self { integral_image: vec![0; size], sq_integral_image: vec![0; size] } } } #[allow(unused)] // Wiener coming soon! #[derive(Copy, Clone, Debug, PartialEq, Eq, Default)] pub enum RestorationFilter { #[default] None, Wiener { coeffs: [[i8; 3]; 2], }, Sgrproj { set: u8, xqd: [i8; 2], }, } impl RestorationFilter { pub const fn notequal(self, cmp: RestorationFilter) -> bool { match self { RestorationFilter::None {} => !matches!(cmp, RestorationFilter::None {}), RestorationFilter::Sgrproj { set, xqd } => { if let RestorationFilter::Sgrproj { set: set2, xqd: xqd2 } = cmp { !(set == set2 && xqd[0] == xqd2[0] && xqd[1] == xqd2[1]) } else { true } } RestorationFilter::Wiener { coeffs } => { if let RestorationFilter::Wiener { coeffs: coeffs2 } = cmp { !(coeffs[0][0] == coeffs2[0][0] && coeffs[0][1] == coeffs2[0][1] && coeffs[0][2] == coeffs2[0][2] && coeffs[1][0] == coeffs2[1][0] && coeffs[1][1] == coeffs2[1][1] && coeffs[1][2] == coeffs2[1][2]) } else { true } } } } } pub(crate) mod rust { use crate::cpu_features::CpuFeatureLevel; use crate::frame::PlaneSlice; use crate::lrf::{ get_integral_square, sgrproj_sum_finish, SGRPROJ_RST_BITS, SGRPROJ_SGR_BITS, }; use crate::util::CastFromPrimitive; use crate::Pixel; #[inline(always)] pub(crate) fn sgrproj_box_ab_internal( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32, ) { let d: usize = r * 2 + 1; let n: usize = d * d; let one_over_n = if r == 1 { 455 } else { 164 }; assert!(iimg.len() > (y + d) * iimg_stride + stripe_w + 1 + d); assert!(iimg_sq.len() > (y + d) * iimg_stride + stripe_w + 1 + d); assert!(af.len() > stripe_w + 1); assert!(bf.len() > stripe_w + 1); for x in start_x..stripe_w + 2 { // SAFETY: We perform the bounds checks above, once for the whole loop unsafe { let sum = get_integral_square(iimg, iimg_stride, x, y, d); let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d); let (reta, retb) = sgrproj_sum_finish::(ssq, sum, n as u32, one_over_n, s); *af.get_unchecked_mut(x) = reta; *bf.get_unchecked_mut(x) = retb; } } } // computes an intermediate (ab) row for stripe_w + 2 columns at row y pub(crate) fn sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, _cpu: CpuFeatureLevel, ) { sgrproj_box_ab_internal::( 1, af, bf, iimg, iimg_sq, iimg_stride, 0, y, stripe_w, s, ); } // computes an intermediate (ab) row for stripe_w + 2 columns at row y pub(crate) fn sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, y: usize, stripe_w: usize, s: u32, _cpu: CpuFeatureLevel, ) { sgrproj_box_ab_internal::( 2, af, bf, iimg, iimg_sq, iimg_stride, 0, y, stripe_w, s, ); } pub(crate) fn sgrproj_box_f_r0( f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r0_internal(f, 0, y, w, cdeffed); } #[inline(always)] pub(crate) fn sgrproj_box_f_r0_internal( f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, ) { let line = cdeffed.row(y); for (fp, &v) in f[start_x..w].iter_mut().zip(line[start_x..w].iter()) { *fp = u32::cast_from(v) << SGRPROJ_RST_BITS; } } pub(crate) fn sgrproj_box_f_r1( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r1_internal(af, bf, f, 0, y, w, cdeffed); } #[inline(always)] pub(crate) fn sgrproj_box_f_r1_internal( af: &[&[u32]; 3], bf: &[&[u32]; 3], f: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, ) { let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; let line = cdeffed.row(y); for x in start_x..w { let a = 3 * (af[0][x] + af[2][x] + af[0][x + 2] + af[2][x + 2]) + 4 * (af[1][x] + af[0][x + 1] + af[1][x + 1] + af[2][x + 1] + af[1][x + 2]); let b = 3 * (bf[0][x] + bf[2][x] + bf[0][x + 2] + bf[2][x + 2]) + 4 * (bf[1][x] + bf[0][x + 1] + bf[1][x + 1] + bf[2][x + 1] + bf[1][x + 2]); let v = a * u32::cast_from(line[x]) + b; f[x] = (v + (1 << shift >> 1)) >> shift; } } pub(crate) fn sgrproj_box_f_r2( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], y: usize, w: usize, cdeffed: &PlaneSlice, _cpu: CpuFeatureLevel, ) { sgrproj_box_f_r2_internal(af, bf, f0, f1, 0, y, w, cdeffed); } #[inline(always)] pub(crate) fn sgrproj_box_f_r2_internal( af: &[&[u32]; 2], bf: &[&[u32]; 2], f0: &mut [u32], f1: &mut [u32], start_x: usize, y: usize, w: usize, cdeffed: &PlaneSlice, ) { let shift = 5 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; let shifto = 4 + SGRPROJ_SGR_BITS - SGRPROJ_RST_BITS; let line = cdeffed.row(y); let line1 = cdeffed.row(y + 1); let af0 = af[0][start_x..w + 3].windows(3); let af1 = af[1][start_x..w + 3].windows(3); let bf0 = bf[0][start_x..w + 3].windows(3); let bf1 = bf[1][start_x..w + 3].windows(3); let af_it = af0.zip(af1); let bf_it = bf0.zip(bf1); let in0 = line[start_x..w].iter(); let in1 = line1[start_x..w].iter(); let o0 = f0[start_x..w].iter_mut(); let o1 = f1[start_x..w].iter_mut(); let in_iter = in0.zip(in1); let out_iter = o0.zip(o1); let io_iter = out_iter.zip(in_iter); for (((o0, o1), (&p0, &p1)), ((af_0, af_1), (bf_0, bf_1))) in io_iter.zip(af_it.zip(bf_it)) { let a = 5 * (af_0[0] + af_0[2]) + 6 * af_0[1]; let b = 5 * (bf_0[0] + bf_0[2]) + 6 * bf_0[1]; let ao = 5 * (af_1[0] + af_1[2]) + 6 * af_1[1]; let bo = 5 * (bf_1[0] + bf_1[2]) + 6 * bf_1[1]; let v = (a + ao) * u32::cast_from(p0) + b + bo; *o0 = (v + (1 << shift >> 1)) >> shift; let vo = ao * u32::cast_from(p1) + bo; *o1 = (vo + (1 << shifto >> 1)) >> shifto; } } } #[inline(always)] fn sgrproj_sum_finish( ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, ) -> (u32, u32) { let bdm8 = BD - 8; let scaled_ssq = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8); let scaled_sum = (sum + (1 << bdm8 >> 1)) >> bdm8; let p = (scaled_ssq * n).saturating_sub(scaled_sum * scaled_sum); let z = (p * s + (1 << SGRPROJ_MTABLE_BITS >> 1)) >> SGRPROJ_MTABLE_BITS; let a = if z >= 255 { 256 } else if z == 0 { 1 } else { ((z << SGRPROJ_SGR_BITS) + z / 2) / (z + 1) }; let b = ((1 << SGRPROJ_SGR_BITS) - a) * sum * one_over_n; (a, (b + (1 << SGRPROJ_RECIP_BITS >> 1)) >> SGRPROJ_RECIP_BITS) } // Using an integral image, compute the sum of a square region // SAFETY: The size of `iimg` must be at least `(y + size) * stride + x + size` #[inline(always)] unsafe fn get_integral_square( iimg: &[u32], stride: usize, x: usize, y: usize, size: usize, ) -> u32 { // Cancel out overflow in iimg by using wrapping arithmetic let top_left = *iimg.get_unchecked(y * stride + x); let top_right = *iimg.get_unchecked(y * stride + x + size); let bottom_left = *iimg.get_unchecked((y + size) * stride + x); let bottom_right = *iimg.get_unchecked((y + size) * stride + x + size); top_left .wrapping_add(bottom_right) .wrapping_sub(bottom_left) .wrapping_sub(top_right) } struct VertPaddedIter<'a, T: Pixel> { // The two sources that can be selected when clipping deblocked: &'a Plane, cdeffed: &'a Plane, // x index to choice where on the row to start x: isize, // y index that will be mutated y: isize, // The index at which to terminate. Can be larger than the slice length. end: isize, // Used for source buffer choice/clipping. May (and regularly will) // be negative. stripe_begin: isize, // Also used for source buffer choice/clipping. May specify a stripe boundary // less than, equal to, or larger than the buffers we're accessing. stripe_end: isize, // Active area cropping is done by specifying a value smaller than the height // of the plane. crop: isize, } impl<'a, T: Pixel> VertPaddedIter<'a, T> { fn new( cdeffed: &PlaneSlice<'a, T>, deblocked: &PlaneSlice<'a, T>, stripe_h: usize, crop: usize, ) -> VertPaddedIter<'a, T> { // cdeffed and deblocked must start at the same coordinates from their // underlying planes. Since cropping is provided via a separate params, the // height of the underlying planes do not need to match. assert_eq!(cdeffed.x, deblocked.x); assert_eq!(cdeffed.y, deblocked.y); // To share integral images, always use the max box filter radius of 2 let r = 2; // The number of rows outside the stripe are needed let rows_above = r + 2; let rows_below = 2; // Offset crop and stripe_h so they are relative to the underlying plane // and not the plane slice. let crop = crop as isize + deblocked.y; let stripe_end = stripe_h as isize + deblocked.y; // Move y up the number rows above. // If y is negative we repeat the first row let y = deblocked.y - rows_above as isize; VertPaddedIter { deblocked: deblocked.plane, cdeffed: cdeffed.plane, x: deblocked.x, y, end: (rows_above + stripe_h + rows_below) as isize + y, stripe_begin: deblocked.y, stripe_end, crop, } } } impl<'a, T: Pixel> Iterator for VertPaddedIter<'a, T> { type Item = &'a [T]; #[inline(always)] fn next(&mut self) -> Option { if self.end > self.y { // clamp before deciding the source // clamp vertically to storage at top and passed-in height at bottom let cropped_y = clamp(self.y, 0, self.crop - 1); // clamp vertically to stripe limits let ly = clamp(cropped_y, self.stripe_begin - 2, self.stripe_end + 1); // decide if we're vertically inside or outside the strip let src_plane = if ly >= self.stripe_begin && ly < self.stripe_end { self.cdeffed } else { self.deblocked }; // cannot directly return self.ps.row(row) due to lifetime issue let range = src_plane.row_range(self.x, ly); self.y += 1; Some(&src_plane.data[range]) } else { None } } fn size_hint(&self) -> (usize, Option) { let remaining = self.end - self.y; debug_assert!(remaining >= 0); let remaining = remaining as usize; (remaining, Some(remaining)) } } impl ExactSizeIterator for VertPaddedIter<'_, T> {} impl FusedIterator for VertPaddedIter<'_, T> {} struct HorzPaddedIter<'a, T: Pixel> { // Active area cropping is done using the length of the slice slice: &'a [T], // x index of the iterator // When less than 0, repeat the first element. When greater than end, repeat // the last element index: isize, // The index at which to terminate. Can be larger than the slice length. end: usize, } impl<'a, T: Pixel> HorzPaddedIter<'a, T> { fn new( slice: &'a [T], start_index: isize, width: usize, ) -> HorzPaddedIter<'a, T> { HorzPaddedIter { slice, index: start_index, end: (width as isize + start_index) as usize, } } } impl<'a, T: Pixel> Iterator for HorzPaddedIter<'a, T> { type Item = &'a T; #[inline(always)] fn next(&mut self) -> Option { if self.index < self.end as isize { // clamp to the edges of the frame let x = clamp(self.index, 0, self.slice.len() as isize - 1) as usize; self.index += 1; Some(&self.slice[x]) } else { None } } #[inline(always)] fn size_hint(&self) -> (usize, Option) { let size: usize = (self.end as isize - self.index) as usize; (size, Some(size)) } } impl ExactSizeIterator for HorzPaddedIter<'_, T> {} impl FusedIterator for HorzPaddedIter<'_, T> {} #[profiling::function] pub fn setup_integral_image( integral_image_buffer: &mut IntegralImageBuffer, integral_image_stride: usize, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, cdeffed: &PlaneSlice, deblocked: &PlaneSlice, ) { let integral_image = &mut integral_image_buffer.integral_image; let sq_integral_image = &mut integral_image_buffer.sq_integral_image; // Number of elements outside the stripe let left_w = 4; // max radius of 2 + 2 padding let right_w = 3; // max radius of 2 + 1 padding assert_eq!(cdeffed.x, deblocked.x); // Find how many unique elements to use to the left and right let left_uniques = if cdeffed.x == 0 { 0 } else { left_w }; let right_uniques = right_w.min(crop_w - stripe_w); // Find the total number of unique elements used let row_uniques = left_uniques + stripe_w + right_uniques; // Negative start indices result in repeating the first element of the row let start_index_x = if cdeffed.x == 0 { -(left_w as isize) } else { 0 }; let mut rows_iter = VertPaddedIter::new( // Move left to encompass all the used data &cdeffed.go_left(left_uniques), &deblocked.go_left(left_uniques), // since r2 uses every other row, we need an extra row if stripe_h is odd stripe_h + (stripe_h & 1), crop_h, ) .map(|row: &[T]| { HorzPaddedIter::new( // Limit how many unique elements we use &row[..row_uniques], start_index_x, left_w + stripe_w + right_w, ) }); // Setup the first row { let mut sum: u32 = 0; let mut sq_sum: u32 = 0; // Remove the first row and use it outside of the main loop let row = rows_iter.next().unwrap(); for (src, (integral, sq_integral)) in row.zip(integral_image.iter_mut().zip(sq_integral_image.iter_mut())) { let current = u32::cast_from(*src); // Wrap adds to prevent undefined behaviour on overflow. Overflow is // cancelled out when calculating the sum of a region. sum = sum.wrapping_add(current); *integral = sum; sq_sum = sq_sum.wrapping_add(current * current); *sq_integral = sq_sum; } } // Calculate all other rows let mut integral_slice = &mut integral_image[..]; let mut sq_integral_slice = &mut sq_integral_image[..]; for row in rows_iter { let mut sum: u32 = 0; let mut sq_sum: u32 = 0; // Split the data between the previous row and future rows. // This allows us to mutate the current row while accessing the // previous row. let (integral_row_prev, integral_row) = integral_slice.split_at_mut(integral_image_stride); let (sq_integral_row_prev, sq_integral_row) = sq_integral_slice.split_at_mut(integral_image_stride); for ( src, ((integral_above, sq_integral_above), (integral, sq_integral)), ) in row.zip( integral_row_prev .iter() .zip(sq_integral_row_prev.iter()) .zip(integral_row.iter_mut().zip(sq_integral_row.iter_mut())), ) { let current = u32::cast_from(*src); // Wrap adds to prevent undefined behaviour on overflow. Overflow is // cancelled out when calculating the sum of a region. sum = sum.wrapping_add(current); *integral = sum.wrapping_add(*integral_above); sq_sum = sq_sum.wrapping_add(current * current); *sq_integral = sq_sum.wrapping_add(*sq_integral_above); } // The current row also contains all future rows. Replacing the slice with // it moves down a row. integral_slice = integral_row; sq_integral_slice = sq_integral_row; } } #[profiling::function] pub fn sgrproj_stripe_filter( set: u8, xqd: [i8; 2], fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, integral_image_stride: usize, cdeffed: &PlaneSlice, out: &mut PlaneRegionMut, ) { let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect(); let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0]; let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1]; let fn_ab_r1 = match fi.sequence.bit_depth { 8 => sgrproj_box_ab_r1::<8>, 10 => sgrproj_box_ab_r1::<10>, 12 => sgrproj_box_ab_r1::<12>, _ => unimplemented!(), }; let fn_ab_r2 = match fi.sequence.bit_depth { 8 => sgrproj_box_ab_r2::<8>, 10 => sgrproj_box_ab_r2::<10>, 12 => sgrproj_box_ab_r2::<12>, _ => unimplemented!(), }; /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation // of every row (of a, b and f), only half of the rows (every-other // row) are actually used. let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { fn_ab_r2( &mut a_r2[0], &mut b_r2[0], integral_image, sq_integral_image, integral_image_stride, 0, stripe_w, s_r2, fi.cpu_feature_level, ); } if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; fn_ab_r1( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], integral_image_stride, 0, stripe_w, s_r1, fi.cpu_feature_level, ); fn_ab_r1( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], integral_image_stride, 1, stripe_w, s_r1, fi.cpu_feature_level, ); } /* iterate by row */ // Increment by two to handle the use of even rows by r=2 and run a nested // loop to handle increments of one. for y in (0..stripe_h).step_by(2) { // get results to use y and y+1 let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 { fn_ab_r2( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, sq_integral_image, integral_image_stride, y + 2, stripe_w, s_r2, fi.cpu_feature_level, ); let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]]; let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]]; sgrproj_box_f_r2( &ap0, &bp0, &mut f_r2_0, &mut f_r2_1, y, stripe_w, cdeffed, fi.cpu_feature_level, ); [&f_r2_0, &f_r2_1] } else { sgrproj_box_f_r0( &mut f_r2_0, y, stripe_w, cdeffed, fi.cpu_feature_level, ); // share results for both rows [&f_r2_0, &f_r2_0] }; for dy in 0..(2.min(stripe_h - y)) { let y = y + dy; if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; fn_ab_r1( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], integral_image_stride, y + 2, stripe_w, s_r1, fi.cpu_feature_level, ); let ap1: [&[u32]; 3] = [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]]; let bp1: [&[u32]; 3] = [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]]; sgrproj_box_f_r1( &ap1, &bp1, &mut f_r1, y, stripe_w, cdeffed, fi.cpu_feature_level, ); } else { sgrproj_box_f_r0( &mut f_r1, y, stripe_w, cdeffed, fi.cpu_feature_level, ); } /* apply filter */ let w0 = xqd[0] as i32; let w1 = xqd[1] as i32; let w2 = (1 << SGRPROJ_PRJ_BITS) - w0 - w1; let line = &cdeffed[y]; #[inline(always)] fn apply_filter( out: &mut [U], line: &[U], f_r1: &[u32], f_r2_ab: &[u32], stripe_w: usize, bit_depth: usize, w0: i32, w1: i32, w2: i32, ) { let line_it = line[..stripe_w].iter(); let f_r2_ab_it = f_r2_ab[..stripe_w].iter(); let f_r1_it = f_r1[..stripe_w].iter(); let out_it = out[..stripe_w].iter_mut(); for ((o, &u), (&f_r2_ab, &f_r1)) in out_it.zip(line_it).zip(f_r2_ab_it.zip(f_r1_it)) { let u = i32::cast_from(u) << SGRPROJ_RST_BITS; let v = w0 * f_r2_ab as i32 + w1 * u + w2 * f_r1 as i32; let s = (v + (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) >> 1)) >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); *o = U::cast_from(clamp(s, 0, (1 << bit_depth) - 1)); } } apply_filter( &mut out[y], line, &f_r1, f_r2_ab[dy], stripe_w, fi.sequence.bit_depth, w0, w1, w2, ); } } } // Frame inputs below aren't all equal, and will change as work // continues. There's no deblocked reconstruction available at this // point of RDO, so we use the non-deblocked reconstruction, cdef and // input. The input can be a full-sized frame. Cdef input is a partial // frame constructed specifically for RDO. // For simplicity, this ignores stripe segmentation (it's possible the // extra complexity isn't worth it and we'll ignore stripes // permanently during RDO, but that's not been tested yet). Data // access inside the cdef frame is monolithic and clipped to the cdef // borders. // Input params follow the same rules as sgrproj_stripe_filter. // Inputs are relative to the colocated slice views. #[profiling::function] pub fn sgrproj_solve( set: u8, fi: &FrameInvariants, integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>, cdeffed: &PlaneSlice, cdef_w: usize, cdef_h: usize, ) -> (i8, i8) { let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut f_r2_0: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut f_r2_1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let mut a_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut b_r1: [[u32; IMAGE_WIDTH_MAX + 2]; 3] = [[0; IMAGE_WIDTH_MAX + 2]; 3]; let mut f_r1: [u32; IMAGE_WIDTH_MAX] = [0; IMAGE_WIDTH_MAX]; let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0]; let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1]; let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]]; let mut c: [f64; 2] = [0., 0.]; let fn_ab_r1 = match fi.sequence.bit_depth { 8 => sgrproj_box_ab_r1::<8>, 10 => sgrproj_box_ab_r1::<10>, 12 => sgrproj_box_ab_r1::<12>, _ => unimplemented!(), }; let fn_ab_r2 = match fi.sequence.bit_depth { 8 => sgrproj_box_ab_r2::<8>, 10 => sgrproj_box_ab_r2::<10>, 12 => sgrproj_box_ab_r2::<12>, _ => unimplemented!(), }; /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation // of every row (of a, b and f), only half of the rows (every-other // row) are actually used. let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { fn_ab_r2( &mut a_r2[0], &mut b_r2[0], integral_image, sq_integral_image, SOLVE_IMAGE_STRIDE, 0, cdef_w, s_r2, fi.cpu_feature_level, ); } if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; fn_ab_r1( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], SOLVE_IMAGE_STRIDE, 0, cdef_w, s_r1, fi.cpu_feature_level, ); fn_ab_r1( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], SOLVE_IMAGE_STRIDE, 1, cdef_w, s_r1, fi.cpu_feature_level, ); } /* iterate by row */ // Increment by two to handle the use of even rows by r=2 and run a nested // loop to handle increments of one. for y in (0..cdef_h).step_by(2) { // get results to use y and y+1 let f_r2_01: [&[u32]; 2] = if s_r2 > 0 { fn_ab_r2( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, sq_integral_image, SOLVE_IMAGE_STRIDE, y + 2, cdef_w, s_r2, fi.cpu_feature_level, ); let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]]; let bp0: [&[u32]; 2] = [&b_r2[(y / 2) % 2], &b_r2[(y / 2 + 1) % 2]]; sgrproj_box_f_r2( &ap0, &bp0, &mut f_r2_0, &mut f_r2_1, y, cdef_w, cdeffed, fi.cpu_feature_level, ); [&f_r2_0, &f_r2_1] } else { sgrproj_box_f_r0(&mut f_r2_0, y, cdef_w, cdeffed, fi.cpu_feature_level); // share results for both rows [&f_r2_0, &f_r2_0] }; for dy in 0..(2.min(cdef_h - y)) { let y = y + dy; if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; fn_ab_r1( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], &sq_integral_image[integral_image_offset..], SOLVE_IMAGE_STRIDE, y + 2, cdef_w, s_r1, fi.cpu_feature_level, ); let ap1: [&[u32]; 3] = [&a_r1[y % 3], &a_r1[(y + 1) % 3], &a_r1[(y + 2) % 3]]; let bp1: [&[u32]; 3] = [&b_r1[y % 3], &b_r1[(y + 1) % 3], &b_r1[(y + 2) % 3]]; sgrproj_box_f_r1( &ap1, &bp1, &mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level, ); } else { sgrproj_box_f_r0(&mut f_r1, y, cdef_w, cdeffed, fi.cpu_feature_level); } #[inline(always)] fn process_line( h: &mut [[f64; 2]; 2], c: &mut [f64; 2], cdeffed: &[T], input: &[T], f_r1: &[u32], f_r2_ab: &[u32], cdef_w: usize, ) { let cdeffed_it = cdeffed[..cdef_w].iter(); let input_it = input[..cdef_w].iter(); let f_r2_ab_it = f_r2_ab[..cdef_w].iter(); let f_r1_it = f_r1[..cdef_w].iter(); #[derive(Debug, Copy, Clone)] struct Sums { h: [[i64; 2]; 2], c: [i64; 2], } let sums: Sums = cdeffed_it .zip(input_it) .zip(f_r2_ab_it.zip(f_r1_it)) .map(|((&u, &i), (&f2, &f1))| { let u = i32::cast_from(u) << SGRPROJ_RST_BITS; let s = (i32::cast_from(i) << SGRPROJ_RST_BITS) - u; let f2 = f2 as i32 - u; let f1 = f1 as i32 - u; (s as i64, f1 as i64, f2 as i64) }) .fold(Sums { h: [[0; 2]; 2], c: [0; 2] }, |sums, (s, f1, f2)| { let mut ret: Sums = sums; ret.h[0][0] += f2 * f2; ret.h[1][1] += f1 * f1; ret.h[0][1] += f1 * f2; ret.c[0] += f2 * s; ret.c[1] += f1 * s; ret }); h[0][0] += sums.h[0][0] as f64; h[1][1] += sums.h[1][1] as f64; h[0][1] += sums.h[0][1] as f64; c[0] += sums.c[0] as f64; c[1] += sums.c[1] as f64; } process_line( &mut h, &mut c, &cdeffed[y], &input[y], &f_r1, f_r2_01[dy], cdef_w, ); } } // this is lifted almost in-tact from libaom let n = cdef_w as f64 * cdef_h as f64; h[0][0] /= n; h[0][1] /= n; h[1][1] /= n; h[1][0] = h[0][1]; c[0] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n; c[1] *= (1 << SGRPROJ_PRJ_BITS) as f64 / n; let (xq0, xq1) = if s_r2 == 0 { // H matrix is now only the scalar h[1][1] // C vector is now only the scalar c[1] if h[1][1] == 0. { (0, 0) } else { (0, (c[1] / h[1][1]).round() as i32) } } else if s_r1 == 0 { // H matrix is now only the scalar h[0][0] // C vector is now only the scalar c[0] if h[0][0] == 0. { (0, 0) } else { ((c[0] / h[0][0]).round() as i32, 0) } } else { let det = h[0][0].mul_add(h[1][1], -h[0][1] * h[1][0]); if det == 0. { (0, 0) } else { // If scaling up dividend would overflow, instead scale down the divisor let div1 = h[1][1].mul_add(c[0], -h[0][1] * c[1]); let div2 = h[0][0].mul_add(c[1], -h[1][0] * c[0]); ((div1 / det).round() as i32, (div2 / det).round() as i32) } }; { let xqd0 = clamp(xq0, SGRPROJ_XQD_MIN[0] as i32, SGRPROJ_XQD_MAX[0] as i32); let xqd1 = clamp( (1 << SGRPROJ_PRJ_BITS) - xqd0 - xq1, SGRPROJ_XQD_MIN[1] as i32, SGRPROJ_XQD_MAX[1] as i32, ); (xqd0 as i8, xqd1 as i8) } } #[profiling::function] fn wiener_stripe_filter( coeffs: [[i8; 3]; 2], fi: &FrameInvariants, crop_w: usize, crop_h: usize, stripe_w: usize, stripe_h: usize, stripe_x: usize, stripe_y: isize, cdeffed: &Plane, deblocked: &Plane, out: &mut Plane, ) { let bit_depth = fi.sequence.bit_depth; let round_h = if bit_depth == 12 { 5 } else { 3 }; let round_v = if bit_depth == 12 { 9 } else { 11 }; let offset = 1 << (bit_depth + WIENER_BITS - round_h - 1); let limit = (1 << (bit_depth + 1 + WIENER_BITS - round_h)) - 1; let mut coeffs_ = [[0; 3]; 2]; for i in 0..2 { for j in 0..3 { coeffs_[i][j] = i32::from(coeffs[i][j]); } } let mut work: [i32; SB_SIZE + 7] = [0; SB_SIZE + 7]; let vfilter: [i32; 7] = [ coeffs_[0][0], coeffs_[0][1], coeffs_[0][2], 128 - 2 * (coeffs_[0][0] + coeffs_[0][1] + coeffs_[0][2]), coeffs_[0][2], coeffs_[0][1], coeffs_[0][0], ]; let hfilter: [i32; 7] = [ coeffs_[1][0], coeffs_[1][1], coeffs_[1][2], 128 - 2 * (coeffs_[1][0] + coeffs_[1][1] + coeffs_[1][2]), coeffs_[1][2], coeffs_[1][1], coeffs_[1][0], ]; // unlike x, our y can be negative to start as the first stripe // starts off the top of the frame by 8 pixels, and can also run off the end of the frame let start_wi = if stripe_y < 0 { -stripe_y } else { 0 } as usize; let start_yi = if stripe_y < 0 { 0 } else { stripe_y } as usize; let end_i = cmp::max( 0, if stripe_h as isize + stripe_y > crop_h as isize { crop_h as isize - stripe_y - start_wi as isize } else { stripe_h as isize - start_wi as isize }, ) as usize; let mut out_slice = out.mut_slice(PlaneOffset { x: 0, y: start_yi as isize }); for xi in stripe_x..stripe_x + stripe_w { let n = cmp::min(7, crop_w as isize + 3 - xi as isize); for yi in stripe_y - 3..stripe_y + stripe_h as isize + 4 { let mut acc = 0; let src = if yi < stripe_y { let ly = cmp::max(clamp(yi, 0, crop_h as isize - 1), stripe_y - 2); deblocked.row(ly) } else if yi < stripe_y + stripe_h as isize { let ly = clamp(yi, 0, crop_h as isize - 1); cdeffed.row(ly) } else { let ly = cmp::min( clamp(yi, 0, crop_h as isize - 1), stripe_y + stripe_h as isize + 1, ); deblocked.row(ly) }; let start = i32::cast_from(src[0]); let end = i32::cast_from(src[crop_w - 1]); for i in 0..3 - xi as isize { acc += hfilter[i as usize] * start; } let off = 3 - (xi as isize); let s = cmp::max(0, off) as usize; let s1 = (s as isize - off) as usize; let n1 = (n - off) as usize; for (hf, &v) in hfilter[s..n as usize].iter().zip(src[s1..n1].iter()) { acc += hf * i32::cast_from(v); } for i in n..7 { acc += hfilter[i as usize] * end; } acc = (acc + (1 << round_h >> 1)) >> round_h; work[(yi - stripe_y + 3) as usize] = clamp(acc, -offset, limit - offset); } for (wi, dst) in (start_wi..start_wi + end_i) .zip(out_slice.rows_iter_mut().map(|row| &mut row[xi]).take(end_i)) { let mut acc = 0; for (i, src) in (0..7).zip(work[wi..wi + 7].iter_mut()) { acc += vfilter[i] * *src; } *dst = T::cast_from(clamp( (acc + (1 << round_v >> 1)) >> round_v, 0, (1 << bit_depth) - 1, )); } } } #[derive(Copy, Clone, Debug, Default)] pub struct RestorationUnit { pub filter: RestorationFilter, } #[derive(Clone, Debug)] pub struct FrameRestorationUnits { units: Box<[RestorationUnit]>, pub cols: usize, pub rows: usize, } impl FrameRestorationUnits { pub fn new(cols: usize, rows: usize) -> Self { Self { units: vec![RestorationUnit::default(); cols * rows].into_boxed_slice(), cols, rows, } } } impl Index for FrameRestorationUnits { type Output = [RestorationUnit]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { &self.units[index * self.cols..(index + 1) * self.cols] } } impl IndexMut for FrameRestorationUnits { #[inline(always)] fn index_mut(&mut self, index: usize) -> &mut Self::Output { &mut self.units[index * self.cols..(index + 1) * self.cols] } } #[derive(Clone, Debug)] pub struct RestorationPlaneConfig { pub lrf_type: u8, pub unit_size: usize, // (1 << sb_x_shift) gives the number of superblocks horizontally or // vertically in a restoration unit, not accounting for RU stretching pub sb_h_shift: usize, pub sb_v_shift: usize, pub sb_cols: usize, // actual number of SB cols in this LRU (accounting for stretch and crop) pub sb_rows: usize, // actual number of SB rows in this LRU (accounting for stretch and crop) // stripe height is 64 in all cases except 4:2:0 chroma planes where // it is 32. This is independent of all other setup parameters pub stripe_height: usize, pub cols: usize, pub rows: usize, } #[derive(Clone, Debug)] pub struct RestorationPlane { pub cfg: RestorationPlaneConfig, pub units: FrameRestorationUnits, } #[derive(Clone, Default)] pub struct RestorationPlaneOffset { pub row: usize, pub col: usize, } impl RestorationPlane { pub fn new( lrf_type: u8, unit_size: usize, sb_h_shift: usize, sb_v_shift: usize, sb_cols: usize, sb_rows: usize, stripe_decimate: usize, cols: usize, rows: usize, ) -> RestorationPlane { let stripe_height = if stripe_decimate != 0 { 32 } else { 64 }; RestorationPlane { cfg: RestorationPlaneConfig { lrf_type, unit_size, sb_h_shift, sb_v_shift, sb_cols, sb_rows, stripe_height, cols, rows, }, units: FrameRestorationUnits::new(cols, rows), } } // Stripes are always 64 pixels high in a non-subsampled // frame, and decimated from 64 pixels in chroma. When // filtering, they are not co-located on Y with superblocks. fn restoration_unit_index_by_stripe( &self, stripenum: usize, rux: usize, ) -> (usize, usize) { ( cmp::min(rux, self.cfg.cols - 1), cmp::min( stripenum * self.cfg.stripe_height / self.cfg.unit_size, self.cfg.rows - 1, ), ) } pub fn restoration_unit_by_stripe( &self, stripenum: usize, rux: usize, ) -> &RestorationUnit { let (x, y) = self.restoration_unit_index_by_stripe(stripenum, rux); &self.units[y][x] } } #[derive(Clone, Debug)] pub struct RestorationState { pub planes: [RestorationPlane; MAX_PLANES], } impl RestorationState { pub fn new(fi: &FrameInvariants, input: &Frame) -> Self { let PlaneConfig { xdec, ydec, .. } = input.planes[1].cfg; // stripe size is decimated in 4:2:0 (and only 4:2:0) let stripe_uv_decimate = usize::from(xdec > 0 && ydec > 0); let y_sb_log2 = if fi.sequence.use_128x128_superblock { 7 } else { 6 }; let uv_sb_h_log2 = y_sb_log2 - xdec; let uv_sb_v_log2 = y_sb_log2 - ydec; let (lrf_y_shift, lrf_uv_shift) = if fi.sequence.enable_large_lru && fi.sequence.enable_restoration { assert!( fi.width > 1 && fi.height > 1, "Width and height must be higher than 1 for LRF setup" ); // Specific content does affect optimal LRU size choice, but the // quantizer in use is a surprisingly strong selector. let lrf_base_shift = if fi.base_q_idx > 200 { 0 // big } else if fi.base_q_idx > 160 { 1 } else { 2 // small }; let lrf_chroma_shift = if stripe_uv_decimate > 0 { // 4:2:0 only if lrf_base_shift == 2 { 1 // smallest chroma LRU is a win at low quant } else { // Will a down-shifted chroma LRU eliminate stretch in chroma? // If so, that's generally a win. let lrf_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_base_shift); let unshifted_stretch = ((fi.width >> xdec) - 1) % lrf_unit_size <= lrf_unit_size / 2 || ((fi.height >> ydec) - 1) % lrf_unit_size <= lrf_unit_size / 2; let shifted_stretch = ((fi.width >> xdec) - 1) % (lrf_unit_size >> 1) <= lrf_unit_size / 4 || ((fi.height >> ydec) - 1) % (lrf_unit_size >> 1) <= lrf_unit_size / 4; // shift to eliminate stretch if needed, // otherwise do not shift and save the signaling bits usize::from(unshifted_stretch && !shifted_stretch) } } else { 0 }; (lrf_base_shift, lrf_base_shift + lrf_chroma_shift) } else { // Explicit request to tie LRU size to superblock size == // smallest possible LRU size let lrf_y_shift = if fi.sequence.use_128x128_superblock { 1 } else { 2 }; (lrf_y_shift, lrf_y_shift + stripe_uv_decimate) }; let mut y_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_y_shift); let mut uv_unit_size = 1 << (RESTORATION_TILESIZE_MAX_LOG2 - lrf_uv_shift); let tiling = fi.sequence.tiling; // Right now we defer to tiling setup: don't choose an LRU size // large enough that a tile is not an integer number of LRUs // wide/high. if tiling.cols > 1 || tiling.rows > 1 { // despite suggestions to the contrary, tiles can be // non-powers-of-2. let trailing_h_zeros = tiling.tile_width_sb.trailing_zeros() as usize; let trailing_v_zeros = tiling.tile_height_sb.trailing_zeros() as usize; let tile_aligned_y_unit_size = 1 << (y_sb_log2 + trailing_h_zeros.min(trailing_v_zeros)); let tile_aligned_uv_h_unit_size = 1 << (uv_sb_h_log2 + trailing_h_zeros); let tile_aligned_uv_v_unit_size = 1 << (uv_sb_v_log2 + trailing_v_zeros); y_unit_size = y_unit_size.min(tile_aligned_y_unit_size); uv_unit_size = uv_unit_size .min(tile_aligned_uv_h_unit_size.min(tile_aligned_uv_v_unit_size)); // But it's actually worse: LRUs can't span tiles (in our // one-pass design that is, spec allows it). However, the spec // mandates the last LRU stretches forward into any // less-than-half-LRU span of superblocks at the right and // bottom of a frame. These superblocks may well be in a // different tile! Even if LRUs are minimum size (one // superblock), when the right or bottom edge of the frame is a // superblock that's less than half the width/height of a normal // superblock, the LRU is forced by the spec to span into it // (and thus a different tile). Tiling is under no such // restriction; it could decide the right/left sliver will be in // its own tile row/column. We can't disallow the combination // here. The tiling code will have to either prevent it or // tolerate it. (prayer mechanic == Issue #1629). } // When coding 4:2:2 and 4:4:4, spec requires Y and UV LRU sizes // to be the same*. If they differ at this // point, it's due to a tiling restriction enforcing a maximum // size, so force both to the smaller value. // // *see sec 5.9.20, "Loop restoration params syntax". The // bitstream provides means of coding a different UV LRU size only // when chroma is in use and both x and y are subsampled in the // chroma planes. if ydec == 0 && y_unit_size != uv_unit_size { y_unit_size = uv_unit_size.min(y_unit_size); uv_unit_size = y_unit_size; } // derive the rest let y_unit_log2 = y_unit_size.ilog() - 1; let uv_unit_log2 = uv_unit_size.ilog() - 1; let y_cols = ((fi.width + (y_unit_size >> 1)) / y_unit_size).max(1); let y_rows = ((fi.height + (y_unit_size >> 1)) / y_unit_size).max(1); let uv_cols = ((((fi.width + (1 << xdec >> 1)) >> xdec) + (uv_unit_size >> 1)) / uv_unit_size) .max(1); let uv_rows = ((((fi.height + (1 << ydec >> 1)) >> ydec) + (uv_unit_size >> 1)) / uv_unit_size) .max(1); RestorationState { planes: [ RestorationPlane::new( RESTORE_SWITCHABLE, y_unit_size, y_unit_log2 - y_sb_log2, y_unit_log2 - y_sb_log2, fi.sb_width, fi.sb_height, 0, y_cols, y_rows, ), RestorationPlane::new( RESTORE_SWITCHABLE, uv_unit_size, uv_unit_log2 - uv_sb_h_log2, uv_unit_log2 - uv_sb_v_log2, fi.sb_width, fi.sb_height, stripe_uv_decimate, uv_cols, uv_rows, ), RestorationPlane::new( RESTORE_SWITCHABLE, uv_unit_size, uv_unit_log2 - uv_sb_h_log2, uv_unit_log2 - uv_sb_v_log2, fi.sb_width, fi.sb_height, stripe_uv_decimate, uv_cols, uv_rows, ), ], } } #[profiling::function] pub fn lrf_filter_frame( &mut self, out: &mut Frame, pre_cdef: &Frame, fi: &FrameInvariants, ) { let cdeffed = out.clone(); let planes = if fi.sequence.chroma_sampling == Cs400 { 1 } else { MAX_PLANES }; // unlike the other loop filters that operate over the padded // frame dimensions, restoration filtering and source pixel // accesses are clipped to the original frame dimensions // that's why we use fi.width and fi.height instead of PlaneConfig fields // number of stripes (counted according to colocated Y luma position) let stripe_n = (fi.height + 7) / 64 + 1; // Buffers for the stripe filter. let mut stripe_filter_buffer = IntegralImageBuffer::zeroed(STRIPE_IMAGE_SIZE); for pli in 0..planes { let rp = &self.planes[pli]; let xdec = out.planes[pli].cfg.xdec; let ydec = out.planes[pli].cfg.ydec; let crop_w = (fi.width + (1 << xdec >> 1)) >> xdec; let crop_h = (fi.height + (1 << ydec >> 1)) >> ydec; for si in 0..stripe_n { let (stripe_start_y, stripe_size) = if si == 0 { (0, (64 - 8) >> ydec) } else { let start = (si * 64 - 8) >> ydec; ( start as isize, // one past, unlike spec (64 >> ydec).min(crop_h - start), ) }; // horizontally, go rdu-by-rdu for rux in 0..rp.cfg.cols { // stripe x pixel locations must be clipped to frame, last may need to stretch let x = rux * rp.cfg.unit_size; let size = if rux == rp.cfg.cols - 1 { crop_w - x } else { rp.cfg.unit_size }; let ru = rp.restoration_unit_by_stripe(si, rux); match ru.filter { RestorationFilter::Wiener { coeffs } => { wiener_stripe_filter( coeffs, fi, crop_w, crop_h, size, stripe_size, x, stripe_start_y, &cdeffed.planes[pli], &pre_cdef.planes[pli], &mut out.planes[pli], ); } RestorationFilter::Sgrproj { set, xqd } => { if !fi.sequence.enable_cdef { continue; } setup_integral_image( &mut stripe_filter_buffer, STRIPE_IMAGE_STRIDE, crop_w - x, (crop_h as isize - stripe_start_y) as usize, size, stripe_size, &cdeffed.planes[pli] .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), &pre_cdef.planes[pli] .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), ); sgrproj_stripe_filter( set, xqd, fi, &stripe_filter_buffer, STRIPE_IMAGE_STRIDE, &cdeffed.planes[pli] .slice(PlaneOffset { x: x as isize, y: stripe_start_y }), &mut out.planes[pli].region_mut(Area::Rect { x: x as isize, y: stripe_start_y, width: size, height: stripe_size, }), ); } RestorationFilter::None => { // do nothing } } } } } } } rav1e-0.7.1/src/mc.rs000064400000000000000000000351511046102023000124100ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::mc::*; } else if #[cfg(asm_neon)] { pub use crate::asm::aarch64::mc::*; } else { pub use self::rust::*; } } use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use crate::tiling::*; use crate::util::*; use simd_helpers::cold_for_target_arch; use std::ops; #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub struct MotionVector { pub row: i16, pub col: i16, } impl MotionVector { #[inline] pub const fn quantize_to_fullpel(self) -> Self { Self { row: (self.row / 8) * 8, col: (self.col / 8) * 8 } } #[inline] pub const fn is_zero(self) -> bool { self.row == 0 && self.col == 0 } #[inline] pub const fn is_valid(self) -> bool { use crate::context::{MV_LOW, MV_UPP}; ((MV_LOW as i16) < self.row && self.row < (MV_UPP as i16)) && ((MV_LOW as i16) < self.col && self.col < (MV_UPP as i16)) } } impl ops::Mul for MotionVector { type Output = MotionVector; #[inline] fn mul(self, rhs: i16) -> MotionVector { MotionVector { row: self.row * rhs, col: self.col * rhs } } } impl ops::Mul for MotionVector { type Output = MotionVector; #[inline] fn mul(self, rhs: u16) -> MotionVector { MotionVector { row: self.row * rhs as i16, col: self.col * rhs as i16 } } } impl ops::Shr for MotionVector { type Output = MotionVector; #[inline] fn shr(self, rhs: u8) -> MotionVector { MotionVector { row: self.row >> rhs, col: self.col >> rhs } } } impl ops::Shl for MotionVector { type Output = MotionVector; #[inline] fn shl(self, rhs: u8) -> MotionVector { MotionVector { row: self.row << rhs, col: self.col << rhs } } } impl ops::Add for MotionVector { type Output = MotionVector; #[inline] fn add(self, rhs: MotionVector) -> MotionVector { MotionVector { row: self.row + rhs.row, col: self.col + rhs.col } } } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] #[allow(unused)] pub enum FilterMode { REGULAR = 0, SMOOTH = 1, SHARP = 2, BILINEAR = 3, SWITCHABLE = 4, } pub const SUBPEL_FILTER_SIZE: usize = 8; const SUBPEL_FILTERS: [[[i32; SUBPEL_FILTER_SIZE]; 16]; 6] = [ [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 2, -6, 126, 8, -2, 0, 0], [0, 2, -10, 122, 18, -4, 0, 0], [0, 2, -12, 116, 28, -8, 2, 0], [0, 2, -14, 110, 38, -10, 2, 0], [0, 2, -14, 102, 48, -12, 2, 0], [0, 2, -16, 94, 58, -12, 2, 0], [0, 2, -14, 84, 66, -12, 2, 0], [0, 2, -14, 76, 76, -14, 2, 0], [0, 2, -12, 66, 84, -14, 2, 0], [0, 2, -12, 58, 94, -16, 2, 0], [0, 2, -12, 48, 102, -14, 2, 0], [0, 2, -10, 38, 110, -14, 2, 0], [0, 2, -8, 28, 116, -12, 2, 0], [0, 0, -4, 18, 122, -10, 2, 0], [0, 0, -2, 8, 126, -6, 2, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 2, 28, 62, 34, 2, 0, 0], [0, 0, 26, 62, 36, 4, 0, 0], [0, 0, 22, 62, 40, 4, 0, 0], [0, 0, 20, 60, 42, 6, 0, 0], [0, 0, 18, 58, 44, 8, 0, 0], [0, 0, 16, 56, 46, 10, 0, 0], [0, -2, 16, 54, 48, 12, 0, 0], [0, -2, 14, 52, 52, 14, -2, 0], [0, 0, 12, 48, 54, 16, -2, 0], [0, 0, 10, 46, 56, 16, 0, 0], [0, 0, 8, 44, 58, 18, 0, 0], [0, 0, 6, 42, 60, 20, 0, 0], [0, 0, 4, 40, 62, 22, 0, 0], [0, 0, 4, 36, 62, 26, 0, 0], [0, 0, 2, 34, 62, 28, 2, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [-2, 2, -6, 126, 8, -2, 2, 0], [-2, 6, -12, 124, 16, -6, 4, -2], [-2, 8, -18, 120, 26, -10, 6, -2], [-4, 10, -22, 116, 38, -14, 6, -2], [-4, 10, -22, 108, 48, -18, 8, -2], [-4, 10, -24, 100, 60, -20, 8, -2], [-4, 10, -24, 90, 70, -22, 10, -2], [-4, 12, -24, 80, 80, -24, 12, -4], [-2, 10, -22, 70, 90, -24, 10, -4], [-2, 8, -20, 60, 100, -24, 10, -4], [-2, 8, -18, 48, 108, -22, 10, -4], [-2, 6, -14, 38, 116, -22, 10, -4], [-2, 6, -10, 26, 120, -18, 8, -2], [-2, 4, -6, 16, 124, -12, 6, -2], [0, 2, -2, 8, 126, -6, 2, -2], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 0, 0, 120, 8, 0, 0, 0], [0, 0, 0, 112, 16, 0, 0, 0], [0, 0, 0, 104, 24, 0, 0, 0], [0, 0, 0, 96, 32, 0, 0, 0], [0, 0, 0, 88, 40, 0, 0, 0], [0, 0, 0, 80, 48, 0, 0, 0], [0, 0, 0, 72, 56, 0, 0, 0], [0, 0, 0, 64, 64, 0, 0, 0], [0, 0, 0, 56, 72, 0, 0, 0], [0, 0, 0, 48, 80, 0, 0, 0], [0, 0, 0, 40, 88, 0, 0, 0], [0, 0, 0, 32, 96, 0, 0, 0], [0, 0, 0, 24, 104, 0, 0, 0], [0, 0, 0, 16, 112, 0, 0, 0], [0, 0, 0, 8, 120, 0, 0, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 0, -4, 126, 8, -2, 0, 0], [0, 0, -8, 122, 18, -4, 0, 0], [0, 0, -10, 116, 28, -6, 0, 0], [0, 0, -12, 110, 38, -8, 0, 0], [0, 0, -12, 102, 48, -10, 0, 0], [0, 0, -14, 94, 58, -10, 0, 0], [0, 0, -12, 84, 66, -10, 0, 0], [0, 0, -12, 76, 76, -12, 0, 0], [0, 0, -10, 66, 84, -12, 0, 0], [0, 0, -10, 58, 94, -14, 0, 0], [0, 0, -10, 48, 102, -12, 0, 0], [0, 0, -8, 38, 110, -12, 0, 0], [0, 0, -6, 28, 116, -10, 0, 0], [0, 0, -4, 18, 122, -8, 0, 0], [0, 0, -2, 8, 126, -4, 0, 0], ], [ [0, 0, 0, 128, 0, 0, 0, 0], [0, 0, 30, 62, 34, 2, 0, 0], [0, 0, 26, 62, 36, 4, 0, 0], [0, 0, 22, 62, 40, 4, 0, 0], [0, 0, 20, 60, 42, 6, 0, 0], [0, 0, 18, 58, 44, 8, 0, 0], [0, 0, 16, 56, 46, 10, 0, 0], [0, 0, 14, 54, 48, 12, 0, 0], [0, 0, 12, 52, 52, 12, 0, 0], [0, 0, 12, 48, 54, 14, 0, 0], [0, 0, 10, 46, 56, 16, 0, 0], [0, 0, 8, 44, 58, 18, 0, 0], [0, 0, 6, 42, 60, 20, 0, 0], [0, 0, 4, 40, 62, 22, 0, 0], [0, 0, 4, 36, 62, 26, 0, 0], [0, 0, 2, 34, 62, 30, 0, 0], ], ]; pub(crate) mod rust { use super::*; use num_traits::*; unsafe fn run_filter>( src: *const T, stride: usize, filter: [i32; 8], ) -> i32 { filter .iter() .enumerate() .map(|(i, f)| { let p = src.add(i * stride); f * (*p).as_() }) .sum::() } fn get_filter( mode: FilterMode, frac: i32, length: usize, ) -> [i32; SUBPEL_FILTER_SIZE] { let filter_idx = if mode == FilterMode::BILINEAR || length > 4 { mode as usize } else { (mode as usize).min(1) + 4 }; SUBPEL_FILTERS[filter_idx][frac as usize] } #[cold_for_target_arch("x86_64")] pub fn put_8tap( dst: &mut PlaneRegionMut<'_, T>, src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); let ref_stride = src.plane.cfg.stride; let y_filter = get_filter(mode_y, row_frac, height); let x_filter = get_filter(mode_x, col_frac, width); let max_sample_val = (1 << bit_depth) - 1; let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; match (col_frac, row_frac) { (0, 0) => { for r in 0..height { let src_slice = &src[r]; let dst_slice = &mut dst[r]; dst_slice[..width].copy_from_slice(&src_slice[..width]); } } (0, _) => { let offset_slice = src.go_up(3); for r in 0..height { let src_slice = &offset_slice[r]; let dst_slice = &mut dst[r]; for c in 0..width { dst_slice[c] = T::cast_from( round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter) }, 7, ) .clamp(0, max_sample_val), ); } } } (_, 0) => { let offset_slice = src.go_left(3); for r in 0..height { let src_slice = &offset_slice[r]; let dst_slice = &mut dst[r]; for c in 0..width { dst_slice[c] = T::cast_from( round_shift( round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, 7 - intermediate_bits, ), intermediate_bits, ) .clamp(0, max_sample_val), ); } } } (_, _) => { let mut intermediate: [i16; 8 * (128 + 7)] = [0; 8 * (128 + 7)]; let offset_slice = src.go_left(3).go_up(3); for cg in (0..width).step_by(8) { for r in 0..height + 7 { let src_slice = &offset_slice[r]; for c in cg..(cg + 8).min(width) { intermediate[8 * r + (c - cg)] = round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, 7 - intermediate_bits, ) as i16; } } for r in 0..height { let dst_slice = &mut dst[r]; for c in cg..(cg + 8).min(width) { dst_slice[c] = T::cast_from( round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter( intermediate[8 * r + c - cg..].as_ptr(), 8, y_filter, ) }, 7 + intermediate_bits, ) .clamp(0, max_sample_val), ); } } } } } } // HBD output interval is [-20588, 36956] (10-bit), [-20602, 36983] (12-bit) // Subtract PREP_BIAS to ensure result fits in i16 and matches dav1d assembly const PREP_BIAS: i32 = 8192; #[cold_for_target_arch("x86_64")] pub fn prep_8tap( tmp: &mut [i16], src: PlaneSlice<'_, T>, width: usize, height: usize, col_frac: i32, row_frac: i32, mode_x: FilterMode, mode_y: FilterMode, bit_depth: usize, _cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); let ref_stride = src.plane.cfg.stride; let y_filter = get_filter(mode_y, row_frac, height); let x_filter = get_filter(mode_x, col_frac, width); let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS }; match (col_frac, row_frac) { (0, 0) => { for r in 0..height { let src_slice = &src[r]; for c in 0..width { tmp[r * width + c] = (i16::cast_from(src_slice[c]) << intermediate_bits) - prep_bias as i16; } } } (0, _) => { let offset_slice = src.go_up(3); for r in 0..height { let src_slice = &offset_slice[r]; for c in 0..width { tmp[r * width + c] = (round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), ref_stride, y_filter) }, 7 - intermediate_bits, ) - prep_bias) as i16; } } } (_, 0) => { let offset_slice = src.go_left(3); for r in 0..height { let src_slice = &offset_slice[r]; for c in 0..width { tmp[r * width + c] = (round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, 7 - intermediate_bits, ) - prep_bias) as i16; } } } (_, _) => { let mut intermediate: [i16; 8 * (128 + 7)] = [0; 8 * (128 + 7)]; let offset_slice = src.go_left(3).go_up(3); for cg in (0..width).step_by(8) { for r in 0..height + 7 { let src_slice = &offset_slice[r]; for c in cg..(cg + 8).min(width) { intermediate[8 * r + (c - cg)] = round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter(src_slice[c..].as_ptr(), 1, x_filter) }, 7 - intermediate_bits, ) as i16; } } for r in 0..height { for c in cg..(cg + 8).min(width) { tmp[r * width + c] = (round_shift( // SAFETY: We pass this a raw pointer, but it's created from a // checked slice, so we are safe. unsafe { run_filter( intermediate[8 * r + c - cg..].as_ptr(), 8, y_filter, ) }, 7, ) - prep_bias) as i16; } } } } } } #[cold_for_target_arch("x86_64")] pub fn mc_avg( dst: &mut PlaneRegionMut<'_, T>, tmp1: &[i16], tmp2: &[i16], width: usize, height: usize, bit_depth: usize, _cpu: CpuFeatureLevel, ) { // The assembly only supports even heights and valid uncropped widths assert_eq!(height & 1, 0); assert!(width.is_power_of_two() && (2..=128).contains(&width)); let max_sample_val = (1 << bit_depth) - 1; let intermediate_bits = 4 - if bit_depth == 12 { 2 } else { 0 }; let prep_bias = if bit_depth == 8 { 0 } else { PREP_BIAS * 2 }; for r in 0..height { let dst_slice = &mut dst[r]; for c in 0..width { dst_slice[c] = T::cast_from( round_shift( tmp1[r * width + c] as i32 + tmp2[r * width + c] as i32 + prep_bias, intermediate_bits + 1, ) .clamp(0, max_sample_val), ); } } } } rav1e-0.7.1/src/me.rs000064400000000000000000001320561046102023000124140ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::api::InterConfig; use crate::context::{ BlockOffset, PlaneBlockOffset, SuperBlockOffset, TileBlockOffset, TileSuperBlockOffset, MAX_SB_SIZE_LOG2, MIB_SIZE_LOG2, MI_SIZE, MI_SIZE_LOG2, SB_SIZE, }; use crate::dist::*; use crate::frame::*; use crate::mc::MotionVector; use crate::partition::*; use crate::predict::PredictionMode; use crate::tiling::*; use crate::util::ILog; use crate::util::{clamp, Pixel}; use crate::FrameInvariants; use arrayvec::*; use std::ops::{Index, IndexMut}; use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; #[derive(Debug, Copy, Clone, Default)] pub struct MEStats { pub mv: MotionVector, /// sad value, on the scale of a 128x128 block pub normalized_sad: u32, } #[derive(Debug, Clone)] pub struct FrameMEStats { stats: Box<[MEStats]>, pub cols: usize, pub rows: usize, } /// cbindgen:ignore pub type RefMEStats = Arc>; /// cbindgen:ignore pub type ReadGuardMEStats<'a> = RwLockReadGuard<'a, [FrameMEStats; REF_FRAMES]>; /// cbindgen:ignore pub type WriteGuardMEStats<'a> = RwLockWriteGuard<'a, [FrameMEStats; REF_FRAMES]>; impl FrameMEStats { #[inline] pub fn rows_iter(&self) -> std::slice::ChunksExact<'_, MEStats> { self.stats.chunks_exact(self.cols) } pub fn new(cols: usize, rows: usize) -> Self { Self { // dynamic allocation: once per frame stats: vec![MEStats::default(); cols * rows].into_boxed_slice(), cols, rows, } } pub fn new_arc_array(cols: usize, rows: usize) -> RefMEStats { Arc::new(RwLock::new([ FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), FrameMEStats::new(cols, rows), ])) } } impl Index for FrameMEStats { type Output = [MEStats]; #[inline] fn index(&self, index: usize) -> &Self::Output { &self.stats[index * self.cols..(index + 1) * self.cols] } } impl IndexMut for FrameMEStats { #[inline] fn index_mut(&mut self, index: usize) -> &mut Self::Output { &mut self.stats[index * self.cols..(index + 1) * self.cols] } } /// Result of motion search. #[derive(Debug, Copy, Clone)] pub struct MotionSearchResult { /// Motion vector chosen by the motion search. pub mv: MotionVector, /// Rate distortion data associated with `mv`. pub rd: MVCandidateRD, } impl MotionSearchResult { /// Creates an 'empty' value. /// /// To be considered empty, cost is set higher than any naturally occurring /// cost value. The idea is that comparing to any valid rd output, the search /// result will always be replaced. #[inline(always)] pub fn empty() -> MotionSearchResult { MotionSearchResult { mv: MotionVector::default(), rd: MVCandidateRD::empty(), } } /// Check if the value should be considered to be empty. #[inline(always)] const fn is_empty(&self) -> bool { self.rd.cost == u64::MAX } } /// Holds data from computing rate distortion of a motion vector. #[derive(Debug, Copy, Clone)] pub struct MVCandidateRD { /// Rate distortion cost of the motion vector. pub cost: u64, /// Distortion metric value for the motion vector. pub sad: u32, } impl MVCandidateRD { /// Creates an 'empty' value. /// /// To be considered empty, cost is set higher than any naturally occurring /// cost value. The idea is that comparing to any valid rd output, the search /// result will always be replaced. #[inline(always)] const fn empty() -> MVCandidateRD { MVCandidateRD { sad: u32::MAX, cost: u64::MAX } } } #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub enum MVSamplingMode { INIT, CORNER { right: bool, bottom: bool }, } pub fn estimate_tile_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, inter_cfg: &InterConfig, ) { let init_size = MIB_SIZE_LOG2; let mut prev_ssdec: Option = None; for mv_size_in_b_log2 in (2..=init_size).rev() { let init = mv_size_in_b_log2 == init_size; // Choose subsampling. Pass one is quarter res and pass two is at half res. let ssdec = match init_size - mv_size_in_b_log2 { 0 => 2, 1 => 1, _ => 0, }; let new_subsampling = if let Some(prev) = prev_ssdec { prev != ssdec } else { false }; prev_ssdec = Some(ssdec); // 0.5 and 0.125 are a fudge factors let lambda = (fi.me_lambda * 256.0 / (1 << (2 * ssdec)) as f64 * if ssdec == 0 { 0.5 } else { 0.125 }) as u32; for sby in 0..ts.sb_height { for sbx in 0..ts.sb_width { let mut tested_frames_flags = 0; for &ref_frame in inter_cfg.allowed_ref_frames() { let frame_flag = 1 << fi.ref_frames[ref_frame.to_index()]; if tested_frames_flags & frame_flag == frame_flag { continue; } tested_frames_flags |= frame_flag; let tile_bo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }) .block_offset(0, 0); if new_subsampling { refine_subsampled_sb_motion( fi, ts, ref_frame, mv_size_in_b_log2 + 1, tile_bo, ssdec, lambda, ); } estimate_sb_motion( fi, ts, ref_frame, mv_size_in_b_log2, tile_bo, init, ssdec, lambda, ); } } } } } fn estimate_sb_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, init: bool, ssdec: u8, lambda: u32, ) { let pix_offset = tile_bo.to_luma_plane_offset(); let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize); let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize); let mv_size = MI_SIZE << mv_size_in_b_log2; // Process in blocks, cropping at edges. for y in (0..sb_h).step_by(mv_size) { for x in (0..sb_w).step_by(mv_size) { let corner: MVSamplingMode = if init { MVSamplingMode::INIT } else { // Processing the block a size up produces data that can be used by // the right and bottom corners. MVSamplingMode::CORNER { right: x & mv_size == mv_size, bottom: y & mv_size == mv_size, } }; let sub_bo = tile_bo .with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2); // Clamp to frame edge, rounding up in the case of subsampling. // The rounding makes some assumptions about how subsampling is done. let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec; let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec; // Run motion estimation. // Note that the initial search (init) instructs the called function to // perform a more extensive search. if let Some(results) = estimate_motion( fi, ts, w, h, sub_bo, ref_frame, None, corner, init, ssdec, Some(lambda), ) { // normalize sad to 128x128 block let sad = (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) / (w * h) as u64) as u32; save_me_stats( ts, mv_size_in_b_log2, sub_bo, ref_frame, MEStats { mv: results.mv, normalized_sad: sad }, ); } } } } fn refine_subsampled_sb_motion( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, ref_frame: RefType, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ssdec: u8, lambda: u32, ) { let pix_offset = tile_bo.to_luma_plane_offset(); let sb_h: usize = SB_SIZE.min(ts.height - pix_offset.y as usize); let sb_w: usize = SB_SIZE.min(ts.width - pix_offset.x as usize); let mv_size = MI_SIZE << mv_size_in_b_log2; // Process in blocks, cropping at edges. for y in (0..sb_h).step_by(mv_size) { for x in (0..sb_w).step_by(mv_size) { let sub_bo = tile_bo .with_offset(x as isize >> MI_SIZE_LOG2, y as isize >> MI_SIZE_LOG2); // Clamp to frame edge, rounding up in the case of subsampling. // The rounding makes some assumptions about how subsampling is done. let w = mv_size.min(sb_w - x + (1 << ssdec) - 1) >> ssdec; let h = mv_size.min(sb_h - y + (1 << ssdec) - 1) >> ssdec; // Refine the existing motion estimate if let Some(results) = refine_subsampled_motion_estimate( fi, ts, w, h, sub_bo, ref_frame, ssdec, lambda, ) { // normalize sad to 128x128 block let sad = (((results.rd.sad as u64) << (MAX_SB_SIZE_LOG2 * 2)) / (w * h) as u64) as u32; save_me_stats( ts, mv_size_in_b_log2, sub_bo, ref_frame, MEStats { mv: results.mv, normalized_sad: sad }, ); } } } } fn save_me_stats( ts: &mut TileStateMut<'_, T>, mv_size_in_b_log2: usize, tile_bo: TileBlockOffset, ref_frame: RefType, stats: MEStats, ) { let size_in_b = 1 << mv_size_in_b_log2; let tile_me_stats = &mut ts.me_stats[ref_frame.to_index()]; let tile_bo_x_end = (tile_bo.0.x + size_in_b).min(ts.mi_width); let tile_bo_y_end = (tile_bo.0.y + size_in_b).min(ts.mi_height); for mi_y in tile_bo.0.y..tile_bo_y_end { for a in tile_me_stats[mi_y][tile_bo.0.x..tile_bo_x_end].iter_mut() { *a = stats; } } } fn get_mv_range( w_in_b: usize, h_in_b: usize, bo: PlaneBlockOffset, blk_w: usize, blk_h: usize, ) -> (isize, isize, isize, isize) { let border_w = 128 + blk_w as isize * 8; let border_h = 128 + blk_h as isize * 8; let mvx_min = -(bo.0.x as isize) * (8 * MI_SIZE) as isize - border_w; let mvx_max = ((w_in_b - bo.0.x) as isize - (blk_w / MI_SIZE) as isize) * (8 * MI_SIZE) as isize + border_w; let mvy_min = -(bo.0.y as isize) * (8 * MI_SIZE) as isize - border_h; let mvy_max = ((h_in_b - bo.0.y) as isize - (blk_h / MI_SIZE) as isize) * (8 * MI_SIZE) as isize + border_h; // use crate::context::{MV_LOW, MV_UPP}; ( mvx_min.max(MV_LOW as isize + 1), mvx_max.min(MV_UPP as isize - 1), mvy_min.max(MV_LOW as isize + 1), mvy_max.min(MV_UPP as isize - 1), ) } struct MotionEstimationSubsets { min_sad: u32, median: Option, subset_b: ArrayVec, subset_c: ArrayVec, } impl MotionEstimationSubsets { fn all_mvs(&self) -> ArrayVec { let mut all = ArrayVec::new(); if let Some(median) = self.median { all.push(median); } all.extend(self.subset_b.iter().copied()); all.extend(self.subset_c.iter().copied()); all } } #[profiling::function] fn get_subset_predictors( tile_bo: TileBlockOffset, tile_me_stats: &TileMEStats<'_>, frame_ref_opt: Option>, ref_frame_id: usize, pix_w: usize, pix_h: usize, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, corner: MVSamplingMode, ssdec: u8, ) -> MotionEstimationSubsets { let mut min_sad: u32 = u32::MAX; let mut subset_b = ArrayVec::::new(); let mut subset_c = ArrayVec::::new(); // rounded up width in blocks let w = ((pix_w << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2; let h = ((pix_h << ssdec) + MI_SIZE - 1) >> MI_SIZE_LOG2; // Get predictors from the same frame. let clipped_half_w = (w >> 1).min(tile_me_stats.cols() - 1 - tile_bo.0.x); let clipped_half_h = (h >> 1).min(tile_me_stats.rows() - 1 - tile_bo.0.y); let mut process_cand = |stats: MEStats| -> MotionVector { min_sad = min_sad.min(stats.normalized_sad); let mv = stats.mv.quantize_to_fullpel(); MotionVector { col: clamp(mv.col as isize, mvx_min, mvx_max) as i16, row: clamp(mv.row as isize, mvy_min, mvy_max) as i16, } }; // Sample the middle of all block edges bordering this one. // Note: If motion vectors haven't been precomputed to a given blocksize, then // the right and bottom edges will be duplicates of the center predictor when // processing in raster order. // left if tile_bo.0.x > 0 { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x - 1], )); } // top if tile_bo.0.y > 0 { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y - 1][tile_bo.0.x + clipped_half_w], )); } // Sampling far right and far bottom edges was tested, but had worse results // without an extensive threshold test (with threshold being applied after // checking median and the best of each subset). // right if let MVSamplingMode::CORNER { right: true, bottom: _ } = corner { if tile_bo.0.x + w < tile_me_stats.cols() { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y + clipped_half_h][tile_bo.0.x + w], )); } } // bottom if let MVSamplingMode::CORNER { right: _, bottom: true } = corner { if tile_bo.0.y + h < tile_me_stats.rows() { subset_b.push(process_cand( tile_me_stats[tile_bo.0.y + h][tile_bo.0.x + clipped_half_w], )); } } let median = if corner != MVSamplingMode::INIT { // Sample the center of the current block. Some(process_cand( tile_me_stats[tile_bo.0.y + clipped_half_h] [tile_bo.0.x + clipped_half_w], )) } else if subset_b.len() != 3 { None } else { let mut rows: ArrayVec = subset_b.iter().map(|&a| a.row).collect(); let mut cols: ArrayVec = subset_b.iter().map(|&a| a.col).collect(); rows.as_mut_slice().sort_unstable(); cols.as_mut_slice().sort_unstable(); Some(MotionVector { row: rows[1], col: cols[1] }) }; // Zero motion vector, don't use add_cand since it skips zero vectors. subset_b.push(MotionVector::default()); // EPZS subset C predictors. // Sample the middle of bordering side of the left, right, top and bottom // blocks of the previous frame. // Sample the middle of this block in the previous frame. if let Some(frame_me_stats) = frame_ref_opt { let prev_frame = &frame_me_stats[ref_frame_id]; let frame_bo = PlaneBlockOffset(BlockOffset { x: tile_me_stats.x() + tile_bo.0.x, y: tile_me_stats.y() + tile_bo.0.y, }); let clipped_half_w = (w >> 1).min(prev_frame.cols - 1 - frame_bo.0.x); let clipped_half_h = (h >> 1).min(prev_frame.rows - 1 - frame_bo.0.y); // left if frame_bo.0.x > 0 { subset_c.push(process_cand( prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x - 1], )); } // top if frame_bo.0.y > 0 { subset_c.push(process_cand( prev_frame[frame_bo.0.y - 1][frame_bo.0.x + clipped_half_w], )); } // right if frame_bo.0.x + w < prev_frame.cols { subset_c.push(process_cand( prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + w], )); } // bottom if frame_bo.0.y + h < prev_frame.rows { subset_c.push(process_cand( prev_frame[frame_bo.0.y + h][frame_bo.0.x + clipped_half_w], )); } subset_c.push(process_cand( prev_frame[frame_bo.0.y + clipped_half_h][frame_bo.0.x + clipped_half_w], )); } // Undo normalization to 128x128 block size let min_sad = ((min_sad as u64 * (pix_w * pix_h) as u64) >> (MAX_SB_SIZE_LOG2 * 2)) as u32; let dec_mv = |mv: MotionVector| MotionVector { col: mv.col >> ssdec, row: mv.row >> ssdec, }; let median = median.map(dec_mv); for mv in subset_b.iter_mut() { *mv = dec_mv(*mv); } for mv in subset_c.iter_mut() { *mv = dec_mv(*mv); } MotionEstimationSubsets { min_sad, median, subset_b, subset_c } } pub fn estimate_motion( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, w: usize, h: usize, tile_bo: TileBlockOffset, ref_frame: RefType, pmv: Option<[MotionVector; 2]>, corner: MVSamplingMode, extensive_search: bool, ssdec: u8, lambda: Option, ) -> Option { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] { let frame_bo = ts.to_frame_block_offset(tile_bo); let (mvx_min, mvx_max, mvy_min, mvy_max) = get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec); let lambda = lambda.unwrap_or({ // 0.5 is a fudge factor (fi.me_lambda * 256.0 * 0.5) as u32 }); let global_mv = [MotionVector { row: 0, col: 0 }; 2]; let po = frame_bo.to_luma_plane_offset(); let (mvx_min, mvx_max, mvy_min, mvy_max) = (mvx_min >> ssdec, mvx_max >> ssdec, mvy_min >> ssdec, mvy_max >> ssdec); let po = PlaneOffset { x: po.x >> ssdec, y: po.y >> ssdec }; let p_ref = match ssdec { 0 => &rec.frame.planes[0], 1 => &rec.input_hres, 2 => &rec.input_qres, _ => unimplemented!(), }; let org_region = &match ssdec { 0 => ts.input_tile.planes[0] .subregion(Area::BlockStartingAt { bo: tile_bo.0 }), 1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }), 2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }), _ => unimplemented!(), }; let mut best: MotionSearchResult = full_pixel_me( fi, ts, org_region, p_ref, tile_bo, po, lambda, pmv.unwrap_or(global_mv), w, h, mvx_min, mvx_max, mvy_min, mvy_max, ref_frame, corner, extensive_search, ssdec, ); if let Some(pmv) = pmv { let use_satd: bool = fi.config.speed_settings.motion.use_satd_subpel; if use_satd { best.rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, fi.sequence.bit_depth, pmv, lambda, use_satd, mvx_min, mvx_max, mvy_min, mvy_max, w, h, best.mv, ); } sub_pixel_me( fi, po, org_region, p_ref, lambda, pmv, mvx_min, mvx_max, mvy_min, mvy_max, w, h, use_satd, &mut best, ref_frame, ); } // Scale motion vectors to full res size best.mv = best.mv << ssdec; Some(best) } else { None } } /// Refine motion estimation that was computed one level of subsampling up. fn refine_subsampled_motion_estimate( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, w: usize, h: usize, tile_bo: TileBlockOffset, ref_frame: RefType, ssdec: u8, lambda: u32, ) -> Option { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] { let frame_bo = ts.to_frame_block_offset(tile_bo); let (mvx_min, mvx_max, mvy_min, mvy_max) = get_mv_range(fi.w_in_b, fi.h_in_b, frame_bo, w << ssdec, h << ssdec); let pmv = [MotionVector { row: 0, col: 0 }; 2]; let po = frame_bo.to_luma_plane_offset(); let (mvx_min, mvx_max, mvy_min, mvy_max) = (mvx_min >> ssdec, mvx_max >> ssdec, mvy_min >> ssdec, mvy_max >> ssdec); let po = PlaneOffset { x: po.x >> ssdec, y: po.y >> ssdec }; let p_ref = match ssdec { 0 => &rec.frame.planes[0], 1 => &rec.input_hres, 2 => &rec.input_qres, _ => unimplemented!(), }; let org_region = &match ssdec { 0 => ts.input_tile.planes[0] .subregion(Area::BlockStartingAt { bo: tile_bo.0 }), 1 => ts.input_hres.region(Area::StartingAt { x: po.x, y: po.y }), 2 => ts.input_qres.region(Area::StartingAt { x: po.x, y: po.y }), _ => unimplemented!(), }; let mv = ts.me_stats[ref_frame.to_index()][tile_bo.0.y][tile_bo.0.x].mv >> ssdec; // Given a motion vector at 0 at higher subsampling: // | -1 | 0 | 1 | // then the vectors at -1 to 2 should be tested at the current subsampling. // |-------------| // | -2 -1 | 0 1 | 2 3 | // This corresponds to a 4x4 full search. let x_lo = po.x + (mv.col as isize / 8 - 1).max(mvx_min / 8); let x_hi = po.x + (mv.col as isize / 8 + 2).min(mvx_max / 8); let y_lo = po.y + (mv.row as isize / 8 - 1).max(mvy_min / 8); let y_hi = po.y + (mv.row as isize / 8 + 2).min(mvy_max / 8); let mut results = full_search( fi, x_lo, x_hi, y_lo, y_hi, w, h, org_region, p_ref, po, 1, lambda, pmv, ); // Scale motion vectors to full res size results.mv = results.mv << ssdec; Some(results) } else { None } } #[profiling::function] fn full_pixel_me( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, org_region: &PlaneRegion, p_ref: &Plane, tile_bo: TileBlockOffset, po: PlaneOffset, lambda: u32, pmv: [MotionVector; 2], w: usize, h: usize, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, ref_frame: RefType, corner: MVSamplingMode, extensive_search: bool, ssdec: u8, ) -> MotionSearchResult { let ref_frame_id = ref_frame.to_index(); let tile_me_stats = &ts.me_stats[ref_frame_id].as_const(); let frame_ref = fi.rec_buffer.frames[fi.ref_frames[0] as usize] .as_ref() .map(|frame_ref| frame_ref.frame_me_stats.read().expect("poisoned lock")); let subsets = get_subset_predictors( tile_bo, tile_me_stats, frame_ref, ref_frame_id, w, h, mvx_min, mvx_max, mvy_min, mvy_max, corner, ssdec, ); let try_cands = |predictors: &[MotionVector], best: &mut MotionSearchResult| { let mut results = get_best_predictor( fi, po, org_region, p_ref, predictors, fi.sequence.bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, ); fullpel_diamond_search( fi, po, org_region, p_ref, &mut results, fi.sequence.bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, ); if results.rd.cost < best.rd.cost { *best = results; } }; let mut best: MotionSearchResult = MotionSearchResult::empty(); if !extensive_search { try_cands(&subsets.all_mvs(), &mut best); best } else { // Perform a more thorough search before resorting to full search. // Search the median, the best mvs of neighboring blocks, and motion vectors // from the previous frame. Stop once a candidate with a sad less than a // threshold is found. let thresh = (subsets.min_sad as f32 * 1.2) as u32 + (((w * h) as u32) << (fi.sequence.bit_depth - 8)); if let Some(median) = subsets.median { try_cands(&[median], &mut best); if best.rd.sad < thresh { return best; } } try_cands(&subsets.subset_b, &mut best); if best.rd.sad < thresh { return best; } try_cands(&subsets.subset_c, &mut best); if best.rd.sad < thresh { return best; } // Preform UMH search, either as the last possible search when full search // is disabled, or as the last search before resorting to full search. uneven_multi_hex_search( fi, po, org_region, p_ref, &mut best, fi.sequence.bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, // Use 24, since it is the largest range that x264 uses. 24, ); if !fi.config.speed_settings.motion.me_allow_full_search || best.rd.sad < thresh { return best; } { let range_x = (192 * fi.me_range_scale as isize) >> ssdec; let range_y = (64 * fi.me_range_scale as isize) >> ssdec; let x_lo = po.x + (-range_x).max(mvx_min / 8); let x_hi = po.x + (range_x).min(mvx_max / 8); let y_lo = po.y + (-range_y).max(mvy_min / 8); let y_hi = po.y + (range_y).min(mvy_max / 8); let results = full_search( fi, x_lo, x_hi, y_lo, y_hi, w, h, org_region, p_ref, po, // Full search is run at quarter resolution, except for short edges. // When subsampling is lower than usual, the step size is raised so that // the number of search locations stays the same. 4 >> ssdec, lambda, [MotionVector::default(); 2], ); if results.rd.cost < best.rd.cost { results } else { best } } } } fn sub_pixel_me( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, lambda: u32, pmv: [MotionVector; 2], mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, use_satd: bool, best: &mut MotionSearchResult, ref_frame: RefType, ) { subpel_diamond_search( fi, po, org_region, p_ref, fi.sequence.bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, use_satd, best, ref_frame, ); } #[profiling::function] fn get_best_predictor( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, predictors: &[MotionVector], bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, ) -> MotionSearchResult { let mut best: MotionSearchResult = MotionSearchResult::empty(); for &init_mv in predictors.iter() { let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, init_mv, ); if rd.cost < best.rd.cost { best.mv = init_mv; best.rd = rd; } } best } /// Declares an array of motion vectors in structure of arrays syntax. /// Compared to [`search_pattern_subpel`], this version creates motion vectors /// in fullpel resolution (x8). macro_rules! search_pattern { ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => { [ $(MotionVector { $field_a: $ll_a << 3, $field_b: $ll_b << 3 } ),*] }; } /// Declares an array of motion vectors in structure of arrays syntax. macro_rules! search_pattern_subpel { ($field_a:ident: [$($ll_a:expr),*], $field_b:ident: [$($ll_b:expr),*]) => { [ $(MotionVector { $field_a: $ll_a, $field_b: $ll_b } ),*] }; } /// Diamond pattern of radius 1 as shown below. For fullpel search, use /// `DIAMOND_R1_PATTERN_FULLPEL` since it has been scaled for fullpel search. /// ```text /// X /// XoX /// X /// ``` /// 'X's are motion candidates and the 'o' is the center. /// const DIAMOND_R1_PATTERN_SUBPEL: [MotionVector; 4] = search_pattern_subpel!( col: [ 0, 1, 0, -1], row: [ 1, 0, -1, 0] ); /// Diamond pattern of radius 1 as shown below. Unlike `DIAMOND_R1_PATTERN`, the /// vectors have been shifted fullpel scale. /// ```text /// X /// XoX /// X /// ``` /// 'X's are motion candidates and the 'o' is the center. const DIAMOND_R1_PATTERN: [MotionVector; 4] = search_pattern!( col: [ 0, 1, 0, -1], row: [ 1, 0, -1, 0] ); /// Run a full pixel diamond search. The search is run on multiple step sizes. /// /// For each step size, candidate motion vectors are examined for improvement /// to the current search location. The search location is moved to the best /// candidate (if any). This is repeated until the search location stops moving. #[profiling::function] fn fullpel_diamond_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, ) { // Define the initial and the final scale (log2) of the diamond. let (mut diamond_radius_log2, diamond_radius_end_log2) = (1u8, 0u8); loop { // Find the best candidate from the diamond pattern. let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &DIAMOND_R1_PATTERN { let cand_mv = current.mv + (offset << diamond_radius_log2); let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { best_cand.mv = cand_mv; best_cand.rd = rd; } } // Continue the search at this scale until the can't find a better candidate // to use. if current.rd.cost <= best_cand.rd.cost { if diamond_radius_log2 == diamond_radius_end_log2 { break; } else { diamond_radius_log2 -= 1; } } else { *current = best_cand; } } assert!(!current.is_empty()); } /// A hexagon pattern around a center point. The pattern is ordered so that the /// offsets circle around the center. This is done to allow pruning locations /// covered by the last iteration. /// ```text /// 21012 /// 2 X X /// 1 /// 0 X o X /// 1 /// 2 X X /// ``` /// 'X's are motion candidates and the 'o' is the center. /// /// The illustration below shows the process of a hexagon search. /// ```text /// Step 1 Step 2 /// 1 1 1 1 2 /// /// 1(0)1 => 1 0(1)2 /// /// 1 1 1 1 2 /// ``` /// The search above has gone through the following steps. /// 1. Search '1' elements for better candidates than the center '0'. /// 2. Recenter around the best candidate ('(1)') and hexagon candidates that /// don't overlap with the previous search step (labeled '2'). const HEXAGON_PATTERN: [MotionVector; 6] = search_pattern!( col: [ 0, 2, 2, 0, -2, -2], row: [ -2, -1, 1, 2, 1, -1] ); /// A small square pattern around a center point. /// ```text /// 101 /// 1 XXX /// 0 XoX /// 1 XXX /// ``` /// 'X's are motion candidates and the 'o' is the center. const SQUARE_REFINE_PATTERN: [MotionVector; 8] = search_pattern!( col: [ -1, 0, 1, -1, 1, -1, 0, 1], row: [ 1, 1, 1, 0, 0, -1, -1, -1] ); /// Perform hexagon search and refine afterwards. /// /// In the hexagon search stage, candidate motion vectors are examined for /// improvement to the current search location. The search location is moved to /// the best candidate (if any). This is repeated until the search location /// stops moving. /// /// Refinement uses a square pattern that fits between the hexagon candidates. /// /// The hexagon pattern is defined by [`HEXAGON_PATTERN`] and the refinement /// is defined by [`SQUARE_REFINE_PATTERN`]. /// /// `current` provides the initial search location and serves as /// the output for the final search results. #[profiling::function] fn hexagon_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, ) { // The first iteration of hexagon search is implemented separate from // subsequent iterations, which overlap with previous iterations. // Holds what candidate is used (if any). This is used to determine which // candidates have already been tested in a previous iteration and can be // skipped. let mut best_cand_idx: usize = 0; let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); // First iteration of hexagon search. There are six candidates to consider. for i in 0..6 { let cand_mv = current.mv + HEXAGON_PATTERN[i]; let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { best_cand_idx = i; best_cand.mv = cand_mv; best_cand.rd = rd; } } // Run additional iterations of hexagon search until the search location // doesn't update. while best_cand.rd.cost < current.rd.cost { // Update the search location. *current = best_cand; best_cand = MotionSearchResult::empty(); // Save the index/direction taken in the previous iteration to the current // search location. let center_cand_idx = best_cand_idx; // Look only at candidates that don't overlap with previous iterations. This // corresponds with the three offsets (2D) with the closest direction to // that traveled by the previous iteration. HEXAGON_PATTERN has clockwise // order, so the last direction -1, +0, and +1 (mod 6) give the indices for // these offsets. for idx_offset_mod6 in 5..=7 { let i = (center_cand_idx + idx_offset_mod6) % 6; let cand_mv = current.mv + HEXAGON_PATTERN[i]; let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { best_cand_idx = i; best_cand.mv = cand_mv; best_cand.rd = rd; } } } // Refine the motion after completing hexagon search. let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &SQUARE_REFINE_PATTERN { let cand_mv = current.mv + offset; let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < best_cand.rd.cost { best_cand.mv = cand_mv; best_cand.rd = rd; } } if best_cand.rd.cost < current.rd.cost { *current = best_cand; } assert!(!current.is_empty()); } /// Uneven multi-hexagon search pattern around a center point. Used for locating /// irregular movement. /// ```text /// X /// X X /// X X /// X X /// X o X /// X X /// X X /// X X /// X /// ``` /// 'X's are motion candidates and the 'o' is the center. const UMH_PATTERN: [MotionVector; 16] = search_pattern!( col: [ -2, -1, 0, 1, 2, 3, 4, 3, 2, 1, 0, -1, -2, 3, -4, -3], row: [ 4, 4, 4, 4, 4, 2, 0, -2, -4, -4, -4, -4, -4, -2, 0, 2] ); /// Perform an uneven multi-hexagon search. There are 4 stages: /// 1. Unsymmetrical-cross search: Search the horizontal and vertical directions /// for the general direction of the motion. /// 2. A 5x5 full search is done to refine the current candidate. /// 3. Uneven multi-hexagon search. See [`UMH_PATTERN`]. /// 4. Refinement using standard hexagon search. /// /// `current` provides the initial search location and serves as /// the output for the final search results. /// /// `me_range` parameter determines how far these stages can search. #[profiling::function] fn uneven_multi_hex_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, current: &mut MotionSearchResult, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, me_range: i16, ) { assert!(!current.is_empty()); // Search in a cross pattern to obtain a rough approximate of motion. // The cross is split into a horizontal and vertical component. Video content // tends to have more horizontal motion, so the horizontal part of the cross // is twice as large as the vertical half. // X - // | <- me_range/2 // X | // X X X XoX X X X - // X // // X // |------| // \ // me_range let center = current.mv; // The larger, horizontal, part of the cross search. for i in (1..=me_range).step_by(2) { const HORIZONTAL_LINE: [MotionVector; 2] = search_pattern!( col: [ 0, 0], row: [-1, 1] ); for &offset in &HORIZONTAL_LINE { let cand_mv = center + offset * i; let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // The smaller, vertical, part of the cross search for i in (1..=me_range >> 1).step_by(2) { const VERTICAL_LINE: [MotionVector; 2] = search_pattern!( col: [-1, 1], row: [ 0, 0] ); for &offset in &VERTICAL_LINE { let cand_mv = center + offset * i; let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // 5x5 full search. Search a 5x5 square region around the current best mv. let center = current.mv; for row in -2..=2 { for col in -2..=2 { if row == 0 && col == 0 { continue; } let cand_mv = center + MotionVector { row, col }; let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // Run the hexagons in uneven multi-hexagon. The hexagonal pattern is tested // around the best vector at multiple scales. // Example of the UMH pattern run on a scale of 1 and 2: // 2 - // | <- me_range // 2 2 | // | // 2 1 2 | // 1 1 | // 2 1 1 2 | // 1 1 | // 2 1 o 1 2 | // 1 1 | // 2 1 1 2 | // 1 1 | // 2 1 2 | // | // 2 2 | // | // 2 - // |---------------| // \ // me_range let center = current.mv; // Divide by 4, the radius of the UMH's hexagon. let iterations = me_range >> 2; for i in 1..=iterations { for &offset in &UMH_PATTERN { let cand_mv = center + offset * i; let rd = get_fullpel_mv_rd( fi, po, org_region, p_ref, bit_depth, pmv, lambda, false, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, ); if rd.cost < current.rd.cost { current.mv = cand_mv; current.rd = rd; } } } // Refine the search results using a 'normal' hexagon search. hexagon_search( fi, po, org_region, p_ref, current, bit_depth, pmv, lambda, mvx_min, mvx_max, mvy_min, mvy_max, w, h, ); } /// Run a subpixel diamond search. The search is run on multiple step sizes. /// /// For each step size, candidate motion vectors are examined for improvement /// to the current search location. The search location is moved to the best /// candidate (if any). This is repeated until the search location stops moving. #[profiling::function] fn subpel_diamond_search( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, _p_ref: &Plane, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, use_satd: bool, current: &mut MotionSearchResult, ref_frame: RefType, ) { use crate::util::Aligned; // Motion compensation assembly has special requirements for edges let mc_w = w.next_power_of_two(); let mc_h = (h + 1) & !1; // Metadata for subpel scratch pad. let cfg = PlaneConfig::new(mc_w, mc_h, 0, 0, 0, 0, std::mem::size_of::()); // Stack allocation for subpel scratch pad. // SAFETY: We write to the array below before reading from it. let mut buf: Aligned<[T; 128 * 128]> = unsafe { Aligned::uninitialized() }; let mut tmp_region = PlaneRegionMut::from_slice( &mut buf.data, &cfg, Rect { x: 0, y: 0, width: cfg.width, height: cfg.height }, ); // start at 1/2 pel and end at 1/4 or 1/8 pel let (mut diamond_radius_log2, diamond_radius_end_log2) = (2u8, u8::from(!fi.allow_high_precision_mv)); loop { // Find the best candidate from the diamond pattern. let mut best_cand: MotionSearchResult = MotionSearchResult::empty(); for &offset in &DIAMOND_R1_PATTERN_SUBPEL { let cand_mv = current.mv + (offset << diamond_radius_log2); let rd = get_subpel_mv_rd( fi, po, org_region, bit_depth, pmv, lambda, use_satd, mvx_min, mvx_max, mvy_min, mvy_max, w, h, cand_mv, &mut tmp_region, ref_frame, ); if rd.cost < best_cand.rd.cost { best_cand.mv = cand_mv; best_cand.rd = rd; } } // Continue the search at this scale until a better candidate isn't found. if current.rd.cost <= best_cand.rd.cost { if diamond_radius_log2 == diamond_radius_end_log2 { break; } else { diamond_radius_log2 -= 1; } } else { *current = best_cand; } } assert!(!current.is_empty()); } #[inline] fn get_fullpel_mv_rd( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, p_ref: &Plane, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector, ) -> MVCandidateRD { if (cand_mv.col as isize) < mvx_min || (cand_mv.col as isize) > mvx_max || (cand_mv.row as isize) < mvy_min || (cand_mv.row as isize) > mvy_max { return MVCandidateRD::empty(); } // Convert the motion vector into an full pixel offset. let plane_ref = p_ref.region(Area::StartingAt { x: po.x + (cand_mv.col / 8) as isize, y: po.y + (cand_mv.row / 8) as isize, }); compute_mv_rd( fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region, &plane_ref, ) } fn get_subpel_mv_rd( fi: &FrameInvariants, po: PlaneOffset, org_region: &PlaneRegion, bit_depth: usize, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, mvx_min: isize, mvx_max: isize, mvy_min: isize, mvy_max: isize, w: usize, h: usize, cand_mv: MotionVector, tmp_region: &mut PlaneRegionMut, ref_frame: RefType, ) -> MVCandidateRD { if (cand_mv.col as isize) < mvx_min || (cand_mv.col as isize) > mvx_max || (cand_mv.row as isize) < mvy_min || (cand_mv.row as isize) > mvy_max { return MVCandidateRD::empty(); } let tmp_width = tmp_region.rect().width; let tmp_height = tmp_region.rect().height; let tile_rect = TileRect { x: 0, y: 0, width: tmp_width, height: tmp_height }; PredictionMode::NEWMV.predict_inter_single( fi, tile_rect, 0, po, tmp_region, // motion comp's w & h on edges can be different than distortion's tmp_width, tmp_height, ref_frame, cand_mv, ); let plane_ref = tmp_region.as_const(); compute_mv_rd( fi, pmv, lambda, use_satd, bit_depth, w, h, cand_mv, org_region, &plane_ref, ) } /// Compute the rate distortion stats for a motion vector. #[inline(always)] fn compute_mv_rd( fi: &FrameInvariants, pmv: [MotionVector; 2], lambda: u32, use_satd: bool, bit_depth: usize, w: usize, h: usize, cand_mv: MotionVector, plane_org: &PlaneRegion<'_, T>, plane_ref: &PlaneRegion<'_, T>, ) -> MVCandidateRD { let sad = if use_satd { get_satd(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level) } else { get_sad(plane_org, plane_ref, w, h, bit_depth, fi.cpu_feature_level) }; let rate1 = get_mv_rate(cand_mv, pmv[0], fi.allow_high_precision_mv); let rate2 = get_mv_rate(cand_mv, pmv[1], fi.allow_high_precision_mv); let rate = rate1.min(rate2 + 1); MVCandidateRD { cost: 256 * sad as u64 + rate as u64 * lambda as u64, sad } } #[profiling::function] fn full_search( fi: &FrameInvariants, x_lo: isize, x_hi: isize, y_lo: isize, y_hi: isize, w: usize, h: usize, org_region: &PlaneRegion, p_ref: &Plane, po: PlaneOffset, step: usize, lambda: u32, pmv: [MotionVector; 2], ) -> MotionSearchResult { let search_region = p_ref.region(Area::Rect { x: x_lo, y: y_lo, width: (x_hi - x_lo) as usize + w, height: (y_hi - y_lo) as usize + h, }); let mut best: MotionSearchResult = MotionSearchResult::empty(); // Select rectangular regions within search region with vert+horz windows for vert_window in search_region.vert_windows(h).step_by(step) { for ref_window in vert_window.horz_windows(w).step_by(step) { let &Rect { x, y, .. } = ref_window.rect(); let mv = MotionVector { row: 8 * (y as i16 - po.y as i16), col: 8 * (x as i16 - po.x as i16), }; let rd = compute_mv_rd( fi, pmv, lambda, false, fi.sequence.bit_depth, w, h, mv, org_region, &ref_window, ); if rd.cost < best.rd.cost { best.rd = rd; best.mv = mv; } } } best } #[inline(always)] fn get_mv_rate( a: MotionVector, b: MotionVector, allow_high_precision_mv: bool, ) -> u32 { #[inline(always)] fn diff_to_rate(diff: i16, allow_high_precision_mv: bool) -> u32 { let d = if allow_high_precision_mv { diff } else { diff >> 1 }; 2 * ILog::ilog(d.abs()) as u32 } diff_to_rate(a.row - b.row, allow_high_precision_mv) + diff_to_rate(a.col - b.col, allow_high_precision_mv) } rav1e-0.7.1/src/partition.rs000064400000000000000000000737261046102023000140340ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_camel_case_types)] #![allow(dead_code)] use self::BlockSize::*; use self::TxSize::*; use crate::context::*; use crate::frame::*; use crate::predict::*; use crate::recon_intra::*; use crate::serialize::{Deserialize, Serialize}; use crate::tiling::*; use crate::transform::TxSize; use crate::util::*; use thiserror::Error; use std::mem::transmute; use std::mem::MaybeUninit; // LAST_FRAME through ALTREF_FRAME correspond to slots 0-6. #[derive(PartialEq, Eq, PartialOrd, Copy, Clone, Debug)] pub enum RefType { INTRA_FRAME = 0, LAST_FRAME = 1, LAST2_FRAME = 2, LAST3_FRAME = 3, GOLDEN_FRAME = 4, BWDREF_FRAME = 5, ALTREF2_FRAME = 6, ALTREF_FRAME = 7, NONE_FRAME = 8, } impl RefType { /// convert to a ref list index, 0-6 (`INTER_REFS_PER_FRAME`) /// /// # Panics /// /// - If the ref type is a None or Intra frame #[inline] pub fn to_index(self) -> usize { match self { NONE_FRAME => { panic!("Tried to get slot of NONE_FRAME"); } INTRA_FRAME => { panic!("Tried to get slot of INTRA_FRAME"); } _ => (self as usize) - 1, } } #[inline] pub const fn is_fwd_ref(self) -> bool { (self as usize) < 5 } #[inline] pub const fn is_bwd_ref(self) -> bool { (self as usize) >= 5 } } use self::RefType::*; use std::fmt; use std::fmt::Display; pub const ALL_INTER_REFS: [RefType; 7] = [ LAST_FRAME, LAST2_FRAME, LAST3_FRAME, GOLDEN_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME, ]; pub const LAST_LAST2_FRAMES: usize = 0; // { LAST_FRAME, LAST2_FRAME } pub const LAST_LAST3_FRAMES: usize = 1; // { LAST_FRAME, LAST3_FRAME } pub const LAST_GOLDEN_FRAMES: usize = 2; // { LAST_FRAME, GOLDEN_FRAME } pub const BWDREF_ALTREF_FRAMES: usize = 3; // { BWDREF_FRAME, ALTREF_FRAME } pub const LAST2_LAST3_FRAMES: usize = 4; // { LAST2_FRAME, LAST3_FRAME } pub const LAST2_GOLDEN_FRAMES: usize = 5; // { LAST2_FRAME, GOLDEN_FRAME } pub const LAST3_GOLDEN_FRAMES: usize = 6; // { LAST3_FRAME, GOLDEN_FRAME } pub const BWDREF_ALTREF2_FRAMES: usize = 7; // { BWDREF_FRAME, ALTREF2_FRAME } pub const ALTREF2_ALTREF_FRAMES: usize = 8; // { ALTREF2_FRAME, ALTREF_FRAME } pub const TOTAL_UNIDIR_COMP_REFS: usize = 9; // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs // that are explicitly signaled. pub const UNIDIR_COMP_REFS: usize = BWDREF_ALTREF_FRAMES + 1; pub const FWD_REFS: usize = 4; pub const BWD_REFS: usize = 3; pub const SINGLE_REFS: usize = 7; pub const TOTAL_REFS_PER_FRAME: usize = 8; pub const INTER_REFS_PER_FRAME: usize = 7; pub const TOTAL_COMP_REFS: usize = FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS; pub const REF_FRAMES_LOG2: usize = 3; pub const REF_FRAMES: usize = 1 << REF_FRAMES_LOG2; pub const REF_CONTEXTS: usize = 3; pub const MVREF_ROW_COLS: usize = 3; #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Debug)] pub enum PartitionType { PARTITION_NONE, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT, PARTITION_HORZ_A, // HORZ split and the top partition is split again PARTITION_HORZ_B, // HORZ split and the bottom partition is split again PARTITION_VERT_A, // VERT split and the left partition is split again PARTITION_VERT_B, // VERT split and the right partition is split again PARTITION_HORZ_4, // 4:1 horizontal partition PARTITION_VERT_4, // 4:1 vertical partition PARTITION_INVALID, } #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum BlockSize { BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, BLOCK_64X64, BLOCK_64X128, BLOCK_128X64, BLOCK_128X128, BLOCK_4X16, BLOCK_16X4, BLOCK_8X32, BLOCK_32X8, BLOCK_16X64, BLOCK_64X16, } #[derive(Debug, Error, Copy, Clone, Eq, PartialEq)] pub struct InvalidBlockSize; impl Display for InvalidBlockSize { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str("invalid block size") } } impl PartialOrd for BlockSize { #[inline(always)] fn partial_cmp(&self, other: &Self) -> Option { use std::cmp::Ordering::{Equal, Greater, Less}; match ( self.width().cmp(&other.width()), self.height().cmp(&other.height()), ) { (Greater, Less) | (Less, Greater) => None, (Equal, Equal) => Some(Equal), (Greater, _) | (_, Greater) => Some(Greater), (Less, _) | (_, Less) => Some(Less), } } } #[cfg(test)] impl Default for BlockSize { fn default() -> Self { BlockSize::BLOCK_64X64 } } impl BlockSize { pub const BLOCK_SIZES_ALL: usize = 22; pub const BLOCK_SIZES: usize = BlockSize::BLOCK_SIZES_ALL - 6; // BLOCK_SIZES_ALL minus 4:1 non-squares, six of them #[inline] /// # Errors /// /// - Returns `InvalidBlockSize` if the given `w` and `h` do not produce /// a valid block size. pub fn from_width_and_height_opt( w: usize, h: usize, ) -> Result { match (w, h) { (4, 4) => Ok(BLOCK_4X4), (4, 8) => Ok(BLOCK_4X8), (4, 16) => Ok(BLOCK_4X16), (8, 4) => Ok(BLOCK_8X4), (8, 8) => Ok(BLOCK_8X8), (8, 16) => Ok(BLOCK_8X16), (8, 32) => Ok(BLOCK_8X32), (16, 4) => Ok(BLOCK_16X4), (16, 8) => Ok(BLOCK_16X8), (16, 16) => Ok(BLOCK_16X16), (16, 32) => Ok(BLOCK_16X32), (16, 64) => Ok(BLOCK_16X64), (32, 8) => Ok(BLOCK_32X8), (32, 16) => Ok(BLOCK_32X16), (32, 32) => Ok(BLOCK_32X32), (32, 64) => Ok(BLOCK_32X64), (64, 16) => Ok(BLOCK_64X16), (64, 32) => Ok(BLOCK_64X32), (64, 64) => Ok(BLOCK_64X64), (64, 128) => Ok(BLOCK_64X128), (128, 64) => Ok(BLOCK_128X64), (128, 128) => Ok(BLOCK_128X128), _ => Err(InvalidBlockSize), } } /// # Panics /// /// - If the given `w` and `h` do not produce a valid block size. pub fn from_width_and_height(w: usize, h: usize) -> BlockSize { Self::from_width_and_height_opt(w, h).unwrap() } #[inline] pub fn cfl_allowed(self) -> bool { // TODO: fix me when enabling EXT_PARTITION_TYPES self <= BlockSize::BLOCK_32X32 } #[inline] pub const fn width(self) -> usize { 1 << self.width_log2() } /// width * height #[inline] pub const fn area(self) -> usize { self.width() * self.height() } #[inline] pub const fn width_log2(self) -> usize { match self { BLOCK_4X4 | BLOCK_4X8 | BLOCK_4X16 => 2, BLOCK_8X4 | BLOCK_8X8 | BLOCK_8X16 | BLOCK_8X32 => 3, BLOCK_16X4 | BLOCK_16X8 | BLOCK_16X16 | BLOCK_16X32 | BLOCK_16X64 => 4, BLOCK_32X8 | BLOCK_32X16 | BLOCK_32X32 | BLOCK_32X64 => 5, BLOCK_64X16 | BLOCK_64X32 | BLOCK_64X64 | BLOCK_64X128 => 6, BLOCK_128X64 | BLOCK_128X128 => 7, } } #[inline] pub const fn width_mi_log2(self) -> usize { self.width_log2() - 2 } #[inline] pub const fn width_mi(self) -> usize { self.width() >> MI_SIZE_LOG2 } #[inline] pub fn width_imp_b(self) -> usize { (self.width() >> (IMPORTANCE_BLOCK_TO_BLOCK_SHIFT + BLOCK_TO_PLANE_SHIFT)) .max(1) } #[inline] pub const fn height(self) -> usize { 1 << self.height_log2() } #[inline] pub const fn height_log2(self) -> usize { match self { BLOCK_4X4 | BLOCK_8X4 | BLOCK_16X4 => 2, BLOCK_4X8 | BLOCK_8X8 | BLOCK_16X8 | BLOCK_32X8 => 3, BLOCK_4X16 | BLOCK_8X16 | BLOCK_16X16 | BLOCK_32X16 | BLOCK_64X16 => 4, BLOCK_8X32 | BLOCK_16X32 | BLOCK_32X32 | BLOCK_64X32 => 5, BLOCK_16X64 | BLOCK_32X64 | BLOCK_64X64 | BLOCK_128X64 => 6, BLOCK_64X128 | BLOCK_128X128 => 7, } } #[inline] pub const fn height_mi_log2(self) -> usize { self.height_log2() - 2 } #[inline] pub const fn height_mi(self) -> usize { self.height() >> MI_SIZE_LOG2 } #[inline] pub fn height_imp_b(self) -> usize { (self.height() >> (IMPORTANCE_BLOCK_TO_BLOCK_SHIFT + BLOCK_TO_PLANE_SHIFT)) .max(1) } #[inline] pub const fn tx_size(self) -> TxSize { match self { BLOCK_4X4 => TX_4X4, BLOCK_4X8 => TX_4X8, BLOCK_8X4 => TX_8X4, BLOCK_8X8 => TX_8X8, BLOCK_8X16 => TX_8X16, BLOCK_16X8 => TX_16X8, BLOCK_16X16 => TX_16X16, BLOCK_16X32 => TX_16X32, BLOCK_32X16 => TX_32X16, BLOCK_32X32 => TX_32X32, BLOCK_32X64 => TX_32X64, BLOCK_64X32 => TX_64X32, BLOCK_4X16 => TX_4X16, BLOCK_16X4 => TX_16X4, BLOCK_8X32 => TX_8X32, BLOCK_32X8 => TX_32X8, BLOCK_16X64 => TX_16X64, BLOCK_64X16 => TX_64X16, _ => TX_64X64, } } /// Source: `Subsampled_Size` (AV1 specification section 5.11.38) /// /// # Errors /// /// - Returns `InvalidBlockSize` if the given block size cannot /// be subsampled in the requested way. #[inline] pub const fn subsampled_size( self, xdec: usize, ydec: usize, ) -> Result { Ok(match (xdec, ydec) { (0, 0) /* 4:4:4 */ => self, (1, 0) /* 4:2:2 */ => match self { BLOCK_4X4 | BLOCK_8X4 => BLOCK_4X4, BLOCK_8X8 => BLOCK_4X8, BLOCK_16X4 => BLOCK_8X4, BLOCK_16X8 => BLOCK_8X8, BLOCK_16X16 => BLOCK_8X16, BLOCK_32X8 => BLOCK_16X8, BLOCK_32X16 => BLOCK_16X16, BLOCK_32X32 => BLOCK_16X32, BLOCK_64X16 => BLOCK_32X16, BLOCK_64X32 => BLOCK_32X32, BLOCK_64X64 => BLOCK_32X64, BLOCK_128X64 => BLOCK_64X64, BLOCK_128X128 => BLOCK_64X128, _ => return Err(InvalidBlockSize), }, (1, 1) /* 4:2:0 */ => match self { BLOCK_4X4 | BLOCK_4X8 | BLOCK_8X4 | BLOCK_8X8 => BLOCK_4X4, BLOCK_4X16 | BLOCK_8X16 => BLOCK_4X8, BLOCK_8X32 => BLOCK_4X16, BLOCK_16X4 | BLOCK_16X8 => BLOCK_8X4, BLOCK_16X16 => BLOCK_8X8, BLOCK_16X32 => BLOCK_8X16, BLOCK_16X64 => BLOCK_8X32, BLOCK_32X8 => BLOCK_16X4, BLOCK_32X16 => BLOCK_16X8, BLOCK_32X32 => BLOCK_16X16, BLOCK_32X64 => BLOCK_16X32, BLOCK_64X16 => BLOCK_32X8, BLOCK_64X32 => BLOCK_32X16, BLOCK_64X64 => BLOCK_32X32, BLOCK_64X128 => BLOCK_32X64, BLOCK_128X64 => BLOCK_64X32, BLOCK_128X128 => BLOCK_64X64, }, _ => return Err(InvalidBlockSize), }) } /// # Panics /// /// Will panic if the subsampling is not possible #[inline] pub fn largest_chroma_tx_size(self, xdec: usize, ydec: usize) -> TxSize { let plane_bsize = self .subsampled_size(xdec, ydec) .expect("invalid block size for this subsampling mode"); let chroma_tx_size = max_txsize_rect_lookup[plane_bsize as usize]; av1_get_coded_tx_size(chroma_tx_size) } #[inline] pub const fn is_sqr(self) -> bool { self.width_log2() == self.height_log2() } #[inline] pub const fn is_sub8x8(self, xdec: usize, ydec: usize) -> bool { xdec != 0 && self.width_log2() == 2 || ydec != 0 && self.height_log2() == 2 } #[inline] pub const fn sub8x8_offset( self, xdec: usize, ydec: usize, ) -> (isize, isize) { let offset_x = if xdec != 0 && self.width_log2() == 2 { -1 } else { 0 }; let offset_y = if ydec != 0 && self.height_log2() == 2 { -1 } else { 0 }; (offset_x, offset_y) } /// # Errors /// /// - Returns `InvalidBlockSize` if the block size cannot be split /// in the requested way. pub const fn subsize( self, partition: PartitionType, ) -> Result { use PartitionType::*; Ok(match partition { PARTITION_NONE => self, PARTITION_SPLIT => match self { BLOCK_8X8 => BLOCK_4X4, BLOCK_16X16 => BLOCK_8X8, BLOCK_32X32 => BLOCK_16X16, BLOCK_64X64 => BLOCK_32X32, BLOCK_128X128 => BLOCK_64X64, _ => return Err(InvalidBlockSize), }, PARTITION_HORZ | PARTITION_HORZ_A | PARTITION_HORZ_B => match self { BLOCK_8X8 => BLOCK_8X4, BLOCK_16X16 => BLOCK_16X8, BLOCK_32X32 => BLOCK_32X16, BLOCK_64X64 => BLOCK_64X32, BLOCK_128X128 => BLOCK_128X64, _ => return Err(InvalidBlockSize), }, PARTITION_VERT | PARTITION_VERT_A | PARTITION_VERT_B => match self { BLOCK_8X8 => BLOCK_4X8, BLOCK_16X16 => BLOCK_8X16, BLOCK_32X32 => BLOCK_16X32, BLOCK_64X64 => BLOCK_32X64, BLOCK_128X128 => BLOCK_64X128, _ => return Err(InvalidBlockSize), }, PARTITION_HORZ_4 => match self { BLOCK_16X16 => BLOCK_16X4, BLOCK_32X32 => BLOCK_32X8, BLOCK_64X64 => BLOCK_64X16, _ => return Err(InvalidBlockSize), }, PARTITION_VERT_4 => match self { BLOCK_16X16 => BLOCK_4X16, BLOCK_32X32 => BLOCK_8X32, BLOCK_64X64 => BLOCK_16X64, _ => return Err(InvalidBlockSize), }, _ => return Err(InvalidBlockSize), }) } pub const fn is_rect_tx_allowed(self) -> bool { !matches!( self, BLOCK_4X4 | BLOCK_8X8 | BLOCK_16X16 | BLOCK_32X32 | BLOCK_64X64 | BLOCK_64X128 | BLOCK_128X64 | BLOCK_128X128 ) } } impl fmt::Display for BlockSize { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!( f, "{}", match self { BlockSize::BLOCK_4X4 => "4x4", BlockSize::BLOCK_4X8 => "4x8", BlockSize::BLOCK_8X4 => "8x4", BlockSize::BLOCK_8X8 => "8x8", BlockSize::BLOCK_8X16 => "8x16", BlockSize::BLOCK_16X8 => "16x8", BlockSize::BLOCK_16X16 => "16x16", BlockSize::BLOCK_16X32 => "16x32", BlockSize::BLOCK_32X16 => "32x16", BlockSize::BLOCK_32X32 => "32x32", BlockSize::BLOCK_32X64 => "32x64", BlockSize::BLOCK_64X32 => "64x32", BlockSize::BLOCK_64X64 => "64x64", BlockSize::BLOCK_64X128 => "64x128", BlockSize::BLOCK_128X64 => "128x64", BlockSize::BLOCK_128X128 => "128x128", BlockSize::BLOCK_4X16 => "4x16", BlockSize::BLOCK_16X4 => "16x4", BlockSize::BLOCK_8X32 => "8x32", BlockSize::BLOCK_32X8 => "32x8", BlockSize::BLOCK_16X64 => "16x64", BlockSize::BLOCK_64X16 => "64x16", } ) } } pub const NEWMV_MODE_CONTEXTS: usize = 7; pub const GLOBALMV_MODE_CONTEXTS: usize = 2; pub const REFMV_MODE_CONTEXTS: usize = 6; pub const INTER_COMPOUND_MODES: usize = 8; pub const REFMV_OFFSET: usize = 4; pub const GLOBALMV_OFFSET: usize = 3; pub const NEWMV_CTX_MASK: usize = (1 << GLOBALMV_OFFSET) - 1; pub const GLOBALMV_CTX_MASK: usize = (1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1; pub const REFMV_CTX_MASK: usize = (1 << (8 - REFMV_OFFSET)) - 1; pub static RAV1E_PARTITION_TYPES: &[PartitionType] = &[ PartitionType::PARTITION_NONE, PartitionType::PARTITION_HORZ, PartitionType::PARTITION_VERT, PartitionType::PARTITION_SPLIT, ]; #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum GlobalMVMode { IDENTITY = 0, // identity transformation, 0-parameter TRANSLATION = 1, // translational motion 2-parameter ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter AFFINE = 3, // affine, 6-parameter } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum MvSubpelPrecision { MV_SUBPEL_NONE = -1, MV_SUBPEL_LOW_PRECISION = 0, MV_SUBPEL_HIGH_PRECISION, } /* Symbols for coding which components are zero jointly */ pub const MV_JOINTS: usize = 4; #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum MvJointType { MV_JOINT_ZERO = 0, /* Zero vector */ MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ } fn supersample_chroma_bsize( bsize: BlockSize, ss_x: usize, ss_y: usize, ) -> BlockSize { debug_assert!(ss_x < 2); debug_assert!(ss_y < 2); match bsize { BLOCK_4X4 => match (ss_x, ss_y) { (1, 1) => BLOCK_8X8, (1, 0) => BLOCK_8X4, (0, 1) => BLOCK_4X8, _ => bsize, }, BLOCK_4X8 => match (ss_x, ss_y) { (1, 1) => BLOCK_8X8, (1, 0) => BLOCK_8X8, (0, 1) => BLOCK_4X8, _ => bsize, }, BLOCK_8X4 => match (ss_x, ss_y) { (1, 1) => BLOCK_8X8, (1, 0) => BLOCK_8X4, (0, 1) => BLOCK_8X8, _ => bsize, }, BLOCK_4X16 => match (ss_x, ss_y) { (1, 1) => BLOCK_8X16, (1, 0) => BLOCK_8X16, (0, 1) => BLOCK_4X16, _ => bsize, }, BLOCK_16X4 => match (ss_x, ss_y) { (1, 1) => BLOCK_16X8, (1, 0) => BLOCK_16X4, (0, 1) => BLOCK_16X8, _ => bsize, }, _ => bsize, } } type IntraEdgeBuffer = Aligned<[MaybeUninit; 4 * MAX_TX_SIZE + 1]>; #[cfg(any(test, feature = "bench"))] type IntraEdgeMock = Aligned<[T; 4 * MAX_TX_SIZE + 1]>; pub struct IntraEdge<'a, T: Pixel>(&'a [T], &'a [T], &'a [T]); impl<'a, T: Pixel> IntraEdge<'a, T> { fn new( edge_buf: &'a mut IntraEdgeBuffer, init_left: usize, init_above: usize, ) -> Self { // SAFETY: Initialized in `get_intra_edges`. let left = unsafe { let begin_left = 2 * MAX_TX_SIZE - init_left; let end_above = 2 * MAX_TX_SIZE + 1 + init_above; slice_assume_init_mut(&mut edge_buf.data[begin_left..end_above]) }; let (left, top_left) = left.split_at(init_left); let (top_left, above) = top_left.split_at(1); Self(left, top_left, above) } pub const fn as_slices(&self) -> (&'a [T], &'a [T], &'a [T]) { (self.0, self.1, self.2) } pub const fn top_left_ptr(&self) -> *const T { self.1.as_ptr() } #[cfg(any(test, feature = "bench"))] pub fn mock(edge_buf: &'a IntraEdgeMock) -> Self { let left = &edge_buf.data[..]; let (left, top_left) = left.split_at(2 * MAX_TX_SIZE); let (top_left, above) = top_left.split_at(1); Self(left, top_left, above) } } pub fn get_intra_edges<'a, T: Pixel>( edge_buf: &'a mut IntraEdgeBuffer, dst: &PlaneRegion<'_, T>, partition_bo: TileBlockOffset, // partition bo, BlockOffset bx: usize, by: usize, partition_size: BlockSize, // partition size, BlockSize po: PlaneOffset, tx_size: TxSize, bit_depth: usize, opt_mode: Option, enable_intra_edge_filter: bool, intra_param: IntraParam, ) -> IntraEdge<'a, T> { let mut init_left: usize = 0; let mut init_above: usize = 0; let plane_cfg = &dst.plane_cfg; let base = 128u16 << (bit_depth - 8); { // left pixels are ordered from bottom to top and right-aligned let (left, not_left) = edge_buf.data.split_at_mut(2 * MAX_TX_SIZE); let (top_left, above) = not_left.split_at_mut(1); let x = po.x as usize; let y = po.y as usize; let mut needs_left = true; let mut needs_topleft = true; let mut needs_top = true; let mut needs_topright = true; let mut needs_bottomleft = true; let mut needs_topleft_filter = false; if let Some(mut mode) = opt_mode { mode = match mode { PredictionMode::PAETH_PRED => match (x, y) { (0, 0) => PredictionMode::DC_PRED, (0, _) => PredictionMode::V_PRED, (_, 0) => PredictionMode::H_PRED, _ => PredictionMode::PAETH_PRED, }, _ => mode, }; let p_angle = intra_mode_to_angle(mode) + match intra_param { IntraParam::AngleDelta(val) => (val * ANGLE_STEP) as isize, _ => 0, }; let dc_or_cfl = mode == PredictionMode::DC_PRED || mode == PredictionMode::UV_CFL_PRED; needs_left = (!dc_or_cfl || x != 0) || (p_angle > 90 && p_angle != 180); needs_topleft = mode == PredictionMode::PAETH_PRED || (mode.is_directional() && p_angle != 90 && p_angle != 180); needs_top = (!dc_or_cfl || y != 0) || (p_angle != 90 && p_angle < 180); needs_topright = mode.is_directional() && p_angle < 90; needs_bottomleft = mode.is_directional() && p_angle > 180; needs_topleft_filter = enable_intra_edge_filter && p_angle > 90 && p_angle < 180; } let rect_w = dst.rect().width.min(dst.plane_cfg.width - dst.rect().x as usize); let rect_h = dst.rect().height.min(dst.plane_cfg.height - dst.rect().y as usize); // Needs left if needs_left { let txh = if y + tx_size.height() > rect_h { rect_h - y } else { tx_size.height() }; if x != 0 { for i in 0..txh { debug_assert!(y + i < rect_h); left[2 * MAX_TX_SIZE - 1 - i].write(dst[y + i][x - 1]); } if txh < tx_size.height() { let val = dst[y + txh - 1][x - 1]; for i in txh..tx_size.height() { left[2 * MAX_TX_SIZE - 1 - i].write(val); } } } else { let val = if y != 0 { dst[y - 1][0] } else { T::cast_from(base + 1) }; for v in left[2 * MAX_TX_SIZE - tx_size.height()..].iter_mut() { v.write(val); } } init_left += tx_size.height(); } // Needs top if needs_top { let txw = if x + tx_size.width() > rect_w { rect_w - x } else { tx_size.width() }; if y != 0 { above[..txw].copy_from_slice( // SAFETY: &[T] and &[MaybeUninit] have the same layout unsafe { transmute::<&[T], &[MaybeUninit]>(&dst[y - 1][x..x + txw]) }, ); if txw < tx_size.width() { let val = dst[y - 1][x + txw - 1]; for i in txw..tx_size.width() { above[i].write(val); } } } else { let val = if x != 0 { dst[0][x - 1] } else { T::cast_from(base - 1) }; for v in above[..tx_size.width()].iter_mut() { v.write(val); } } init_above += tx_size.width(); } let bx4 = bx * (tx_size.width() >> MI_SIZE_LOG2); // bx,by are in tx block indices let by4 = by * (tx_size.height() >> MI_SIZE_LOG2); let have_top = by4 != 0 || if plane_cfg.ydec != 0 { partition_bo.0.y > 1 } else { partition_bo.0.y > 0 }; let have_left = bx4 != 0 || if plane_cfg.xdec != 0 { partition_bo.0.x > 1 } else { partition_bo.0.x > 0 }; let right_available = x + tx_size.width() < rect_w; let bottom_available = y + tx_size.height() < rect_h; let scaled_partition_size = supersample_chroma_bsize(partition_size, plane_cfg.xdec, plane_cfg.ydec); // Needs top right if needs_topright { debug_assert!(plane_cfg.xdec <= 1 && plane_cfg.ydec <= 1); let num_avail = if y != 0 && has_top_right( scaled_partition_size, partition_bo, have_top, right_available, tx_size, by4, bx4, plane_cfg.xdec, plane_cfg.ydec, ) { tx_size.width().min(rect_w - x - tx_size.width()) } else { 0 }; if num_avail > 0 { above[tx_size.width()..][..num_avail].copy_from_slice( // SAFETY: &[T] and &[MaybeUninit] have the same layout unsafe { transmute::<&[T], &[MaybeUninit]>( &dst[y - 1][x + tx_size.width()..][..num_avail], ) }, ); } if num_avail < tx_size.height() { let val = above[tx_size.width() + num_avail - 1]; for v in above [tx_size.width() + num_avail..tx_size.width() + tx_size.height()] .iter_mut() { *v = val; } } init_above += tx_size.height(); } // SAFETY: The blocks above have initialized the first `init_above` items. let above = unsafe { slice_assume_init_mut(&mut above[..init_above]) }; // Needs bottom left if needs_bottomleft { debug_assert!(plane_cfg.xdec <= 1 && plane_cfg.ydec <= 1); let num_avail = if x != 0 && has_bottom_left( scaled_partition_size, partition_bo, bottom_available, have_left, tx_size, by4, bx4, plane_cfg.xdec, plane_cfg.ydec, ) { tx_size.height().min(rect_h - y - tx_size.height()) } else { 0 }; if num_avail > 0 { for i in 0..num_avail { left[2 * MAX_TX_SIZE - tx_size.height() - 1 - i] .write(dst[y + tx_size.height() + i][x - 1]); } } if num_avail < tx_size.width() { let val = left[2 * MAX_TX_SIZE - tx_size.height() - num_avail]; for v in left[(2 * MAX_TX_SIZE - tx_size.height() - tx_size.width()) ..(2 * MAX_TX_SIZE - tx_size.height() - num_avail)] .iter_mut() { *v = val; } } init_left += tx_size.width(); } // SAFETY: The blocks above have initialized last `init_left` items. let left = unsafe { slice_assume_init_mut(&mut left[2 * MAX_TX_SIZE - init_left..]) }; // Needs top-left if needs_topleft { let top_left = top_left[0].write(match (x, y) { (0, 0) => T::cast_from(base), (_, 0) => dst[0][x - 1], (0, _) => dst[y - 1][0], _ => dst[y - 1][x - 1], }); let (w, h) = (tx_size.width(), tx_size.height()); if needs_topleft_filter && w + h >= 24 { let (l, a, tl): (u32, u32, u32) = (left[left.len() - 1].into(), above[0].into(), (*top_left).into()); let s = l * 5 + tl * 6 + a * 5; *top_left = T::cast_from((s + (1 << 3)) >> 4); } } else { top_left[0].write(T::cast_from(base)); } } IntraEdge::new(edge_buf, init_left, init_above) } pub fn has_tr(bo: TileBlockOffset, bsize: BlockSize) -> bool { let sb_mi_size = BLOCK_64X64.width_mi(); /* Assume 64x64 for now */ let mask_row = bo.0.y & LOCAL_BLOCK_MASK; let mask_col = bo.0.x & LOCAL_BLOCK_MASK; let target_n4_w = bsize.width_mi(); let target_n4_h = bsize.height_mi(); let mut bs = target_n4_w.max(target_n4_h); if bs > BLOCK_64X64.width_mi() { return false; } let mut has_tr = !((mask_row & bs) != 0 && (mask_col & bs) != 0); /* TODO: assert its a power of two */ while bs < sb_mi_size { if (mask_col & bs) != 0 { if (mask_col & (2 * bs) != 0) && (mask_row & (2 * bs) != 0) { has_tr = false; break; } } else { break; } bs <<= 1; } /* The left hand of two vertical rectangles always has a top right (as the * block above will have been decoded) */ if (target_n4_w < target_n4_h) && (bo.0.x & target_n4_w) == 0 { has_tr = true; } /* The bottom of two horizontal rectangles never has a top right (as the block * to the right won't have been decoded) */ if (target_n4_w > target_n4_h) && (bo.0.y & target_n4_h) != 0 { has_tr = false; } /* The bottom left square of a Vertical A (in the old format) does * not have a top right as it is decoded before the right hand * rectangle of the partition */ /* if blk.partition == PartitionType::PARTITION_VERT_A { if blk.n4_w == blk.n4_h { if (mask_row & bs) != 0 { has_tr = false; } } } */ has_tr } pub fn has_bl(bo: TileBlockOffset, bsize: BlockSize) -> bool { let sb_mi_size = BLOCK_64X64.width_mi(); /* Assume 64x64 for now */ let mask_row = bo.0.y & LOCAL_BLOCK_MASK; let mask_col = bo.0.x & LOCAL_BLOCK_MASK; let target_n4_w = bsize.width_mi(); let target_n4_h = bsize.height_mi(); let mut bs = target_n4_w.max(target_n4_h); if bs > BLOCK_64X64.width_mi() { return false; } let mut has_bl = (mask_row & bs) == 0 && (mask_col & bs) == 0 && bs < sb_mi_size; /* TODO: assert its a power of two */ while 2 * bs < sb_mi_size { if (mask_col & bs) == 0 { if (mask_col & (2 * bs) == 0) && (mask_row & (2 * bs) == 0) { has_bl = true; break; } } else { break; } bs <<= 1; } /* The right hand of two vertical rectangles never has a bottom left (as the * block below won't have been decoded) */ if (target_n4_w < target_n4_h) && (bo.0.x & target_n4_w) != 0 { has_bl = false; } /* The top of two horizontal rectangles always has a bottom left (as the block * to the left will have been decoded) */ if (target_n4_w > target_n4_h) && (bo.0.y & target_n4_h) == 0 { has_bl = true; } /* The bottom left square of a Vertical A (in the old format) does * not have a top right as it is decoded before the right hand * rectangle of the partition */ /* if blk.partition == PartitionType::PARTITION_VERT_A { if blk.n4_w == blk.n4_h { if (mask_row & bs) != 0 { has_tr = false; } } } */ has_bl } #[cfg(test)] mod tests { use crate::partition::BlockSize::*; use crate::partition::{BlockSize, InvalidBlockSize}; #[test] fn from_wh_matches_naive() { fn from_wh_opt_naive( w: usize, h: usize, ) -> Result { match (w, h) { (4, 4) => Ok(BLOCK_4X4), (4, 8) => Ok(BLOCK_4X8), (8, 4) => Ok(BLOCK_8X4), (8, 8) => Ok(BLOCK_8X8), (8, 16) => Ok(BLOCK_8X16), (16, 8) => Ok(BLOCK_16X8), (16, 16) => Ok(BLOCK_16X16), (16, 32) => Ok(BLOCK_16X32), (32, 16) => Ok(BLOCK_32X16), (32, 32) => Ok(BLOCK_32X32), (32, 64) => Ok(BLOCK_32X64), (64, 32) => Ok(BLOCK_64X32), (64, 64) => Ok(BLOCK_64X64), (64, 128) => Ok(BLOCK_64X128), (128, 64) => Ok(BLOCK_128X64), (128, 128) => Ok(BLOCK_128X128), (4, 16) => Ok(BLOCK_4X16), (16, 4) => Ok(BLOCK_16X4), (8, 32) => Ok(BLOCK_8X32), (32, 8) => Ok(BLOCK_32X8), (16, 64) => Ok(BLOCK_16X64), (64, 16) => Ok(BLOCK_64X16), _ => Err(InvalidBlockSize), } } for w in 0..256 { for h in 0..256 { let a = BlockSize::from_width_and_height_opt(w, h); let b = from_wh_opt_naive(w, h); assert_eq!(a, b); } } } } rav1e-0.7.1/src/predict.rs000064400000000000000000001437731046102023000134550ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(dead_code)] use std::mem::MaybeUninit; cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::predict::*; } else if #[cfg(asm_neon)] { pub use crate::asm::aarch64::predict::*; } else { pub use self::rust::*; } } use crate::context::{TileBlockOffset, MAX_SB_SIZE_LOG2, MAX_TX_SIZE}; use crate::cpu_features::CpuFeatureLevel; use crate::encoder::FrameInvariants; use crate::frame::*; use crate::mc::*; use crate::partition::*; use crate::tiling::*; use crate::transform::*; use crate::util::*; use std::convert::TryInto; pub const ANGLE_STEP: i8 = 3; // TODO: Review the order of this list. // The order impacts compression efficiency. pub static RAV1E_INTRA_MODES: &[PredictionMode] = &[ PredictionMode::DC_PRED, PredictionMode::H_PRED, PredictionMode::V_PRED, PredictionMode::SMOOTH_PRED, PredictionMode::SMOOTH_H_PRED, PredictionMode::SMOOTH_V_PRED, PredictionMode::PAETH_PRED, PredictionMode::D45_PRED, PredictionMode::D135_PRED, PredictionMode::D113_PRED, PredictionMode::D157_PRED, PredictionMode::D203_PRED, PredictionMode::D67_PRED, ]; pub static RAV1E_INTER_MODES_MINIMAL: &[PredictionMode] = &[PredictionMode::NEARESTMV]; pub static RAV1E_INTER_COMPOUND_MODES: &[PredictionMode] = &[ PredictionMode::GLOBAL_GLOBALMV, PredictionMode::NEAREST_NEARESTMV, PredictionMode::NEW_NEWMV, PredictionMode::NEAREST_NEWMV, PredictionMode::NEW_NEARESTMV, PredictionMode::NEAR_NEAR0MV, PredictionMode::NEAR_NEAR1MV, PredictionMode::NEAR_NEAR2MV, ]; // There are more modes than in the spec because every allowed // drl index for NEAR modes is considered its own mode. #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Default)] pub enum PredictionMode { #[default] DC_PRED, // Average of above and left pixels V_PRED, // Vertical H_PRED, // Horizontal D45_PRED, // Directional 45 degree D135_PRED, // Directional 135 degree D113_PRED, // Directional 113 degree D157_PRED, // Directional 157 degree D203_PRED, // Directional 203 degree D67_PRED, // Directional 67 degree SMOOTH_PRED, // Combination of horizontal and vertical interpolation SMOOTH_V_PRED, SMOOTH_H_PRED, PAETH_PRED, UV_CFL_PRED, NEARESTMV, NEAR0MV, NEAR1MV, NEAR2MV, GLOBALMV, NEWMV, // Compound ref compound modes NEAREST_NEARESTMV, NEAR_NEAR0MV, NEAR_NEAR1MV, NEAR_NEAR2MV, NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEW0MV, NEAR_NEW1MV, NEAR_NEW2MV, NEW_NEAR0MV, NEW_NEAR1MV, NEW_NEAR2MV, GLOBAL_GLOBALMV, NEW_NEWMV, } // This is a higher number than in the spec and cannot be used // for bitstream writing purposes. pub const PREDICTION_MODES: usize = 34; #[derive(Copy, Clone, Debug)] pub enum PredictionVariant { NONE, LEFT, TOP, BOTH, } impl PredictionVariant { #[inline] const fn new(x: usize, y: usize) -> Self { match (x, y) { (0, 0) => PredictionVariant::NONE, (_, 0) => PredictionVariant::LEFT, (0, _) => PredictionVariant::TOP, _ => PredictionVariant::BOTH, } } } pub const fn intra_mode_to_angle(mode: PredictionMode) -> isize { match mode { PredictionMode::V_PRED => 90, PredictionMode::H_PRED => 180, PredictionMode::D45_PRED => 45, PredictionMode::D135_PRED => 135, PredictionMode::D113_PRED => 113, PredictionMode::D157_PRED => 157, PredictionMode::D203_PRED => 203, PredictionMode::D67_PRED => 67, _ => 0, } } impl PredictionMode { #[inline] pub fn is_compound(self) -> bool { self >= PredictionMode::NEAREST_NEARESTMV } #[inline] pub fn has_nearmv(self) -> bool { self == PredictionMode::NEAR0MV || self == PredictionMode::NEAR1MV || self == PredictionMode::NEAR2MV || self == PredictionMode::NEAR_NEAR0MV || self == PredictionMode::NEAR_NEAR1MV || self == PredictionMode::NEAR_NEAR2MV || self == PredictionMode::NEAR_NEW0MV || self == PredictionMode::NEAR_NEW1MV || self == PredictionMode::NEAR_NEW2MV || self == PredictionMode::NEW_NEAR0MV || self == PredictionMode::NEW_NEAR1MV || self == PredictionMode::NEW_NEAR2MV } #[inline] pub fn has_newmv(self) -> bool { self == PredictionMode::NEWMV || self == PredictionMode::NEW_NEWMV || self == PredictionMode::NEAREST_NEWMV || self == PredictionMode::NEW_NEARESTMV || self == PredictionMode::NEAR_NEW0MV || self == PredictionMode::NEAR_NEW1MV || self == PredictionMode::NEAR_NEW2MV || self == PredictionMode::NEW_NEAR0MV || self == PredictionMode::NEW_NEAR1MV || self == PredictionMode::NEW_NEAR2MV } #[inline] pub fn ref_mv_idx(self) -> usize { if self == PredictionMode::NEAR0MV || self == PredictionMode::NEAR1MV || self == PredictionMode::NEAR2MV { self as usize - PredictionMode::NEAR0MV as usize + 1 } else if self == PredictionMode::NEAR_NEAR0MV || self == PredictionMode::NEAR_NEAR1MV || self == PredictionMode::NEAR_NEAR2MV { self as usize - PredictionMode::NEAR_NEAR0MV as usize + 1 } else { 1 } } /// # Panics /// /// - If called on an inter `PredictionMode` pub fn predict_intra( self, tile_rect: TileRect, dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, ac: &[i16], intra_param: IntraParam, ief_params: Option, edge_buf: &IntraEdge, cpu: CpuFeatureLevel, ) { assert!(self.is_intra()); let &Rect { x: frame_x, y: frame_y, .. } = dst.rect(); debug_assert!(frame_x >= 0 && frame_y >= 0); // x and y are expressed relative to the tile let x = frame_x as usize - tile_rect.x; let y = frame_y as usize - tile_rect.y; let variant = PredictionVariant::new(x, y); let alpha = match intra_param { IntraParam::Alpha(val) => val, _ => 0, }; let angle_delta = match intra_param { IntraParam::AngleDelta(val) => val, _ => 0, }; let mode = match self { PredictionMode::PAETH_PRED => match variant { PredictionVariant::NONE => PredictionMode::DC_PRED, PredictionVariant::TOP => PredictionMode::V_PRED, PredictionVariant::LEFT => PredictionMode::H_PRED, PredictionVariant::BOTH => PredictionMode::PAETH_PRED, }, PredictionMode::UV_CFL_PRED if alpha == 0 => PredictionMode::DC_PRED, _ => self, }; let angle = match mode { PredictionMode::UV_CFL_PRED => alpha as isize, _ => intra_mode_to_angle(mode) + (angle_delta * ANGLE_STEP) as isize, }; dispatch_predict_intra::( mode, variant, dst, tx_size, bit_depth, ac, angle, ief_params, edge_buf, cpu, ); } #[inline] pub fn is_intra(self) -> bool { self < PredictionMode::NEARESTMV } #[inline] pub fn is_cfl(self) -> bool { self == PredictionMode::UV_CFL_PRED } #[inline] pub fn is_directional(self) -> bool { self >= PredictionMode::V_PRED && self <= PredictionMode::D67_PRED } #[inline(always)] pub const fn angle_delta_count(self) -> i8 { match self { PredictionMode::V_PRED | PredictionMode::H_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => 7, _ => 1, } } // Used by inter prediction to extract the fractional component of a mv and // obtain the correct PlaneSlice to operate on. #[inline] fn get_mv_params( rec_plane: &Plane, po: PlaneOffset, mv: MotionVector, ) -> (i32, i32, PlaneSlice) { let &PlaneConfig { xdec, ydec, .. } = &rec_plane.cfg; let row_offset = mv.row as i32 >> (3 + ydec); let col_offset = mv.col as i32 >> (3 + xdec); let row_frac = ((mv.row as i32) << (1 - ydec)) & 0xf; let col_frac = ((mv.col as i32) << (1 - xdec)) & 0xf; let qo = PlaneOffset { x: po.x + col_offset as isize - 3, y: po.y + row_offset as isize - 3, }; (row_frac, col_frac, rec_plane.slice(qo).clamp().subslice(3, 3)) } /// Inter prediction with a single reference (i.e. not compound mode) /// /// # Panics /// /// - If called on an intra `PredictionMode` pub fn predict_inter_single( self, fi: &FrameInvariants, tile_rect: TileRect, p: usize, po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize, height: usize, ref_frame: RefType, mv: MotionVector, ) { assert!(!self.is_intra()); let frame_po = tile_rect.to_frame_plane_offset(po); let mode = fi.default_filter; if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frame.to_index()] as usize] { let (row_frac, col_frac, src) = PredictionMode::get_mv_params(&rec.frame.planes[p], frame_po, mv); put_8tap( dst, src, width, height, col_frac, row_frac, mode, mode, fi.sequence.bit_depth, fi.cpu_feature_level, ); } } /// Inter prediction with two references. /// /// # Panics /// /// - If called on an intra `PredictionMode` pub fn predict_inter_compound( self, fi: &FrameInvariants, tile_rect: TileRect, p: usize, po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize, height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2], buffer: &mut InterCompoundBuffers, ) { assert!(!self.is_intra()); let frame_po = tile_rect.to_frame_plane_offset(po); let mode = fi.default_filter; for i in 0..2 { if let Some(ref rec) = fi.rec_buffer.frames[fi.ref_frames[ref_frames[i].to_index()] as usize] { let (row_frac, col_frac, src) = PredictionMode::get_mv_params( &rec.frame.planes[p], frame_po, mvs[i], ); prep_8tap( buffer.get_buffer_mut(i), src, width, height, col_frac, row_frac, mode, mode, fi.sequence.bit_depth, fi.cpu_feature_level, ); } } mc_avg( dst, buffer.get_buffer(0), buffer.get_buffer(1), width, height, fi.sequence.bit_depth, fi.cpu_feature_level, ); } /// Inter prediction that determines whether compound mode is being used based /// on the second [`RefType`] in [`ref_frames`]. pub fn predict_inter( self, fi: &FrameInvariants, tile_rect: TileRect, p: usize, po: PlaneOffset, dst: &mut PlaneRegionMut<'_, T>, width: usize, height: usize, ref_frames: [RefType; 2], mvs: [MotionVector; 2], compound_buffer: &mut InterCompoundBuffers, ) { let is_compound = ref_frames[1] != RefType::INTRA_FRAME && ref_frames[1] != RefType::NONE_FRAME; if !is_compound { self.predict_inter_single( fi, tile_rect, p, po, dst, width, height, ref_frames[0], mvs[0], ) } else { self.predict_inter_compound( fi, tile_rect, p, po, dst, width, height, ref_frames, mvs, compound_buffer, ); } } } /// A pair of buffers holding the interpolation of two references. Use for /// compound inter prediction. #[derive(Debug)] pub struct InterCompoundBuffers { data: AlignedBoxedSlice, } impl InterCompoundBuffers { // Size of one of the two buffers used. const BUFFER_SIZE: usize = 1 << (2 * MAX_SB_SIZE_LOG2); /// Get the buffer for eith #[inline] fn get_buffer_mut(&mut self, i: usize) -> &mut [i16] { match i { 0 => &mut self.data[0..Self::BUFFER_SIZE], 1 => &mut self.data[Self::BUFFER_SIZE..2 * Self::BUFFER_SIZE], _ => panic!(), } } #[inline] fn get_buffer(&self, i: usize) -> &[i16] { match i { 0 => &self.data[0..Self::BUFFER_SIZE], 1 => &self.data[Self::BUFFER_SIZE..2 * Self::BUFFER_SIZE], _ => panic!(), } } } impl Default for InterCompoundBuffers { fn default() -> Self { Self { data: AlignedBoxedSlice::new(2 * Self::BUFFER_SIZE, 0) } } } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum InterIntraMode { II_DC_PRED, II_V_PRED, II_H_PRED, II_SMOOTH_PRED, INTERINTRA_MODES, } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum CompoundType { COMPOUND_AVERAGE, COMPOUND_WEDGE, COMPOUND_DIFFWTD, COMPOUND_TYPES, } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum MotionMode { SIMPLE_TRANSLATION, OBMC_CAUSAL, // 2-sided OBMC WARPED_CAUSAL, // 2-sided WARPED MOTION_MODES, } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum PaletteSize { TWO_COLORS, THREE_COLORS, FOUR_COLORS, FIVE_COLORS, SIX_COLORS, SEVEN_COLORS, EIGHT_COLORS, PALETTE_SIZES, } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum PaletteColor { PALETTE_COLOR_ONE, PALETTE_COLOR_TWO, PALETTE_COLOR_THREE, PALETTE_COLOR_FOUR, PALETTE_COLOR_FIVE, PALETTE_COLOR_SIX, PALETTE_COLOR_SEVEN, PALETTE_COLOR_EIGHT, PALETTE_COLORS, } #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd)] pub enum FilterIntraMode { FILTER_DC_PRED, FILTER_V_PRED, FILTER_H_PRED, FILTER_D157_PRED, FILTER_PAETH_PRED, FILTER_INTRA_MODES, } #[derive(Copy, Clone, Debug)] pub enum IntraParam { AngleDelta(i8), Alpha(i16), None, } #[derive(Debug, Clone, Copy, Default)] pub struct AngleDelta { pub y: i8, pub uv: i8, } #[derive(Copy, Clone, Default)] pub struct IntraEdgeFilterParameters { pub plane: usize, pub above_ref_frame_types: Option<[RefType; 2]>, pub left_ref_frame_types: Option<[RefType; 2]>, pub above_mode: Option, pub left_mode: Option, } impl IntraEdgeFilterParameters { pub fn new( plane: usize, above_ctx: Option, left_ctx: Option, ) -> Self { IntraEdgeFilterParameters { plane, above_mode: match above_ctx { Some(bi) => match plane { 0 => bi.luma_mode, _ => bi.chroma_mode, } .into(), None => None, }, left_mode: match left_ctx { Some(bi) => match plane { 0 => bi.luma_mode, _ => bi.chroma_mode, } .into(), None => None, }, above_ref_frame_types: above_ctx.map(|bi| bi.reference_types), left_ref_frame_types: left_ctx.map(|bi| bi.reference_types), } } /// # Panics /// /// - If the appropriate ref frame types are not set on `self` pub fn use_smooth_filter(self) -> bool { let above_smooth = match self.above_mode { Some(PredictionMode::SMOOTH_PRED) | Some(PredictionMode::SMOOTH_V_PRED) | Some(PredictionMode::SMOOTH_H_PRED) => { self.plane == 0 || self.above_ref_frame_types.unwrap()[0] == RefType::INTRA_FRAME } _ => false, }; let left_smooth = match self.left_mode { Some(PredictionMode::SMOOTH_PRED) | Some(PredictionMode::SMOOTH_V_PRED) | Some(PredictionMode::SMOOTH_H_PRED) => { self.plane == 0 || self.left_ref_frame_types.unwrap()[0] == RefType::INTRA_FRAME } _ => false, }; above_smooth || left_smooth } } // Weights are quadratic from '1' to '1 / block_size', scaled by 2^sm_weight_log2_scale. const sm_weight_log2_scale: u8 = 8; // Smooth predictor weights #[rustfmt::skip] static sm_weight_arrays: [u8; 2 * MAX_TX_SIZE] = [ // Unused, because we always offset by bs, which is at least 2. 0, 0, // bs = 2 255, 128, // bs = 4 255, 149, 85, 64, // bs = 8 255, 197, 146, 105, 73, 50, 37, 32, // bs = 16 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, // bs = 32 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, // bs = 64 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, ]; #[inline(always)] const fn get_scaled_luma_q0(alpha_q3: i16, ac_pred_q3: i16) -> i32 { let scaled_luma_q6 = (alpha_q3 as i32) * (ac_pred_q3 as i32); let abs_scaled_luma_q0 = (scaled_luma_q6.abs() + 32) >> 6; if scaled_luma_q6 < 0 { -abs_scaled_luma_q0 } else { abs_scaled_luma_q0 } } /// # Returns /// /// Initialized luma AC coefficients /// /// # Panics /// /// - If the block size is invalid for subsampling /// pub fn luma_ac<'ac, T: Pixel>( ac: &'ac mut [MaybeUninit], ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, fi: &FrameInvariants, ) -> &'ac mut [i16] { use crate::context::MI_SIZE_LOG2; let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let plane_bsize = bsize.subsampled_size(xdec, ydec).unwrap(); // ensure ac has the right length, so there aren't any uninitialized elements at the end let ac = &mut ac[..plane_bsize.area()]; let bo = if bsize.is_sub8x8(xdec, ydec) { let offset = bsize.sub8x8_offset(xdec, ydec); tile_bo.with_offset(offset.0, offset.1) } else { tile_bo }; let rec = &ts.rec.planes[0]; let luma = &rec.subregion(Area::BlockStartingAt { bo: bo.0 }); let frame_bo = ts.to_frame_block_offset(bo); let frame_clipped_bw: usize = ((fi.w_in_b - frame_bo.0.x) << MI_SIZE_LOG2).min(bsize.width()); let frame_clipped_bh: usize = ((fi.h_in_b - frame_bo.0.y) << MI_SIZE_LOG2).min(bsize.height()); // Similar to 'MaxLumaW' and 'MaxLumaH' stated in https://aomediacodec.github.io/av1-spec/#transform-block-semantics let max_luma_w = if bsize.width() > BlockSize::BLOCK_8X8.width() { let txw_log2 = tx_size.width_log2(); ((frame_clipped_bw + (1 << txw_log2) - 1) >> txw_log2) << txw_log2 } else { bsize.width() }; let max_luma_h = if bsize.height() > BlockSize::BLOCK_8X8.height() { let txh_log2 = tx_size.height_log2(); ((frame_clipped_bh + (1 << txh_log2) - 1) >> txh_log2) << txh_log2 } else { bsize.height() }; let w_pad = (bsize.width() - max_luma_w) >> (2 + xdec); let h_pad = (bsize.height() - max_luma_h) >> (2 + ydec); let cpu = fi.cpu_feature_level; (match (xdec, ydec) { (0, 0) => pred_cfl_ac::, (1, 0) => pred_cfl_ac::, (_, _) => pred_cfl_ac::, })(ac, luma, plane_bsize, w_pad, h_pad, cpu); // SAFETY: it relies on individual pred_cfl_ac implementations to initialize the ac unsafe { slice_assume_init_mut(ac) } } pub(crate) mod rust { use super::*; use crate::context::MAX_TX_SIZE; use crate::cpu_features::CpuFeatureLevel; use crate::tiling::PlaneRegionMut; use crate::transform::TxSize; use crate::util::round_shift; use crate::Pixel; use std::mem::{size_of, MaybeUninit}; #[inline(always)] pub fn dispatch_predict_intra( mode: PredictionMode, variant: PredictionVariant, dst: &mut PlaneRegionMut<'_, T>, tx_size: TxSize, bit_depth: usize, ac: &[i16], angle: isize, ief_params: Option, edge_buf: &IntraEdge, _cpu: CpuFeatureLevel, ) { let width = tx_size.width(); let height = tx_size.height(); // left pixels are ordered from bottom to top and right-aligned let (left, top_left, above) = edge_buf.as_slices(); let above_slice = above; let left_slice = &left[left.len().saturating_sub(height)..]; let left_and_left_below_slice = &left[left.len().saturating_sub(width + height)..]; match mode { PredictionMode::DC_PRED => { (match variant { PredictionVariant::NONE => pred_dc_128, PredictionVariant::LEFT => pred_dc_left, PredictionVariant::TOP => pred_dc_top, PredictionVariant::BOTH => pred_dc, })(dst, above_slice, left_slice, width, height, bit_depth) } PredictionMode::V_PRED if angle == 90 => { pred_v(dst, above_slice, width, height) } PredictionMode::H_PRED if angle == 180 => { pred_h(dst, left_slice, width, height) } PredictionMode::H_PRED | PredictionMode::V_PRED | PredictionMode::D45_PRED | PredictionMode::D135_PRED | PredictionMode::D113_PRED | PredictionMode::D157_PRED | PredictionMode::D203_PRED | PredictionMode::D67_PRED => pred_directional( dst, above_slice, left_and_left_below_slice, top_left, angle as usize, width, height, bit_depth, ief_params, ), PredictionMode::SMOOTH_PRED => { pred_smooth(dst, above_slice, left_slice, width, height) } PredictionMode::SMOOTH_V_PRED => { pred_smooth_v(dst, above_slice, left_slice, width, height) } PredictionMode::SMOOTH_H_PRED => { pred_smooth_h(dst, above_slice, left_slice, width, height) } PredictionMode::PAETH_PRED => { pred_paeth(dst, above_slice, left_slice, top_left[0], width, height) } PredictionMode::UV_CFL_PRED => (match variant { PredictionVariant::NONE => pred_cfl_128, PredictionVariant::LEFT => pred_cfl_left, PredictionVariant::TOP => pred_cfl_top, PredictionVariant::BOTH => pred_cfl, })( dst, ac, angle as i16, above_slice, left_slice, width, height, bit_depth, ), _ => unimplemented!(), } } pub(crate) fn pred_dc( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize, height: usize, _bit_depth: usize, ) { let edges = left[..height].iter().chain(above[..width].iter()); let len = (width + height) as u32; let avg = (edges.fold(0u32, |acc, &v| { let v: u32 = v.into(); v + acc }) + (len >> 1)) / len; let avg = T::cast_from(avg); for line in output.rows_iter_mut().take(height) { line[..width].fill(avg); } } pub(crate) fn pred_dc_128( output: &mut PlaneRegionMut<'_, T>, _above: &[T], _left: &[T], width: usize, height: usize, bit_depth: usize, ) { let v = T::cast_from(128u32 << (bit_depth - 8)); for line in output.rows_iter_mut().take(height) { line[..width].fill(v); } } pub(crate) fn pred_dc_left( output: &mut PlaneRegionMut<'_, T>, _above: &[T], left: &[T], width: usize, height: usize, _bit_depth: usize, ) { let sum = left[..].iter().fold(0u32, |acc, &v| { let v: u32 = v.into(); v + acc }); let avg = T::cast_from((sum + (height >> 1) as u32) / height as u32); for line in output.rows_iter_mut().take(height) { line[..width].fill(avg); } } pub(crate) fn pred_dc_top( output: &mut PlaneRegionMut<'_, T>, above: &[T], _left: &[T], width: usize, height: usize, _bit_depth: usize, ) { let sum = above[..width].iter().fold(0u32, |acc, &v| { let v: u32 = v.into(); v + acc }); let avg = T::cast_from((sum + (width >> 1) as u32) / width as u32); for line in output.rows_iter_mut().take(height) { line[..width].fill(avg); } } pub(crate) fn pred_h( output: &mut PlaneRegionMut<'_, T>, left: &[T], width: usize, height: usize, ) { for (line, l) in output.rows_iter_mut().zip(left[..height].iter().rev()) { line[..width].fill(*l); } } pub(crate) fn pred_v( output: &mut PlaneRegionMut<'_, T>, above: &[T], width: usize, height: usize, ) { for line in output.rows_iter_mut().take(height) { line[..width].copy_from_slice(&above[..width]) } } pub(crate) fn pred_paeth( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], above_left: T, width: usize, height: usize, ) { for r in 0..height { let row = &mut output[r]; for c in 0..width { // Top-left pixel is fixed in libaom let raw_top_left: i32 = above_left.into(); let raw_left: i32 = left[height - 1 - r].into(); let raw_top: i32 = above[c].into(); let p_base = raw_top + raw_left - raw_top_left; let p_left = (p_base - raw_left).abs(); let p_top = (p_base - raw_top).abs(); let p_top_left = (p_base - raw_top_left).abs(); // Return nearest to base of left, top and top_left if p_left <= p_top && p_left <= p_top_left { row[c] = T::cast_from(raw_left); } else if p_top <= p_top_left { row[c] = T::cast_from(raw_top); } else { row[c] = T::cast_from(raw_top_left); } } } } pub(crate) fn pred_smooth( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize, height: usize, ) { let below_pred = left[0]; // estimated by bottom-left pixel let right_pred = above[width - 1]; // estimated by top-right pixel let sm_weights_w = &sm_weight_arrays[width..]; let sm_weights_h = &sm_weight_arrays[height..]; let log2_scale = 1 + sm_weight_log2_scale; let scale = 1_u16 << sm_weight_log2_scale; // Weights sanity checks assert!((sm_weights_w[0] as u16) < scale); assert!((sm_weights_h[0] as u16) < scale); assert!((scale - sm_weights_w[width - 1] as u16) < scale); assert!((scale - sm_weights_h[height - 1] as u16) < scale); // ensures no overflow when calculating predictor assert!(log2_scale as usize + size_of::() < 31); for r in 0..height { let row = &mut output[r]; for c in 0..width { let pixels = [above[c], below_pred, left[height - 1 - r], right_pred]; let weights = [ sm_weights_h[r] as u16, scale - sm_weights_h[r] as u16, sm_weights_w[c] as u16, scale - sm_weights_w[c] as u16, ]; assert!( scale >= (sm_weights_h[r] as u16) && scale >= (sm_weights_w[c] as u16) ); // Sum up weighted pixels let mut this_pred: u32 = weights .iter() .zip(pixels.iter()) .map(|(w, p)| { let p: u32 = (*p).into(); (*w as u32) * p }) .sum(); this_pred = (this_pred + (1 << (log2_scale - 1))) >> log2_scale; row[c] = T::cast_from(this_pred); } } } pub(crate) fn pred_smooth_h( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize, height: usize, ) { let right_pred = above[width - 1]; // estimated by top-right pixel let sm_weights = &sm_weight_arrays[width..]; let log2_scale = sm_weight_log2_scale; let scale = 1_u16 << sm_weight_log2_scale; // Weights sanity checks assert!((sm_weights[0] as u16) < scale); assert!((scale - sm_weights[width - 1] as u16) < scale); // ensures no overflow when calculating predictor assert!(log2_scale as usize + size_of::() < 31); for r in 0..height { let row = &mut output[r]; for c in 0..width { let pixels = [left[height - 1 - r], right_pred]; let weights = [sm_weights[c] as u16, scale - sm_weights[c] as u16]; assert!(scale >= sm_weights[c] as u16); let mut this_pred: u32 = weights .iter() .zip(pixels.iter()) .map(|(w, p)| { let p: u32 = (*p).into(); (*w as u32) * p }) .sum(); this_pred = (this_pred + (1 << (log2_scale - 1))) >> log2_scale; row[c] = T::cast_from(this_pred); } } } pub(crate) fn pred_smooth_v( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], width: usize, height: usize, ) { let below_pred = left[0]; // estimated by bottom-left pixel let sm_weights = &sm_weight_arrays[height..]; let log2_scale = sm_weight_log2_scale; let scale = 1_u16 << sm_weight_log2_scale; // Weights sanity checks assert!((sm_weights[0] as u16) < scale); assert!((scale - sm_weights[height - 1] as u16) < scale); // ensures no overflow when calculating predictor assert!(log2_scale as usize + size_of::() < 31); for r in 0..height { let row = &mut output[r]; for c in 0..width { let pixels = [above[c], below_pred]; let weights = [sm_weights[r] as u16, scale - sm_weights[r] as u16]; assert!(scale >= sm_weights[r] as u16); let mut this_pred: u32 = weights .iter() .zip(pixels.iter()) .map(|(w, p)| { let p: u32 = (*p).into(); (*w as u32) * p }) .sum(); this_pred = (this_pred + (1 << (log2_scale - 1))) >> log2_scale; row[c] = T::cast_from(this_pred); } } } pub(crate) fn pred_cfl_ac( ac: &mut [MaybeUninit], luma: &PlaneRegion<'_, T>, plane_bsize: BlockSize, w_pad: usize, h_pad: usize, _cpu: CpuFeatureLevel, ) { let max_luma_w = (plane_bsize.width() - w_pad * 4) << XDEC; let max_luma_h = (plane_bsize.height() - h_pad * 4) << YDEC; let max_luma_x: usize = max_luma_w.max(8) - (1 << XDEC); let max_luma_y: usize = max_luma_h.max(8) - (1 << YDEC); let mut sum: i32 = 0; let ac = &mut ac[..plane_bsize.area()]; for (sub_y, ac_rows) in ac.chunks_exact_mut(plane_bsize.width()).enumerate() { for (sub_x, ac_item) in ac_rows.iter_mut().enumerate() { // Refer to https://aomediacodec.github.io/av1-spec/#predict-chroma-from-luma-process let luma_y = sub_y << YDEC; let luma_x = sub_x << XDEC; let y = luma_y.min(max_luma_y); let x = luma_x.min(max_luma_x); let mut sample: i16 = i16::cast_from(luma[y][x]); if XDEC != 0 { sample += i16::cast_from(luma[y][x + 1]); } if YDEC != 0 { debug_assert!(XDEC != 0); sample += i16::cast_from(luma[y + 1][x]) + i16::cast_from(luma[y + 1][x + 1]); } sample <<= 3 - XDEC - YDEC; ac_item.write(sample); sum += sample as i32; } } // SAFETY: the loop above has initialized all items let ac = unsafe { slice_assume_init_mut(ac) }; let shift = plane_bsize.width_log2() + plane_bsize.height_log2(); let average = ((sum + (1 << (shift - 1))) >> shift) as i16; for val in ac { *val -= average; } } pub(crate) fn pred_cfl_inner( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, width: usize, height: usize, bit_depth: usize, ) { if alpha == 0 { return; } debug_assert!(ac.len() >= width * height); assert!(output.plane_cfg.stride >= width); assert!(output.rows_iter().len() >= height); let sample_max = (1 << bit_depth) - 1; let avg: i32 = output[0][0].into(); for (line, luma) in output.rows_iter_mut().zip(ac.chunks_exact(width)).take(height) { for (v, &l) in line[..width].iter_mut().zip(luma[..width].iter()) { *v = T::cast_from( (avg + get_scaled_luma_q0(alpha, l)).clamp(0, sample_max), ); } } } pub(crate) fn pred_cfl( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], left: &[T], width: usize, height: usize, bit_depth: usize, ) { pred_dc(output, above, left, width, height, bit_depth); pred_cfl_inner(output, ac, alpha, width, height, bit_depth); } pub(crate) fn pred_cfl_128( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], left: &[T], width: usize, height: usize, bit_depth: usize, ) { pred_dc_128(output, above, left, width, height, bit_depth); pred_cfl_inner(output, ac, alpha, width, height, bit_depth); } pub(crate) fn pred_cfl_left( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], left: &[T], width: usize, height: usize, bit_depth: usize, ) { pred_dc_left(output, above, left, width, height, bit_depth); pred_cfl_inner(output, ac, alpha, width, height, bit_depth); } pub(crate) fn pred_cfl_top( output: &mut PlaneRegionMut<'_, T>, ac: &[i16], alpha: i16, above: &[T], left: &[T], width: usize, height: usize, bit_depth: usize, ) { pred_dc_top(output, above, left, width, height, bit_depth); pred_cfl_inner(output, ac, alpha, width, height, bit_depth); } #[allow(clippy::collapsible_if)] #[allow(clippy::collapsible_else_if)] #[allow(clippy::needless_return)] pub(crate) const fn select_ief_strength( width: usize, height: usize, smooth_filter: bool, angle_delta: isize, ) -> u8 { let block_wh = width + height; let abs_delta = angle_delta.unsigned_abs(); if smooth_filter { if block_wh <= 8 { if abs_delta >= 64 { return 2; } if abs_delta >= 40 { return 1; } } else if block_wh <= 16 { if abs_delta >= 48 { return 2; } if abs_delta >= 20 { return 1; } } else if block_wh <= 24 { if abs_delta >= 4 { return 3; } } else { return 3; } } else { if block_wh <= 8 { if abs_delta >= 56 { return 1; } } else if block_wh <= 16 { if abs_delta >= 40 { return 1; } } else if block_wh <= 24 { if abs_delta >= 32 { return 3; } if abs_delta >= 16 { return 2; } if abs_delta >= 8 { return 1; } } else if block_wh <= 32 { if abs_delta >= 32 { return 3; } if abs_delta >= 4 { return 2; } return 1; } else { return 3; } } return 0; } pub(crate) const fn select_ief_upsample( width: usize, height: usize, smooth_filter: bool, angle_delta: isize, ) -> bool { let block_wh = width + height; let abs_delta = angle_delta.unsigned_abs(); if abs_delta == 0 || abs_delta >= 40 { false } else if smooth_filter { block_wh <= 8 } else { block_wh <= 16 } } pub(crate) fn filter_edge( size: usize, strength: u8, edge: &mut [T], ) { const INTRA_EDGE_KERNEL: [[u32; 5]; 3] = [[0, 4, 8, 4, 0], [0, 5, 6, 5, 0], [2, 4, 4, 4, 2]]; if strength == 0 { return; } // Copy the edge buffer to avoid predicting from // just-filtered samples. let mut edge_filtered = [MaybeUninit::::uninit(); MAX_TX_SIZE * 4 + 1]; let edge_filtered = init_slice_repeat_mut(&mut edge_filtered[..edge.len()], T::zero()); edge_filtered.copy_from_slice(&edge[..edge.len()]); for i in 1..size { let mut s = 0; for j in 0..INTRA_EDGE_KERNEL[0].len() { let k = (i + j).saturating_sub(2).min(size - 1); s += INTRA_EDGE_KERNEL[(strength - 1) as usize][j] * edge[k].to_u32().unwrap(); } edge_filtered[i] = T::cast_from((s + 8) >> 4); } edge.copy_from_slice(edge_filtered); } pub(crate) fn upsample_edge( size: usize, edge: &mut [T], bit_depth: usize, ) { // The input edge should be valid in the -1..size range, // where the -1 index is the top-left edge pixel. Since // negative indices are unsafe in Rust, the caller is // expected to globally offset it by 1, which makes the // input range 0..=size. let mut dup = [MaybeUninit::::uninit(); MAX_TX_SIZE]; let dup = init_slice_repeat_mut(&mut dup[..size + 3], T::zero()); dup[0] = edge[0]; dup[1..=size + 1].copy_from_slice(&edge[0..=size]); dup[size + 2] = edge[size]; // Past here the edge is being filtered, and its // effective range is shifted from -1..size to // -2..2*size-1. Again, because this is safe Rust, // we cannot use negative indices, and the actual range // will be 0..=2*size. The caller is expected to adjust // its indices on receipt of the filtered edge. edge[0] = dup[0]; for i in 0..size { let mut s = -dup[i].to_i32().unwrap() + (9 * dup[i + 1].to_i32().unwrap()) + (9 * dup[i + 2].to_i32().unwrap()) - dup[i + 3].to_i32().unwrap(); s = ((s + 8) / 16).clamp(0, (1 << bit_depth) - 1); edge[2 * i + 1] = T::cast_from(s); edge[2 * i + 2] = dup[i + 2]; } } pub(crate) const fn dr_intra_derivative(p_angle: usize) -> usize { match p_angle { 3 => 1023, 6 => 547, 9 => 372, 14 => 273, 17 => 215, 20 => 178, 23 => 151, 26 => 132, 29 => 116, 32 => 102, 36 => 90, 39 => 80, 42 => 71, 45 => 64, 48 => 57, 51 => 51, 54 => 45, 58 => 40, 61 => 35, 64 => 31, 67 => 27, 70 => 23, 73 => 19, 76 => 15, 81 => 11, 84 => 7, 87 => 3, _ => 0, } } pub(crate) fn pred_directional( output: &mut PlaneRegionMut<'_, T>, above: &[T], left: &[T], top_left: &[T], p_angle: usize, width: usize, height: usize, bit_depth: usize, ief_params: Option, ) { let sample_max = (1 << bit_depth) - 1; let max_x = output.plane_cfg.width as isize - 1; let max_y = output.plane_cfg.height as isize - 1; let mut upsample_above = false; let mut upsample_left = false; let mut above_edge: &[T] = above; let mut left_edge: &[T] = left; let top_left_edge: T = top_left[0]; let enable_edge_filter = ief_params.is_some(); // Initialize above and left edge buffers of the largest possible needed size if upsampled // The first value is the top left pixel, also mutable and indexed at -1 in the spec let mut above_filtered = [MaybeUninit::::uninit(); MAX_TX_SIZE * 4 + 1]; let above_filtered = init_slice_repeat_mut( &mut above_filtered[..=(width + height) * 2], T::zero(), ); let mut left_filtered = [MaybeUninit::::uninit(); MAX_TX_SIZE * 4 + 1]; let left_filtered = init_slice_repeat_mut( &mut left_filtered[..=(width + height) * 2], T::zero(), ); if enable_edge_filter { let above_len = above.len().min(above_filtered.len() - 1); let left_len = left.len().min(left_filtered.len() - 1); above_filtered[1..=above_len].clone_from_slice(&above[..above_len]); for i in 1..=left_len { left_filtered[i] = left[left.len() - i]; } let smooth_filter = ief_params.unwrap().use_smooth_filter(); if p_angle != 90 && p_angle != 180 { above_filtered[0] = top_left_edge; left_filtered[0] = top_left_edge; let num_px = ( width.min((max_x - output.rect().x + 1).try_into().unwrap()) + if p_angle < 90 { height } else { 0 } + 1, // above height.min((max_y - output.rect().y + 1).try_into().unwrap()) + if p_angle > 180 { width } else { 0 } + 1, // left ); let filter_strength = select_ief_strength( width, height, smooth_filter, p_angle as isize - 90, ); filter_edge(num_px.0, filter_strength, above_filtered); let filter_strength = select_ief_strength( width, height, smooth_filter, p_angle as isize - 180, ); filter_edge(num_px.1, filter_strength, left_filtered); } let num_px = ( width + if p_angle < 90 { height } else { 0 }, // above height + if p_angle > 180 { width } else { 0 }, // left ); upsample_above = select_ief_upsample( width, height, smooth_filter, p_angle as isize - 90, ); if upsample_above { upsample_edge(num_px.0, above_filtered, bit_depth); } upsample_left = select_ief_upsample( width, height, smooth_filter, p_angle as isize - 180, ); if upsample_left { upsample_edge(num_px.1, left_filtered, bit_depth); } left_filtered.reverse(); above_edge = above_filtered; left_edge = left_filtered; } let dx = if p_angle < 90 { dr_intra_derivative(p_angle) } else if p_angle > 90 && p_angle < 180 { dr_intra_derivative(180 - p_angle) } else { 0 // undefined }; let dy = if p_angle > 90 && p_angle < 180 { dr_intra_derivative(p_angle - 90) } else if p_angle > 180 { dr_intra_derivative(270 - p_angle) } else { 0 // undefined }; // edge buffer index offsets applied due to the fact // that we cannot safely use negative indices in Rust let upsample_above = upsample_above as usize; let upsample_left = upsample_left as usize; let offset_above = (enable_edge_filter as usize) << upsample_above; let offset_left = (enable_edge_filter as usize) << upsample_left; if p_angle < 90 { for i in 0..height { let row = &mut output[i]; for j in 0..width { let idx = (i + 1) * dx; let base = (idx >> (6 - upsample_above)) + (j << upsample_above); let shift = (((idx << upsample_above) >> 1) & 31) as i32; let max_base_x = (height + width - 1) << upsample_above; let v = (if base < max_base_x { let a: i32 = above_edge[base + offset_above].into(); let b: i32 = above_edge[base + 1 + offset_above].into(); round_shift(a * (32 - shift) + b * shift, 5) } else { let c: i32 = above_edge[max_base_x + offset_above].into(); c }) .clamp(0, sample_max); row[j] = T::cast_from(v); } } } else if p_angle > 90 && p_angle < 180 { for i in 0..height { let row = &mut output[i]; for j in 0..width { let idx = (j << 6) as isize - ((i + 1) * dx) as isize; let base = idx >> (6 - upsample_above); if base >= -(1 << upsample_above) { let shift = (((idx << upsample_above) >> 1) & 31) as i32; let a: i32 = if !enable_edge_filter && base < 0 { top_left_edge } else { above_edge[(base + offset_above as isize) as usize] } .into(); let b: i32 = above_edge[(base + 1 + offset_above as isize) as usize].into(); let v = round_shift(a * (32 - shift) + b * shift, 5) .clamp(0, sample_max); row[j] = T::cast_from(v); } else { let idx = (i << 6) as isize - ((j + 1) * dy) as isize; let base = idx >> (6 - upsample_left); let shift = (((idx << upsample_left) >> 1) & 31) as i32; let l = left_edge.len() - 1; let a: i32 = if !enable_edge_filter && base < 0 { top_left_edge } else if (base + offset_left as isize) == -2 { left_edge[0] } else { left_edge[l - (base + offset_left as isize) as usize] } .into(); let b: i32 = if (base + offset_left as isize) == -2 { left_edge[1] } else { left_edge[l - (base + offset_left as isize + 1) as usize] } .into(); let v = round_shift(a * (32 - shift) + b * shift, 5) .clamp(0, sample_max); row[j] = T::cast_from(v); } } } } else if p_angle > 180 { for i in 0..height { let row = &mut output[i]; for j in 0..width { let idx = (j + 1) * dy; let base = (idx >> (6 - upsample_left)) + (i << upsample_left); let shift = (((idx << upsample_left) >> 1) & 31) as i32; let l = left_edge.len() - 1; let a: i32 = left_edge[l.saturating_sub(base + offset_left)].into(); let b: i32 = left_edge[l.saturating_sub(base + offset_left + 1)].into(); let v = round_shift(a * (32 - shift) + b * shift, 5).clamp(0, sample_max); row[j] = T::cast_from(v); } } } } } #[cfg(test)] mod test { use super::*; use crate::predict::rust::*; use crate::util::Aligned; use num_traits::*; #[test] fn pred_matches_u8() { let edge_buf = Aligned::from_fn(|i| (i + 32).saturating_sub(MAX_TX_SIZE * 2).as_()); let (all_left, top_left, above) = IntraEdge::mock(&edge_buf).as_slices(); let left = &all_left[all_left.len() - 4..]; let mut output = Plane::from_slice(&[0u8; 4 * 4], 4); pred_dc(&mut output.as_region_mut(), above, left, 4, 4, 8); assert_eq!(&output.data[..], [32u8; 16]); pred_dc_top(&mut output.as_region_mut(), above, left, 4, 4, 8); assert_eq!(&output.data[..], [35u8; 16]); pred_dc_left(&mut output.as_region_mut(), above, left, 4, 4, 8); assert_eq!(&output.data[..], [30u8; 16]); pred_dc_128(&mut output.as_region_mut(), above, left, 4, 4, 8); assert_eq!(&output.data[..], [128u8; 16]); pred_v(&mut output.as_region_mut(), above, 4, 4); assert_eq!( &output.data[..], [33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36] ); pred_h(&mut output.as_region_mut(), left, 4, 4); assert_eq!( &output.data[..], [31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28] ); pred_paeth(&mut output.as_region_mut(), above, left, top_left[0], 4, 4); assert_eq!( &output.data[..], [32, 34, 35, 36, 30, 32, 32, 36, 29, 32, 32, 32, 28, 28, 32, 32] ); pred_smooth(&mut output.as_region_mut(), above, left, 4, 4); assert_eq!( &output.data[..], [32, 34, 35, 35, 30, 32, 33, 34, 29, 31, 32, 32, 29, 30, 32, 32] ); pred_smooth_h(&mut output.as_region_mut(), above, left, 4, 4); assert_eq!( &output.data[..], [31, 33, 34, 35, 30, 33, 34, 35, 29, 32, 34, 34, 28, 31, 33, 34] ); pred_smooth_v(&mut output.as_region_mut(), above, left, 4, 4); assert_eq!( &output.data[..], [33, 34, 35, 36, 31, 31, 32, 33, 30, 30, 30, 31, 29, 30, 30, 30] ); let left = &all_left[all_left.len() - 8..]; let angles = [ 3, 6, 9, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, 48, 51, 54, 58, 61, 64, 67, 70, 73, 76, 81, 84, 87, ]; let expected = [ [40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40], [40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40], [39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40], [37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40], [36, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40], [36, 37, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40], [35, 36, 37, 38, 38, 39, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40], [35, 36, 37, 38, 37, 38, 39, 40, 39, 40, 40, 40, 40, 40, 40, 40], [35, 36, 37, 38, 37, 38, 39, 40, 38, 39, 40, 40, 40, 40, 40, 40], [35, 36, 37, 38, 36, 37, 38, 39, 38, 39, 40, 40, 39, 40, 40, 40], [34, 35, 36, 37, 36, 37, 38, 39, 37, 38, 39, 40, 39, 40, 40, 40], [34, 35, 36, 37, 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 40], [34, 35, 36, 37, 35, 36, 37, 38, 36, 37, 38, 39, 37, 38, 39, 40], [34, 35, 36, 37, 35, 36, 37, 38, 36, 37, 38, 39, 37, 38, 39, 40], [34, 35, 36, 37, 35, 36, 37, 38, 36, 37, 38, 39, 37, 38, 39, 40], [34, 35, 36, 37, 35, 36, 37, 38, 35, 36, 37, 38, 36, 37, 38, 39], [34, 35, 36, 37, 34, 35, 36, 37, 35, 36, 37, 38, 36, 37, 38, 39], [34, 35, 36, 37, 34, 35, 36, 37, 35, 36, 37, 38, 36, 37, 38, 39], [34, 35, 36, 37, 34, 35, 36, 37, 35, 36, 37, 38, 35, 36, 37, 38], [33, 34, 35, 36, 34, 35, 36, 37, 34, 35, 36, 37, 35, 36, 37, 38], [33, 34, 35, 36, 34, 35, 36, 37, 34, 35, 36, 37, 35, 36, 37, 38], [33, 34, 35, 36, 34, 35, 36, 37, 34, 35, 36, 37, 34, 35, 36, 37], [33, 34, 35, 36, 34, 35, 36, 37, 34, 35, 36, 37, 34, 35, 36, 37], [33, 34, 35, 36, 33, 34, 35, 36, 34, 35, 36, 37, 34, 35, 36, 37], [33, 34, 35, 36, 33, 34, 35, 36, 34, 35, 36, 37, 34, 35, 36, 37], [33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36], [33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36, 33, 34, 35, 36], ]; for (&angle, expected) in angles.iter().zip(expected.iter()) { pred_directional( &mut output.as_region_mut(), above, left, top_left, angle, 4, 4, 8, None, ); assert_eq!(&output.data[..], expected); } } #[test] fn pred_max() { let max12bit = 4096 - 1; let above = [max12bit; 32]; let left = [max12bit; 32]; let mut o = Plane::from_slice(&vec![0u16; 32 * 32], 32); pred_dc(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4, 16); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { assert_eq!(*v, max12bit); } } pred_h(&mut o.as_region_mut(), &left[..4], 4, 4); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { assert_eq!(*v, max12bit); } } pred_v(&mut o.as_region_mut(), &above[..4], 4, 4); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { assert_eq!(*v, max12bit); } } let above_left = max12bit; pred_paeth( &mut o.as_region_mut(), &above[..4], &left[..4], above_left, 4, 4, ); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { assert_eq!(*v, max12bit); } } pred_smooth(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { assert_eq!(*v, max12bit); } } pred_smooth_h(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { assert_eq!(*v, max12bit); } } pred_smooth_v(&mut o.as_region_mut(), &above[..4], &left[..4], 4, 4); for l in o.data.chunks(32).take(4) { for v in l[..4].iter() { assert_eq!(*v, max12bit); } } } } rav1e-0.7.1/src/quantize/mod.rs000064400000000000000000000270271046102023000144330ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_upper_case_globals)] mod tables; cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::quantize::*; } else { pub use self::rust::*; } } pub use tables::*; use crate::scan_order::av1_scan_orders; use crate::transform::{TxSize, TxType}; use crate::util::*; use std::convert::Into; use std::mem; use std::num::{NonZeroU16, NonZeroU32, NonZeroU64}; pub fn get_log_tx_scale(tx_size: TxSize) -> usize { let num_pixels = tx_size.area(); Into::::into(num_pixels > 256) + Into::::into(num_pixels > 1024) } pub fn dc_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { let dc_q: [&[NonZeroU16; 256]; 3] = [&dc_qlookup_Q3, &dc_qlookup_10_Q3, &dc_qlookup_12_Q3]; let bd = ((bit_depth ^ 8) >> 1).min(2); dc_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] } pub fn ac_q(qindex: u8, delta_q: i8, bit_depth: usize) -> NonZeroU16 { let ac_q: [&[NonZeroU16; 256]; 3] = [&ac_qlookup_Q3, &ac_qlookup_10_Q3, &ac_qlookup_12_Q3]; let bd = ((bit_depth ^ 8) >> 1).min(2); ac_q[bd][((qindex as isize + delta_q as isize).max(0) as usize).min(255)] } // TODO: Handle lossless properly. fn select_qi(quantizer: i64, qlookup: &[NonZeroU16; QINDEX_RANGE]) -> u8 { if quantizer < qlookup[MINQ].get() as i64 { MINQ as u8 } else if quantizer >= qlookup[MAXQ].get() as i64 { MAXQ as u8 } else { match qlookup .binary_search(&NonZeroU16::new(quantizer as u16).expect("Not zero")) { Ok(qi) => qi as u8, Err(qi) => { debug_assert!(qi > MINQ); debug_assert!(qi <= MAXQ); // Pick the closest quantizer in the log domain. let qthresh = (qlookup[qi - 1].get() as i32) * (qlookup[qi].get() as i32); let q2_i32 = (quantizer as i32) * (quantizer as i32); if q2_i32 < qthresh { (qi - 1) as u8 } else { qi as u8 } } } } } pub fn select_dc_qi(quantizer: i64, bit_depth: usize) -> u8 { let qlookup = match bit_depth { 8 => &dc_qlookup_Q3, 10 => &dc_qlookup_10_Q3, 12 => &dc_qlookup_12_Q3, _ => unimplemented!(), }; select_qi(quantizer, qlookup) } pub fn select_ac_qi(quantizer: i64, bit_depth: usize) -> u8 { let qlookup = match bit_depth { 8 => &ac_qlookup_Q3, 10 => &ac_qlookup_10_Q3, 12 => &ac_qlookup_12_Q3, _ => unimplemented!(), }; select_qi(quantizer, qlookup) } #[derive(Debug, Clone, Copy)] pub struct QuantizationContext { log_tx_scale: usize, dc_quant: NonZeroU16, dc_offset: u32, dc_mul_add: (u32, u32, u32), ac_quant: NonZeroU16, ac_offset_eob: u32, ac_offset0: u32, ac_offset1: u32, ac_mul_add: (u32, u32, u32), } impl Default for QuantizationContext { fn default() -> Self { QuantizationContext { dc_quant: NonZeroU16::new(1).expect("Not zero"), ac_quant: NonZeroU16::new(1).expect("Not zero"), log_tx_scale: Default::default(), dc_offset: Default::default(), dc_mul_add: Default::default(), ac_offset_eob: Default::default(), ac_offset0: Default::default(), ac_offset1: Default::default(), ac_mul_add: Default::default(), } } } fn divu_gen(d: NonZeroU32) -> (u32, u32, u32) { let nbits = (mem::size_of_val(&d) as u64) * 8; let m = nbits - d.leading_zeros() as u64 - 1; if d.is_power_of_two() { (0xFFFF_FFFF, 0xFFFF_FFFF, m as u32) } else { let d = NonZeroU64::from(d); let t = (1u64 << (m + nbits)) / d; let d = d.get(); let r = (t * d + d) & ((1 << nbits) - 1); if r <= 1u64 << m { (t as u32 + 1, 0u32, m as u32) } else { (t as u32, t as u32, m as u32) } } } #[inline] const fn divu_pair(x: u32, d: (u32, u32, u32)) -> u32 { let x = x as u64; let (a, b, shift) = d; let shift = shift as u64; let a = a as u64; let b = b as u64; (((a * x + b) >> 32) >> shift) as u32 } #[inline] const fn copysign(value: u32, signed: i32) -> i32 { if signed < 0 { -(value as i32) } else { value as i32 } } #[cfg(test)] mod test { use super::*; use crate::transform::TxSize::*; #[test] fn test_divu_pair() { for d in 1..1024 { for x in 0..1000 { let ab = divu_gen(NonZeroU32::new(d).unwrap()); assert_eq!(x / d, divu_pair(x, ab)); } } } #[test] fn gen_divu_table() { let b: Vec<(u32, u32, u32)> = dc_qlookup_Q3.iter().map(|&v| divu_gen(v.into())).collect(); println!("{:?}", b); } #[test] fn test_tx_log_scale() { let tx_sizes = [ (TX_4X4, 0), (TX_8X8, 0), (TX_16X16, 0), (TX_32X32, 1), (TX_64X64, 2), (TX_4X8, 0), (TX_8X4, 0), (TX_8X16, 0), (TX_16X8, 0), (TX_16X32, 1), (TX_32X16, 1), (TX_32X64, 2), (TX_64X32, 2), (TX_4X16, 0), (TX_16X4, 0), (TX_8X32, 0), (TX_32X8, 0), (TX_16X64, 1), (TX_64X16, 1), ]; for &tx_size in tx_sizes.iter() { assert!(tx_size.1 == get_log_tx_scale(tx_size.0)); } } } impl QuantizationContext { pub fn update( &mut self, qindex: u8, tx_size: TxSize, is_intra: bool, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, ) { self.log_tx_scale = get_log_tx_scale(tx_size); self.dc_quant = dc_q(qindex, dc_delta_q, bit_depth); self.dc_mul_add = divu_gen(self.dc_quant.into()); self.ac_quant = ac_q(qindex, ac_delta_q, bit_depth); self.ac_mul_add = divu_gen(self.ac_quant.into()); // All of these biases were derived by measuring the cost of coding // a zero vs coding a one on any given coefficient position, or, in // the case of the EOB bias, the cost of coding the block with // the chosen EOB (rounding to one) vs rounding to zero and continuing // to choose a new EOB. This was done over several clips, with the // average of the bit costs taken over all blocks in the set, and a new // bias derived via the method outlined in Jean-Marc Valin's // Journal of Dubious Theoretical Results[1], aka: // // lambda = ln(2) / 6.0 // threshold = 0.5 + (lambda * avg_rate_diff) / 2.0 // bias = 1 - threshold // // lambda is a constant since our offsets are already adjusted for the // quantizer. // // Biases were then updated, and cost collection was re-run, until // the calculated biases started to converge after 2-4 iterations. // // In theory, the rounding biases for inter should be somewhat smaller // than the biases for intra, but this turns out to only be the case // for EOB optimization, or at least, is covered by EOB optimization. // The RD-optimal rounding biases for the actual coefficients seem // to be quite close (+/- 1/256), for both inter and intra, // post-deadzoning. // // [1] https://jmvalin.ca/notes/theoretical_results.pdf self.dc_offset = self.dc_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; self.ac_offset0 = self.ac_quant.get() as u32 * (if is_intra { 98 } else { 97 }) / 256; self.ac_offset1 = self.ac_quant.get() as u32 * (if is_intra { 109 } else { 108 }) / 256; self.ac_offset_eob = self.ac_quant.get() as u32 * (if is_intra { 88 } else { 44 }) / 256; } #[inline] pub fn quantize( &self, coeffs: &[T], qcoeffs: &mut [T], tx_size: TxSize, tx_type: TxType, ) -> u16 { let scan = av1_scan_orders[tx_size as usize][tx_type as usize].scan; let iscan = av1_scan_orders[tx_size as usize][tx_type as usize].iscan; qcoeffs[0] = { let coeff: i32 = i32::cast_from(coeffs[0]) << self.log_tx_scale; let abs_coeff = coeff.unsigned_abs(); T::cast_from(copysign( divu_pair(abs_coeff + self.dc_offset, self.dc_mul_add), coeff, )) }; // Find the last non-zero coefficient using our smaller biases and // zero everything else. // This threshold is such that `abs(coeff) < deadzone` implies: // (abs(coeff << log_tx_scale) + ac_offset_eob) / ac_quant == 0 let deadzone = T::cast_from( (self.ac_quant.get() as usize - self.ac_offset_eob as usize) .align_power_of_two_and_shift(self.log_tx_scale), ); let eob = { let eob_minus_one = iscan .iter() .zip(coeffs) .map(|(&i, &c)| if c.abs() >= deadzone { i } else { 0 }) .max() .unwrap_or(0); // We skip the DC coefficient since it has its own quantizer index. if eob_minus_one > 0 { eob_minus_one + 1 } else { u16::from(qcoeffs[0] != T::cast_from(0)) } }; // Here we use different rounding biases depending on whether we've // had recent coefficients that are larger than one, or less than // one. The reason for this is that a block usually has a chunk of // large coefficients and a tail of zeroes and ones, and the tradeoffs // for coding these two are different. In the tail of zeroes and ones, // you'll likely end up spending most bits just saying where that // coefficient is in the block, whereas in the chunk of larger // coefficients, most bits will be spent on coding its magnitude. // To that end, we want to bias more toward rounding to zero for // that tail of zeroes and ones than we do for the larger coefficients. let mut level_mode = 1; let ac_quant = self.ac_quant.get() as u32; for &pos in scan.iter().take(usize::from(eob)).skip(1) { let coeff = i32::cast_from(coeffs[pos as usize]) << self.log_tx_scale; let abs_coeff = coeff.unsigned_abs(); let level0 = divu_pair(abs_coeff, self.ac_mul_add); let offset = if level0 > 1 - level_mode { self.ac_offset1 } else { self.ac_offset0 }; let abs_qcoeff: u32 = level0 + (abs_coeff + offset >= (level0 + 1) * ac_quant) as u32; if level_mode != 0 && abs_qcoeff == 0 { level_mode = 0; } else if abs_qcoeff > 1 { level_mode = 1; } qcoeffs[pos as usize] = T::cast_from(copysign(abs_qcoeff, coeff)); } // Rather than zeroing the tail in scan order, assume that qcoeffs is // pre-filled with zeros. // Check the eob is correct debug_assert_eq!( usize::from(eob), scan .iter() .rposition(|&i| qcoeffs[i as usize] != T::cast_from(0)) .map(|n| n + 1) .unwrap_or(0) ); eob } } pub mod rust { use super::*; use crate::cpu_features::CpuFeatureLevel; use std::mem::MaybeUninit; pub fn dequantize( qindex: u8, coeffs: &[T], _eob: u16, rcoeffs: &mut [MaybeUninit], tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel, ) { let log_tx_scale = get_log_tx_scale(tx_size) as i32; let offset = (1 << log_tx_scale) - 1; let dc_quant = dc_q(qindex, dc_delta_q, bit_depth).get() as i32; let ac_quant = ac_q(qindex, ac_delta_q, bit_depth).get() as i32; for (i, (r, c)) in rcoeffs .iter_mut() .zip(coeffs.iter().map(|&c| i32::cast_from(c))) .enumerate() { let quant = if i == 0 { dc_quant } else { ac_quant }; r.write(T::cast_from( (c * quant + ((c >> 31) & offset)) >> log_tx_scale, )); } } } rav1e-0.7.1/src/quantize/tables.rs000064400000000000000000000274551046102023000151330ustar 00000000000000// Copyright (c) 2017-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use std::{ mem::{transmute, MaybeUninit}, num::NonZeroU16, }; pub const MINQ: usize = 0; pub const MAXQ: usize = 255; pub(super) const QINDEX_RANGE: usize = MAXQ - MINQ + 1; pub(super) static dc_qlookup_Q3: [NonZeroU16; QINDEX_RANGE] = nonzero_checked(dc_qlookup_Q3_raw); pub(super) static dc_qlookup_10_Q3: [NonZeroU16; QINDEX_RANGE] = nonzero_checked(dc_qlookup_10_Q3_raw); pub(super) static dc_qlookup_12_Q3: [NonZeroU16; QINDEX_RANGE] = nonzero_checked(dc_qlookup_12_Q3_raw); pub(super) static ac_qlookup_Q3: [NonZeroU16; QINDEX_RANGE] = nonzero_checked(ac_qlookup_Q3_raw); pub(super) static ac_qlookup_10_Q3: [NonZeroU16; QINDEX_RANGE] = nonzero_checked(ac_qlookup_10_Q3_raw); pub(super) static ac_qlookup_12_Q3: [NonZeroU16; QINDEX_RANGE] = nonzero_checked(ac_qlookup_12_Q3_raw); #[rustfmt::skip] const dc_qlookup_Q3_raw: [u16; QINDEX_RANGE] = [ 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, ]; #[rustfmt::skip] const dc_qlookup_10_Q3_raw: [u16; QINDEX_RANGE] = [ 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, ]; #[rustfmt::skip] const dc_qlookup_12_Q3_raw: [u16; QINDEX_RANGE] = [ 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387, ]; #[rustfmt::skip] const ac_qlookup_Q3_raw: [u16; QINDEX_RANGE] = [ 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, ]; #[rustfmt::skip] const ac_qlookup_10_Q3_raw: [u16; QINDEX_RANGE] = [ 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, ]; #[rustfmt::skip] const ac_qlookup_12_Q3_raw: [u16; QINDEX_RANGE] = [ 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247, ]; /// # Panics /// /// - If any value is zero #[allow(clippy::let_unit_value)] #[allow(clippy::unused_unit)] const fn nonzero_checked( raw: [u16; QINDEX_RANGE], ) -> [NonZeroU16; QINDEX_RANGE] { // SAFETY: We initialize everything before exiting this function unsafe { let mut nonzero = [MaybeUninit::uninit(); QINDEX_RANGE]; let mut i = 0; while i < QINDEX_RANGE { let _ = if raw[i] == 0 { [(); 0][i] } else { () }; nonzero[i] = MaybeUninit::new(NonZeroU16::new_unchecked(raw[i])); i += 1; } transmute(nonzero) } } rav1e-0.7.1/src/rate.rs000064400000000000000000001734211046102023000127470ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::api::color::ChromaSampling; use crate::api::ContextInner; use crate::encoder::TEMPORAL_DELIMITER; use crate::quantize::{ac_q, dc_q, select_ac_qi, select_dc_qi}; use crate::util::{ bexp64, bexp_q24, blog64, clamp, q24_to_q57, q57, q57_to_q24, Pixel, }; use std::cmp; // The number of frame sub-types for which we track distinct parameters. // This does not include FRAME_SUBTYPE_SEF, because we don't need to do any // parameter tracking for Show Existing Frame frames. pub const FRAME_NSUBTYPES: usize = 4; pub const FRAME_SUBTYPE_I: usize = 0; pub const FRAME_SUBTYPE_P: usize = 1; #[allow(unused)] pub const FRAME_SUBTYPE_B0: usize = 2; #[allow(unused)] pub const FRAME_SUBTYPE_B1: usize = 3; pub const FRAME_SUBTYPE_SEF: usize = 4; const PASS_SINGLE: i32 = 0; const PASS_1: i32 = 1; const PASS_2: i32 = 2; const PASS_2_PLUS_1: i32 = 3; // Magic value at the start of the 2-pass stats file const TWOPASS_MAGIC: i32 = 0x50324156; // Version number for the 2-pass stats file const TWOPASS_VERSION: i32 = 1; // 4 byte magic + 4 byte version + 4 byte TU count + 4 byte SEF frame count // + FRAME_NSUBTYPES*(4 byte frame count + 1 byte exp + 8 byte scale_sum) pub(crate) const TWOPASS_HEADER_SZ: usize = 16 + FRAME_NSUBTYPES * (4 + 1 + 8); // 4 byte frame type (show_frame and fti jointly coded) + 4 byte log_scale_q24 const TWOPASS_PACKET_SZ: usize = 8; const SEF_BITS: i64 = 24; // The scale of AV1 quantizer tables (relative to the pixel domain), i.e., Q3. pub(crate) const QSCALE: i32 = 3; // We clamp the actual I and B frame delays to a minimum of 10 to work // within the range of values where later incrementing the delay works as // designed. // 10 is not an exact choice, but rather a good working trade-off. const INTER_DELAY_TARGET_MIN: i32 = 10; // The base quantizer for a frame is adjusted based on the frame type using the // formula (log_qp*mqp + dqp), where log_qp is the base-2 logarithm of the // "linear" quantizer (the actual factor by which coefficients are divided). // Because log_qp has an implicit offset built in based on the scale of the // coefficients (which depends on the pixel bit depth and the transform // scale), we normalize the quantizer to the equivalent for 8-bit pixels with // orthonormal transforms for the purposes of rate modeling. const MQP_Q12: &[i32; FRAME_NSUBTYPES] = &[ // TODO: Use a const function once f64 operations in const functions are // stable. (1.0 * (1 << 12) as f64) as i32, (1.0 * (1 << 12) as f64) as i32, (1.0 * (1 << 12) as f64) as i32, (1.0 * (1 << 12) as f64) as i32, ]; // The ratio 33_810_170.0 / 86_043_287.0 was derived by approximating the median // of a change of 15 quantizer steps in the quantizer tables. const DQP_Q57: &[i64; FRAME_NSUBTYPES] = &[ (-(33_810_170.0 / 86_043_287.0) * (1i64 << 57) as f64) as i64, (0.0 * (1i64 << 57) as f64) as i64, ((33_810_170.0 / 86_043_287.0) * (1i64 << 57) as f64) as i64, (2.0 * (33_810_170.0 / 86_043_287.0) * (1i64 << 57) as f64) as i64, ]; // For 8-bit-depth inter frames, log_q_y is derived from log_target_q with a // linear model: // log_q_y = log_target_q + (log_target_q >> 32) * Q_MODEL_MUL + Q_MODEL_ADD // Derivation of the linear models: // https://github.com/xiph/rav1e/blob/d02bdbd3b0b7b2cb9fc301031cc6a4e67a567a5c/doc/quantizer-weight-analysis.ipynb #[rustfmt::skip] const Q_MODEL_ADD: [i64; 4] = [ // 4:2:0 -0x24_4FE7_ECB3_DD90, // 4:2:2 -0x37_41DA_38AD_0924, // 4:4:4 -0x70_83BD_A626_311C, // 4:0:0 0, ]; #[rustfmt::skip] const Q_MODEL_MUL: [i64; 4] = [ // 4:2:0 0x8A0_50DD, // 4:2:2 0x887_7666, // 4:4:4 0x8D4_A712, // 4:0:0 0, ]; #[rustfmt::skip] const ROUGH_TAN_LOOKUP: &[u16; 18] = &[ 0, 358, 722, 1098, 1491, 1910, 2365, 2868, 3437, 4096, 4881, 5850, 7094, 8784, 11254, 15286, 23230, 46817 ]; // A digital approximation of a 2nd-order low-pass Bessel follower. // We use this for rate control because it has fast reaction time, but is // critically damped. pub struct IIRBessel2 { c: [i32; 2], g: i32, x: [i32; 2], y: [i32; 2], } // alpha is Q24 in the range [0,0.5). // The return value is 5.12. fn warp_alpha(alpha: i32) -> i32 { let i = ((alpha * 36) >> 24).min(16); let t0 = ROUGH_TAN_LOOKUP[i as usize]; let t1 = ROUGH_TAN_LOOKUP[i as usize + 1]; let d = alpha * 36 - (i << 24); ((((t0 as i64) << 32) + (((t1 - t0) << 8) as i64) * (d as i64)) >> 32) as i32 } // Compute Bessel filter coefficients with the specified delay. // Return: Filter parameters (c[0], c[1], g). fn iir_bessel2_get_parameters(delay: i32) -> (i32, i32, i32) { // This borrows some code from an unreleased version of Postfish. // See the recipe at http://unicorn.us.com/alex/2polefilters.html for details // on deriving the filter coefficients. // alpha is Q24 let alpha = (1 << 24) / delay; // warp is 7.12 (5.12? the max value is 70386 in Q12). let warp = warp_alpha(alpha).max(1) as i64; // k1 is 9.12 (6.12?) let k1 = 3 * warp; // k2 is 16.24 (11.24?) let k2 = k1 * warp; // d is 16.15 (10.15?) let d = ((((1 << 12) + k1) << 12) + k2 + 256) >> 9; // a is 0.32, since d is larger than both 1.0 and k2 let a = (k2 << 23) / d; // ik2 is 25.24 let ik2 = (1i64 << 48) / k2; // b1 is Q56; in practice, the integer ranges between -2 and 2. let b1 = 2 * a * (ik2 - (1i64 << 24)); // b2 is Q56; in practice, the integer ranges between -2 and 2. let b2 = (1i64 << 56) - ((4 * a) << 24) - b1; // All of the filter parameters are Q24. ( ((b1 + (1i64 << 31)) >> 32) as i32, ((b2 + (1i64 << 31)) >> 32) as i32, ((a + 128) >> 8) as i32, ) } impl IIRBessel2 { pub fn new(delay: i32, value: i32) -> IIRBessel2 { let (c0, c1, g) = iir_bessel2_get_parameters(delay); IIRBessel2 { c: [c0, c1], g, x: [value, value], y: [value, value] } } // Re-initialize Bessel filter coefficients with the specified delay. // This does not alter the x/y state, but changes the reaction time of the // filter. // Altering the time constant of a reactive filter without altering internal // state is something that has to be done carefully, but our design operates // at high enough delays and with small enough time constant changes to make // it safe. pub fn reinit(&mut self, delay: i32) { let (c0, c1, g) = iir_bessel2_get_parameters(delay); self.c[0] = c0; self.c[1] = c1; self.g = g; } pub fn update(&mut self, x: i32) -> i32 { let c0 = self.c[0] as i64; let c1 = self.c[1] as i64; let g = self.g as i64; let x0 = self.x[0] as i64; let x1 = self.x[1] as i64; let y0 = self.y[0] as i64; let y1 = self.y[1] as i64; let ya = ((((x as i64) + x0 * 2 + x1) * g + y0 * c0 + y1 * c1 + (1i64 << 23)) >> 24) as i32; self.x[1] = self.x[0]; self.x[0] = x; self.y[1] = self.y[0]; self.y[0] = ya; ya } } #[derive(Copy, Clone)] struct RCFrameMetrics { // The log base 2 of the scale factor for this frame in Q24 format. log_scale_q24: i32, // The frame type from pass 1 fti: usize, // Whether or not the frame was hidden in pass 1 show_frame: bool, // TODO: The input frame number corresponding to this frame in the input. // input_frameno: u32 // TODO vfr: PTS } impl RCFrameMetrics { const fn new() -> RCFrameMetrics { RCFrameMetrics { log_scale_q24: 0, fti: 0, show_frame: false } } } /// Rate control pass summary /// /// It contains encoding information related to the whole previous /// encoding pass. #[derive(Debug, Default, Clone)] pub struct RCSummary { pub(crate) ntus: i32, nframes: [i32; FRAME_NSUBTYPES + 1], exp: [u8; FRAME_NSUBTYPES], scale_sum: [i64; FRAME_NSUBTYPES], pub(crate) total: i32, } // Backing storage to deserialize Summary and Per-Frame pass data // // Can store up to a full header size since it is the largest of the two // packet kinds. pub(crate) struct RCDeserialize { // The current byte position in the frame metrics buffer. pass2_buffer_pos: usize, // In pass 2, this represents the number of bytes that are available in the // input buffer. pass2_buffer_fill: usize, // Buffer for current frame metrics in pass 2. pass2_buffer: [u8; TWOPASS_HEADER_SZ], } impl Default for RCDeserialize { fn default() -> Self { RCDeserialize { pass2_buffer: [0; TWOPASS_HEADER_SZ], pass2_buffer_pos: 0, pass2_buffer_fill: 0, } } } impl RCDeserialize { // Fill the backing storage by reading enough bytes from the // buf slice until goal bytes are available for parsing. // // goal must be at most TWOPASS_HEADER_SZ. pub(crate) fn buffer_fill( &mut self, buf: &[u8], consumed: usize, goal: usize, ) -> usize { let mut consumed = consumed; while self.pass2_buffer_fill < goal && consumed < buf.len() { self.pass2_buffer[self.pass2_buffer_fill] = buf[consumed]; self.pass2_buffer_fill += 1; consumed += 1; } consumed } // Read the next n bytes as i64. // n must be within 1 and 8 fn unbuffer_val(&mut self, n: usize) -> i64 { let mut bytes = n; let mut ret = 0; let mut shift = 0; while bytes > 0 { bytes -= 1; ret |= (self.pass2_buffer[self.pass2_buffer_pos] as i64) << shift; self.pass2_buffer_pos += 1; shift += 8; } ret } // Read metrics for the next frame. fn parse_metrics(&mut self) -> Result { debug_assert!(self.pass2_buffer_fill >= TWOPASS_PACKET_SZ); let ft_val = self.unbuffer_val(4); let show_frame = (ft_val >> 31) != 0; let fti = (ft_val & 0x7FFFFFFF) as usize; // Make sure the frame type is valid. if fti > FRAME_NSUBTYPES { return Err("Invalid frame type".to_string()); } let log_scale_q24 = self.unbuffer_val(4) as i32; Ok(RCFrameMetrics { log_scale_q24, fti, show_frame }) } // Read the summary header data. pub(crate) fn parse_summary(&mut self) -> Result { // check the magic value and version number. if self.unbuffer_val(4) != TWOPASS_MAGIC as i64 { return Err("Magic value mismatch".to_string()); } if self.unbuffer_val(4) != TWOPASS_VERSION as i64 { return Err("Version number mismatch".to_string()); } let mut s = RCSummary { ntus: self.unbuffer_val(4) as i32, ..Default::default() }; // Make sure the file claims to have at least one TU. // Otherwise we probably got the placeholder data from an aborted // pass 1. if s.ntus < 1 { return Err("No TUs found in first pass summary".to_string()); } let mut total: i32 = 0; for nframes in s.nframes.iter_mut() { let n = self.unbuffer_val(4) as i32; if n < 0 { return Err("Got negative frame count".to_string()); } total = total .checked_add(n) .ok_or_else(|| "Frame count too large".to_string())?; *nframes = n; } // We can't have more TUs than frames. if s.ntus > total { return Err("More TUs than frames".to_string()); } s.total = total; for exp in s.exp.iter_mut() { *exp = self.unbuffer_val(1) as u8; } for scale_sum in s.scale_sum.iter_mut() { *scale_sum = self.unbuffer_val(8); if *scale_sum < 0 { return Err("Got negative scale sum".to_string()); } } Ok(s) } } pub struct RCState { // The target bit-rate in bits per second. target_bitrate: i32, // The number of TUs over which to distribute the reservoir usage. // We use TUs because in our leaky bucket model, we only add bits to the // reservoir on TU boundaries. reservoir_frame_delay: i32, // Whether or not the reservoir_frame_delay was explicitly specified by the // user, or is the default value. reservoir_frame_delay_is_set: bool, // The maximum quantizer index to allow (for the luma AC coefficients, other // quantizers will still be adjusted to match). maybe_ac_qi_max: Option, // The minimum quantizer index to allow (for the luma AC coefficients). ac_qi_min: u8, // Will we drop frames to meet bitrate requirements? drop_frames: bool, // Do we respect the maximum reservoir fullness? cap_overflow: bool, // Can the reservoir go negative? cap_underflow: bool, // The log of the first-pass base quantizer. pass1_log_base_q: i64, // Two-pass mode state. // PASS_SINGLE => 1-pass encoding. // PASS_1 => 1st pass of 2-pass encoding. // PASS_2 => 2nd pass of 2-pass encoding. // PASS_2_PLUS_1 => 2nd pass of 2-pass encoding, but also emitting pass 1 // data again. twopass_state: i32, // The log of the number of pixels in a frame in Q57 format. log_npixels: i64, // The target average bits per Temporal Unit (input frame). bits_per_tu: i64, // The current bit reservoir fullness (bits available to be used). reservoir_fullness: i64, // The target buffer fullness. // This is where we'd like to be by the last keyframe that appears in the // next reservoir_frame_delay frames. reservoir_target: i64, // The maximum buffer fullness (total size of the buffer). reservoir_max: i64, // The log of estimated scale factor for the rate model in Q57 format. // // TODO: Convert to Q23 or figure out a better way to avoid overflow // once 2-pass mode is introduced, if required. log_scale: [i64; FRAME_NSUBTYPES], // The exponent used in the rate model in Q6 format. exp: [u8; FRAME_NSUBTYPES], // The log of an estimated scale factor used to obtain the real framerate, // for VFR sources or, e.g., 12 fps content doubled to 24 fps, etc. // TODO vfr: log_vfr_scale: i64, // Second-order lowpass filters to track scale and VFR. scalefilter: [IIRBessel2; FRAME_NSUBTYPES], // TODO vfr: vfrfilter: IIRBessel2, // The number of frames of each type we have seen, for filter adaptation // purposes. // These are only 32 bits to guarantee that we can sum the scales over the // whole file without overflow in a 64-bit int. // That limits us to 2.268 years at 60 fps (minus 33% with re-ordering). nframes: [i32; FRAME_NSUBTYPES + 1], inter_delay: [i32; FRAME_NSUBTYPES - 1], inter_delay_target: i32, // The total accumulated estimation bias. rate_bias: i64, // The number of (non-Show Existing Frame) frames that have been encoded. nencoded_frames: i64, // The number of Show Existing Frames that have been emitted. nsef_frames: i64, // Buffer for current frame metrics in pass 1. pass1_buffer: [u8; TWOPASS_HEADER_SZ], // Whether or not the user has retrieved the pass 1 data for the last frame. // For PASS_1 or PASS_2_PLUS_1 encoding, this is set to false after each // frame is encoded, and must be set to true by calling twopass_out() before // the next frame can be encoded. pub pass1_data_retrieved: bool, // Marks whether or not the user has retrieved the summary data at the end of // the encode. pass1_summary_retrieved: bool, // Whether or not the user has provided enough data to encode in the second // pass. // For PASS_2 or PASS_2_PLUS_1 encoding, this is set to false after each // frame, and must be set to true by calling twopass_in() before the next // frame can be encoded. pass2_data_ready: bool, // TODO: Add a way to force the next frame to be a keyframe in 2-pass mode. // Right now we are relying on keyframe detection to detect the same // keyframes. // The metrics for the previous frame. prev_metrics: RCFrameMetrics, // The metrics for the current frame. cur_metrics: RCFrameMetrics, // The buffered metrics for future frames. frame_metrics: Vec, // The total number of frames still in use in the circular metric buffer. nframe_metrics: usize, // The index of the current frame in the circular metric buffer. frame_metrics_head: usize, // Data deserialization des: RCDeserialize, // The TU count encoded so far. ntus: i32, // The TU count for the whole file. ntus_total: i32, // The remaining TU count. ntus_left: i32, // The frame count of each frame subtype in the whole file. nframes_total: [i32; FRAME_NSUBTYPES + 1], // The sum of those counts. nframes_total_total: i32, // The number of frames of each subtype yet to be processed. nframes_left: [i32; FRAME_NSUBTYPES + 1], // The sum of the scale values for each frame subtype. scale_sum: [i64; FRAME_NSUBTYPES], // The number of TUs represented by the current scale sums. scale_window_ntus: i32, // The frame count of each frame subtype in the current scale window. scale_window_nframes: [i32; FRAME_NSUBTYPES + 1], // The sum of the scale values for each frame subtype in the current window. scale_window_sum: [i64; FRAME_NSUBTYPES], } // TODO: Separate qi values for each color plane. pub struct QuantizerParameters { // The full-precision, unmodulated log quantizer upon which our modulated // quantizer indices are based. // This is only used to limit sudden quality changes from frame to frame, and // as such is not adjusted when we encounter buffer overrun or underrun. pub log_base_q: i64, // The full-precision log quantizer modulated by the current frame type upon // which our quantizer indices are based (including any adjustments to // prevent buffer overrun or underrun). // This is used when estimating the scale parameter once we know the actual // bit usage of a frame. pub log_target_q: i64, pub dc_qi: [u8; 3], pub ac_qi: [u8; 3], pub lambda: f64, pub dist_scale: [f64; 3], } const Q57_SQUARE_EXP_SCALE: f64 = (2.0 * ::std::f64::consts::LN_2) / ((1i64 << 57) as f64); // Daala style log-offset for chroma quantizers // TODO: Optimal offsets for more configurations than just BT.709 fn chroma_offset( log_target_q: i64, chroma_sampling: ChromaSampling, ) -> (i64, i64) { let x = log_target_q.max(0); // Gradient optimized for CIEDE2000+PSNR on subset3 let y = match chroma_sampling { ChromaSampling::Cs400 => 0, ChromaSampling::Cs420 => (x >> 2) + (x >> 6), // 0.266 ChromaSampling::Cs422 => (x >> 3) + (x >> 4) - (x >> 7), // 0.180 ChromaSampling::Cs444 => (x >> 4) + (x >> 5) + (x >> 8), // 0.098 }; // blog64(7) - blog64(4); blog64(5) - blog64(4) (0x19D_5D9F_D501_0B37 - y, 0xA4_D3C2_5E68_DC58 - y) } impl QuantizerParameters { fn new_from_log_q( log_base_q: i64, log_target_q: i64, bit_depth: usize, chroma_sampling: ChromaSampling, is_intra: bool, log_isqrt_mean_scale: i64, ) -> QuantizerParameters { let scale = log_isqrt_mean_scale + q57(QSCALE + bit_depth as i32 - 8); let mut log_q_y = log_target_q; if !is_intra && bit_depth == 8 { log_q_y = log_target_q + (log_target_q >> 32) * Q_MODEL_MUL[chroma_sampling as usize] + Q_MODEL_ADD[chroma_sampling as usize]; } let quantizer = bexp64(log_q_y + scale); let (offset_u, offset_v) = chroma_offset(log_q_y + log_isqrt_mean_scale, chroma_sampling); let mono = chroma_sampling == ChromaSampling::Cs400; let log_q_u = log_q_y + offset_u; let log_q_v = log_q_y + offset_v; let quantizer_u = bexp64(log_q_u + scale); let quantizer_v = bexp64(log_q_v + scale); let lambda = (::std::f64::consts::LN_2 / 6.0) * (((log_target_q + log_isqrt_mean_scale) as f64) * Q57_SQUARE_EXP_SCALE) .exp(); let scale = |q| bexp64((log_target_q - q) * 2 + q57(16)) as f64 / 65536.; let dist_scale = [scale(log_q_y), scale(log_q_u), scale(log_q_v)]; let base_q_idx = select_ac_qi(quantizer, bit_depth).max(1); // delta_q only gets 6 bits + a sign bit, so it can differ by 63 at most. let min_qi = base_q_idx.saturating_sub(63).max(1); let max_qi = base_q_idx.saturating_add(63).min(255); let clamp_qi = |qi: u8| qi.clamp(min_qi, max_qi); QuantizerParameters { log_base_q, log_target_q, // TODO: Allow lossless mode; i.e. qi == 0. dc_qi: [ clamp_qi(select_dc_qi(quantizer, bit_depth)), if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_u, bit_depth)) }, if mono { 0 } else { clamp_qi(select_dc_qi(quantizer_v, bit_depth)) }, ], ac_qi: [ base_q_idx, if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_u, bit_depth)) }, if mono { 0 } else { clamp_qi(select_ac_qi(quantizer_v, bit_depth)) }, ], lambda, dist_scale, } } } impl RCState { pub fn new( frame_width: i32, frame_height: i32, framerate_num: i64, framerate_den: i64, target_bitrate: i32, maybe_ac_qi_max: Option, ac_qi_min: u8, max_key_frame_interval: i32, maybe_reservoir_frame_delay: Option, ) -> RCState { // The default buffer size is set equal to 1.5x the keyframe interval, or 240 // frames; whichever is smaller, with a minimum of 12. // For user set values, we enforce a minimum of 12. // The interval is short enough to allow reaction, but long enough to allow // looking into the next GOP (avoiding the case where the last frames // before an I-frame get starved), in most cases. // The 12 frame minimum gives us some chance to distribute bit estimation // errors in the worst case. let reservoir_frame_delay = maybe_reservoir_frame_delay .unwrap_or_else(|| ((max_key_frame_interval * 3) >> 1).min(240)) .max(12); // TODO: What are the limits on these? let npixels = (frame_width as i64) * (frame_height as i64); // Insane framerates or frame sizes mean insane bitrates. // Let's not get carried away. // We also subtract 16 bits from each temporal unit to account for the // temporal delimiter, whose bits are not included in the frame sizes // reported to update_state(). // TODO: Support constraints imposed by levels. let bits_per_tu = clamp( (target_bitrate as i64) * framerate_den / framerate_num, 40, 0x4000_0000_0000, ) - (TEMPORAL_DELIMITER.len() * 8) as i64; let reservoir_max = bits_per_tu * (reservoir_frame_delay as i64); // Start with a buffer fullness and fullness target of 50%. let reservoir_target = (reservoir_max + 1) >> 1; // Pick exponents and initial scales for quantizer selection. let ibpp = npixels / bits_per_tu; // These have been derived by encoding many clips at every quantizer // and running a piecewise-linear regression in binary log space. let (i_exp, i_log_scale) = if ibpp < 1 { (48u8, blog64(36) - q57(QSCALE)) } else if ibpp < 4 { (61u8, blog64(55) - q57(QSCALE)) } else { (77u8, blog64(129) - q57(QSCALE)) }; let (p_exp, p_log_scale) = if ibpp < 2 { (69u8, blog64(32) - q57(QSCALE)) } else if ibpp < 139 { (104u8, blog64(84) - q57(QSCALE)) } else { (83u8, blog64(19) - q57(QSCALE)) }; let (b0_exp, b0_log_scale) = if ibpp < 2 { (84u8, blog64(30) - q57(QSCALE)) } else if ibpp < 92 { (120u8, blog64(68) - q57(QSCALE)) } else { (68u8, blog64(4) - q57(QSCALE)) }; let (b1_exp, b1_log_scale) = if ibpp < 2 { (87u8, blog64(27) - q57(QSCALE)) } else if ibpp < 126 { (139u8, blog64(84) - q57(QSCALE)) } else { (61u8, blog64(1) - q57(QSCALE)) }; // TODO: Add support for "golden" P frames. RCState { target_bitrate, reservoir_frame_delay, reservoir_frame_delay_is_set: maybe_reservoir_frame_delay.is_some(), maybe_ac_qi_max, ac_qi_min, drop_frames: false, cap_overflow: true, cap_underflow: false, pass1_log_base_q: 0, twopass_state: PASS_SINGLE, log_npixels: blog64(npixels), bits_per_tu, reservoir_fullness: reservoir_target, reservoir_target, reservoir_max, log_scale: [i_log_scale, p_log_scale, b0_log_scale, b1_log_scale], exp: [i_exp, p_exp, b0_exp, b1_exp], scalefilter: [ IIRBessel2::new(4, q57_to_q24(i_log_scale)), IIRBessel2::new(INTER_DELAY_TARGET_MIN, q57_to_q24(p_log_scale)), IIRBessel2::new(INTER_DELAY_TARGET_MIN, q57_to_q24(b0_log_scale)), IIRBessel2::new(INTER_DELAY_TARGET_MIN, q57_to_q24(b1_log_scale)), ], // TODO VFR nframes: [0; FRAME_NSUBTYPES + 1], inter_delay: [INTER_DELAY_TARGET_MIN; FRAME_NSUBTYPES - 1], inter_delay_target: reservoir_frame_delay >> 1, rate_bias: 0, nencoded_frames: 0, nsef_frames: 0, pass1_buffer: [0; TWOPASS_HEADER_SZ], pass1_data_retrieved: true, pass1_summary_retrieved: false, pass2_data_ready: false, prev_metrics: RCFrameMetrics::new(), cur_metrics: RCFrameMetrics::new(), frame_metrics: Vec::new(), nframe_metrics: 0, frame_metrics_head: 0, ntus: 0, ntus_total: 0, ntus_left: 0, nframes_total: [0; FRAME_NSUBTYPES + 1], nframes_total_total: 0, nframes_left: [0; FRAME_NSUBTYPES + 1], scale_sum: [0; FRAME_NSUBTYPES], scale_window_ntus: 0, scale_window_nframes: [0; FRAME_NSUBTYPES + 1], scale_window_sum: [0; FRAME_NSUBTYPES], des: RCDeserialize::default(), } } pub(crate) fn select_first_pass_qi( &self, bit_depth: usize, fti: usize, chroma_sampling: ChromaSampling, ) -> QuantizerParameters { // Adjust the quantizer for the frame type, result is Q57: let log_q = ((self.pass1_log_base_q + (1i64 << 11)) >> 12) * (MQP_Q12[fti] as i64) + DQP_Q57[fti]; QuantizerParameters::new_from_log_q( self.pass1_log_base_q, log_q, bit_depth, chroma_sampling, fti == 0, 0, ) } // TODO: Separate quantizers for Cb and Cr. #[profiling::function] pub(crate) fn select_qi( &self, ctx: &ContextInner, output_frameno: u64, fti: usize, maybe_prev_log_base_q: Option, log_isqrt_mean_scale: i64, ) -> QuantizerParameters { // Is rate control active? if self.target_bitrate <= 0 { // Rate control is not active. // Derive quantizer directly from frame type. let bit_depth = ctx.config.bit_depth; let chroma_sampling = ctx.config.chroma_sampling; let (log_base_q, log_q) = Self::calc_flat_quantizer(ctx.config.quantizer as u8, bit_depth, fti); QuantizerParameters::new_from_log_q( log_base_q, log_q, bit_depth, chroma_sampling, fti == 0, log_isqrt_mean_scale, ) } else { let mut nframes: [i32; FRAME_NSUBTYPES + 1] = [0; FRAME_NSUBTYPES + 1]; let mut log_scale: [i64; FRAME_NSUBTYPES] = self.log_scale; let mut reservoir_tus = self.reservoir_frame_delay.min(self.ntus_left); let mut reservoir_frames = 0; let mut log_cur_scale = (self.scalefilter[fti].y[0] as i64) << 33; match self.twopass_state { // First pass of 2-pass mode: use a fixed base quantizer. PASS_1 => { return self.select_first_pass_qi( ctx.config.bit_depth, fti, ctx.config.chroma_sampling, ); } // Second pass of 2-pass mode: we know exactly how much of each frame // type there is in the current buffer window, and have estimates for // the scales. PASS_2 | PASS_2_PLUS_1 => { let mut scale_window_sum: [i64; FRAME_NSUBTYPES] = self.scale_window_sum; let mut scale_window_nframes: [i32; FRAME_NSUBTYPES + 1] = self.scale_window_nframes; // Intentionally exclude Show Existing Frame frames from this. for ftj in 0..FRAME_NSUBTYPES { reservoir_frames += scale_window_nframes[ftj]; } // If we're approaching the end of the file, add some slack to keep // us from slamming into a rail. // Our rate accuracy goes down, but it keeps the result sensible. // We position the target where the first forced keyframe beyond the // end of the file would be (for consistency with 1-pass mode). // TODO: let mut buf_pad = self.reservoir_frame_delay.min(...); // if buf_delay < buf_pad { // buf_pad -= buf_delay; // } // else ... // Otherwise, search for the last keyframe in the buffer window and // target that. // Currently we only do this when using a finite buffer. // We could save the position of the last keyframe in the stream in // the summary data and do it with a whole-file buffer as well, but // it isn't likely to make a difference. if !self.frame_metrics.is_empty() { let mut fm_tail = self.frame_metrics_head + self.nframe_metrics; if fm_tail >= self.frame_metrics.len() { fm_tail -= self.frame_metrics.len(); } let mut fmi = fm_tail; loop { if fmi == 0 { fmi += self.frame_metrics.len(); } fmi -= 1; // Stop before we remove the first frame. if fmi == self.frame_metrics_head { break; } // If we find a keyframe, remove it and everything past it. if self.frame_metrics[fmi].fti == FRAME_SUBTYPE_I { while fmi != fm_tail { let m = &self.frame_metrics[fmi]; let ftj = m.fti; scale_window_nframes[ftj] -= 1; if ftj < FRAME_NSUBTYPES { scale_window_sum[ftj] -= bexp_q24(m.log_scale_q24); reservoir_frames -= 1; } if m.show_frame { reservoir_tus -= 1; } fmi += 1; if fmi >= self.frame_metrics.len() { fmi = 0; } } // And stop scanning backwards. break; } } } nframes = scale_window_nframes; // If we're not using the same frame type as in pass 1 (because // someone changed some encoding parameters), remove that scale // estimate. // We'll add a replacement for the correct frame type below. if self.cur_metrics.fti != fti { scale_window_nframes[self.cur_metrics.fti] -= 1; if self.cur_metrics.fti != FRAME_SUBTYPE_SEF { scale_window_sum[self.cur_metrics.fti] -= bexp_q24(self.cur_metrics.log_scale_q24); } } else { log_cur_scale = (self.cur_metrics.log_scale_q24 as i64) << 33; } // If we're approaching the end of the file, add some slack to keep // us from slamming into a rail. // Our rate accuracy goes down, but it keeps the result sensible. // We position the target where the first forced keyframe beyond the // end of the file would be (for consistency with 1-pass mode). if reservoir_tus >= self.ntus_left && self.ntus_total as u64 > ctx.gop_input_frameno_start[&output_frameno] { let nfinal_gop_tus = self.ntus_total - (ctx.gop_input_frameno_start[&output_frameno] as i32); if ctx.config.max_key_frame_interval as i32 > nfinal_gop_tus { let reservoir_pad = (ctx.config.max_key_frame_interval as i32 - nfinal_gop_tus) .min(self.reservoir_frame_delay - reservoir_tus); let (guessed_reservoir_frames, guessed_reservoir_tus) = ctx .guess_frame_subtypes( &mut nframes, reservoir_tus + reservoir_pad, ); reservoir_frames = guessed_reservoir_frames; reservoir_tus = guessed_reservoir_tus; } } // Blend in the low-pass filtered scale according to how many // frames of each type we need to add compared to the actual sums in // our window. for ftj in 0..FRAME_NSUBTYPES { let scale = scale_window_sum[ftj] + bexp_q24(self.scalefilter[ftj].y[0]) * (nframes[ftj] - scale_window_nframes[ftj]) as i64; log_scale[ftj] = if nframes[ftj] > 0 { blog64(scale) - blog64(nframes[ftj] as i64) - q57(24) } else { -self.log_npixels }; } } // Single pass. _ => { // Figure out how to re-distribute bits so that we hit our fullness // target before the last keyframe in our current buffer window // (after the current frame), or the end of the buffer window, // whichever comes first. // Count the various types and classes of frames. let (guessed_reservoir_frames, guessed_reservoir_tus) = ctx.guess_frame_subtypes(&mut nframes, self.reservoir_frame_delay); reservoir_frames = guessed_reservoir_frames; reservoir_tus = guessed_reservoir_tus; // TODO: Scale for VFR. } } // If we've been missing our target, add a penalty term. let rate_bias = (self.rate_bias / (self.nencoded_frames + 100)) * (reservoir_frames as i64); // rate_total is the total bits available over the next // reservoir_tus TUs. let rate_total = self.reservoir_fullness - self.reservoir_target + rate_bias + (reservoir_tus as i64) * self.bits_per_tu; // Find a target quantizer that meets our rate target for the // specific mix of frame types we'll have over the next // reservoir_frame frames. // We model the rate<->quantizer relationship as // rate = scale*(quantizer**-exp) // In this case, we have our desired rate, an exponent selected in // setup, and a scale that's been measured over our frame history, // so we're solving for the quantizer. // Exponentiation with arbitrary exponents is expensive, so we work // in the binary log domain (binary exp and log aren't too bad): // rate = exp2(log2(scale) - log2(quantizer)*exp) // There's no easy closed form solution, so we bisection searh for it. let bit_depth = ctx.config.bit_depth; let chroma_sampling = ctx.config.chroma_sampling; // TODO: Proper handling of lossless. let mut log_qlo = blog64(ac_q(self.ac_qi_min, 0, bit_depth).get() as i64) - q57(QSCALE + bit_depth as i32 - 8); // The AC quantizer tables map to values larger than the DC quantizer // tables, so we use that as the upper bound to make sure we can use // the full table if needed. let mut log_qhi = blog64( ac_q(self.maybe_ac_qi_max.unwrap_or(255), 0, bit_depth).get() as i64, ) - q57(QSCALE + bit_depth as i32 - 8); let mut log_base_q = (log_qlo + log_qhi) >> 1; while log_qlo < log_qhi { // Count bits contributed by each frame type using the model. let mut bits = 0i64; for ftj in 0..FRAME_NSUBTYPES { // Modulate base quantizer by frame type. let log_q = ((log_base_q + (1i64 << 11)) >> 12) * (MQP_Q12[ftj] as i64) + DQP_Q57[ftj]; // All the fields here are Q57 except for the exponent, which is // Q6. bits += (nframes[ftj] as i64) * bexp64( log_scale[ftj] + self.log_npixels - ((log_q + 32) >> 6) * (self.exp[ftj] as i64), ); } // The number of bits for Show Existing Frame frames is constant. bits += (nframes[FRAME_SUBTYPE_SEF] as i64) * SEF_BITS; let diff = bits - rate_total; if diff > 0 { log_qlo = log_base_q + 1; } else if diff < 0 { log_qhi = log_base_q - 1; } else { break; } log_base_q = (log_qlo + log_qhi) >> 1; } // If this was not one of the initial frames, limit the change in // base quantizer to within [0.8*Q, 1.2*Q] where Q is the previous // frame's base quantizer. if let Some(prev_log_base_q) = maybe_prev_log_base_q { log_base_q = clamp( log_base_q, prev_log_base_q - 0xA4_D3C2_5E68_DC58, prev_log_base_q + 0xA4_D3C2_5E68_DC58, ); } // Modulate base quantizer by frame type. let mut log_q = ((log_base_q + (1i64 << 11)) >> 12) * (MQP_Q12[fti] as i64) + DQP_Q57[fti]; // The above allocation looks only at the total rate we'll accumulate // in the next reservoir_frame_delay frames. // However, we could overflow the bit reservoir on the very next // frame. // Check for that here if we're not using a soft target. if self.cap_overflow { // Allow 3% of the buffer for prediction error. // This should be plenty, and we don't mind if we go a bit over. // We only want to keep these bits from being completely wasted. let margin = (self.reservoir_max + 31) >> 5; // We want to use at least this many bits next frame. let soft_limit = self.reservoir_fullness + self.bits_per_tu - (self.reservoir_max - margin); if soft_limit > 0 { let log_soft_limit = blog64(soft_limit); // If we're predicting we won't use that many bits... // TODO: When using frame re-ordering, we should include the rate // for all of the frames in the current TU. // When there is more than one frame, there will be no direct // solution for the required adjustment, however. let log_scale_pixels = log_cur_scale + self.log_npixels; let exp = self.exp[fti] as i64; let mut log_q_exp = ((log_q + 32) >> 6) * exp; if log_scale_pixels - log_q_exp < log_soft_limit { // Scale the adjustment based on how far into the margin we are. log_q_exp += ((log_scale_pixels - log_soft_limit - log_q_exp) >> 32) * ((margin.min(soft_limit) << 32) / margin); log_q = ((log_q_exp + (exp >> 1)) / exp) << 6; } } } // We just checked we don't overflow the reservoir next frame, now // check we don't underflow and bust the budget (when not using a // soft target). if self.maybe_ac_qi_max.is_none() { // Compute the maximum number of bits we can use in the next frame. // Allow 50% of the rate for a single frame for prediction error. // This may not be enough for keyframes or sudden changes in // complexity. let log_hard_limit = blog64(self.reservoir_fullness + (self.bits_per_tu >> 1)); // If we're predicting we'll use more than this... // TODO: When using frame re-ordering, we should include the rate // for all of the frames in the current TU. // When there is more than one frame, there will be no direct // solution for the required adjustment, however. let log_scale_pixels = log_cur_scale + self.log_npixels; let exp = self.exp[fti] as i64; let mut log_q_exp = ((log_q + 32) >> 6) * exp; if log_scale_pixels - log_q_exp > log_hard_limit { // Force the target to hit our limit exactly. log_q_exp = log_scale_pixels - log_hard_limit; log_q = ((log_q_exp + (exp >> 1)) / exp) << 6; // If that target is unreasonable, oh well; we'll have to drop. } } if let Some(qi_max) = self.maybe_ac_qi_max { let (max_log_base_q, max_log_q) = Self::calc_flat_quantizer(qi_max, ctx.config.bit_depth, fti); log_base_q = cmp::min(log_base_q, max_log_base_q); log_q = cmp::min(log_q, max_log_q); } if self.ac_qi_min > 0 { let (min_log_base_q, min_log_q) = Self::calc_flat_quantizer(self.ac_qi_min, ctx.config.bit_depth, fti); log_base_q = cmp::max(log_base_q, min_log_base_q); log_q = cmp::max(log_q, min_log_q); } QuantizerParameters::new_from_log_q( log_base_q, log_q, bit_depth, chroma_sampling, fti == 0, log_isqrt_mean_scale, ) } } // Computes a quantizer directly from the frame type and base quantizer index, // without consideration for rate control. fn calc_flat_quantizer( base_qi: u8, bit_depth: usize, fti: usize, ) -> (i64, i64) { // TODO: Rename "quantizer" something that indicates it is a quantizer // index, and move it somewhere more sensible (or choose a better way to // parameterize a "quality" configuration parameter). // We use the AC quantizer as the source quantizer since its quantizer // tables have unique entries, while the DC tables do not. let ac_quantizer = ac_q(base_qi, 0, bit_depth).get() as i64; // Pick the nearest DC entry since an exact match may be unavailable. let dc_qi = select_dc_qi(ac_quantizer, bit_depth); let dc_quantizer = dc_q(dc_qi, 0, bit_depth).get() as i64; // Get the log quantizers as Q57. let log_ac_q = blog64(ac_quantizer) - q57(QSCALE + bit_depth as i32 - 8); let log_dc_q = blog64(dc_quantizer) - q57(QSCALE + bit_depth as i32 - 8); // Target the midpoint of the chosen entries. let log_base_q = (log_ac_q + log_dc_q + 1) >> 1; // Adjust the quantizer for the frame type, result is Q57: let log_q = ((log_base_q + (1i64 << 11)) >> 12) * (MQP_Q12[fti] as i64) + DQP_Q57[fti]; (log_base_q, log_q) } #[profiling::function] pub fn update_state( &mut self, bits: i64, fti: usize, show_frame: bool, log_target_q: i64, trial: bool, droppable: bool, ) -> bool { if trial { assert!(self.needs_trial_encode(fti)); assert!(bits > 0); } let mut dropped = false; // Update rate control only if rate control is active. if self.target_bitrate > 0 { let mut estimated_bits = 0; let mut bits = bits; let mut droppable = droppable; let mut log_scale = q57(-64); // Drop frames is also disabled for now in the case of infinite-buffer // two-pass mode. if !self.drop_frames || fti == FRAME_SUBTYPE_SEF || (self.twopass_state == PASS_2 || self.twopass_state == PASS_2_PLUS_1) && !self.frame_metrics.is_empty() { droppable = false; } if fti == FRAME_SUBTYPE_SEF { debug_assert!(bits == SEF_BITS); debug_assert!(show_frame); // Please don't make trial encodes of a SEF. debug_assert!(!trial); estimated_bits = SEF_BITS; self.nsef_frames += 1; } else { let log_q_exp = ((log_target_q + 32) >> 6) * (self.exp[fti] as i64); let prev_log_scale = self.log_scale[fti]; if bits <= 0 { // We didn't code any blocks in this frame. bits = 0; dropped = true; // TODO: Adjust VFR rate based on drop count. } else { // Compute the estimated scale factor for this frame type. let log_bits = blog64(bits); log_scale = (log_bits - self.log_npixels + log_q_exp).min(q57(16)); estimated_bits = bexp64(prev_log_scale + self.log_npixels - log_q_exp); if !trial { self.nencoded_frames += 1; } } } let log_scale_q24 = q57_to_q24(log_scale); // Special two-pass processing. if self.twopass_state == PASS_2 || self.twopass_state == PASS_2_PLUS_1 { // Pass 2 mode: if !trial { // Move the current metrics back one frame. self.prev_metrics = self.cur_metrics; // Back out the last frame's statistics from the sliding window. let ftj = self.prev_metrics.fti; self.nframes_left[ftj] -= 1; self.scale_window_nframes[ftj] -= 1; if ftj < FRAME_NSUBTYPES { self.scale_window_sum[ftj] -= bexp_q24(self.prev_metrics.log_scale_q24); } if self.prev_metrics.show_frame { self.ntus_left -= 1; self.scale_window_ntus -= 1; } // Free the corresponding entry in the circular buffer. if !self.frame_metrics.is_empty() { self.nframe_metrics -= 1; self.frame_metrics_head += 1; if self.frame_metrics_head >= self.frame_metrics.len() { self.frame_metrics_head = 0; } } // Mark us ready for the next 2-pass packet. self.pass2_data_ready = false; // Update state, so the user doesn't have to keep calling // twopass_in() after they've fed in all the data when we're using // a finite buffer. self.twopass_in(None).unwrap_or(0); } } if self.twopass_state == PASS_1 || self.twopass_state == PASS_2_PLUS_1 { // Pass 1 mode: save the metrics for this frame. self.prev_metrics.log_scale_q24 = log_scale_q24; self.prev_metrics.fti = fti; self.prev_metrics.show_frame = show_frame; self.pass1_data_retrieved = false; } // Common to all passes: if fti != FRAME_SUBTYPE_SEF && bits > 0 { // If this is the first example of the given frame type we've seen, // we immediately replace the default scale factor guess with the // estimate we just computed using the first frame. if trial || self.nframes[fti] <= 0 { let f = &mut self.scalefilter[fti]; let x = log_scale_q24; f.x[0] = x; f.x[1] = x; f.y[0] = x; f.y[1] = x; self.log_scale[fti] = log_scale; // TODO: Duplicate regular P frame state for first golden P frame. } else { // Lengthen the time constant for the inter filters as we collect // more frame statistics, until we reach our target. if fti > 0 && self.inter_delay[fti - 1] < self.inter_delay_target && self.nframes[fti] >= self.inter_delay[fti - 1] { self.inter_delay[fti - 1] += 1; self.scalefilter[fti].reinit(self.inter_delay[fti - 1]); } // Update the low-pass scale filter for this frame type regardless // of whether or not we will ultimately drop this frame. self.log_scale[fti] = q24_to_q57(self.scalefilter[fti].update(log_scale_q24)); } // If this frame busts our budget, it must be dropped. if droppable && self.reservoir_fullness + self.bits_per_tu < bits { // TODO: Adjust VFR rate based on drop count. bits = 0; dropped = true; } else { // TODO: Update a low-pass filter to estimate the "real" frame rate // taking timestamps and drops into account. // This is only done if the frame is coded, as it needs the final // count of dropped frames. } } if !trial { // Increment the frame count for filter adaptation purposes. if !trial && self.nframes[fti] < ::std::i32::MAX { self.nframes[fti] += 1; } self.reservoir_fullness -= bits; if show_frame { self.reservoir_fullness += self.bits_per_tu; // TODO: Properly account for temporal delimiter bits. } // If we're too quick filling the buffer and overflow is capped, that // rate is lost forever. if self.cap_overflow { self.reservoir_fullness = self.reservoir_fullness.min(self.reservoir_max); } // If we're too quick draining the buffer and underflow is capped, // don't try to make up that rate later. if self.cap_underflow { self.reservoir_fullness = self.reservoir_fullness.max(0); } // Adjust the bias for the real bits we've used. self.rate_bias += estimated_bits - bits; } } dropped } pub const fn needs_trial_encode(&self, fti: usize) -> bool { self.target_bitrate > 0 && self.nframes[fti] == 0 } pub(crate) const fn ready(&self) -> bool { match self.twopass_state { PASS_SINGLE => true, PASS_1 => self.pass1_data_retrieved, PASS_2 => self.pass2_data_ready, _ => self.pass1_data_retrieved && self.pass2_data_ready, } } fn buffer_val(&mut self, val: i64, bytes: usize, cur_pos: usize) -> usize { let mut val = val; let mut bytes = bytes; let mut cur_pos = cur_pos; while bytes > 0 { bytes -= 1; self.pass1_buffer[cur_pos] = val as u8; cur_pos += 1; val >>= 8; } cur_pos } pub(crate) fn select_pass1_log_base_q( &self, ctx: &ContextInner, output_frameno: u64, ) -> i64 { assert_eq!(self.twopass_state, PASS_SINGLE); self.select_qi(ctx, output_frameno, FRAME_SUBTYPE_I, None, 0).log_base_q } // Initialize the first pass and emit a placeholder summary pub(crate) fn init_first_pass( &mut self, maybe_pass1_log_base_q: Option, ) { if let Some(pass1_log_base_q) = maybe_pass1_log_base_q { assert_eq!(self.twopass_state, PASS_SINGLE); // Pick first-pass qi for scale calculations. self.pass1_log_base_q = pass1_log_base_q; } else { debug_assert!(self.twopass_state == PASS_2); } self.twopass_state += PASS_1; } // Prepare a placeholder summary fn emit_placeholder_summary(&mut self) -> &[u8] { // Fill in dummy summary values. let mut cur_pos = 0; cur_pos = self.buffer_val(TWOPASS_MAGIC as i64, 4, cur_pos); cur_pos = self.buffer_val(TWOPASS_VERSION as i64, 4, cur_pos); cur_pos = self.buffer_val(0, TWOPASS_HEADER_SZ - 8, cur_pos); debug_assert!(cur_pos == TWOPASS_HEADER_SZ); self.pass1_data_retrieved = true; &self.pass1_buffer[..cur_pos] } // Frame-specific pass data pub(crate) fn emit_frame_data(&mut self) -> Option<&[u8]> { let mut cur_pos = 0; let fti = self.prev_metrics.fti; if fti < FRAME_NSUBTYPES { self.scale_sum[fti] += bexp_q24(self.prev_metrics.log_scale_q24); } if self.prev_metrics.show_frame { self.ntus += 1; } // If we have encoded too many frames, prevent us from reaching the // ready state required to encode more. if self.nencoded_frames + self.nsef_frames >= std::i32::MAX as i64 { None? } cur_pos = self.buffer_val( (self.prev_metrics.show_frame as i64) << 31 | self.prev_metrics.fti as i64, 4, cur_pos, ); cur_pos = self.buffer_val(self.prev_metrics.log_scale_q24 as i64, 4, cur_pos); debug_assert!(cur_pos == TWOPASS_PACKET_SZ); self.pass1_data_retrieved = true; Some(&self.pass1_buffer[..cur_pos]) } // Summary of the whole encoding process. pub(crate) fn emit_summary(&mut self) -> &[u8] { let mut cur_pos = 0; cur_pos = self.buffer_val(TWOPASS_MAGIC as i64, 4, cur_pos); cur_pos = self.buffer_val(TWOPASS_VERSION as i64, 4, cur_pos); cur_pos = self.buffer_val(self.ntus as i64, 4, cur_pos); for fti in 0..=FRAME_NSUBTYPES { cur_pos = self.buffer_val(self.nframes[fti] as i64, 4, cur_pos); } for fti in 0..FRAME_NSUBTYPES { cur_pos = self.buffer_val(self.exp[fti] as i64, 1, cur_pos); } for fti in 0..FRAME_NSUBTYPES { cur_pos = self.buffer_val(self.scale_sum[fti], 8, cur_pos); } debug_assert!(cur_pos == TWOPASS_HEADER_SZ); self.pass1_summary_retrieved = true; &self.pass1_buffer[..cur_pos] } // Emit either summary or frame-specific data depending on the previous call pub(crate) fn twopass_out( &mut self, done_processing: bool, ) -> Option<&[u8]> { if !self.pass1_data_retrieved { if self.twopass_state != PASS_1 && self.twopass_state != PASS_2_PLUS_1 { Some(self.emit_placeholder_summary()) } else { self.emit_frame_data() } } else if done_processing && !self.pass1_summary_retrieved { Some(self.emit_summary()) } else { // The data for this frame has already been retrieved. None } } // Initialize the rate control for second pass encoding pub(crate) fn init_second_pass(&mut self) { if self.twopass_state == PASS_SINGLE || self.twopass_state == PASS_1 { // Initialize the second pass. self.twopass_state += PASS_2; // If the user requested a finite buffer, reserve the space required for // it. if self.reservoir_frame_delay_is_set { debug_assert!(self.reservoir_frame_delay > 0); // reservoir_frame_delay counts in TUs, but RCFrameMetrics are stored // per frame (including Show Existing Frame frames). // When re-ordering, we will have more frames than TUs. // How many more? // That depends on the re-ordering scheme used. // Doubling the number of TUs and adding a fixed latency equal to the // maximum number of reference frames we can store should be // sufficient for any reasonable scheme, and keeps this code from // depending too closely on the details of the scheme currently used // by rav1e. let nmetrics = (self.reservoir_frame_delay as usize) * 2 + 8; self.frame_metrics.reserve_exact(nmetrics); self.frame_metrics.resize(nmetrics, RCFrameMetrics::new()); } } } pub(crate) fn setup_second_pass(&mut self, s: &RCSummary) { self.ntus_total = s.ntus; self.ntus_left = s.ntus; self.nframes_total = s.nframes; self.nframes_left = s.nframes; self.nframes_total_total = s.nframes.iter().sum(); if self.frame_metrics.is_empty() { self.reservoir_frame_delay = s.ntus; self.scale_window_nframes = self.nframes_total; self.scale_window_sum = s.scale_sum; self.reservoir_max = self.bits_per_tu * (self.reservoir_frame_delay as i64); self.reservoir_target = (self.reservoir_max + 1) >> 1; self.reservoir_fullness = self.reservoir_target; } else { self.reservoir_frame_delay = self.reservoir_frame_delay.min(s.ntus); } self.exp = s.exp; } // Parse the rate control summary // // It returns the amount of data consumed in the process or // an empty error on parsing failure. fn twopass_parse_summary(&mut self, buf: &[u8]) -> Result { let consumed = self.des.buffer_fill(buf, 0, TWOPASS_HEADER_SZ); if self.des.pass2_buffer_fill >= TWOPASS_HEADER_SZ { self.des.pass2_buffer_pos = 0; let s = self.des.parse_summary()?; self.setup_second_pass(&s); // Got a valid header. // Set up pass 2. // Clear the header data from the buffer to make room for the // packet data. self.des.pass2_buffer_fill = 0; } Ok(consumed) } // Return the size of the first buffer twopass_in expects // // It is the summary size (constant) + the number of frame data packets // (variable depending on the configuration) it needs to starts encoding. pub(crate) fn twopass_first_packet_size(&self) -> usize { let frames_needed = if !self.frame_metrics.is_empty() { // If we're not using whole-file buffering, we need at least one // frame per buffer slot. self.reservoir_frame_delay as usize } else { // Otherwise we need just one. 1 }; TWOPASS_HEADER_SZ + frames_needed * TWOPASS_PACKET_SZ } // Return the number of frame data packets to be parsed before // the encoding process can continue. pub(crate) fn twopass_in_frames_needed(&self) -> i32 { if self.target_bitrate <= 0 { return 0; } if self.frame_metrics.is_empty() { return i32::from(!self.pass2_data_ready); } let mut cur_scale_window_nframes = 0; let mut cur_nframes_left = 0; for fti in 0..=FRAME_NSUBTYPES { cur_scale_window_nframes += self.scale_window_nframes[fti]; cur_nframes_left += self.nframes_left[fti]; } (self.reservoir_frame_delay - self.scale_window_ntus) .clamp(0, cur_nframes_left - cur_scale_window_nframes) } pub(crate) fn parse_frame_data_packet( &mut self, buf: &[u8], ) -> Result<(), String> { if buf.len() != TWOPASS_PACKET_SZ { return Err("Incorrect buffer size".to_string()); } self.des.buffer_fill(buf, 0, TWOPASS_PACKET_SZ); self.des.pass2_buffer_pos = 0; let m = self.des.parse_metrics()?; self.des.pass2_buffer_fill = 0; if self.frame_metrics.is_empty() { // We're using a whole-file buffer. self.cur_metrics = m; self.pass2_data_ready = true; } else { // Safety check let frames_needed = self.twopass_in_frames_needed(); if frames_needed > 0 { if self.nframe_metrics >= self.frame_metrics.len() { return Err( "Read too many frames without finding enough TUs".to_string(), ); } let mut fmi = self.frame_metrics_head + self.nframe_metrics; if fmi >= self.frame_metrics.len() { fmi -= self.frame_metrics.len(); } self.nframe_metrics += 1; self.frame_metrics[fmi] = m; // And accumulate the statistics over the window. self.scale_window_nframes[m.fti] += 1; if m.fti < FRAME_NSUBTYPES { self.scale_window_sum[m.fti] += bexp_q24(m.log_scale_q24); } if m.show_frame { self.scale_window_ntus += 1; } if frames_needed == 1 { self.pass2_data_ready = true; self.cur_metrics = self.frame_metrics[self.frame_metrics_head]; } } else { return Err("No frames needed".to_string()); } } Ok(()) } // Parse the rate control per-frame data // // If no buffer is passed return the amount of data it expects // to consume next. // // If a properly sized buffer is passed it returns the amount of data // consumed in the process or an empty error on parsing failure. fn twopass_parse_frame_data( &mut self, maybe_buf: Option<&[u8]>, mut consumed: usize, ) -> Result { { if self.frame_metrics.is_empty() { // We're using a whole-file buffer. if let Some(buf) = maybe_buf { consumed = self.des.buffer_fill(buf, consumed, TWOPASS_PACKET_SZ); if self.des.pass2_buffer_fill >= TWOPASS_PACKET_SZ { self.des.pass2_buffer_pos = 0; // Read metrics for the next frame. self.cur_metrics = self.des.parse_metrics()?; // Clear the buffer for the next frame. self.des.pass2_buffer_fill = 0; self.pass2_data_ready = true; } } else { return Ok(TWOPASS_PACKET_SZ - self.des.pass2_buffer_fill); } } else { // We're using a finite buffer. let mut cur_scale_window_nframes = 0; let mut cur_nframes_left = 0; for fti in 0..=FRAME_NSUBTYPES { cur_scale_window_nframes += self.scale_window_nframes[fti]; cur_nframes_left += self.nframes_left[fti]; } let mut frames_needed = self.twopass_in_frames_needed(); while frames_needed > 0 { if let Some(buf) = maybe_buf { consumed = self.des.buffer_fill(buf, consumed, TWOPASS_PACKET_SZ); if self.des.pass2_buffer_fill >= TWOPASS_PACKET_SZ { self.des.pass2_buffer_pos = 0; // Read the metrics for the next frame. let m = self.des.parse_metrics()?; // Add them to the circular buffer. if self.nframe_metrics >= self.frame_metrics.len() { return Err( "Read too many frames without finding enough TUs" .to_string(), ); } let mut fmi = self.frame_metrics_head + self.nframe_metrics; if fmi >= self.frame_metrics.len() { fmi -= self.frame_metrics.len(); } self.nframe_metrics += 1; self.frame_metrics[fmi] = m; // And accumulate the statistics over the window. self.scale_window_nframes[m.fti] += 1; cur_scale_window_nframes += 1; if m.fti < FRAME_NSUBTYPES { self.scale_window_sum[m.fti] += bexp_q24(m.log_scale_q24); } if m.show_frame { self.scale_window_ntus += 1; } frames_needed = (self.reservoir_frame_delay - self.scale_window_ntus) .clamp(0, cur_nframes_left - cur_scale_window_nframes); // Clear the buffer for the next frame. self.des.pass2_buffer_fill = 0; } else { // Go back for more data. break; } } else { return Ok( TWOPASS_PACKET_SZ * (frames_needed as usize) - self.des.pass2_buffer_fill, ); } } // If we've got all the frames we need, fill in the current metrics. // We're ready to go. if frames_needed <= 0 { self.cur_metrics = self.frame_metrics[self.frame_metrics_head]; // Mark us ready for the next frame. self.pass2_data_ready = true; } } } Ok(consumed) } // If called without a buffer it will return the size of the next // buffer it expects. // // If called with a buffer it will consume it fully. // It returns Ok(0) if the buffer had been parsed or Err(()) // if the buffer hadn't been enough or other errors happened. pub(crate) fn twopass_in( &mut self, maybe_buf: Option<&[u8]>, ) -> Result { let mut consumed = 0; self.init_second_pass(); // If we haven't got a valid summary header yet, try to parse one. if self.nframes_total[FRAME_SUBTYPE_I] == 0 { self.pass2_data_ready = false; if let Some(buf) = maybe_buf { consumed = self.twopass_parse_summary(buf)? } else { return Ok(self.twopass_first_packet_size()); } } if self.nframes_total[FRAME_SUBTYPE_I] > 0 { if self.nencoded_frames + self.nsef_frames >= self.nframes_total_total as i64 { // We don't want any more data after the last frame, and we don't want // to allow any more frames to be encoded. self.pass2_data_ready = false; } else if !self.pass2_data_ready { return self.twopass_parse_frame_data(maybe_buf, consumed); } } Ok(consumed) } } rav1e-0.7.1/src/rdo.rs000064400000000000000000002440541046102023000126010ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_camel_case_types)] use crate::api::*; use crate::cdef::*; use crate::context::*; use crate::cpu_features::CpuFeatureLevel; use crate::deblock::*; use crate::dist::*; use crate::ec::{Writer, WriterCounter, OD_BITRES}; use crate::encode_block_with_modes; use crate::encoder::{FrameInvariants, IMPORTANCE_BLOCK_SIZE}; use crate::frame::Frame; use crate::frame::*; use crate::header::ReferenceMode; use crate::lrf::*; use crate::mc::MotionVector; use crate::me::estimate_motion; use crate::me::MVSamplingMode; use crate::me::MotionSearchResult; use crate::motion_compensate; use crate::partition::PartitionType::*; use crate::partition::RefType::*; use crate::partition::*; use crate::predict::{ luma_ac, AngleDelta, IntraEdgeFilterParameters, IntraParam, PredictionMode, RAV1E_INTER_COMPOUND_MODES, RAV1E_INTER_MODES_MINIMAL, RAV1E_INTRA_MODES, }; use crate::rdo_tables::*; use crate::tiling::*; use crate::transform::{TxSet, TxSize, TxType, RAV1E_TX_TYPES}; use crate::util::{init_slice_repeat_mut, Aligned, Pixel}; use crate::write_tx_blocks; use crate::write_tx_tree; use crate::Tune; use crate::{encode_block_post_cdef, encode_block_pre_cdef}; use arrayvec::*; use itertools::izip; use std::fmt; use std::mem::MaybeUninit; #[derive(Copy, Clone, PartialEq, Eq)] pub enum RDOType { PixelDistRealRate, TxDistRealRate, TxDistEstRate, } impl RDOType { #[inline] pub const fn needs_tx_dist(self) -> bool { match self { // Pixel-domain distortion and exact ec rate RDOType::PixelDistRealRate => false, // Tx-domain distortion and exact ec rate RDOType::TxDistRealRate => true, // Tx-domain distortion and txdist-based rate RDOType::TxDistEstRate => true, } } #[inline] pub const fn needs_coeff_rate(self) -> bool { match self { RDOType::PixelDistRealRate => true, RDOType::TxDistRealRate => true, RDOType::TxDistEstRate => false, } } } #[derive(Clone)] pub struct PartitionGroupParameters { pub rd_cost: f64, pub part_type: PartitionType, pub part_modes: ArrayVec, } #[derive(Clone, Debug)] pub struct PartitionParameters { pub rd_cost: f64, pub bo: TileBlockOffset, pub bsize: BlockSize, pub pred_mode_luma: PredictionMode, pub pred_mode_chroma: PredictionMode, pub pred_cfl_params: CFLParams, pub angle_delta: AngleDelta, pub ref_frames: [RefType; 2], pub mvs: [MotionVector; 2], pub skip: bool, pub has_coeff: bool, pub tx_size: TxSize, pub tx_type: TxType, pub sidx: u8, } impl Default for PartitionParameters { fn default() -> Self { PartitionParameters { rd_cost: std::f64::MAX, bo: TileBlockOffset::default(), bsize: BlockSize::BLOCK_32X32, pred_mode_luma: PredictionMode::default(), pred_mode_chroma: PredictionMode::default(), pred_cfl_params: CFLParams::default(), angle_delta: AngleDelta::default(), ref_frames: [RefType::INTRA_FRAME, RefType::NONE_FRAME], mvs: [MotionVector::default(); 2], skip: false, has_coeff: true, tx_size: TxSize::TX_4X4, tx_type: TxType::DCT_DCT, sidx: 0, } } } pub fn estimate_rate(qindex: u8, ts: TxSize, fast_distortion: u64) -> u64 { let bs_index = ts as usize; let q_bin_idx = (qindex as usize) / RDO_QUANT_DIV; let bin_idx_down = ((fast_distortion) / RATE_EST_BIN_SIZE).min((RDO_NUM_BINS - 2) as u64); let bin_idx_up = (bin_idx_down + 1).min((RDO_NUM_BINS - 1) as u64); let x0 = (bin_idx_down * RATE_EST_BIN_SIZE) as i64; let x1 = (bin_idx_up * RATE_EST_BIN_SIZE) as i64; let y0 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_down as usize] as i64; let y1 = RDO_RATE_TABLE[q_bin_idx][bs_index][bin_idx_up as usize] as i64; let slope = ((y1 - y0) << 8) / (x1 - x0); (y0 + (((fast_distortion as i64 - x0) * slope) >> 8)).max(0) as u64 } #[allow(unused)] pub fn cdef_dist_wxh DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, bit_depth: usize, compute_bias: F, cpu: CpuFeatureLevel, ) -> Distortion { debug_assert!(src1.plane_cfg.xdec == 0); debug_assert!(src1.plane_cfg.ydec == 0); debug_assert!(src2.plane_cfg.xdec == 0); debug_assert!(src2.plane_cfg.ydec == 0); let mut sum = Distortion::zero(); for y in (0..h).step_by(8) { for x in (0..w).step_by(8) { let kernel_h = (h - y).min(8); let kernel_w = (w - x).min(8); let area = Area::StartingAt { x: x as isize, y: y as isize }; let value = RawDistortion(cdef_dist_kernel( &src1.subregion(area), &src2.subregion(area), kernel_w, kernel_h, bit_depth, cpu, ) as u64); // cdef is always called on non-subsampled planes, so BLOCK_8X8 is // correct here. sum += value * compute_bias(area, BlockSize::BLOCK_8X8); } } sum } /// Sum of Squared Error for a wxh block /// Currently limited to w and h of valid blocks pub fn sse_wxh DistortionScale>( src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, w: usize, h: usize, compute_bias: F, bit_depth: usize, cpu: CpuFeatureLevel, ) -> Distortion { // See get_weighted_sse in src/dist.rs. // Provide a scale to get_weighted_sse for each square region of this size. const CHUNK_SIZE: usize = IMPORTANCE_BLOCK_SIZE >> 1; // To bias the distortion correctly, compute it in blocks up to the size // importance block size in a non-subsampled plane. let imp_block_w = CHUNK_SIZE << src1.plane_cfg.xdec; let imp_block_h = CHUNK_SIZE << src1.plane_cfg.ydec; let imp_bsize = BlockSize::from_width_and_height(imp_block_w, imp_block_h); let n_imp_blocks_w = (w + CHUNK_SIZE - 1) / CHUNK_SIZE; let n_imp_blocks_h = (h + CHUNK_SIZE - 1) / CHUNK_SIZE; // TODO: Copying biases into a buffer is slow. It would be best if biases were // passed directly. To do this, we would need different versions of the // weighted sse function for decimated/subsampled data. Also requires // eliminating use of unbiased sse. // It should also be noted that the current copy code does not auto-vectorize. // Copy biases into a buffer. let mut buf_storage = Aligned::new( [MaybeUninit::::uninit(); 128 / CHUNK_SIZE * 128 / CHUNK_SIZE], ); let buf_stride = n_imp_blocks_w.next_power_of_two(); let buf = init_slice_repeat_mut( &mut buf_storage.data[..buf_stride * n_imp_blocks_h], 0, ); for block_y in 0..n_imp_blocks_h { for block_x in 0..n_imp_blocks_w { let block = Area::StartingAt { x: (block_x * CHUNK_SIZE) as isize, y: (block_y * CHUNK_SIZE) as isize, }; buf[block_y * buf_stride + block_x] = compute_bias(block, imp_bsize).0; } } Distortion(get_weighted_sse( src1, src2, buf, buf_stride, w, h, bit_depth, cpu, )) } pub const fn clip_visible_bsize( frame_w: usize, frame_h: usize, bsize: BlockSize, x: usize, y: usize, ) -> (usize, usize) { let blk_w = bsize.width(); let blk_h = bsize.height(); let visible_w: usize = if x + blk_w <= frame_w { blk_w } else if x >= frame_w { 0 } else { frame_w - x }; let visible_h: usize = if y + blk_h <= frame_h { blk_h } else if y >= frame_h { 0 } else { frame_h - y }; (visible_w, visible_h) } // Compute the pixel-domain distortion for an encode fn compute_distortion( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, luma_only: bool, ) -> ScaledDistortion { let area = Area::BlockStartingAt { bo: tile_bo.0 }; let input_region = ts.input_tile.planes[0].subregion(area); let rec_region = ts.rec.planes[0].subregion(area); // clip a block to have visible pixles only let frame_bo = ts.to_frame_block_offset(tile_bo); let (visible_w, visible_h) = clip_visible_bsize( fi.width, fi.height, bsize, frame_bo.0.x << MI_SIZE_LOG2, frame_bo.0.y << MI_SIZE_LOG2, ); if visible_w == 0 || visible_h == 0 { return ScaledDistortion::zero(); } let mut distortion = match fi.config.tune { Tune::Psychovisual => cdef_dist_wxh( &input_region, &rec_region, visible_w, visible_h, fi.sequence.bit_depth, |bias_area, bsize| { distortion_scale( fi, input_region.subregion(bias_area).frame_block_offset(), bsize, ) }, fi.cpu_feature_level, ), Tune::Psnr => sse_wxh( &input_region, &rec_region, visible_w, visible_h, |bias_area, bsize| { distortion_scale( fi, input_region.subregion(bias_area).frame_block_offset(), bsize, ) }, fi.sequence.bit_depth, fi.cpu_feature_level, ), } * fi.dist_scale[0]; if is_chroma_block && !luma_only && fi.sequence.chroma_sampling != ChromaSampling::Cs400 { let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let chroma_w = if bsize.width() >= 8 || xdec == 0 { (visible_w + xdec) >> xdec } else { (4 + visible_w + xdec) >> xdec }; let chroma_h = if bsize.height() >= 8 || ydec == 0 { (visible_h + ydec) >> ydec } else { (4 + visible_h + ydec) >> ydec }; for p in 1..3 { let input_region = ts.input_tile.planes[p].subregion(area); let rec_region = ts.rec.planes[p].subregion(area); distortion += sse_wxh( &input_region, &rec_region, chroma_w, chroma_h, |bias_area, bsize| { distortion_scale( fi, input_region.subregion(bias_area).frame_block_offset(), bsize, ) }, fi.sequence.bit_depth, fi.cpu_feature_level, ) * fi.dist_scale[p]; } } distortion } // Compute the transform-domain distortion for an encode fn compute_tx_distortion( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, bsize: BlockSize, is_chroma_block: bool, tile_bo: TileBlockOffset, tx_dist: ScaledDistortion, skip: bool, luma_only: bool, ) -> ScaledDistortion { assert!(fi.config.tune == Tune::Psnr); let area = Area::BlockStartingAt { bo: tile_bo.0 }; let input_region = ts.input_tile.planes[0].subregion(area); let rec_region = ts.rec.planes[0].subregion(area); let (visible_w, visible_h) = if !skip { (bsize.width(), bsize.height()) } else { let frame_bo = ts.to_frame_block_offset(tile_bo); clip_visible_bsize( fi.width, fi.height, bsize, frame_bo.0.x << MI_SIZE_LOG2, frame_bo.0.y << MI_SIZE_LOG2, ) }; if visible_w == 0 || visible_h == 0 { return ScaledDistortion::zero(); } let mut distortion = if skip { sse_wxh( &input_region, &rec_region, visible_w, visible_h, |bias_area, bsize| { distortion_scale( fi, input_region.subregion(bias_area).frame_block_offset(), bsize, ) }, fi.sequence.bit_depth, fi.cpu_feature_level, ) * fi.dist_scale[0] } else { tx_dist }; if is_chroma_block && !luma_only && skip && fi.sequence.chroma_sampling != ChromaSampling::Cs400 { let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let chroma_w = if bsize.width() >= 8 || xdec == 0 { (visible_w + xdec) >> xdec } else { (4 + visible_w + xdec) >> xdec }; let chroma_h = if bsize.height() >= 8 || ydec == 0 { (visible_h + ydec) >> ydec } else { (4 + visible_h + ydec) >> ydec }; for p in 1..3 { let input_region = ts.input_tile.planes[p].subregion(area); let rec_region = ts.rec.planes[p].subregion(area); distortion += sse_wxh( &input_region, &rec_region, chroma_w, chroma_h, |bias_area, bsize| { distortion_scale( fi, input_region.subregion(bias_area).frame_block_offset(), bsize, ) }, fi.sequence.bit_depth, fi.cpu_feature_level, ) * fi.dist_scale[p]; } } distortion } /// Compute a scaling factor to multiply the distortion of a block by, /// this factor is determined using temporal RDO. /// /// # Panics /// /// - If called with `bsize` of 8x8 or smaller /// - If the coded frame data doesn't exist on the `FrameInvariants` pub fn distortion_scale( fi: &FrameInvariants, frame_bo: PlaneBlockOffset, bsize: BlockSize, ) -> DistortionScale { if !fi.config.temporal_rdo() { return DistortionScale::default(); } // EncoderConfig::temporal_rdo() should always return false in situations // where distortion is computed on > 8x8 blocks, so we should never hit this // assert. assert!(bsize <= BlockSize::BLOCK_8X8); let x = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; let y = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; let coded_data = fi.coded_frame_data.as_ref().unwrap(); coded_data.distortion_scales[y * coded_data.w_in_imp_b + x] } /// # Panics /// /// - If the coded frame data doesn't exist on the `FrameInvariants` pub fn spatiotemporal_scale( fi: &FrameInvariants, frame_bo: PlaneBlockOffset, bsize: BlockSize, ) -> DistortionScale { if !fi.config.temporal_rdo() && fi.config.tune != Tune::Psychovisual { return DistortionScale::default(); } let coded_data = fi.coded_frame_data.as_ref().unwrap(); let x0 = frame_bo.0.x >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; let y0 = frame_bo.0.y >> IMPORTANCE_BLOCK_TO_BLOCK_SHIFT; let x1 = (x0 + bsize.width_imp_b()).min(coded_data.w_in_imp_b); let y1 = (y0 + bsize.height_imp_b()).min(coded_data.h_in_imp_b); let den = (((x1 - x0) * (y1 - y0)) as u64) << DistortionScale::SHIFT; // calling this on each slice individually improves autovectorization // compared to using `Iterator::take` #[inline(always)] fn take_slice(slice: &[T], n: usize) -> &[T] { slice.get(..n).unwrap_or(slice) } let mut sum = 0; for y in y0..y1 { sum += take_slice( &coded_data.distortion_scales[y * coded_data.w_in_imp_b..][x0..x1], MAX_SB_IN_IMP_B, ) .iter() .zip( take_slice( &coded_data.activity_scales[y * coded_data.w_in_imp_b..][x0..x1], MAX_SB_IN_IMP_B, ) .iter(), ) .map(|(d, a)| d.0 as u64 * a.0 as u64) .sum::(); } DistortionScale(((sum + (den >> 1)) / den) as u32) } pub fn distortion_scale_for( propagate_cost: f64, intra_cost: f64, ) -> DistortionScale { // The mbtree paper \cite{mbtree} uses the following formula: // // QP_delta = -strength * log2(1 + (propagate_cost / intra_cost)) // // Since this is H.264, this corresponds to the following quantizer: // // Q' = Q * 2^(QP_delta/6) // // Since lambda is proportial to Q^2, this means we want to minimize: // // D + lambda' * R // = D + 2^(QP_delta / 3) * lambda * R // // If we want to keep lambda fixed, we can instead scale distortion and // minimize: // // D * scale + lambda * R // // where: // // scale = 2^(QP_delta / -3) // = (1 + (propagate_cost / intra_cost))^(strength / 3) // // The original paper empirically chooses strength = 2.0, but strength = 1.0 // seems to work best in rav1e currently, this may have something to do with // the fact that they use 16x16 blocks whereas our "importance blocks" are // 8x8, but everything should be scale invariant here so that's weird. // // @article{mbtree, // title={A novel macroblock-tree algorithm for high-performance // optimization of dependent video coding in H.264/AVC}, // author={Garrett-Glaser, Jason}, // journal={Tech. Rep.}, // year={2009}, // url={https://pdfs.semanticscholar.org/032f/1ab7d9db385780a02eb2d579af8303b266d2.pdf} // } if intra_cost == 0. { return DistortionScale::default(); // no scaling } let strength = 1.0; // empirical, see comment above let frac = (intra_cost + propagate_cost) / intra_cost; frac.powf(strength / 3.0).into() } /// Fixed point arithmetic version of distortion scale #[repr(transparent)] #[derive(Copy, Clone)] pub struct DistortionScale(pub u32); #[repr(transparent)] pub struct RawDistortion(u64); #[repr(transparent)] pub struct Distortion(pub u64); #[repr(transparent)] pub struct ScaledDistortion(u64); impl DistortionScale { /// Bits past the radix point const SHIFT: u32 = 14; /// Number of bits used. Determines the max value. /// 28 bits is quite excessive. const BITS: u32 = 28; /// Maximum internal value const MAX: u64 = (1 << Self::BITS) - 1; #[inline] pub const fn new(num: u64, den: u64) -> Self { let raw = (num << Self::SHIFT).saturating_add(den / 2) / den; let mask = (raw <= Self::MAX) as u64; Self((mask * raw + (1 - mask) * Self::MAX) as u32) } pub fn inv_mean(slice: &[Self]) -> Self { use crate::util::{bexp64, blog32_q11}; let sum = slice.iter().map(|&s| blog32_q11(s.0) as i64).sum::(); let log_inv_mean_q11 = (Self::SHIFT << 11) as i64 - sum / slice.len() as i64; Self( bexp64((log_inv_mean_q11 + (Self::SHIFT << 11) as i64) << (57 - 11)) .clamp(1, (1 << Self::BITS) - 1) as u32, ) } /// Binary logarithm in Q11 #[inline] pub const fn blog16(self) -> i16 { use crate::util::blog32_q11; (blog32_q11(self.0) - ((Self::SHIFT as i32) << 11)) as i16 } /// Binary logarithm in Q57 #[inline] pub const fn blog64(self) -> i64 { use crate::util::{blog64, q57}; blog64(self.0 as i64) - q57(Self::SHIFT as i32) } /// Multiply, round and shift /// Internal implementation, so don't use multiply trait. #[inline] pub const fn mul_u64(self, dist: u64) -> u64 { (self.0 as u64 * dist + (1 << Self::SHIFT >> 1)) >> Self::SHIFT } } impl std::ops::Mul for DistortionScale { type Output = Self; /// Multiply, round and shift #[inline] fn mul(self, rhs: Self) -> Self { Self( (((self.0 as u64 * rhs.0 as u64) + (1 << (Self::SHIFT - 1))) >> Self::SHIFT) .clamp(1, (1 << Self::BITS) - 1) as u32, ) } } impl std::ops::MulAssign for DistortionScale { fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; } } // Default value for DistortionScale is a fixed point 1 impl Default for DistortionScale { #[inline] fn default() -> Self { Self(1 << Self::SHIFT) } } impl fmt::Debug for DistortionScale { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", f64::from(*self)) } } impl From for DistortionScale { #[inline] fn from(scale: f64) -> Self { let den = 1 << (Self::SHIFT + 1); Self::new((scale * den as f64) as u64, den) } } impl From for f64 { #[inline] fn from(scale: DistortionScale) -> Self { scale.0 as f64 / (1 << DistortionScale::SHIFT) as f64 } } impl RawDistortion { #[inline] pub const fn new(dist: u64) -> Self { Self(dist) } } impl std::ops::Mul for RawDistortion { type Output = Distortion; #[inline] fn mul(self, rhs: DistortionScale) -> Distortion { Distortion(rhs.mul_u64(self.0)) } } impl Distortion { #[inline] pub const fn zero() -> Self { Self(0) } } impl std::ops::Mul for Distortion { type Output = ScaledDistortion; #[inline] fn mul(self, rhs: DistortionScale) -> ScaledDistortion { ScaledDistortion(rhs.mul_u64(self.0)) } } impl std::ops::AddAssign for Distortion { #[inline] fn add_assign(&mut self, other: Self) { self.0 += other.0; } } impl ScaledDistortion { #[inline] pub const fn zero() -> Self { Self(0) } } impl std::ops::AddAssign for ScaledDistortion { #[inline] fn add_assign(&mut self, other: Self) { self.0 += other.0; } } pub fn compute_rd_cost( fi: &FrameInvariants, rate: u32, distortion: ScaledDistortion, ) -> f64 { let rate_in_bits = (rate as f64) / ((1 << OD_BITRES) as f64); fi.lambda.mul_add(rate_in_bits, distortion.0 as f64) } pub fn rdo_tx_size_type( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, luma_mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], skip: bool, ) -> (TxSize, TxType) { let is_inter = !luma_mode.is_intra(); let mut tx_size = max_txsize_rect_lookup[bsize as usize]; if fi.enable_inter_txfm_split && is_inter && !skip { tx_size = sub_tx_size_map[tx_size as usize]; // Always choose one level split size } let mut best_tx_type = TxType::DCT_DCT; let mut best_tx_size = tx_size; let mut best_rd = std::f64::MAX; let do_rdo_tx_size = fi.tx_mode_select && fi.config.speed_settings.transform.rdo_tx_decision && !is_inter; let rdo_tx_depth = if do_rdo_tx_size { 2 } else { 0 }; let mut cw_checkpoint: Option = None; for _ in 0..=rdo_tx_depth { let tx_set = get_tx_set(tx_size, is_inter, fi.use_reduced_tx_set); let do_rdo_tx_type = tx_set > TxSet::TX_SET_DCTONLY && fi.config.speed_settings.transform.rdo_tx_decision && !is_inter && !skip; if !do_rdo_tx_size && !do_rdo_tx_type { return (best_tx_size, best_tx_type); }; let tx_types = if do_rdo_tx_type { RAV1E_TX_TYPES } else { &[TxType::DCT_DCT] }; // Luma plane transform type decision let (tx_type, rd_cost) = rdo_tx_type_decision( fi, ts, cw, &mut cw_checkpoint, luma_mode, ref_frames, mvs, bsize, tile_bo, tx_size, tx_set, tx_types, best_rd, ); if rd_cost < best_rd { best_tx_size = tx_size; best_tx_type = tx_type; best_rd = rd_cost; } debug_assert!(tx_size.width_log2() <= bsize.width_log2()); debug_assert!(tx_size.height_log2() <= bsize.height_log2()); debug_assert!( tx_size.sqr() <= TxSize::TX_32X32 || tx_type == TxType::DCT_DCT ); let next_tx_size = sub_tx_size_map[tx_size as usize]; if next_tx_size == tx_size { break; } else { tx_size = next_tx_size; }; } (best_tx_size, best_tx_type) } #[inline] const fn dmv_in_range(mv: MotionVector, ref_mv: MotionVector) -> bool { let diff_row = mv.row as i32 - ref_mv.row as i32; let diff_col = mv.col as i32 - ref_mv.col as i32; diff_row >= MV_LOW && diff_row <= MV_UPP && diff_col >= MV_LOW && diff_col <= MV_UPP } #[inline] #[profiling::function] fn luma_chroma_mode_rdo( luma_mode: PredictionMode, fi: &FrameInvariants, bsize: BlockSize, tile_bo: TileBlockOffset, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, rdo_type: RDOType, cw_checkpoint: &ContextWriterCheckpoint, best: &mut PartitionParameters, mvs: [MotionVector; 2], ref_frames: [RefType; 2], mode_set_chroma: &[PredictionMode], luma_mode_is_intra: bool, mode_context: usize, mv_stack: &ArrayVec, angle_delta: AngleDelta, ) { let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let is_chroma_block = has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); if !luma_mode_is_intra { let ref_mvs = if mv_stack.is_empty() { [MotionVector::default(); 2] } else { [mv_stack[0].this_mv, mv_stack[0].comp_mv] }; if (luma_mode == PredictionMode::NEWMV || luma_mode == PredictionMode::NEW_NEWMV || luma_mode == PredictionMode::NEW_NEARESTMV) && !dmv_in_range(mvs[0], ref_mvs[0]) { return; } if (luma_mode == PredictionMode::NEW_NEWMV || luma_mode == PredictionMode::NEAREST_NEWMV) && !dmv_in_range(mvs[1], ref_mvs[1]) { return; } } // Find the best chroma prediction mode for the current luma prediction mode let mut chroma_rdo = |skip: bool| -> bool { use crate::segmentation::select_segment; let mut zero_distortion = false; for sidx in select_segment(fi, ts, tile_bo, bsize, skip) { cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, sidx); let (tx_size, tx_type) = rdo_tx_size_type( fi, ts, cw, bsize, tile_bo, luma_mode, ref_frames, mvs, skip, ); for &chroma_mode in mode_set_chroma.iter() { let wr = &mut WriterCounter::new(); let tell = wr.tell_frac(); if bsize >= BlockSize::BLOCK_8X8 && bsize.is_sqr() { cw.write_partition( wr, tile_bo, PartitionType::PARTITION_NONE, bsize, ); } // TODO(yushin): luma and chroma would have different decision based on chroma format let need_recon_pixel = luma_mode_is_intra && tx_size.block_size() != bsize; encode_block_pre_cdef(&fi.sequence, ts, cw, wr, bsize, tile_bo, skip); let (has_coeff, tx_dist) = encode_block_post_cdef( fi, ts, cw, wr, luma_mode, chroma_mode, angle_delta, ref_frames, mvs, bsize, tile_bo, skip, CFLParams::default(), tx_size, tx_type, mode_context, mv_stack, rdo_type, need_recon_pixel, None, ); let rate = wr.tell_frac() - tell; let distortion = if fi.use_tx_domain_distortion && !need_recon_pixel { compute_tx_distortion( fi, ts, bsize, is_chroma_block, tile_bo, tx_dist, skip, false, ) } else { compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false) }; let is_zero_dist = distortion.0 == 0; let rd = compute_rd_cost(fi, rate, distortion); if rd < best.rd_cost { //if rd < best.rd_cost || luma_mode == PredictionMode::NEW_NEWMV { best.rd_cost = rd; best.pred_mode_luma = luma_mode; best.pred_mode_chroma = chroma_mode; best.angle_delta = angle_delta; best.ref_frames = ref_frames; best.mvs = mvs; best.skip = skip; best.has_coeff = has_coeff; best.tx_size = tx_size; best.tx_type = tx_type; best.sidx = sidx; zero_distortion = is_zero_dist; } cw.rollback(cw_checkpoint); } } zero_distortion }; // Don't skip when using intra modes let zero_distortion = if !luma_mode_is_intra { chroma_rdo(true) } else { false }; // early skip if !zero_distortion { chroma_rdo(false); } } /// RDO-based mode decision /// /// # Panics /// /// - If the best RD found is negative. /// This should never happen and indicates a development error. #[profiling::function] pub fn rdo_mode_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, ) -> PartitionParameters { let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let rdo_type = if fi.use_tx_domain_rate { RDOType::TxDistEstRate } else if fi.use_tx_domain_distortion { RDOType::TxDistRealRate } else { RDOType::PixelDistRealRate }; let mut best = if fi.frame_type.has_inter() { assert!(fi.frame_type != FrameType::KEY); inter_frame_rdo_mode_decision( fi, ts, cw, bsize, tile_bo, inter_cfg, &cw_checkpoint, rdo_type, ) } else { PartitionParameters::default() }; let is_chroma_block = has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); if !best.skip { best = intra_frame_rdo_mode_decision( fi, ts, cw, bsize, tile_bo, &cw_checkpoint, rdo_type, best, is_chroma_block, ); } if best.pred_mode_luma.is_intra() && is_chroma_block && bsize.cfl_allowed() { cw.bc.blocks.set_segmentation_idx(tile_bo, bsize, best.sidx); let chroma_mode = PredictionMode::UV_CFL_PRED; let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let mut wr = WriterCounter::new(); let angle_delta = AngleDelta { y: best.angle_delta.y, uv: 0 }; write_tx_blocks( fi, ts, cw, &mut wr, best.pred_mode_luma, best.pred_mode_luma, angle_delta, tile_bo, bsize, best.tx_size, best.tx_type, false, CFLParams::default(), true, rdo_type, true, ); cw.rollback(&cw_checkpoint); if fi.sequence.chroma_sampling != ChromaSampling::Cs400 { if let Some(cfl) = rdo_cfl_alpha(ts, tile_bo, bsize, best.tx_size, fi) { let mut wr = WriterCounter::new(); let tell = wr.tell_frac(); encode_block_pre_cdef( &fi.sequence, ts, cw, &mut wr, bsize, tile_bo, best.skip, ); let (has_coeff, _) = encode_block_post_cdef( fi, ts, cw, &mut wr, best.pred_mode_luma, chroma_mode, angle_delta, best.ref_frames, best.mvs, bsize, tile_bo, best.skip, cfl, best.tx_size, best.tx_type, 0, &[], rdo_type, true, // For CFL, luma should be always reconstructed. None, ); let rate = wr.tell_frac() - tell; // For CFL, tx-domain distortion is not an option. let distortion = compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, false); let rd = compute_rd_cost(fi, rate, distortion); if rd < best.rd_cost { best.rd_cost = rd; best.pred_mode_chroma = chroma_mode; best.angle_delta = angle_delta; best.has_coeff = has_coeff; best.pred_cfl_params = cfl; } cw.rollback(&cw_checkpoint); } } } cw.bc.blocks.set_mode(tile_bo, bsize, best.pred_mode_luma); cw.bc.blocks.set_ref_frames(tile_bo, bsize, best.ref_frames); cw.bc.blocks.set_motion_vectors(tile_bo, bsize, best.mvs); assert!(best.rd_cost >= 0_f64); PartitionParameters { bo: tile_bo, bsize, pred_mode_luma: best.pred_mode_luma, pred_mode_chroma: best.pred_mode_chroma, pred_cfl_params: best.pred_cfl_params, angle_delta: best.angle_delta, ref_frames: best.ref_frames, mvs: best.mvs, rd_cost: best.rd_cost, skip: best.skip, has_coeff: best.has_coeff, tx_size: best.tx_size, tx_type: best.tx_type, sidx: best.sidx, } } #[profiling::function] fn inter_frame_rdo_mode_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, ) -> PartitionParameters { let mut best = PartitionParameters::default(); // we can never have more than 7 reference frame sets let mut ref_frames_set = ArrayVec::<_, 7>::new(); // again, max of 7 ref slots let mut ref_slot_set = ArrayVec::<_, 7>::new(); // our implementation never returns more than 3 at the moment let mut mvs_from_me = ArrayVec::<_, 3>::new(); let mut fwdref = None; let mut bwdref = None; for i in inter_cfg.allowed_ref_frames().iter().copied() { // Don't search LAST3 since it's used only for probs if i == LAST3_FRAME { continue; } if !ref_slot_set.contains(&fi.ref_frames[i.to_index()]) { if fwdref.is_none() && i.is_fwd_ref() { fwdref = Some(ref_frames_set.len()); } if bwdref.is_none() && i.is_bwd_ref() { bwdref = Some(ref_frames_set.len()); } ref_frames_set.push([i, NONE_FRAME]); let slot_idx = fi.ref_frames[i.to_index()]; ref_slot_set.push(slot_idx); } } assert!(!ref_frames_set.is_empty()); let mut inter_mode_set = ArrayVec::<(PredictionMode, usize), 20>::new(); let mut mvs_set = ArrayVec::<[MotionVector; 2], 20>::new(); let mut satds = ArrayVec::::new(); let mut mv_stacks = ArrayVec::<_, 20>::new(); let mut mode_contexts = ArrayVec::<_, 7>::new(); for (i, &ref_frames) in ref_frames_set.iter().enumerate() { let mut mv_stack = ArrayVec::::new(); mode_contexts.push(cw.find_mvrefs( tile_bo, ref_frames, &mut mv_stack, bsize, fi, false, )); let mut pmv = [MotionVector::default(); 2]; if !mv_stack.is_empty() { pmv[0] = mv_stack[0].this_mv; } if mv_stack.len() > 1 { pmv[1] = mv_stack[1].this_mv; } let res = estimate_motion( fi, ts, bsize.width(), bsize.height(), tile_bo, ref_frames[0], Some(pmv), MVSamplingMode::CORNER { right: true, bottom: true }, false, 0, None, ) .unwrap_or_else(MotionSearchResult::empty); let b_me = res.mv; mvs_from_me.push([b_me, MotionVector::default()]); for &x in RAV1E_INTER_MODES_MINIMAL { inter_mode_set.push((x, i)); } if !mv_stack.is_empty() { inter_mode_set.push((PredictionMode::NEAR0MV, i)); } if mv_stack.len() >= 2 { inter_mode_set.push((PredictionMode::GLOBALMV, i)); } let include_near_mvs = fi.config.speed_settings.motion.include_near_mvs; if include_near_mvs { if mv_stack.len() >= 3 { inter_mode_set.push((PredictionMode::NEAR1MV, i)); } if mv_stack.len() >= 4 { inter_mode_set.push((PredictionMode::NEAR2MV, i)); } } let same_row_col = |x: &CandidateMV| { x.this_mv.row == mvs_from_me[i][0].row && x.this_mv.col == mvs_from_me[i][0].col }; if !mv_stack .iter() .take(if include_near_mvs { 4 } else { 2 }) .any(same_row_col) && (mvs_from_me[i][0].row != 0 || mvs_from_me[i][0].col != 0) { inter_mode_set.push((PredictionMode::NEWMV, i)); } mv_stacks.push(mv_stack); } let sz = bsize.width_mi().min(bsize.height_mi()); // To use non single reference modes, block width and height must be greater than 4. if fi.reference_mode != ReferenceMode::SINGLE && sz >= 2 { // Adding compound candidate if let Some(r0) = fwdref { if let Some(r1) = bwdref { let ref_frames = [ref_frames_set[r0][0], ref_frames_set[r1][0]]; ref_frames_set.push(ref_frames); let mv0 = mvs_from_me[r0][0]; let mv1 = mvs_from_me[r1][0]; mvs_from_me.push([mv0, mv1]); let mut mv_stack = ArrayVec::::new(); mode_contexts.push(cw.find_mvrefs( tile_bo, ref_frames, &mut mv_stack, bsize, fi, true, )); for &x in RAV1E_INTER_COMPOUND_MODES { // exclude any NEAR mode based on speed setting if fi.config.speed_settings.motion.include_near_mvs || !x.has_nearmv() { let mv_stack_idx = ref_frames_set.len() - 1; // exclude NEAR modes if the mv_stack is too short if !(x.has_nearmv() && x.ref_mv_idx() >= mv_stack.len()) { inter_mode_set.push((x, mv_stack_idx)); } } } mv_stacks.push(mv_stack); } } } let num_modes_rdo = if fi.config.speed_settings.prediction.prediction_modes >= PredictionModesSetting::ComplexAll { inter_mode_set.len() } else { 9 // This number is determined by AWCY test }; inter_mode_set.iter().for_each(|&(luma_mode, i)| { let mvs = match luma_mode { PredictionMode::NEWMV | PredictionMode::NEW_NEWMV => mvs_from_me[i], PredictionMode::NEARESTMV | PredictionMode::NEAREST_NEARESTMV => { if !mv_stacks[i].is_empty() { [mv_stacks[i][0].this_mv, mv_stacks[i][0].comp_mv] } else { [MotionVector::default(); 2] } } PredictionMode::NEAR0MV | PredictionMode::NEAR_NEAR0MV => { if mv_stacks[i].len() > 1 { [mv_stacks[i][1].this_mv, mv_stacks[i][1].comp_mv] } else { [MotionVector::default(); 2] } } PredictionMode::NEAR1MV | PredictionMode::NEAR2MV | PredictionMode::NEAR_NEAR1MV | PredictionMode::NEAR_NEAR2MV => [ mv_stacks[i][luma_mode.ref_mv_idx()].this_mv, mv_stacks[i][luma_mode.ref_mv_idx()].comp_mv, ], PredictionMode::NEAREST_NEWMV => { [mv_stacks[i][0].this_mv, mvs_from_me[i][1]] } PredictionMode::NEW_NEARESTMV => { [mvs_from_me[i][0], mv_stacks[i][0].comp_mv] } PredictionMode::GLOBALMV | PredictionMode::GLOBAL_GLOBALMV => { [MotionVector::default(); 2] } _ => { unimplemented!(); } }; mvs_set.push(mvs); // Calculate SATD for each mode if num_modes_rdo != inter_mode_set.len() { let tile_rect = ts.tile_rect(); let rec = &mut ts.rec.planes[0]; let po = tile_bo.plane_offset(rec.plane_cfg); let mut rec_region = rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); luma_mode.predict_inter( fi, tile_rect, 0, po, &mut rec_region, bsize.width(), bsize.height(), ref_frames_set[i], mvs, &mut ts.inter_compound_buffers, ); let plane_org = ts.input_tile.planes[0] .subregion(Area::BlockStartingAt { bo: tile_bo.0 }); let plane_ref = rec_region.as_const(); let satd = get_satd( &plane_org, &plane_ref, bsize.width(), bsize.height(), fi.sequence.bit_depth, fi.cpu_feature_level, ); satds.push(satd); } else { satds.push(0); } }); let mut sorted = izip!(inter_mode_set, mvs_set, satds).collect::>(); if num_modes_rdo != sorted.len() { sorted.sort_by_key(|((_mode, _i), _mvs, satd)| *satd); } sorted.iter().take(num_modes_rdo).for_each( |&((luma_mode, i), mvs, _satd)| { let mode_set_chroma = ArrayVec::from([luma_mode]); luma_chroma_mode_rdo( luma_mode, fi, bsize, tile_bo, ts, cw, rdo_type, cw_checkpoint, &mut best, mvs, ref_frames_set[i], &mode_set_chroma, false, mode_contexts[i], &mv_stacks[i], AngleDelta::default(), ); }, ); best } #[profiling::function] fn intra_frame_rdo_mode_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, cw_checkpoint: &ContextWriterCheckpoint, rdo_type: RDOType, mut best: PartitionParameters, is_chroma_block: bool, ) -> PartitionParameters { let mut modes = ArrayVec::<_, INTRA_MODES>::new(); // Reduce number of prediction modes at higher speed levels let num_modes_rdo = if (fi.frame_type == FrameType::KEY && fi.config.speed_settings.prediction.prediction_modes >= PredictionModesSetting::ComplexKeyframes) || (fi.frame_type.has_inter() && fi.config.speed_settings.prediction.prediction_modes >= PredictionModesSetting::ComplexAll) { 7 } else { 3 }; let intra_mode_set = RAV1E_INTRA_MODES; // Find mode with lowest rate cost { use crate::ec::cdf_to_pdf; let probs_all = cdf_to_pdf(if fi.frame_type.has_inter() { cw.get_cdf_intra_mode(bsize) } else { cw.get_cdf_intra_mode_kf(tile_bo) }); modes.try_extend_from_slice(intra_mode_set).unwrap(); modes.sort_by_key(|&a| !probs_all[a as usize]); } // If tx partition (i.e. fi.tx_mode_select) is enabled, the below intra prediction screening // may be improved by emulating prediction for each tx block. { let satds = { // FIXME: If tx partition is used, this whole sads block should be fixed let tx_size = bsize.tx_size(); let mut edge_buf = Aligned::uninit_array(); let edge_buf = { let rec = &ts.rec.planes[0].as_const(); let po = tile_bo.plane_offset(rec.plane_cfg); // FIXME: If tx partition is used, get_intra_edges() should be called for each tx block get_intra_edges( &mut edge_buf, rec, tile_bo, 0, 0, bsize, po, tx_size, fi.sequence.bit_depth, None, fi.sequence.enable_intra_edge_filter, IntraParam::None, ) }; let ief_params = if fi.sequence.enable_intra_edge_filter { let above_block_info = ts.above_block_info(tile_bo, 0, 0); let left_block_info = ts.left_block_info(tile_bo, 0, 0); Some(IntraEdgeFilterParameters::new( 0, above_block_info, left_block_info, )) } else { None }; let mut satds_all = [0; INTRA_MODES]; for &luma_mode in modes.iter().skip(num_modes_rdo / 2) { let tile_rect = ts.tile_rect(); let rec = &mut ts.rec.planes[0]; let mut rec_region = rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); // FIXME: If tx partition is used, luma_mode.predict_intra() should be called for each tx block luma_mode.predict_intra( tile_rect, &mut rec_region, tx_size, fi.sequence.bit_depth, &[0i16; 2], IntraParam::None, if luma_mode.is_directional() { ief_params } else { None }, &edge_buf, fi.cpu_feature_level, ); let plane_org = ts.input_tile.planes[0] .subregion(Area::BlockStartingAt { bo: tile_bo.0 }); let plane_ref = rec_region.as_const(); satds_all[luma_mode as usize] = get_satd( &plane_org, &plane_ref, tx_size.width(), tx_size.height(), fi.sequence.bit_depth, fi.cpu_feature_level, ); } satds_all }; modes[num_modes_rdo / 2..].sort_by_key(|&a| satds[a as usize]); } debug_assert!(num_modes_rdo >= 1); modes.iter().take(num_modes_rdo).for_each(|&luma_mode| { let mvs = [MotionVector::default(); 2]; let ref_frames = [INTRA_FRAME, NONE_FRAME]; let mut mode_set_chroma = ArrayVec::<_, 2>::new(); mode_set_chroma.push(luma_mode); if is_chroma_block && luma_mode != PredictionMode::DC_PRED { mode_set_chroma.push(PredictionMode::DC_PRED); } luma_chroma_mode_rdo( luma_mode, fi, bsize, tile_bo, ts, cw, rdo_type, cw_checkpoint, &mut best, mvs, ref_frames, &mode_set_chroma, true, 0, &ArrayVec::::new(), AngleDelta::default(), ); }); if fi.config.speed_settings.prediction.fine_directional_intra && bsize >= BlockSize::BLOCK_8X8 { // Find the best angle delta for the current best prediction mode let luma_deltas = best.pred_mode_luma.angle_delta_count(); let chroma_deltas = best.pred_mode_chroma.angle_delta_count(); let mvs = [MotionVector::default(); 2]; let ref_frames = [INTRA_FRAME, NONE_FRAME]; let mode_set_chroma = [best.pred_mode_chroma]; let mv_stack = ArrayVec::<_, 9>::new(); let mut best_angle_delta = best.angle_delta; let mut angle_delta_rdo = |y, uv| -> AngleDelta { if best.angle_delta.y != y || best.angle_delta.uv != uv { luma_chroma_mode_rdo( best.pred_mode_luma, fi, bsize, tile_bo, ts, cw, rdo_type, cw_checkpoint, &mut best, mvs, ref_frames, &mode_set_chroma, true, 0, &mv_stack, AngleDelta { y, uv }, ); } best.angle_delta }; for i in 0..luma_deltas { let angle_delta_y = if luma_deltas == 1 { 0 } else { i - MAX_ANGLE_DELTA as i8 }; best_angle_delta = angle_delta_rdo(angle_delta_y, best_angle_delta.uv); } for j in 0..chroma_deltas { let angle_delta_uv = if chroma_deltas == 1 { 0 } else { j - MAX_ANGLE_DELTA as i8 }; best_angle_delta = angle_delta_rdo(best_angle_delta.y, angle_delta_uv); } } best } /// # Panics /// /// - If the block size is invalid for subsampling. #[profiling::function] pub fn rdo_cfl_alpha( ts: &mut TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, luma_tx_size: TxSize, fi: &FrameInvariants, ) -> Option { let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let uv_tx_size = bsize.largest_chroma_tx_size(xdec, ydec); debug_assert!( bsize.subsampled_size(xdec, ydec).unwrap() == uv_tx_size.block_size() ); let frame_bo = ts.to_frame_block_offset(tile_bo); let (visible_tx_w, visible_tx_h) = clip_visible_bsize( (fi.width + xdec) >> xdec, (fi.height + ydec) >> ydec, uv_tx_size.block_size(), (frame_bo.0.x << MI_SIZE_LOG2) >> xdec, (frame_bo.0.y << MI_SIZE_LOG2) >> ydec, ); if visible_tx_w == 0 || visible_tx_h == 0 { return None; }; let mut ac = Aligned::<[MaybeUninit; 32 * 32]>::uninit_array(); let ac = luma_ac(&mut ac.data, ts, tile_bo, bsize, luma_tx_size, fi); let best_alpha: ArrayVec = (1..3) .map(|p| { let &PlaneConfig { xdec, ydec, .. } = ts.rec.planes[p].plane_cfg; let tile_rect = ts.tile_rect().decimated(xdec, ydec); let rec = &mut ts.rec.planes[p]; let input = &ts.input_tile.planes[p]; let po = tile_bo.plane_offset(rec.plane_cfg); let mut edge_buf = Aligned::uninit_array(); let edge_buf = get_intra_edges( &mut edge_buf, &rec.as_const(), tile_bo, 0, 0, bsize, po, uv_tx_size, fi.sequence.bit_depth, Some(PredictionMode::UV_CFL_PRED), fi.sequence.enable_intra_edge_filter, IntraParam::None, ); let mut alpha_cost = |alpha: i16| -> u64 { let mut rec_region = rec.subregion_mut(Area::BlockStartingAt { bo: tile_bo.0 }); PredictionMode::UV_CFL_PRED.predict_intra( tile_rect, &mut rec_region, uv_tx_size, fi.sequence.bit_depth, ac, IntraParam::Alpha(alpha), None, &edge_buf, fi.cpu_feature_level, ); sse_wxh( &input.subregion(Area::BlockStartingAt { bo: tile_bo.0 }), &rec_region.as_const(), visible_tx_w, visible_tx_h, |_, _| DistortionScale::default(), // We're not doing RDO here. fi.sequence.bit_depth, fi.cpu_feature_level, ) .0 }; let mut best = (alpha_cost(0), 0); let mut count = 2; for alpha in 1i16..=16i16 { let cost = (alpha_cost(alpha), alpha_cost(-alpha)); if cost.0 < best.0 { best = (cost.0, alpha); count += 2; } if cost.1 < best.0 { best = (cost.1, -alpha); count += 2; } if count < alpha { break; } } best.1 }) .collect(); if best_alpha[0] == 0 && best_alpha[1] == 0 { None } else { Some(CFLParams::from_alpha(best_alpha[0], best_alpha[1])) } } /// RDO-based transform type decision /// If `cw_checkpoint` is `None`, a checkpoint for cw's (`ContextWriter`) current /// state is created and stored for later use. /// /// # Panics /// /// - If a writer checkpoint is never created before or within the function. /// This should never happen and indicates a development error. /// - If the best RD found is negative. /// This should never happen and indicates a development error. pub fn rdo_tx_type_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, cw_checkpoint: &mut Option, mode: PredictionMode, ref_frames: [RefType; 2], mvs: [MotionVector; 2], bsize: BlockSize, tile_bo: TileBlockOffset, tx_size: TxSize, tx_set: TxSet, tx_types: &[TxType], cur_best_rd: f64, ) -> (TxType, f64) { let mut best_type = TxType::DCT_DCT; let mut best_rd = std::f64::MAX; let PlaneConfig { xdec, ydec, .. } = ts.input.planes[1].cfg; let is_chroma_block = has_chroma(tile_bo, bsize, xdec, ydec, fi.sequence.chroma_sampling); let is_inter = !mode.is_intra(); if cw_checkpoint.is_none() { // Only run the first call // Prevents creating multiple checkpoints for own version of cw *cw_checkpoint = Some(cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling)); } let rdo_type = if fi.use_tx_domain_distortion { RDOType::TxDistRealRate } else { RDOType::PixelDistRealRate }; let need_recon_pixel = tx_size.block_size() != bsize && !is_inter; let mut first_iteration = true; for &tx_type in tx_types { // Skip unsupported transform types if av1_tx_used[tx_set as usize][tx_type as usize] == 0 { continue; } if is_inter { motion_compensate( fi, ts, cw, mode, ref_frames, mvs, bsize, tile_bo, true, ); } let mut wr = WriterCounter::new(); let tell = wr.tell_frac(); let (_, tx_dist) = if is_inter { write_tx_tree( fi, ts, cw, &mut wr, mode, 0, tile_bo, bsize, tx_size, tx_type, false, true, rdo_type, need_recon_pixel, ) } else { write_tx_blocks( fi, ts, cw, &mut wr, mode, mode, AngleDelta::default(), tile_bo, bsize, tx_size, tx_type, false, CFLParams::default(), // Unused. true, rdo_type, need_recon_pixel, ) }; let rate = wr.tell_frac() - tell; let distortion = if fi.use_tx_domain_distortion { compute_tx_distortion( fi, ts, bsize, is_chroma_block, tile_bo, tx_dist, false, true, ) } else { compute_distortion(fi, ts, bsize, is_chroma_block, tile_bo, true) }; cw.rollback(cw_checkpoint.as_ref().unwrap()); let rd = compute_rd_cost(fi, rate, distortion); if first_iteration { // We use an optimization to early exit after testing the first // transform type if the cost is higher than the existing best. // The idea is that if this transform size is not better than he // previous size, it is not worth testing remaining modes for this size. if rd > cur_best_rd { break; } first_iteration = false; } if rd < best_rd { best_rd = rd; best_type = tx_type; } } assert!(best_rd >= 0_f64); (best_type, best_rd) } pub fn get_sub_partitions( four_partitions: &[TileBlockOffset; 4], partition: PartitionType, ) -> ArrayVec { let mut partition_offsets = ArrayVec::::new(); partition_offsets.push(four_partitions[0]); if partition == PARTITION_NONE { return partition_offsets; } if partition == PARTITION_VERT || partition == PARTITION_SPLIT { partition_offsets.push(four_partitions[1]); }; if partition == PARTITION_HORZ || partition == PARTITION_SPLIT { partition_offsets.push(four_partitions[2]); }; if partition == PARTITION_SPLIT { partition_offsets.push(four_partitions[3]); }; partition_offsets } #[inline(always)] fn rdo_partition_none( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, child_modes: &mut ArrayVec, ) -> f64 { debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height); let mode = rdo_mode_decision(fi, ts, cw, bsize, tile_bo, inter_cfg); let cost = mode.rd_cost; child_modes.push(mode); cost } // VERTICAL, HORIZONTAL or simple SPLIT #[inline(always)] fn rdo_partition_simple( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, inter_cfg: &InterConfig, partition: PartitionType, rdo_type: RDOType, best_rd: f64, child_modes: &mut ArrayVec, ) -> Option { debug_assert!(tile_bo.0.x < ts.mi_width && tile_bo.0.y < ts.mi_height); let subsize = bsize.subsize(partition).unwrap(); let cost = if bsize >= BlockSize::BLOCK_8X8 { let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; let tell = w.tell_frac(); cw.write_partition(w, tile_bo, partition, bsize); compute_rd_cost(fi, w.tell_frac() - tell, ScaledDistortion::zero()) } else { 0.0 }; let hbsw = subsize.width_mi(); // Half the block size width in blocks let hbsh = subsize.height_mi(); // Half the block size height in blocks let four_partitions = [ tile_bo, TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y }), TileBlockOffset(BlockOffset { x: tile_bo.0.x, y: tile_bo.0.y + hbsh }), TileBlockOffset(BlockOffset { x: tile_bo.0.x + hbsw, y: tile_bo.0.y + hbsh, }), ]; let partitions = get_sub_partitions(&four_partitions, partition); let mut rd_cost_sum = 0.0; for offset in partitions { let hbs = subsize.width_mi() >> 1; let has_cols = offset.0.x + hbs < ts.mi_width; let has_rows = offset.0.y + hbs < ts.mi_height; if has_cols && has_rows { let mode_decision = rdo_mode_decision(fi, ts, cw, subsize, offset, inter_cfg); rd_cost_sum += mode_decision.rd_cost; if fi.enable_early_exit && rd_cost_sum > best_rd { return None; } if subsize >= BlockSize::BLOCK_8X8 && subsize.is_sqr() { let w: &mut W = if cw.bc.cdef_coded { w_post_cdef } else { w_pre_cdef }; cw.write_partition(w, offset, PartitionType::PARTITION_NONE, subsize); } encode_block_with_modes( fi, ts, cw, w_pre_cdef, w_post_cdef, subsize, offset, &mode_decision, rdo_type, None, ); child_modes.push(mode_decision); } else { //rd_cost_sum += std::f64::MAX; return None; } } Some(cost + rd_cost_sum) } /// RDO-based single level partitioning decision /// /// # Panics /// /// - If the best RD found is negative. /// This should never happen, and indicates a development error. #[profiling::function] pub fn rdo_partition_decision( fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w_pre_cdef: &mut W, w_post_cdef: &mut W, bsize: BlockSize, tile_bo: TileBlockOffset, cached_block: &PartitionGroupParameters, partition_types: &[PartitionType], rdo_type: RDOType, inter_cfg: &InterConfig, ) -> PartitionGroupParameters { let mut best_partition = cached_block.part_type; let mut best_rd = cached_block.rd_cost; let mut best_pred_modes = cached_block.part_modes.clone(); let cw_checkpoint = cw.checkpoint(&tile_bo, fi.sequence.chroma_sampling); let w_pre_checkpoint = w_pre_cdef.checkpoint(); let w_post_checkpoint = w_post_cdef.checkpoint(); for &partition in partition_types { // Do not re-encode results we already have if partition == cached_block.part_type { continue; } let mut child_modes = ArrayVec::<_, 4>::new(); let cost = match partition { PARTITION_NONE if bsize <= BlockSize::BLOCK_64X64 => { Some(rdo_partition_none( fi, ts, cw, bsize, tile_bo, inter_cfg, &mut child_modes, )) } PARTITION_SPLIT | PARTITION_HORZ | PARTITION_VERT => { rdo_partition_simple( fi, ts, cw, w_pre_cdef, w_post_cdef, bsize, tile_bo, inter_cfg, partition, rdo_type, best_rd, &mut child_modes, ) } _ => { unreachable!(); } }; if let Some(rd) = cost { if rd < best_rd { best_rd = rd; best_partition = partition; best_pred_modes = child_modes.clone(); } } cw.rollback(&cw_checkpoint); w_pre_cdef.rollback(&w_pre_checkpoint); w_post_cdef.rollback(&w_post_checkpoint); } assert!(best_rd >= 0_f64); PartitionGroupParameters { rd_cost: best_rd, part_type: best_partition, part_modes: best_pred_modes, } } #[profiling::function] fn rdo_loop_plane_error( base_sbo: TileSuperBlockOffset, offset_sbo: TileSuperBlockOffset, sb_w: usize, sb_h: usize, fi: &FrameInvariants, ts: &TileStateMut<'_, T>, blocks: &TileBlocks<'_>, test: &Frame, src: &Tile<'_, T>, pli: usize, ) -> ScaledDistortion { let sb_w_blocks = if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_w; let sb_h_blocks = if fi.sequence.use_128x128_superblock { 16 } else { 8 } * sb_h; // Each direction block is 8x8 in y, potentially smaller if subsampled in chroma // accumulating in-frame and unpadded let mut err = Distortion::zero(); for by in 0..sb_h_blocks { for bx in 0..sb_w_blocks { let loop_bo = offset_sbo.block_offset(bx << 1, by << 1); if loop_bo.0.x < blocks.cols() && loop_bo.0.y < blocks.rows() { let src_plane = &src.planes[pli]; let test_plane = &test.planes[pli]; let PlaneConfig { xdec, ydec, .. } = *src_plane.plane_cfg; debug_assert_eq!(xdec, test_plane.cfg.xdec); debug_assert_eq!(ydec, test_plane.cfg.ydec); // Unfortunately, our distortion biases are only available via // Frame-absolute addressing, so we need a block offset // relative to the full frame origin (not the tile or analysis // area) let frame_bo = (base_sbo + offset_sbo).block_offset(bx << 1, by << 1); let bias = distortion_scale( fi, ts.to_frame_block_offset(frame_bo), BlockSize::BLOCK_8X8, ); let src_region = src_plane.subregion(Area::BlockStartingAt { bo: loop_bo.0 }); let test_region = test_plane.region(Area::BlockStartingAt { bo: loop_bo.0 }); err += if pli == 0 { // For loop filters, We intentionally use cdef_dist even with // `--tune Psnr`. Using SSE instead gives no PSNR gain but has a // significant negative impact on other metrics and visual quality. RawDistortion(cdef_dist_kernel( &src_region, &test_region, 8, 8, fi.sequence.bit_depth, fi.cpu_feature_level, ) as u64) * bias } else { sse_wxh( &src_region, &test_region, 8 >> xdec, 8 >> ydec, |_, _| bias, fi.sequence.bit_depth, fi.cpu_feature_level, ) }; } } } err * fi.dist_scale[pli] } /// Passed in a superblock offset representing the upper left corner of /// the LRU area we're optimizing. This area covers the largest LRU in /// any of the present planes, but may consist of a number of /// superblocks and full, smaller LRUs in the other planes /// /// # Panics /// /// - If both CDEF and LRF are disabled. #[profiling::function] pub fn rdo_loop_decision( base_sbo: TileSuperBlockOffset, fi: &FrameInvariants, ts: &mut TileStateMut<'_, T>, cw: &mut ContextWriter, w: &mut W, deblock_p: bool, ) { let planes = if fi.sequence.chroma_sampling == ChromaSampling::Cs400 { 1 } else { MAX_PLANES }; assert!(fi.sequence.enable_cdef || fi.sequence.enable_restoration); // Determine area of optimization: Which plane has the largest LRUs? // How many LRUs for each? let mut sb_w = 1; // how many superblocks wide the largest LRU // is/how many SBs we're processing (same thing) let mut sb_h = 1; // how many superblocks wide the largest LRU // is/how many SBs we're processing (same thing) let mut lru_w = [0; MAX_PLANES]; // how many LRUs we're processing let mut lru_h = [0; MAX_PLANES]; // how many LRUs we're processing for pli in 0..planes { let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift; let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift; if sb_w < (1 << sb_h_shift) { sb_w = 1 << sb_h_shift; } if sb_h < (1 << sb_v_shift) { sb_h = 1 << sb_v_shift; } } for pli in 0..planes { let sb_h_shift = ts.restoration.planes[pli].rp_cfg.sb_h_shift; let sb_v_shift = ts.restoration.planes[pli].rp_cfg.sb_v_shift; lru_w[pli] = sb_w / (1 << sb_h_shift); lru_h[pli] = sb_h / (1 << sb_v_shift); } // The superblock width/height determinations may be calling for us // to compute over superblocks that do not actually exist in the // frame (off the right or lower edge). Trim sb width/height down // to actual superblocks. Note that these last superblocks on the // right/bottom may themselves still span the edge of the frame, but // they do hold at least some visible pixels. sb_w = sb_w.min(ts.sb_width - base_sbo.0.x); sb_h = sb_h.min(ts.sb_height - base_sbo.0.y); // We have need to know the Y visible pixel limits as well (the // sb_w/sb_h figures above can be used to determine how many // allocated pixels, possibly beyond the visible frame, exist). let crop_w = fi.width - ((ts.sbo.0.x + base_sbo.0.x) << SUPERBLOCK_TO_PLANE_SHIFT); let crop_h = fi.height - ((ts.sbo.0.y + base_sbo.0.y) << SUPERBLOCK_TO_PLANE_SHIFT); let pixel_w = crop_w.min(sb_w << SUPERBLOCK_TO_PLANE_SHIFT); let pixel_h = crop_h.min(sb_h << SUPERBLOCK_TO_PLANE_SHIFT); // Based on `RestorationState::new` const MAX_SB_SHIFT: usize = 4; const MAX_SB_SIZE: usize = 1 << MAX_SB_SHIFT; const MAX_LRU_SIZE: usize = MAX_SB_SIZE; // Static allocation relies on the "minimal LRU area for all N planes" invariant. let mut best_index = [-1; MAX_SB_SIZE * MAX_SB_SIZE]; let mut best_lrf = [[RestorationFilter::None; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; // due to imprecision in the reconstruction parameter solver, we // need to make sure we don't fall into a limit cycle. Track our // best cost at LRF so that we can break if we get a solution that doesn't // improve at the reconstruction stage. let mut best_lrf_cost = [[-1.0; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; // sub-setted region of the TileBlocks for our working frame area. // Note that the size of this subset is what signals CDEF as to the // actual coded size. let mut tileblocks_subset = cw.bc.blocks.subregion_mut( base_sbo.block_offset(0, 0).0.x, base_sbo.block_offset(0, 0).0.y, sb_w << SUPERBLOCK_TO_BLOCK_SHIFT, sb_h << SUPERBLOCK_TO_BLOCK_SHIFT, ); // cdef doesn't run on superblocks that are completely skipped. // Determine which super blocks are marked as skipped so we can avoid running // them. If all blocks are skipped, we can avoid some of the overhead related // to setting up for cdef. let mut cdef_skip = [true; MAX_SB_SIZE * MAX_SB_SIZE]; let mut cdef_skip_all = true; if fi.sequence.enable_cdef { for sby in 0..sb_h { for sbx in 0..sb_w { let blocks = tileblocks_subset.subregion(16 * sbx, 16 * sby, 16, 16); let mut skip = true; for y in 0..blocks.rows() { for block in blocks[y].iter() { skip &= block.skip; } } cdef_skip[sby * MAX_SB_SIZE + sbx] = skip; cdef_skip_all &= skip; } } } // Unlike cdef, loop restoration will run regardless of whether blocks are // skipped or not. At the same time, the most significant improvement will // generally be from un-skipped blocks, so lru is only performed if there are // un-skipped blocks. // This should be the same as `cdef_skip_all`, except when cdef is disabled. let mut lru_skip_all = true; let mut lru_skip = [[true; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; if fi.sequence.enable_restoration { if fi.config.speed_settings.lru_on_skip { lru_skip_all = false; lru_skip = [[false; MAX_PLANES]; MAX_LRU_SIZE * MAX_LRU_SIZE]; } else { for pli in 0..planes { // width, in sb, of an LRU in this plane let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift; // height, in sb, of an LRU in this plane let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift; for lru_y in 0..lru_h[pli] { // number of LRUs vertically for lru_x in 0..lru_w[pli] { // number of LRUs horizontally let loop_sbo = TileSuperBlockOffset(SuperBlockOffset { x: lru_x * lru_sb_w, y: lru_y * lru_sb_h, }); if !ts.restoration.has_restoration_unit( base_sbo + loop_sbo, pli, false, ) { continue; } let start = loop_sbo.block_offset(0, 0).0; let size = TileSuperBlockOffset(SuperBlockOffset { x: lru_sb_w, y: lru_sb_h, }) .block_offset(0, 0) .0; let blocks = tileblocks_subset.subregion(start.x, start.y, size.x, size.y); let mut skip = true; for y in 0..blocks.rows() { for block in blocks[y].iter() { skip &= block.skip; } } lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] = skip; lru_skip_all &= skip; } } } } } // Return early if all blocks are skipped for lru and cdef. if lru_skip_all && cdef_skip_all { return; } // Loop filter RDO is an iterative process and we need temporary // scratch data to hold the results of deblocking, cdef, and the // loop reconstruction filter so that each can be partially updated // without recomputing the entire stack. Construct // largest-LRU-sized frames for each, accounting for padding // required by deblocking, cdef and [optionally] LR. let mut rec_subset = ts .rec .subregion(Area::BlockRect { bo: base_sbo.block_offset(0, 0).0, width: (pixel_w + 7) >> 3 << 3, height: (pixel_h + 7) >> 3 << 3, }) .scratch_copy(); // const, no need to copy, just need the subregion (but do zero the // origin to match the other copies/new backing frames). let src_subset = ts .input_tile .subregion(Area::BlockRect { bo: base_sbo.block_offset(0, 0).0, width: (pixel_w + 7) >> 3 << 3, height: (pixel_h + 7) >> 3 << 3, }) .home(); if deblock_p { // Find a good deblocking filter solution for the passed in area. // This is not RDO of deblocking itself, merely a solution to get // better results from CDEF/LRF RDO. let deblock_levels = deblock_filter_optimize( fi, &rec_subset.as_tile(), &src_subset, &tileblocks_subset.as_const(), crop_w, crop_h, ); // Deblock the contents of our reconstruction copy. if deblock_levels[0] != 0 || deblock_levels[1] != 0 { // copy ts.deblock because we need to set some of our own values here let mut deblock_copy = *ts.deblock; deblock_copy.levels = deblock_levels; // finally, deblock the temp frame deblock_filter_frame( &deblock_copy, &mut rec_subset.as_tile_mut(), &tileblocks_subset.as_const(), crop_w, crop_h, fi.sequence.bit_depth, planes, ); } } let mut cdef_work = if !cdef_skip_all { Some(rec_subset.clone()) } else { None }; let mut lrf_work = if !lru_skip_all { Some(Frame { planes: { let new_plane = |pli: usize| { let PlaneConfig { xdec, ydec, width, height, .. } = rec_subset.planes[pli].cfg; Plane::new(width, height, xdec, ydec, 0, 0) }; [new_plane(0), new_plane(1), new_plane(2)] }, }) } else { None }; // Precompute directional analysis for CDEF let cdef_data = { if cdef_work.is_some() { Some(( &rec_subset, cdef_analyze_superblock_range( fi, &rec_subset, &tileblocks_subset.as_const(), sb_w, sb_h, ), )) } else { None } }; // CDEF/LRF decision iteration // Start with a default of CDEF 0 and RestorationFilter::None // Try all CDEF options for each sb with current LRF; if new CDEF+LRF choice is better, select it. // Then try all LRF options with current CDEFs; if new CDEFs+LRF choice is better, select it. // If LRF choice changed for any plane, repeat until no changes // Limit iterations and where we break based on speed setting (in the TODO list ;-) let mut cdef_change = true; let mut lrf_change = true; while cdef_change || lrf_change { // search for improved cdef indices, superblock by superblock, if cdef is enabled. if let (Some((rec_copy, cdef_dirs)), Some(cdef_ref)) = (&cdef_data, &mut cdef_work.as_mut()) { for sby in 0..sb_h { for sbx in 0..sb_w { // determine whether this superblock can be skipped if cdef_skip[sby * MAX_SB_SIZE + sbx] { continue; } let prev_best_index = best_index[sby * sb_w + sbx]; let mut best_cost = -1.; let mut best_new_index = -1i8; /* offset of the superblock we're currently testing within the larger analysis area */ let loop_sbo = TileSuperBlockOffset(SuperBlockOffset { x: sbx, y: sby }); /* cdef index testing loop */ for cdef_index in 0..(1 << fi.cdef_bits) { let mut err = ScaledDistortion::zero(); let mut rate = 0; cdef_filter_superblock( fi, &rec_subset, &mut cdef_ref.as_tile_mut(), &tileblocks_subset.as_const(), loop_sbo, cdef_index, &cdef_dirs[sby * sb_w + sbx], ); // apply LRF if any for pli in 0..planes { // We need the cropped-to-visible-frame area of this SB let wh = if fi.sequence.use_128x128_superblock { 128 } else { 64 }; let PlaneConfig { xdec, ydec, .. } = cdef_ref.planes[pli].cfg; let vis_width = (wh >> xdec).min( (crop_w >> xdec) - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).x as usize, ); let vis_height = (wh >> ydec).min( (crop_h >> ydec) - loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg).y as usize, ); // which LRU are we currently testing against? if let (Some((lru_x, lru_y)), Some(lrf_ref)) = { let rp = &ts.restoration.planes[pli]; ( rp.restoration_unit_offset(base_sbo, loop_sbo, false), &mut lrf_work, ) } { // We have a valid LRU, apply LRF, compute error match best_lrf[lru_y * lru_w[pli] + lru_x][pli] { RestorationFilter::None {} => { err += rdo_loop_plane_error( base_sbo, loop_sbo, 1, 1, fi, ts, &tileblocks_subset.as_const(), cdef_ref, &src_subset, pli, ); rate += if fi.sequence.enable_restoration { cw.fc.count_lrf_switchable( w, &ts.restoration.as_const(), best_lrf[lru_y * lru_w[pli] + lru_x][pli], pli, ) } else { 0 // no relative cost differeneces to different // CDEF params. If cdef is on, it's a wash. }; } RestorationFilter::Sgrproj { set, xqd } => { // only run on this single superblock let loop_po = loop_sbo.plane_offset(&cdef_ref.planes[pli].cfg); // todo: experiment with borrowing border pixels // rather than edge-extending. Right now this is // hard-clipping to the superblock boundary. setup_integral_image( &mut ts.integral_buffer, SOLVE_IMAGE_STRIDE, vis_width, vis_height, vis_width, vis_height, &cdef_ref.planes[pli].slice(loop_po), &cdef_ref.planes[pli].slice(loop_po), ); sgrproj_stripe_filter( set, xqd, fi, &ts.integral_buffer, SOLVE_IMAGE_STRIDE, &cdef_ref.planes[pli].slice(loop_po), &mut lrf_ref.planes[pli].region_mut(Area::Rect { x: loop_po.x, y: loop_po.y, width: vis_width, height: vis_height, }), ); err += rdo_loop_plane_error( base_sbo, loop_sbo, 1, 1, fi, ts, &tileblocks_subset.as_const(), lrf_ref, &src_subset, pli, ); rate += cw.fc.count_lrf_switchable( w, &ts.restoration.as_const(), best_lrf[lru_y * lru_w[pli] + lru_x][pli], pli, ); } RestorationFilter::Wiener { .. } => unreachable!(), // coming soon } } else { // No actual LRU here, compute error directly from CDEF output. err += rdo_loop_plane_error( base_sbo, loop_sbo, 1, 1, fi, ts, &tileblocks_subset.as_const(), cdef_ref, &src_subset, pli, ); // no relative cost differeneces to different // CDEF params. If cdef is on, it's a wash. // rate += 0; } } let cost = compute_rd_cost(fi, rate, err); if best_cost < 0. || cost < best_cost { best_cost = cost; best_new_index = cdef_index as i8; } } // Did we change any preexisting choices? if best_new_index != prev_best_index { cdef_change = true; best_index[sby * sb_w + sbx] = best_new_index; tileblocks_subset.set_cdef(loop_sbo, best_new_index as u8); } let mut cdef_ref_tm = TileMut::new( cdef_ref, TileRect { x: 0, y: 0, width: cdef_ref.planes[0].cfg.width, height: cdef_ref.planes[0].cfg.height, }, ); // Keep cdef output up to date; we need it for restoration // both below and above (padding) cdef_filter_superblock( fi, rec_copy, &mut cdef_ref_tm, &tileblocks_subset.as_const(), loop_sbo, best_index[sby * sb_w + sbx] as u8, &cdef_dirs[sby * sb_w + sbx], ); } } } if !cdef_change { break; } cdef_change = false; lrf_change = false; // search for improved restoration filter parameters if restoration is enabled if let Some(lrf_ref) = &mut lrf_work.as_mut() { let lrf_input = if cdef_work.is_some() { // When CDEF is enabled, we pull from the CDEF output cdef_work.as_ref().unwrap() } else { // When CDEF is disabled, we pull from the [optionally // deblocked] reconstruction &rec_subset }; for pli in 0..planes { // Nominal size of LRU in pixels before clipping to visible frame let unit_size = ts.restoration.planes[pli].rp_cfg.unit_size; // width, in sb, of an LRU in this plane let lru_sb_w = 1 << ts.restoration.planes[pli].rp_cfg.sb_h_shift; // height, in sb, of an LRU in this plane let lru_sb_h = 1 << ts.restoration.planes[pli].rp_cfg.sb_v_shift; let PlaneConfig { xdec, ydec, .. } = lrf_ref.planes[pli].cfg; for lru_y in 0..lru_h[pli] { // number of LRUs vertically for lru_x in 0..lru_w[pli] { // number of LRUs horizontally // determine whether this lru should be skipped if lru_skip[lru_y * MAX_LRU_SIZE + lru_x][pli] { continue; } let loop_sbo = TileSuperBlockOffset(SuperBlockOffset { x: lru_x * lru_sb_w, y: lru_y * lru_sb_h, }); if ts.restoration.has_restoration_unit( base_sbo + loop_sbo, pli, false, ) { let src_plane = &src_subset.planes[pli]; // uncompressed input for reference let lrf_in_plane = &lrf_input.planes[pli]; let lrf_po = loop_sbo.plane_offset(src_plane.plane_cfg); let mut best_new_lrf = best_lrf[lru_y * lru_w[pli] + lru_x][pli]; let mut best_cost = best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli]; // Check the no filter option { let err = rdo_loop_plane_error( base_sbo, loop_sbo, lru_sb_w, lru_sb_h, fi, ts, &tileblocks_subset.as_const(), lrf_input, &src_subset, pli, ); let rate = cw.fc.count_lrf_switchable( w, &ts.restoration.as_const(), best_new_lrf, pli, ); let cost = compute_rd_cost(fi, rate, err); // Was this choice actually an improvement? if best_cost < 0. || cost < best_cost { best_cost = cost; best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost; best_new_lrf = RestorationFilter::None; } } // Look for a self guided filter // We need the cropped-to-visible-frame computation area of this LRU let vis_width = unit_size.min( (crop_w >> xdec) - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).x as usize, ); let vis_height = unit_size.min( (crop_h >> ydec) - loop_sbo.plane_offset(&lrf_ref.planes[pli].cfg).y as usize, ); // todo: experiment with borrowing border pixels // rather than edge-extending. Right now this is // hard-clipping to the superblock boundary. setup_integral_image( &mut ts.integral_buffer, SOLVE_IMAGE_STRIDE, vis_width, vis_height, vis_width, vis_height, &lrf_in_plane.slice(lrf_po), &lrf_in_plane.slice(lrf_po), ); for &set in get_sgr_sets(fi.config.speed_settings.sgr_complexity) { let (xqd0, xqd1) = sgrproj_solve( set, fi, &ts.integral_buffer, &src_plane .subregion(Area::StartingAt { x: lrf_po.x, y: lrf_po.y }), &lrf_in_plane.slice(lrf_po), vis_width, vis_height, ); let current_lrf = RestorationFilter::Sgrproj { set, xqd: [xqd0, xqd1] }; if let RestorationFilter::Sgrproj { set, xqd } = current_lrf { sgrproj_stripe_filter( set, xqd, fi, &ts.integral_buffer, SOLVE_IMAGE_STRIDE, &lrf_in_plane.slice(lrf_po), &mut lrf_ref.planes[pli].region_mut(Area::Rect { x: lrf_po.x, y: lrf_po.y, width: vis_width, height: vis_height, }), ); } let err = rdo_loop_plane_error( base_sbo, loop_sbo, lru_sb_w, lru_sb_h, fi, ts, &tileblocks_subset.as_const(), lrf_ref, &src_subset, pli, ); let rate = cw.fc.count_lrf_switchable( w, &ts.restoration.as_const(), current_lrf, pli, ); let cost = compute_rd_cost(fi, rate, err); if cost < best_cost { best_cost = cost; best_lrf_cost[lru_y * lru_w[pli] + lru_x][pli] = cost; best_new_lrf = current_lrf; } } if best_lrf[lru_y * lru_w[pli] + lru_x][pli] .notequal(best_new_lrf) { best_lrf[lru_y * lru_w[pli] + lru_x][pli] = best_new_lrf; lrf_change = true; if let Some(ru) = ts.restoration.planes[pli] .restoration_unit_mut(base_sbo + loop_sbo) { ru.filter = best_new_lrf; } } } } } } } } } #[test] fn estimate_rate_test() { assert_eq!(estimate_rate(0, TxSize::TX_4X4, 0), RDO_RATE_TABLE[0][0][0]); } rav1e-0.7.1/src/rdo_tables.rs000064400000000000000000001614561046102023000141370ustar 00000000000000// Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved // Copyright (c) 2017-2019, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub const RDO_NUM_BINS: usize = 50; #[allow(unused)] pub const RDO_MAX_BIN: usize = 10000; pub const RATE_EST_MAX_BIN: usize = 100_000; pub const RDO_QUANT_BINS: usize = 8; pub const RDO_QUANT_DIV: usize = 256 / RDO_QUANT_BINS; #[allow(unused)] pub const RDO_BIN_SIZE: u64 = (RDO_MAX_BIN / RDO_NUM_BINS) as u64; pub const RATE_EST_BIN_SIZE: u64 = (RATE_EST_MAX_BIN / RDO_NUM_BINS) as u64; use crate::transform::TxSize; pub static RDO_RATE_TABLE: [[[u64; RDO_NUM_BINS]; TxSize::TX_SIZES_ALL]; RDO_QUANT_BINS] = [ [ [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 104, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 707, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 2756, ], [ 13934, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 7628, ], [ 3155, 8704, 12726, 17300, 19395, 20872, 22052, 23156, 24014, 25176, 26269, 27336, 28359, 28967, 29713, 30205, 30847, 31286, 31776, 32355, 32826, 33232, 33367, 33741, 34095, 34472, 34726, 35087, 35195, 35531, 35614, 35728, 35934, 35973, 36165, 36360, 36281, 36651, 36759, 36921, 37014, 37095, 37188, 37012, 37310, 37255, 37157, 37399, 37753, 41014, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 146, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 1159, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 7612, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 4525, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], [ [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 56, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 446, ], [ 3456, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 1438, ], [ 5300, 12376, 13413, 21251, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 1449, ], [ 468, 2455, 5285, 8433, 10917, 12155, 13651, 14712, 15475, 16484, 17678, 18613, 19609, 20313, 20833, 21383, 21834, 22277, 22758, 23300, 23686, 24012, 24419, 24739, 25112, 25439, 25602, 26011, 26256, 26389, 26539, 26662, 26827, 26946, 27059, 27145, 27358, 27485, 27510, 27509, 27594, 27720, 27792, 28033, 28029, 28087, 28285, 28548, 28657, 31616, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 69, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 2627, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 668, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 3957, 6158, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 955, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], [ [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 26, ], [ 814, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 217, ], [ 1127, 2725, 4344, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 154, ], [ 345, 1365, 2879, 4614, 6905, 9707, 9334, 9812, 11542, 14962, 22525, 28756, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 717, ], [ 116, 270, 689, 1041, 1618, 2298, 3748, 5051, 5800, 6310, 6909, 7391, 7745, 8030, 8482, 8996, 9557, 10112, 10633, 11148, 11623, 12152, 12614, 13089, 13592, 14000, 14425, 14886, 15182, 15630, 15871, 16282, 16506, 16780, 17018, 17139, 17228, 17411, 17463, 17587, 17723, 17634, 17696, 17944, 18012, 17950, 18006, 18005, 18182, 22725, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 23, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 873, 2245, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 182, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 519, 1820, 3040, 5457, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 154, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], [ [ 169, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 15, ], [ 463, 859, 1141, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 76, ], [ 295, 923, 1607, 2072, 2388, 3105, 4419, 5573, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 65, ], [ 112, 343, 674, 1102, 1578, 2089, 2648, 3376, 4342, 4869, 5708, 5918, 5901, 6998, 8658, 8258, 7166, 7837, 9781, 10973, 10590, 11659, 11882, 12548, 15440, 19254, 22300, 24169, 25287, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 413, ], [ 60, 105, 217, 365, 586, 798, 1010, 1287, 1563, 1952, 2385, 2825, 3135, 3158, 3796, 4070, 4377, 4574, 4794, 5032, 5350, 5507, 5762, 5986, 6267, 6519, 6741, 7016, 7346, 7645, 8040, 8413, 8750, 9012, 9362, 9687, 9982, 10270, 10558, 10725, 11056, 11204, 11387, 11663, 11752, 11877, 12042, 12104, 12188, 16345, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 154, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 9, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 327, 950, 1500, 1781, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 78, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 125, 384, 795, 1244, 2031, 2451, 3184, 3711, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 75, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], [ [ 83, 148, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 7, ], [ 170, 381, 574, 822, 1042, 1186, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 28, ], [ 89, 233, 449, 680, 867, 1082, 1319, 1445, 1725, 1907, 2163, 2432, 2636, 3171, 3869, 4433, 4818, 5081, 5327, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 29, ], [ 43, 112, 190, 282, 370, 477, 587, 725, 913, 1105, 1315, 1542, 1773, 1992, 2173, 2337, 2526, 2703, 2884, 3072, 3276, 3472, 3668, 3886, 4168, 4521, 4923, 5082, 4981, 5031, 5290, 5598, 6069, 6710, 7558, 8505, 9363, 9523, 8687, 7810, 7479, 7697, 8214, 8970, 9562, 9978, 10067, 9798, 9430, 843, ], [ 46, 58, 75, 129, 207, 265, 340, 426, 505, 554, 662, 771, 873, 1023, 1097, 1183, 1270, 1342, 1445, 1561, 1642, 1741, 1852, 1992, 2037, 2170, 2236, 2339, 2387, 2507, 2713, 2837, 2893, 3108, 3262, 3364, 3466, 3592, 3758, 3811, 4050, 4191, 4326, 4493, 4607, 4773, 4846, 5055, 5247, 9882, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 81, 155, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 3, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 120, 275, 521, 750, 987, 1167, 1264, 1373, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 55, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 57, 85, 114, 227, 424, 686, 890, 1071, 1436, 1495, 1521, 1956, 2182, 2247, 2349, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 58, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], [ [ 22, 47, 62, 81, 106, 116, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 2, ], [ 46, 108, 158, 203, 253, 296, 333, 371, 429, 500, 558, 587, 580, 542, 505, 502, 529, 554, 590, 612, 632, 645, 678, 684, 687, 735, 716, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 12, ], [ 24, 56, 96, 144, 195, 247, 297, 343, 387, 427, 463, 499, 533, 568, 605, 642, 685, 739, 806, 872, 904, 929, 982, 1068, 1168, 1238, 1254, 1246, 1201, 1202, 1257, 1346, 1445, 1572, 1661, 1755, 1851, 1966, 2062, 2201, 2290, 2405, 2448, 2482, 2488, 2474, 2367, 2270, 2085, 17, ], [ 22, 34, 52, 72, 93, 112, 130, 149, 170, 198, 226, 261, 292, 326, 359, 395, 427, 464, 507, 553, 605, 655, 716, 757, 804, 856, 902, 944, 987, 1030, 1064, 1114, 1159, 1192, 1226, 1262, 1308, 1339, 1367, 1395, 1440, 1470, 1504, 1548, 1568, 1604, 1647, 1680, 1715, 1461, ], [ 40, 42, 47, 54, 72, 98, 111, 130, 148, 159, 182, 200, 231, 246, 285, 298, 312, 324, 357, 375, 394, 418, 439, 467, 472, 501, 518, 536, 564, 562, 604, 620, 632, 644, 674, 691, 735, 770, 786, 781, 827, 856, 947, 939, 993, 1008, 1030, 1092, 1120, 3819, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 11, 38, 44, 35, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 1, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 70, 90, 117, 151, 195, 243, 297, 339, 386, 425, 475, 508, 517, 492, 487, 494, 510, 499, 524, 617, 599, 652, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 46, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 43, 47, 47, 51, 65, 97, 120, 149, 181, 229, 277, 314, 371, 449, 480, 499, 435, 616, 671, 604, 596, 586, 679, 832, 927, 876, 960, 923, 1070, 1101, 1148, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 53, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], [ [ 4, 11, 16, 18, 24, 30, 39, 46, 56, 58, 51, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 0, ], [ 14, 30, 46, 60, 73, 84, 94, 105, 115, 126, 137, 150, 162, 175, 188, 199, 207, 213, 217, 226, 235, 251, 263, 284, 301, 319, 336, 351, 363, 373, 384, 393, 392, 391, 397, 390, 387, 376, 370, 357, 348, 338, 335, 328, 321, 318, 318, 331, 320, 6, ], [ 6, 13, 21, 31, 43, 55, 67, 79, 91, 103, 114, 125, 135, 145, 156, 164, 174, 184, 193, 202, 212, 221, 229, 238, 245, 255, 263, 270, 281, 289, 299, 308, 317, 326, 332, 343, 349, 357, 368, 374, 385, 399, 405, 417, 425, 439, 452, 469, 480, 30, ], [ 16, 17, 20, 23, 27, 31, 35, 39, 43, 48, 52, 57, 62, 68, 73, 78, 84, 90, 96, 102, 108, 113, 120, 124, 129, 137, 143, 150, 157, 162, 170, 177, 186, 195, 201, 207, 221, 225, 233, 244, 248, 259, 270, 274, 283, 292, 302, 311, 320, 624, ], [ 37, 37, 37, 38, 40, 42, 46, 48, 51, 55, 58, 60, 66, 69, 72, 77, 82, 81, 86, 89, 93, 97, 98, 102, 107, 114, 116, 119, 128, 132, 130, 135, 137, 138, 138, 139, 151, 156, 153, 153, 153, 154, 168, 172, 171, 170, 178, 182, 191, 1284, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 1, 3, 7, 14, 17, 19, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 0, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 47, 52, 58, 63, 69, 73, 79, 84, 88, 94, 101, 106, 114, 124, 136, 141, 146, 166, 193, 202, 223, 211, 243, 263, 279, 301, 264, 299, 308, 321, 327, 337, 328, 346, 319, 321, 307, 329, 323, 304, 328, 315, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 33, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 38, 39, 41, 42, 41, 43, 42, 42, 45, 46, 53, 56, 60, 68, 70, 77, 84, 84, 88, 75, 86, 121, 135, 137, 144, 154, 161, 162, 178, 183, 189, 196, 202, 245, 286, 295, 251, 270, 265, 242, 242, 233, 182, 326, 183, 447, 294, 330, 318, 108, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], [ [ 1, 2, 2, 3, 3, 3, 3, 4, 5, 6, 8, 15, 17, 24, 25, 23, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 0, ], [ 9, 16, 22, 26, 30, 34, 38, 42, 46, 49, 53, 55, 58, 61, 64, 66, 69, 71, 74, 75, 78, 78, 83, 85, 87, 88, 88, 92, 94, 97, 97, 101, 102, 105, 107, 106, 110, 113, 115, 119, 118, 117, 122, 126, 129, 129, 126, 131, 134, 6, ], [ 3, 5, 6, 9, 11, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47, 51, 54, 58, 62, 65, 68, 72, 75, 78, 79, 84, 88, 91, 95, 96, 100, 103, 106, 107, 110, 111, 117, 119, 119, 122, 125, 127, 130, 132, 134, 138, 140, 21, ], [ 11, 13, 13, 15, 16, 17, 18, 18, 19, 21, 22, 24, 25, 27, 28, 29, 31, 32, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 48, 49, 51, 52, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 66, 67, 69, 181, ], [ 33, 32, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 39, 39, 39, 40, 41, 41, 42, 43, 43, 44, 45, 45, 46, 47, 47, 48, 50, 50, 51, 52, 53, 52, 52, 53, 55, 55, 56, 56, 58, 58, 58, 59, 60, 60, 58, 59, 61, 393, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 0, 1, 1, 1, 2, 1, 2, 1, 2, 1, 0, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 0, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 40, 44, 47, 49, 50, 51, 52, 52, 51, 52, 53, 55, 55, 57, 57, 57, 59, 60, 62, 63, 64, 65, 66, 67, 67, 69, 68, 72, 76, 77, 80, 76, 76, 85, 85, 90, 92, 98, 87, 87, 88, 98, 94, 103, 107, 98, 107, 109, 109, 31, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 36, 37, 38, 38, 37, 37, 37, 37, 37, 36, 37, 37, 37, 37, 38, 37, 38, 38, 40, 40, 41, 40, 38, 44, 47, 44, 43, 44, 45, 50, 44, 44, 46, 49, 49, 54, 58, 56, 64, 63, 62, 60, 54, 54, 59, 59, 65, 57, 60, 78, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], [ 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, 99999, ], ], ]; rav1e-0.7.1/src/recon_intra.rs000064400000000000000000000415521046102023000143160ustar 00000000000000// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(dead_code)] use crate::context::*; use crate::partition::BlockSize::*; use crate::partition::*; use crate::transform::*; static has_null: &[u8] = &[]; // Tables to store if the top-right reference pixels are available. The flags // are represented with bits, packed into 8-bit integers. E.g., for the 32x32 // blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster // order), so its flag is stored at the 3rd bit of the 2nd entry in the table, // i.e. (table[10 / 8] >> (10 % 8)) & 1. // . . . . // . . . . // . . o . // . . . . #[rustfmt::skip] static has_tr_4x4: &[u8] = &[ 255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85, ]; static has_tr_4x8: &[u8] = &[ 255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119, ]; #[rustfmt::skip] static has_tr_8x4: &[u8] = &[ 255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, 255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0, ]; #[rustfmt::skip] static has_tr_8x8: &[u8] = &[ 255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, 255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85, ]; static has_tr_8x16: &[u8] = &[ 255, 255, 119, 119, 127, 127, 119, 119, 255, 127, 119, 119, 127, 127, 119, 119, ]; static has_tr_16x8: &[u8] = &[255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0]; static has_tr_16x16: &[u8] = &[255, 85, 119, 85, 127, 85, 119, 85]; static has_tr_16x32: &[u8] = &[255, 119, 127, 119]; static has_tr_32x16: &[u8] = &[15, 5, 7, 5]; static has_tr_32x32: &[u8] = &[95, 87]; static has_tr_32x64: &[u8] = &[127]; static has_tr_64x32: &[u8] = &[19]; static has_tr_64x64: &[u8] = &[7]; static has_tr_64x128: &[u8] = &[3]; static has_tr_128x64: &[u8] = &[1]; static has_tr_128x128: &[u8] = &[1]; static has_tr_4x16: &[u8] = &[ 255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255, 127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127, 127, 127, 255, 127, 255, 127, 127, 127, 127, 127, ]; static has_tr_16x4: &[u8] = &[ 255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, 127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0, ]; static has_tr_8x32: &[u8] = &[255, 255, 127, 127, 255, 127, 127, 127]; static has_tr_32x8: &[u8] = &[15, 0, 5, 0, 7, 0, 5, 0]; static has_tr_16x64: &[u8] = &[255, 127]; static has_tr_64x16: &[u8] = &[3, 1]; static has_tr_tables: &[&[u8]] = &[ has_tr_4x4, // 4x4 has_tr_4x8, // 4x8 has_tr_8x4, // 8x4 has_tr_8x8, // 8x8 has_tr_8x16, // 8x16 has_tr_16x8, // 16x8 has_tr_16x16, // 16x16 has_tr_16x32, // 16x32 has_tr_32x16, // 32x16 has_tr_32x32, // 32x32 has_tr_32x64, // 32x64 has_tr_64x32, // 64x32 has_tr_64x64, // 64x64 has_tr_64x128, // 64x128 has_tr_128x64, // 128x64 has_tr_128x128, // 128x128 has_tr_4x16, // 4x16 has_tr_16x4, // 16x4 has_tr_8x32, // 8x32 has_tr_32x8, // 32x8 has_tr_16x64, // 16x64 has_tr_64x16, // 64x16 ]; #[rustfmt::skip] static has_tr_vert_8x8: &[u8] = &[ 255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, 255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0, ]; static has_tr_vert_16x16: &[u8] = &[255, 0, 119, 0, 127, 0, 119, 0]; static has_tr_vert_32x32: &[u8] = &[15, 7]; static has_tr_vert_64x64: &[u8] = &[3]; // The _vert_* tables are like the ordinary tables above, but describe the // order we visit square blocks when doing a PARTITION_VERT_A or // PARTITION_VERT_B. This is the same order as normal except for on the last // split where we go vertically (TL, BL, TR, BR). We treat the rectangular block // as a pair of squares, which means that these tables work correctly for both // mixed vertical partition types. // // There are tables for each of the square sizes. Vertical rectangles (like // BLOCK_16X32) use their respective "non-vert" table static has_tr_vert_tables: &[&[u8]] = &[ has_null, // 4X4 has_tr_4x8, // 4X8 has_null, // 8X4 has_tr_vert_8x8, // 8X8 has_tr_8x16, // 8X16 has_null, // 16X8 has_tr_vert_16x16, // 16X16 has_tr_16x32, // 16X32 has_null, // 32X16 has_tr_vert_32x32, // 32X32 has_tr_32x64, // 32X64 has_null, // 64X32 has_tr_vert_64x64, // 64X64 has_tr_64x128, // 64x128 has_null, // 128x64 has_tr_128x128, // 128x128 ]; // TODO: Enable the case for PARTITION_VERT_A/B once they can be encoded by rav1e. pub fn get_has_tr_table( /*partition: PartitionType, */ bsize: BlockSize, ) -> &'static [u8] { let ret: &[u8]; // If this is a mixed vertical partition, look up bsize in orders_vert. /*if partition == PartitionType::PARTITION_VERT_A || partition == PartitionType::PARTITION_VERT_B { debug_assert!(bsize < BlockSize::BLOCK_SIZES); ret = has_tr_vert_tables[bsize as usize]; } else */ { ret = has_tr_tables[bsize as usize]; } //debug_assert!(ret != ptr::has_null()); ret } pub fn has_top_right( bsize: BlockSize, partition_bo: TileBlockOffset, top_available: bool, right_available: bool, tx_size: TxSize, row_off: usize, col_off: usize, ss_x: usize, _ss_y: usize, ) -> bool { if !top_available || !right_available { return false; }; let bw_unit = bsize.width_mi(); let plane_bw_unit = (bw_unit >> ss_x).max(1); let top_right_count_unit = tx_size.width_mi(); let mi_col = partition_bo.0.x; let mi_row = partition_bo.0.y; if row_off > 0 { // Just need to check if enough pixels on the right. // 128x128 SB is not supported yet by rav1e if bsize.width() > BLOCK_64X64.width() { // Special case: For 128x128 blocks, the transform unit whose // top-right corner is at the center of the block does in fact have // pixels available at its top-right corner. if row_off == BLOCK_64X64.height_mi() >> _ss_y && col_off + top_right_count_unit == BLOCK_64X64.width_mi() >> ss_x { return false; } let plane_bw_unit_64 = BLOCK_64X64.width_mi() >> ss_x; let col_off_64 = col_off % plane_bw_unit_64; return col_off_64 + top_right_count_unit < plane_bw_unit_64; } col_off + top_right_count_unit < plane_bw_unit } else { // All top-right pixels are in the block above, which is already available. if col_off + top_right_count_unit < plane_bw_unit { return true; }; let bw_in_mi_log2 = bsize.width_log2() - MI_SIZE_LOG2; let bh_in_mi_log2 = bsize.height_log2() - MI_SIZE_LOG2; let sb_mi_size: usize = 16; // 64x64 let blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; let blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; // Top row of superblock: so top-right pixels are in the top and/or // top-right superblocks, both of which are already available. if blk_row_in_sb == 0 { return true; }; // Rightmost column of superblock (and not the top row): so top-right pixels // fall in the right superblock, which is not available yet. if ((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size { return false; }; // General case (neither top row nor rightmost column): check if the // top-right block is coded before the current block. let this_blk_index = (blk_row_in_sb << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + blk_col_in_sb; let idx1 = this_blk_index / 8; let idx2 = this_blk_index % 8; let has_tr_table: &[u8] = get_has_tr_table(/*partition,*/ bsize); ((has_tr_table[idx1] >> idx2) & 1) != 0 } } // Similar to the has_tr_* tables, but store if the bottom-left reference // pixels are available. static has_bl_4x4: &[u8] = &[ 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0, ]; static has_bl_4x8: &[u8] = &[ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0, 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0, ]; static has_bl_8x4: &[u8] = &[ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1, 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0, ]; static has_bl_8x8: &[u8] = &[ 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0, ]; static has_bl_8x16: &[u8] = &[16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0]; static has_bl_16x8: &[u8] = &[254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0]; static has_bl_16x16: &[u8] = &[84, 16, 84, 0, 84, 16, 84, 0]; static has_bl_16x32: &[u8] = &[16, 0, 16, 0]; static has_bl_32x16: &[u8] = &[78, 14, 78, 14]; static has_bl_32x32: &[u8] = &[4, 4]; static has_bl_32x64: &[u8] = &[0]; static has_bl_64x32: &[u8] = &[34]; static has_bl_64x64: &[u8] = &[0]; static has_bl_64x128: &[u8] = &[0]; static has_bl_128x64: &[u8] = &[0]; static has_bl_128x128: &[u8] = &[0]; static has_bl_4x16: &[u8] = &[ 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, ]; static has_bl_16x4: &[u8] = &[ 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0, ]; static has_bl_8x32: &[u8] = &[0, 1, 0, 0, 0, 1, 0, 0]; static has_bl_32x8: &[u8] = &[238, 78, 238, 14, 238, 78, 238, 14]; static has_bl_16x64: &[u8] = &[0, 0]; static has_bl_64x16: &[u8] = &[42, 42]; static has_bl_tables: &[&[u8]] = &[ has_bl_4x4, // 4x4 has_bl_4x8, // 4x8 has_bl_8x4, // 8x4 has_bl_8x8, // 8x8 has_bl_8x16, // 8x16 has_bl_16x8, // 16x8 has_bl_16x16, // 16x16 has_bl_16x32, // 16x32 has_bl_32x16, // 32x16 has_bl_32x32, // 32x32 has_bl_32x64, // 32x64 has_bl_64x32, // 64x32 has_bl_64x64, // 64x64 has_bl_64x128, // 64x128 has_bl_128x64, // 128x64 has_bl_128x128, // 128x128 has_bl_4x16, // 4x16 has_bl_16x4, // 16x4 has_bl_8x32, // 8x32 has_bl_32x8, // 32x8 has_bl_16x64, // 16x64 has_bl_64x16, // 64x16 ]; #[rustfmt::skip] static has_bl_vert_8x8: &[u8] = &[ 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0, ]; static has_bl_vert_16x16: &[u8] = &[254, 16, 254, 0, 254, 16, 254, 0]; static has_bl_vert_32x32: &[u8] = &[14, 14]; static has_bl_vert_64x64: &[u8] = &[2]; // The _vert_* tables are like the ordinary tables above, but describe the // order we visit square blocks when doing a PARTITION_VERT_A or // PARTITION_VERT_B. This is the same order as normal except for on the last // split where we go vertically (TL, BL, TR, BR). We treat the rectangular block // as a pair of squares, which means that these tables work correctly for both // mixed vertical partition types. // // There are tables for each of the square sizes. Vertical rectangles (like // BLOCK_16X32) use their respective "non-vert" table static has_bl_vert_tables: &[&[u8]] = &[ has_null, // 4x4 has_bl_4x8, // 4x8 has_null, // 8x4 has_bl_vert_8x8, // 8x8 has_bl_8x16, // 8x16 has_null, // 16x8 has_bl_vert_16x16, // 16x16 has_bl_16x32, // 16x32 has_null, // 32x16 has_bl_vert_32x32, // 32x32 has_bl_32x64, // 32x64 has_null, // 64x32 has_bl_vert_64x64, // 64x64 has_bl_64x128, // 64x128 has_null, // 128x64 has_bl_128x128, // 128x128 ]; pub fn get_has_bl_table( /*partition: PartitionType, */ bsize: BlockSize, ) -> &'static [u8] { let ret: &[u8]; // If this is a mixed vertical partition, look up bsize in orders_vert. /*if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) { //assert(bsize < BLOCK_SIZES); ret = has_bl_vert_tables[bsize as usize]; } else*/ { ret = has_bl_tables[bsize as usize]; } //debug_assert!(ret != ptr::has_null()); ret } pub fn has_bottom_left( bsize: BlockSize, partition_bo: TileBlockOffset, bottom_available: bool, left_available: bool, tx_size: TxSize, row_off: usize, col_off: usize, _ss_x: usize, ss_y: usize, ) -> bool { if !bottom_available || !left_available { return false; }; // Special case for 128x* blocks, when col_off is half the block width. // This is needed because 128x* superblocks are divided into 64x* blocks in // raster order // 128x128 SB is not supported yet by rav1e if bsize.width() > BLOCK_64X64.width() && col_off > 0 { let plane_bw_unit_64 = BLOCK_64X64.width_mi() >> _ss_x; let col_off_64 = col_off % plane_bw_unit_64; if col_off_64 == 0 { // We are at the left edge of top-right or bottom-right 64x* block. let plane_bh_unit_64 = BLOCK_64X64.height_mi() >> ss_y; let row_off_64 = row_off % plane_bh_unit_64; let plane_bh_unit = (bsize.height_mi() >> ss_y).min(plane_bh_unit_64); // Check if all bottom-left pixels are in the left 64x* block (which is // already coded). return row_off_64 + tx_size.height_mi() < plane_bh_unit; } } if col_off > 0 { // Bottom-left pixels are in the bottom-left block, which is not available. false } else { let bh_unit = bsize.height_mi(); let plane_bh_unit = (bh_unit >> ss_y).max(1); let bottom_left_count_unit = tx_size.height_mi(); let mi_col = partition_bo.0.x; let mi_row = partition_bo.0.y; // All bottom-left pixels are in the left block, which is already available. if row_off + bottom_left_count_unit < plane_bh_unit { return true; }; let bw_in_mi_log2 = bsize.width_log2() - MI_SIZE_LOG2; let bh_in_mi_log2 = bsize.height_log2() - MI_SIZE_LOG2; let sb_mi_size: usize = 16; // 64x64 let blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2; let blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2; // Leftmost column of superblock: so bottom-left pixels maybe in the left // and/or bottom-left superblocks. But only the left superblock is // available, so check if all required pixels fall in that superblock. if blk_col_in_sb == 0 { let blk_start_row_off = blk_row_in_sb << bh_in_mi_log2 >> ss_y; let row_off_in_sb = blk_start_row_off + row_off; let sb_height_unit = sb_mi_size >> ss_y; return row_off_in_sb + bottom_left_count_unit < sb_height_unit; //return row_off_in_sb + (bottom_left_count_unit << 1) < sb_height_unit; // Don't it need tx height? again? } // Bottom row of superblock (and not the leftmost column): so bottom-left // pixels fall in the bottom superblock, which is not available yet. if ((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size { return false; }; // General case (neither leftmost column nor bottom row): check if the // bottom-left block is coded before the current block. let this_blk_index = (blk_row_in_sb << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) + blk_col_in_sb; let idx1 = this_blk_index / 8; let idx2 = this_blk_index % 8; let has_bl_table: &[u8] = get_has_bl_table(/*partition,*/ bsize); ((has_bl_table[idx1] >> idx2) & 1) != 0 } } rav1e-0.7.1/src/sad_plane.rs000064400000000000000000000032611046102023000137340ustar 00000000000000// Copyright (c) 2021-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { use crate::asm::x86::sad_plane::*; } else { use self::rust::*; } } use v_frame::plane::Plane; use crate::cpu_features::CpuFeatureLevel; use crate::util::{CastFromPrimitive, Pixel}; pub(crate) mod rust { use super::*; use crate::cpu_features::CpuFeatureLevel; #[inline] pub(crate) fn sad_plane_internal( src: &Plane, dst: &Plane, _cpu: CpuFeatureLevel, ) -> u64 { debug_assert!(src.cfg.width == dst.cfg.width); debug_assert!(src.cfg.height == dst.cfg.height); src .rows_iter() .zip(dst.rows_iter()) .map(|(src, dst)| { src .iter() .zip(dst.iter()) .map(|(&p1, &p2)| i32::cast_from(p1).abs_diff(i32::cast_from(p2))) .sum::() as u64 }) .sum() } } /// Compute the sum of absolute differences (SADs) on 2 rows of pixels /// /// This differs from other SAD functions in that it operates over a row /// (or line) of unknown length rather than a `PlaneRegion`. pub(crate) fn sad_plane( src: &Plane, dst: &Plane, cpu: CpuFeatureLevel, ) -> u64 { sad_plane_internal(src, dst, cpu) } rav1e-0.7.1/src/scan_order.rs000064400000000000000000002355131046102023000141340ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_upper_case_globals)] #![allow(dead_code)] #![allow(non_camel_case_types)] const MAX_NEIGHBORS: usize = 2; use crate::transform::*; pub struct SCAN_ORDER { pub scan: &'static [u16], pub iscan: &'static [u16], } // To stop from having to perform an unnecessary transpose in the forward and // inverse transforms, the output of the forward transform is transposed in // relation to how is in the spec. This means that all of our scan orders are // different from what is found the spec. #[rustfmt::skip] static default_scan_4x4 : [u16; 16] = [ 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15, ]; #[rustfmt::skip] static mrow_scan_4x4 : [u16; 16] = [ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, ]; #[rustfmt::skip] static mcol_scan_4x4 : [u16; 16] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ]; #[rustfmt::skip] static default_scan_8x4 : [u16; 32] = [ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31, ]; #[rustfmt::skip] static mrow_scan_8x4 : [u16; 32] = [ 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29, 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31, ]; #[rustfmt::skip] static mcol_scan_8x4 : [u16; 32] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ]; #[rustfmt::skip] static default_scan_4x8 : [u16; 32] = [ 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19, 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31, ]; #[rustfmt::skip] static mrow_scan_4x8 : [u16; 32] = [ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, ]; #[rustfmt::skip] static mcol_scan_4x8 : [u16; 32] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ]; #[rustfmt::skip] static default_scan_16x4 : [u16; 64] = [ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14, 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30, 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46, 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63, ]; #[rustfmt::skip] static default_scan_4x16 : [u16; 64] = [ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35, 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39, 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43, 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63, ]; #[rustfmt::skip] static mcol_scan_16x4 : [u16; 64] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, ]; #[rustfmt::skip] static mcol_scan_4x16 : [u16; 64] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, ]; #[rustfmt::skip] static mrow_scan_16x4 : [u16; 64] = [ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63, ]; #[rustfmt::skip] static mrow_scan_4x16 : [u16; 64] = [ 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63, ]; #[rustfmt::skip] static default_scan_32x8 : [u16; 256] = [ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116, 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125, 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134, 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143, 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200, 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255, ]; #[rustfmt::skip] static default_scan_8x32 : [u16; 256] = [ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226, 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10, 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43, 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76, 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109, 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142, 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175, 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208, 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241, 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25, 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58, 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91, 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124, 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126, 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223, 255, ]; #[rustfmt::skip] static mcol_scan_32x8 : [u16; 256] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]; #[rustfmt::skip] static mcol_scan_8x32 : [u16; 256] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]; #[rustfmt::skip] static mrow_scan_32x8 : [u16; 256] = [ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217, 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157, 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 255, ]; #[rustfmt::skip] static mrow_scan_8x32 : [u16; 256] = [ 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225, 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227, 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229, 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231, 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233, 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235, 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237, 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239, 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241, 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243, 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245, 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247, 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249, 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251, 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253, 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255, ]; #[rustfmt::skip] static default_scan_8x8 : [u16; 64] = [ 0, 8, 1, 2, 9, 16, 24, 17, 10, 3, 4, 11, 18, 25, 32, 40, 33, 26, 19, 12, 5, 6, 13, 20, 27, 34, 41, 48, 56, 49, 42, 35, 28, 21, 14, 7, 15, 22, 29, 36, 43, 50, 57, 58, 51, 44, 37, 30, 23, 31, 38, 45, 52, 59, 60, 53, 46, 39, 47, 54, 61, 62, 55, 63, ]; #[rustfmt::skip] static mrow_scan_8x8 : [u16; 64] = [ 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63, ]; #[rustfmt::skip] static mcol_scan_8x8 : [u16; 64] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, ]; #[rustfmt::skip] static default_scan_16x8 : [u16; 128] = [ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127, ]; #[rustfmt::skip] static default_scan_8x16 : [u16; 128] = [ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80, 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67, 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69, 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71, 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73, 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75, 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62, 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127, ]; #[rustfmt::skip] static mrow_scan_16x8 : [u16; 128] = [ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122, 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123, 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127, ]; #[rustfmt::skip] static mrow_scan_8x16 : [u16; 128] = [ 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113, 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115, 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117, 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119, 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121, 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123, 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125, 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127, ]; #[rustfmt::skip] static mcol_scan_16x8 : [u16; 128] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, ]; #[rustfmt::skip] static mcol_scan_8x16 : [u16; 128] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, ]; #[rustfmt::skip] static default_scan_32x16 : [u16; 512] = [ 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225, 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231, 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232, 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233, 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234, 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235, 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236, 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237, 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238, 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239, 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464, 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465, 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466, 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467, 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483, 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335, 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444, 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511, ]; #[rustfmt::skip] static default_scan_16x32 : [u16; 512] = [ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14, 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142, 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61, 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411, 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511, ]; #[rustfmt::skip] static mrow_scan_32x16 : [u16; 512] = [ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433, 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402, 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487, 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456, 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425, 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394, 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510, 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479, 495, 511, ]; #[rustfmt::skip] static mrow_scan_16x32 : [u16; 512] = [ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, ]; #[rustfmt::skip] static mcol_scan_32x16 : [u16; 512] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, ]; #[rustfmt::skip] static mcol_scan_16x32 : [u16; 512] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, ]; #[rustfmt::skip] static default_scan_16x16 : [u16; 256] = [ 0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80, 65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67, 52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114, 99, 84, 69, 54, 39, 24, 9, 10, 25, 40, 55, 70, 85, 100, 115, 130, 145, 160, 176, 161, 146, 131, 116, 101, 86, 71, 56, 41, 26, 11, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 208, 193, 178, 163, 148, 133, 118, 103, 88, 73, 58, 43, 28, 13, 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224, 240, 225, 210, 195, 180, 165, 150, 135, 120, 105, 90, 75, 60, 45, 30, 15, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226, 241, 242, 227, 212, 197, 182, 167, 152, 137, 122, 107, 92, 77, 62, 47, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228, 243, 244, 229, 214, 199, 184, 169, 154, 139, 124, 109, 94, 79, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230, 245, 246, 231, 216, 201, 186, 171, 156, 141, 126, 111, 127, 142, 157, 172, 187, 202, 217, 232, 247, 248, 233, 218, 203, 188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190, 175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255, ]; #[rustfmt::skip] static mrow_scan_16x16 : [u16; 256] = [ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242, 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243, 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244, 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250, 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251, 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255, ]; #[rustfmt::skip] static mcol_scan_16x16 : [u16; 256] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]; #[rustfmt::skip] static mrow_scan_32x32 : [u16; 1024] = [ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737, 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610, 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483, 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677, 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550, 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998, 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744, 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617, 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490, 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938, 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684, 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557, 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005, 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624, 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497, 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945, 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691, 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564, 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012, 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631, 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504, 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952, 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698, 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571, 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019, 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638, 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511, 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959, 991, 1023, ]; #[rustfmt::skip] static mcol_scan_32x32 : [u16; 1024] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, ]; #[rustfmt::skip] static default_scan_32x32 : [u16; 1024] = [ 0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131, 100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258, 289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13, 14, 45, 76, 107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448, 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46, 15, 16, 47, 78, 109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419, 450, 481, 512, 544, 513, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110, 79, 48, 17, 18, 49, 80, 111, 142, 173, 204, 235, 266, 297, 328, 359, 390, 421, 452, 483, 514, 545, 576, 608, 577, 546, 515, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174, 143, 112, 81, 50, 19, 20, 51, 82, 113, 144, 175, 206, 237, 268, 299, 330, 361, 392, 423, 454, 485, 516, 547, 578, 609, 640, 672, 641, 610, 579, 548, 517, 486, 455, 424, 393, 362, 331, 300, 269, 238, 207, 176, 145, 114, 83, 52, 21, 22, 53, 84, 115, 146, 177, 208, 239, 270, 301, 332, 363, 394, 425, 456, 487, 518, 549, 580, 611, 642, 673, 704, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395, 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 24, 55, 86, 117, 148, 179, 210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 520, 551, 582, 613, 644, 675, 706, 737, 768, 800, 769, 738, 707, 676, 645, 614, 583, 552, 521, 490, 459, 428, 397, 366, 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 26, 57, 88, 119, 150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 522, 553, 584, 615, 646, 677, 708, 739, 770, 801, 832, 864, 833, 802, 771, 740, 709, 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 28, 59, 90, 121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 524, 555, 586, 617, 648, 679, 710, 741, 772, 803, 834, 865, 896, 928, 897, 866, 835, 804, 773, 742, 711, 680, 649, 618, 587, 556, 525, 494, 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29, 30, 61, 92, 123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464, 495, 526, 557, 588, 619, 650, 681, 712, 743, 774, 805, 836, 867, 898, 929, 960, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 63, 94, 125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435, 466, 497, 528, 559, 590, 621, 652, 683, 714, 745, 776, 807, 838, 869, 900, 931, 962, 993, 994, 963, 932, 901, 870, 839, 808, 777, 746, 715, 684, 653, 622, 591, 560, 529, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126, 95, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 530, 561, 592, 623, 654, 685, 716, 747, 778, 809, 840, 871, 902, 933, 964, 995, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 191, 222, 253, 284, 315, 346, 377, 408, 439, 470, 501, 532, 563, 594, 625, 656, 687, 718, 749, 780, 811, 842, 873, 904, 935, 966, 997, 998, 967, 936, 905, 874, 843, 812, 781, 750, 719, 688, 657, 626, 595, 564, 533, 502, 471, 440, 409, 378, 347, 316, 285, 254, 223, 255, 286, 317, 348, 379, 410, 441, 472, 503, 534, 565, 596, 627, 658, 689, 720, 751, 782, 813, 844, 875, 906, 937, 968, 999, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659, 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 319, 350, 381, 412, 443, 474, 505, 536, 567, 598, 629, 660, 691, 722, 753, 784, 815, 846, 877, 908, 939, 970, 1001, 1002, 971, 940, 909, 878, 847, 816, 785, 754, 723, 692, 661, 630, 599, 568, 537, 506, 475, 444, 413, 382, 351, 383, 414, 445, 476, 507, 538, 569, 600, 631, 662, 693, 724, 755, 786, 817, 848, 879, 910, 941, 972, 1003, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, 446, 415, 447, 478, 509, 540, 571, 602, 633, 664, 695, 726, 757, 788, 819, 850, 881, 912, 943, 974, 1005, 1006, 975, 944, 913, 882, 851, 820, 789, 758, 727, 696, 665, 634, 603, 572, 541, 510, 479, 511, 542, 573, 604, 635, 666, 697, 728, 759, 790, 821, 852, 883, 914, 945, 976, 1007, 1008, 977, 946, 915, 884, 853, 822, 791, 760, 729, 698, 667, 636, 605, 574, 543, 575, 606, 637, 668, 699, 730, 761, 792, 823, 854, 885, 916, 947, 978, 1009, 1010, 979, 948, 917, 886, 855, 824, 793, 762, 731, 700, 669, 638, 607, 639, 670, 701, 732, 763, 794, 825, 856, 887, 918, 949, 980, 1011, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 703, 734, 765, 796, 827, 858, 889, 920, 951, 982, 1013, 1014, 983, 952, 921, 890, 859, 828, 797, 766, 735, 767, 798, 829, 860, 891, 922, 953, 984, 1015, 1016, 985, 954, 923, 892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023, ]; const fn invert(scan: [u16; N]) -> [u16; N] { let mut iscan = scan; let mut i = 0; while i < N { iscan[scan[i] as usize] = i as u16; i += 1; } iscan } static default_iscan_4x4: [u16; 16] = invert(default_scan_4x4); static mrow_iscan_4x4: [u16; 16] = invert(mrow_scan_4x4); static mcol_iscan_4x4: [u16; 16] = invert(mcol_scan_4x4); static default_iscan_8x4: [u16; 32] = invert(default_scan_8x4); static mrow_iscan_8x4: [u16; 32] = invert(mrow_scan_8x4); static mcol_iscan_8x4: [u16; 32] = invert(mcol_scan_8x4); static default_iscan_4x8: [u16; 32] = invert(default_scan_4x8); static mrow_iscan_4x8: [u16; 32] = invert(mrow_scan_4x8); static mcol_iscan_4x8: [u16; 32] = invert(mcol_scan_4x8); static default_iscan_16x4: [u16; 64] = invert(default_scan_16x4); static default_iscan_4x16: [u16; 64] = invert(default_scan_4x16); static mcol_iscan_16x4: [u16; 64] = invert(mcol_scan_16x4); static mcol_iscan_4x16: [u16; 64] = invert(mcol_scan_4x16); static mrow_iscan_16x4: [u16; 64] = invert(mrow_scan_16x4); static mrow_iscan_4x16: [u16; 64] = invert(mrow_scan_4x16); static default_iscan_32x8: [u16; 256] = invert(default_scan_32x8); static default_iscan_8x32: [u16; 256] = invert(default_scan_8x32); static mcol_iscan_32x8: [u16; 256] = invert(mcol_scan_32x8); static mcol_iscan_8x32: [u16; 256] = invert(mcol_scan_8x32); static mrow_iscan_32x8: [u16; 256] = invert(mrow_scan_32x8); static mrow_iscan_8x32: [u16; 256] = invert(mrow_scan_8x32); static default_iscan_8x8: [u16; 64] = invert(default_scan_8x8); static mrow_iscan_8x8: [u16; 64] = invert(mrow_scan_8x8); static mcol_iscan_8x8: [u16; 64] = invert(mcol_scan_8x8); static default_iscan_16x8: [u16; 128] = invert(default_scan_16x8); static default_iscan_8x16: [u16; 128] = invert(default_scan_8x16); static mrow_iscan_16x8: [u16; 128] = invert(mrow_scan_16x8); static mrow_iscan_8x16: [u16; 128] = invert(mrow_scan_8x16); static mcol_iscan_16x8: [u16; 128] = invert(mcol_scan_16x8); static mcol_iscan_8x16: [u16; 128] = invert(mcol_scan_8x16); static default_iscan_32x16: [u16; 512] = invert(default_scan_32x16); static default_iscan_16x32: [u16; 512] = invert(default_scan_16x32); static mrow_iscan_32x16: [u16; 512] = invert(mrow_scan_32x16); static mrow_iscan_16x32: [u16; 512] = invert(mrow_scan_16x32); static mcol_iscan_32x16: [u16; 512] = invert(mcol_scan_32x16); static mcol_iscan_16x32: [u16; 512] = invert(mcol_scan_16x32); static default_iscan_16x16: [u16; 256] = invert(default_scan_16x16); static mrow_iscan_16x16: [u16; 256] = invert(mrow_scan_16x16); static mcol_iscan_16x16: [u16; 256] = invert(mcol_scan_16x16); static mrow_iscan_32x32: [u16; 1024] = invert(mrow_scan_32x32); static mcol_iscan_32x32: [u16; 1024] = invert(mcol_scan_32x32); static default_iscan_32x32: [u16; 1024] = invert(default_scan_32x32); #[rustfmt::skip] pub static av1_scan_orders: [[SCAN_ORDER; TX_TYPES]; TxSize::TX_SIZES_ALL] = [ [ // TX_4X4 SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &default_scan_4x4, iscan: &default_iscan_4x4 }, SCAN_ORDER { scan: &mrow_scan_4x4, iscan: &mrow_iscan_4x4 }, SCAN_ORDER { scan: &mcol_scan_4x4, iscan: &mcol_iscan_4x4 }, SCAN_ORDER { scan: &mrow_scan_4x4, iscan: &mrow_iscan_4x4 }, SCAN_ORDER { scan: &mcol_scan_4x4, iscan: &mcol_iscan_4x4 }, SCAN_ORDER { scan: &mrow_scan_4x4, iscan: &mrow_iscan_4x4 }, SCAN_ORDER { scan: &mcol_scan_4x4, iscan: &mcol_iscan_4x4 }, ], [ // TX_8X8 SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &default_scan_8x8, iscan: &default_iscan_8x8 }, SCAN_ORDER { scan: &mrow_scan_8x8, iscan: &mrow_iscan_8x8 }, SCAN_ORDER { scan: &mcol_scan_8x8, iscan: &mcol_iscan_8x8 }, SCAN_ORDER { scan: &mrow_scan_8x8, iscan: &mrow_iscan_8x8 }, SCAN_ORDER { scan: &mcol_scan_8x8, iscan: &mcol_iscan_8x8 }, SCAN_ORDER { scan: &mrow_scan_8x8, iscan: &mrow_iscan_8x8 }, SCAN_ORDER { scan: &mcol_scan_8x8, iscan: &mcol_iscan_8x8 }, ], [ // TX_16X16 SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &default_scan_16x16, iscan: &default_iscan_16x16 }, SCAN_ORDER { scan: &mrow_scan_16x16, iscan: &mrow_iscan_16x16 }, SCAN_ORDER { scan: &mcol_scan_16x16, iscan: &mcol_iscan_16x16 }, SCAN_ORDER { scan: &mrow_scan_16x16, iscan: &mrow_iscan_16x16 }, SCAN_ORDER { scan: &mcol_scan_16x16, iscan: &mcol_iscan_16x16 }, SCAN_ORDER { scan: &mrow_scan_16x16, iscan: &mrow_iscan_16x16 }, SCAN_ORDER { scan: &mcol_scan_16x16, iscan: &mcol_iscan_16x16 }, ], [ // TX_32X32 SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, ], [ // TX_64X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, ], [ // TX_4X8 SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &default_scan_4x8, iscan: &default_iscan_4x8 }, SCAN_ORDER { scan: &mrow_scan_4x8, iscan: &mrow_iscan_4x8 }, SCAN_ORDER { scan: &mcol_scan_4x8, iscan: &mcol_iscan_4x8 }, SCAN_ORDER { scan: &mrow_scan_4x8, iscan: &mrow_iscan_4x8 }, SCAN_ORDER { scan: &mcol_scan_4x8, iscan: &mcol_iscan_4x8 }, SCAN_ORDER { scan: &mrow_scan_4x8, iscan: &mrow_iscan_4x8 }, SCAN_ORDER { scan: &mcol_scan_4x8, iscan: &mcol_iscan_4x8 }, ], [ // TX_8X4 SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &default_scan_8x4, iscan: &default_iscan_8x4 }, SCAN_ORDER { scan: &mrow_scan_8x4, iscan: &mrow_iscan_8x4 }, SCAN_ORDER { scan: &mcol_scan_8x4, iscan: &mcol_iscan_8x4 }, SCAN_ORDER { scan: &mrow_scan_8x4, iscan: &mrow_iscan_8x4 }, SCAN_ORDER { scan: &mcol_scan_8x4, iscan: &mcol_iscan_8x4 }, SCAN_ORDER { scan: &mrow_scan_8x4, iscan: &mrow_iscan_8x4 }, SCAN_ORDER { scan: &mcol_scan_8x4, iscan: &mcol_iscan_8x4 }, ], [ // TX_8X16 SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &default_scan_8x16, iscan: &default_iscan_8x16 }, SCAN_ORDER { scan: &mrow_scan_8x16, iscan: &mrow_iscan_8x16 }, SCAN_ORDER { scan: &mcol_scan_8x16, iscan: &mcol_iscan_8x16 }, SCAN_ORDER { scan: &mrow_scan_8x16, iscan: &mrow_iscan_8x16 }, SCAN_ORDER { scan: &mcol_scan_8x16, iscan: &mcol_iscan_8x16 }, SCAN_ORDER { scan: &mrow_scan_8x16, iscan: &mrow_iscan_8x16 }, SCAN_ORDER { scan: &mcol_scan_8x16, iscan: &mcol_iscan_8x16 }, ], [ // TX_16X8 SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &default_scan_16x8, iscan: &default_iscan_16x8 }, SCAN_ORDER { scan: &mrow_scan_16x8, iscan: &mrow_iscan_16x8 }, SCAN_ORDER { scan: &mcol_scan_16x8, iscan: &mcol_iscan_16x8 }, SCAN_ORDER { scan: &mrow_scan_16x8, iscan: &mrow_iscan_16x8 }, SCAN_ORDER { scan: &mcol_scan_16x8, iscan: &mcol_iscan_16x8 }, SCAN_ORDER { scan: &mrow_scan_16x8, iscan: &mrow_iscan_16x8 }, SCAN_ORDER { scan: &mcol_scan_16x8, iscan: &mcol_iscan_16x8 }, ], [ // TX_16X32 SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &mrow_scan_16x32, iscan: &mrow_iscan_16x32 }, SCAN_ORDER { scan: &mcol_scan_16x32, iscan: &mcol_iscan_16x32 }, SCAN_ORDER { scan: &mrow_scan_16x32, iscan: &mrow_iscan_16x32 }, SCAN_ORDER { scan: &mcol_scan_16x32, iscan: &mcol_iscan_16x32 }, SCAN_ORDER { scan: &mrow_scan_16x32, iscan: &mrow_iscan_16x32 }, SCAN_ORDER { scan: &mcol_scan_16x32, iscan: &mcol_iscan_16x32 }, ], [ // TX_32X16 SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &mrow_scan_32x16, iscan: &mrow_iscan_32x16 }, SCAN_ORDER { scan: &mcol_scan_32x16, iscan: &mcol_iscan_32x16 }, SCAN_ORDER { scan: &mrow_scan_32x16, iscan: &mrow_iscan_32x16 }, SCAN_ORDER { scan: &mcol_scan_32x16, iscan: &mcol_iscan_32x16 }, SCAN_ORDER { scan: &mrow_scan_32x16, iscan: &mrow_iscan_32x16 }, SCAN_ORDER { scan: &mcol_scan_32x16, iscan: &mcol_iscan_32x16 }, ], [ // TX_32X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, ], [ // TX_64X32 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &default_scan_32x32, iscan: &default_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, SCAN_ORDER { scan: &mrow_scan_32x32, iscan: &mrow_iscan_32x32 }, SCAN_ORDER { scan: &mcol_scan_32x32, iscan: &mcol_iscan_32x32 }, ], [ // TX_4X16 SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &default_scan_4x16, iscan: &default_iscan_4x16 }, SCAN_ORDER { scan: &mrow_scan_4x16, iscan: &mrow_iscan_4x16 }, SCAN_ORDER { scan: &mcol_scan_4x16, iscan: &mcol_iscan_4x16 }, SCAN_ORDER { scan: &mrow_scan_4x16, iscan: &mrow_iscan_4x16 }, SCAN_ORDER { scan: &mcol_scan_4x16, iscan: &mcol_iscan_4x16 }, SCAN_ORDER { scan: &mrow_scan_4x16, iscan: &mrow_iscan_4x16 }, SCAN_ORDER { scan: &mcol_scan_4x16, iscan: &mcol_iscan_4x16 }, ], [ // TX_16X4 SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &default_scan_16x4, iscan: &default_iscan_16x4 }, SCAN_ORDER { scan: &mrow_scan_16x4, iscan: &mrow_iscan_16x4 }, SCAN_ORDER { scan: &mcol_scan_16x4, iscan: &mcol_iscan_16x4 }, SCAN_ORDER { scan: &mrow_scan_16x4, iscan: &mrow_iscan_16x4 }, SCAN_ORDER { scan: &mcol_scan_16x4, iscan: &mcol_iscan_16x4 }, SCAN_ORDER { scan: &mrow_scan_16x4, iscan: &mrow_iscan_16x4 }, SCAN_ORDER { scan: &mcol_scan_16x4, iscan: &mcol_iscan_16x4 }, ], [ // TX_8X32 SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &default_scan_8x32, iscan: &default_iscan_8x32 }, SCAN_ORDER { scan: &mrow_scan_8x32, iscan: &mrow_iscan_8x32 }, SCAN_ORDER { scan: &mcol_scan_8x32, iscan: &mcol_iscan_8x32 }, SCAN_ORDER { scan: &mrow_scan_8x32, iscan: &mrow_iscan_8x32 }, SCAN_ORDER { scan: &mcol_scan_8x32, iscan: &mcol_iscan_8x32 }, SCAN_ORDER { scan: &mrow_scan_8x32, iscan: &mrow_iscan_8x32 }, SCAN_ORDER { scan: &mcol_scan_8x32, iscan: &mcol_iscan_8x32 }, ], [ // TX_32X8 SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &default_scan_32x8, iscan: &default_iscan_32x8 }, SCAN_ORDER { scan: &mrow_scan_32x8, iscan: &mrow_iscan_32x8 }, SCAN_ORDER { scan: &mcol_scan_32x8, iscan: &mcol_iscan_32x8 }, SCAN_ORDER { scan: &mrow_scan_32x8, iscan: &mrow_iscan_32x8 }, SCAN_ORDER { scan: &mcol_scan_32x8, iscan: &mcol_iscan_32x8 }, SCAN_ORDER { scan: &mrow_scan_32x8, iscan: &mrow_iscan_32x8 }, SCAN_ORDER { scan: &mcol_scan_32x8, iscan: &mcol_iscan_32x8 }, ], [ // TX_16X64 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &default_scan_16x32, iscan: &default_iscan_16x32 }, SCAN_ORDER { scan: &mrow_scan_16x32, iscan: &mrow_iscan_16x32 }, SCAN_ORDER { scan: &mcol_scan_16x32, iscan: &mcol_iscan_16x32 }, SCAN_ORDER { scan: &mrow_scan_16x32, iscan: &mrow_iscan_16x32 }, SCAN_ORDER { scan: &mcol_scan_16x32, iscan: &mcol_iscan_16x32 }, SCAN_ORDER { scan: &mrow_scan_16x32, iscan: &mrow_iscan_16x32 }, SCAN_ORDER { scan: &mcol_scan_16x32, iscan: &mcol_iscan_16x32 }, ], [ // TX_64X16 // Half of the coefficients of tx64 at higher frequencies are set to // zeros. So tx32's scan order is used. SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &default_scan_32x16, iscan: &default_iscan_32x16 }, SCAN_ORDER { scan: &mrow_scan_32x16, iscan: &mrow_iscan_32x16 }, SCAN_ORDER { scan: &mcol_scan_32x16, iscan: &mcol_iscan_32x16 }, SCAN_ORDER { scan: &mrow_scan_32x16, iscan: &mrow_iscan_32x16 }, SCAN_ORDER { scan: &mcol_scan_32x16, iscan: &mcol_iscan_32x16 }, SCAN_ORDER { scan: &mrow_scan_32x16, iscan: &mrow_iscan_32x16 }, SCAN_ORDER { scan: &mcol_scan_32x16, iscan: &mcol_iscan_32x16 }, ], ]; rav1e-0.7.1/src/scenechange/fast.rs000064400000000000000000000067421046102023000152150ustar 00000000000000use std::{cmp, sync::Arc}; use crate::{ api::SceneDetectionSpeed, encoder::Sequence, frame::{Frame, Plane}, sad_plane, scenechange::fast_idiv, }; use debug_unreachable::debug_unreachable; use v_frame::pixel::Pixel; use super::{ScaleFunction, SceneChangeDetector, ScenecutResult}; /// Experiments have determined this to be an optimal threshold pub(super) const FAST_THRESHOLD: f64 = 18.0; impl SceneChangeDetector { /// The fast algorithm detects fast cuts using a raw difference /// in pixel values between the scaled frames. #[profiling::function] pub(super) fn fast_scenecut( &mut self, frame1: Arc>, frame2: Arc>, ) -> ScenecutResult { if let Some(scale_func) = &self.scale_func { // downscale both frames for faster comparison if let Some(frame_buffer) = &mut self.downscaled_frame_buffer { frame_buffer.swap(0, 1); (scale_func.downscale_in_place)( &frame2.planes[0], &mut frame_buffer[1], ); } else { self.downscaled_frame_buffer = Some([ (scale_func.downscale)(&frame1.planes[0]), (scale_func.downscale)(&frame2.planes[0]), ]); } if let Some(frame_buffer) = &self.downscaled_frame_buffer { let &[first, second] = &frame_buffer; let delta = self.delta_in_planes(first, second); ScenecutResult { threshold: self.threshold, inter_cost: delta, imp_block_cost: delta, forward_adjusted_cost: delta, backward_adjusted_cost: delta, } } else { // SAFETY: `downscaled_frame_buffer` is always initialized to `Some(..)` with a valid state // before this if/else block is reached. unsafe { debug_unreachable!() } } } else { let delta = self.delta_in_planes(&frame1.planes[0], &frame2.planes[0]); ScenecutResult { threshold: self.threshold, inter_cost: delta, imp_block_cost: delta, backward_adjusted_cost: delta, forward_adjusted_cost: delta, } } } /// Calculates the average sum of absolute difference (SAD) per pixel between 2 planes #[profiling::function] fn delta_in_planes(&self, plane1: &Plane, plane2: &Plane) -> f64 { let delta = sad_plane::sad_plane(plane1, plane2, self.cpu_feature_level); delta as f64 / self.pixels as f64 } } /// Scaling factor for frame in scene detection pub(super) fn detect_scale_factor( sequence: &Arc, speed_mode: SceneDetectionSpeed, ) -> Option> { let small_edge = cmp::min(sequence.max_frame_height, sequence.max_frame_width) as usize; let scale_func = if speed_mode == SceneDetectionSpeed::Fast { match small_edge { 0..=240 => None, 241..=480 => Some(ScaleFunction::from_scale::<2>()), 481..=720 => Some(ScaleFunction::from_scale::<4>()), 721..=1080 => Some(ScaleFunction::from_scale::<8>()), 1081..=1600 => Some(ScaleFunction::from_scale::<16>()), 1601..=usize::MAX => Some(ScaleFunction::from_scale::<32>()), _ => None, } } else { None }; if let Some(scale_factor) = scale_func.as_ref().map(|x| x.factor) { debug!( "Scene detection scale factor {}, [{},{}] -> [{},{}]", scale_factor, sequence.max_frame_width, sequence.max_frame_height, fast_idiv(sequence.max_frame_width as usize, scale_factor), fast_idiv(sequence.max_frame_height as usize, scale_factor) ); } scale_func } rav1e-0.7.1/src/scenechange/mod.rs000064400000000000000000000313331046102023000150310ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. mod fast; mod standard; use crate::api::{EncoderConfig, SceneDetectionSpeed}; use crate::cpu_features::CpuFeatureLevel; use crate::encoder::Sequence; use crate::frame::*; use crate::me::RefMEStats; use crate::util::Pixel; use std::collections::BTreeMap; use std::num::NonZeroUsize; use std::sync::Arc; use std::{cmp, u64}; use self::fast::{detect_scale_factor, FAST_THRESHOLD}; /// Experiments have determined this to be an optimal threshold const IMP_BLOCK_DIFF_THRESHOLD: f64 = 7.0; /// Fast integer division where divisor is a nonzero power of 2 #[inline(always)] pub(crate) fn fast_idiv(n: usize, d: NonZeroUsize) -> usize { debug_assert!(d.is_power_of_two()); n >> d.trailing_zeros() } struct ScaleFunction { downscale_in_place: fn(/* &self: */ &Plane, /* in_plane: */ &mut Plane), downscale: fn(/* &self: */ &Plane) -> Plane, factor: NonZeroUsize, } impl ScaleFunction { fn from_scale() -> Self { assert!( SCALE.is_power_of_two(), "Scaling factor needs to be a nonzero power of two" ); Self { downscale: Plane::downscale::, downscale_in_place: Plane::downscale_in_place::, factor: NonZeroUsize::new(SCALE).unwrap(), } } } /// Runs keyframe detection on frames from the lookahead queue. pub struct SceneChangeDetector { /// Minimum average difference between YUV deltas that will trigger a scene change. threshold: f64, /// Fast scene cut detection mode, uses simple SAD instead of encoder cost estimates. speed_mode: SceneDetectionSpeed, /// Downscaling function for fast scene detection scale_func: Option>, /// Frame buffer for scaled frames downscaled_frame_buffer: Option<[Plane; 2]>, /// Buffer for FrameMEStats for cost scenecut frame_me_stats_buffer: Option, /// Deque offset for current lookahead_offset: usize, /// Start deque offset based on lookahead deque_offset: usize, /// Scenechange results for adaptive threshold score_deque: Vec, /// Number of pixels in scaled frame for fast mode pixels: usize, /// The bit depth of the video. bit_depth: usize, /// The CPU feature level to be used. cpu_feature_level: CpuFeatureLevel, encoder_config: EncoderConfig, sequence: Arc, /// Calculated intra costs for each input frame. /// These are cached for reuse later in rav1e. pub(crate) intra_costs: BTreeMap>, /// Temporary buffer used by estimate_intra_costs. pub(crate) temp_plane: Option>, } impl SceneChangeDetector { pub fn new( encoder_config: EncoderConfig, cpu_feature_level: CpuFeatureLevel, lookahead_distance: usize, sequence: Arc, ) -> Self { let bit_depth = encoder_config.bit_depth; let speed_mode = if encoder_config.low_latency { SceneDetectionSpeed::Fast } else { encoder_config.speed_settings.scene_detection_mode }; // Downscaling function for fast scene detection let scale_func = detect_scale_factor(&sequence, speed_mode); // Set lookahead offset to 5 if normal lookahead available let lookahead_offset = if lookahead_distance >= 5 { 5 } else { 0 }; let deque_offset = lookahead_offset; let score_deque = Vec::with_capacity(5 + lookahead_distance); // Downscaling factor for fast scenedetect (is currently always a power of 2) let factor = scale_func.as_ref().map_or(NonZeroUsize::new(1).unwrap(), |x| x.factor); let pixels = if speed_mode == SceneDetectionSpeed::Fast { fast_idiv(sequence.max_frame_height as usize, factor) * fast_idiv(sequence.max_frame_width as usize, factor) } else { 1 }; let threshold = FAST_THRESHOLD * (bit_depth as f64) / 8.0; Self { threshold, speed_mode, scale_func, downscaled_frame_buffer: None, frame_me_stats_buffer: None, lookahead_offset, deque_offset, score_deque, pixels, bit_depth, cpu_feature_level, encoder_config, sequence, intra_costs: BTreeMap::new(), temp_plane: None, } } /// Runs keyframe detection on the next frame in the lookahead queue. /// /// This function requires that a subset of input frames /// is passed to it in order, and that `keyframes` is only /// updated from this method. `input_frameno` should correspond /// to the second frame in `frame_set`. /// /// This will gracefully handle the first frame in the video as well. #[profiling::function] pub fn analyze_next_frame( &mut self, frame_set: &[&Arc>], input_frameno: u64, previous_keyframe: u64, ) -> bool { // Use score deque for adaptive threshold for scene cut // Declare score_deque offset based on lookahead for scene change scores // Find the distance to the previous keyframe. let distance = input_frameno - previous_keyframe; if frame_set.len() <= self.lookahead_offset { // Don't insert keyframes in the last few frames of the video // This is basically a scene flash and a waste of bits return false; } if self.encoder_config.speed_settings.scene_detection_mode == SceneDetectionSpeed::None { if let Some(true) = self.handle_min_max_intervals(distance) { return true; }; return false; } // Initialization of score deque // based on frame set length if self.deque_offset > 0 && frame_set.len() > self.deque_offset + 1 && self.score_deque.is_empty() { self.initialize_score_deque(frame_set, input_frameno, self.deque_offset); } else if self.score_deque.is_empty() { self.initialize_score_deque( frame_set, input_frameno, frame_set.len() - 1, ); self.deque_offset = frame_set.len() - 2; } // Running single frame comparison and adding it to deque // Decrease deque offset if there is no new frames if frame_set.len() > self.deque_offset + 1 { self.run_comparison( frame_set[self.deque_offset].clone(), frame_set[self.deque_offset + 1].clone(), input_frameno + self.deque_offset as u64, ); } else { self.deque_offset -= 1; } // Adaptive scenecut check let (scenecut, score) = self.adaptive_scenecut(); let scenecut = self.handle_min_max_intervals(distance).unwrap_or(scenecut); debug!( "[SC-Detect] Frame {}: Raw={:5.1} ImpBl={:5.1} Bwd={:5.1} Fwd={:5.1} Th={:.1} {}", input_frameno, score.inter_cost, score.imp_block_cost, score.backward_adjusted_cost, score.forward_adjusted_cost, score.threshold, if scenecut { "Scenecut" } else { "No cut" } ); // Keep score deque of 5 backward frames // and forward frames of length of lookahead offset if self.score_deque.len() > 5 + self.lookahead_offset { self.score_deque.pop(); } scenecut } fn handle_min_max_intervals(&mut self, distance: u64) -> Option { // Handle minimum and maximum keyframe intervals. if distance < self.encoder_config.min_key_frame_interval { return Some(false); } if distance >= self.encoder_config.max_key_frame_interval { return Some(true); } None } // Initially fill score deque with frame scores fn initialize_score_deque( &mut self, frame_set: &[&Arc>], input_frameno: u64, init_len: usize, ) { for x in 0..init_len { self.run_comparison( frame_set[x].clone(), frame_set[x + 1].clone(), input_frameno + x as u64, ); } } /// Runs scene change comparison beetween 2 given frames /// Insert result to start of score deque fn run_comparison( &mut self, frame1: Arc>, frame2: Arc>, input_frameno: u64, ) { let mut result = if self.speed_mode == SceneDetectionSpeed::Fast { self.fast_scenecut(frame1, frame2) } else { self.cost_scenecut(frame1, frame2, input_frameno) }; // Subtract the highest metric value of surrounding frames from the current one // It makes the peaks in the metric more distinct if self.speed_mode != SceneDetectionSpeed::Fast && self.deque_offset > 0 { if input_frameno == 1 { // Accounts for the second frame not having a score to adjust against. // It should always be 0 because the first frame of the video is always a keyframe. result.backward_adjusted_cost = 0.0; } else { let mut adjusted_cost = f64::MAX; for other_cost in self.score_deque.iter().take(self.deque_offset).map(|i| i.inter_cost) { let this_cost = result.inter_cost - other_cost; if this_cost < adjusted_cost { adjusted_cost = this_cost; } if adjusted_cost < 0.0 { adjusted_cost = 0.0; break; } } result.backward_adjusted_cost = adjusted_cost; } if !self.score_deque.is_empty() { for i in 0..(cmp::min(self.deque_offset, self.score_deque.len())) { let adjusted_cost = self.score_deque[i].inter_cost - result.inter_cost; if i == 0 || adjusted_cost < self.score_deque[i].forward_adjusted_cost { self.score_deque[i].forward_adjusted_cost = adjusted_cost; } if self.score_deque[i].forward_adjusted_cost < 0.0 { self.score_deque[i].forward_adjusted_cost = 0.0; } } } } self.score_deque.insert(0, result); } /// Compares current scene score to adapted threshold based on previous scores /// Value of current frame is offset by lookahead, if lookahead >=5 /// Returns true if current scene score is higher than adapted threshold fn adaptive_scenecut(&mut self) -> (bool, ScenecutResult) { let score = self.score_deque[self.deque_offset]; // We use the importance block algorithm's cost metrics as a secondary algorithm // because, although it struggles in certain scenarios such as // finding the end of a pan, it is very good at detecting hard scenecuts // or detecting if a pan exists. // Because of this, we only consider a frame for a scenechange if // the importance block algorithm is over the threshold either on this frame (hard scenecut) // or within the past few frames (pan). This helps filter out a few false positives // produced by the cost-based algorithm. let imp_block_threshold = IMP_BLOCK_DIFF_THRESHOLD * (self.bit_depth as f64) / 8.0; if !&self.score_deque[self.deque_offset..] .iter() .any(|result| result.imp_block_cost >= imp_block_threshold) { return (false, score); } let cost = score.forward_adjusted_cost; if cost >= score.threshold { let back_deque = &self.score_deque[self.deque_offset + 1..]; let forward_deque = &self.score_deque[..self.deque_offset]; let back_over_tr_count = back_deque .iter() .filter(|result| result.backward_adjusted_cost >= result.threshold) .count(); let forward_over_tr_count = forward_deque .iter() .filter(|result| result.forward_adjusted_cost >= result.threshold) .count(); // Check for scenecut after the flashes // No frames over threshold forward // and some frames over threshold backward let back_count_req = if self.speed_mode == SceneDetectionSpeed::Fast { // Fast scenecut is more sensitive to false flash detection, // so we want more "evidence" of there being a flash before creating a keyframe. 2 } else { 1 }; if forward_over_tr_count == 0 && back_over_tr_count >= back_count_req { return (true, score); } // Check for scenecut before flash // If distance longer than max flash length if back_over_tr_count == 0 && forward_over_tr_count == 1 && forward_deque[0].forward_adjusted_cost >= forward_deque[0].threshold { return (true, score); } if back_over_tr_count != 0 || forward_over_tr_count != 0 { return (false, score); } } (cost >= score.threshold, score) } } #[derive(Debug, Clone, Copy)] struct ScenecutResult { inter_cost: f64, imp_block_cost: f64, backward_adjusted_cost: f64, forward_adjusted_cost: f64, threshold: f64, } rav1e-0.7.1/src/scenechange/standard.rs000064400000000000000000000060341046102023000160520ustar 00000000000000use std::sync::Arc; use crate::{ api::lookahead::{ estimate_importance_block_difference, estimate_inter_costs, estimate_intra_costs, }, frame::Frame, me::FrameMEStats, }; use v_frame::{math::Fixed, pixel::Pixel}; use super::{SceneChangeDetector, ScenecutResult}; impl SceneChangeDetector { /// Run a comparison between two frames to determine if they qualify for a scenecut. /// /// We gather both intra and inter costs for the frames, /// as well as an importance-block-based difference, /// and use all three metrics. pub(super) fn cost_scenecut( &mut self, frame1: Arc>, frame2: Arc>, input_frameno: u64, ) -> ScenecutResult { let frame2_inter_ref = Arc::clone(&frame2); let frame1_imp_ref = Arc::clone(&frame1); let frame2_imp_ref = Arc::clone(&frame2); let mut intra_cost = 0.0; let mut mv_inter_cost = 0.0; let mut imp_block_cost = 0.0; let cols = 2 * self.encoder_config.width.align_power_of_two_and_shift(3); let rows = 2 * self.encoder_config.height.align_power_of_two_and_shift(3); let buffer = if let Some(buffer) = &self.frame_me_stats_buffer { Arc::clone(buffer) } else { let frame_me_stats = FrameMEStats::new_arc_array(cols, rows); let clone = Arc::clone(&frame_me_stats); self.frame_me_stats_buffer = Some(frame_me_stats); clone }; rayon::scope(|s| { s.spawn(|_| { let temp_plane = self.temp_plane.get_or_insert_with(|| frame2.planes[0].clone()); let intra_costs = self.intra_costs.entry(input_frameno).or_insert_with(|| { estimate_intra_costs( temp_plane, &*frame2, self.bit_depth, self.cpu_feature_level, ) }); intra_cost = intra_costs.iter().map(|&cost| cost as u64).sum::() as f64 / intra_costs.len() as f64; // If we're not using temporal RDO, we won't need these costs later, // so remove them from the cache to avoid a memory leak if !self.encoder_config.temporal_rdo() { self.intra_costs.remove(&input_frameno); }; }); s.spawn(|_| { mv_inter_cost = estimate_inter_costs( frame2_inter_ref, frame1, self.bit_depth, self.encoder_config.clone(), self.sequence.clone(), buffer, ); }); s.spawn(|_| { imp_block_cost = estimate_importance_block_difference(frame2_imp_ref, frame1_imp_ref); }); }); // `BIAS` determines how likely we are // to choose a keyframe, between 0.0-1.0. // Higher values mean we are more likely to choose a keyframe. // This value was chosen based on trials using the new // adaptive scenecut code. const BIAS: f64 = 0.7; let threshold = intra_cost * (1.0 - BIAS); ScenecutResult { inter_cost: mv_inter_cost, imp_block_cost, threshold, backward_adjusted_cost: 0.0, forward_adjusted_cost: 0.0, } } } rav1e-0.7.1/src/segmentation.rs000064400000000000000000000155511046102023000145100ustar 00000000000000// Copyright (c) 2018-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::context::*; use crate::header::PRIMARY_REF_NONE; use crate::partition::BlockSize; use crate::rdo::spatiotemporal_scale; use crate::rdo::DistortionScale; use crate::tiling::TileStateMut; use crate::util::Pixel; use crate::FrameInvariants; use crate::FrameState; pub const MAX_SEGMENTS: usize = 8; #[profiling::function] pub fn segmentation_optimize( fi: &FrameInvariants, fs: &mut FrameState, ) { assert!(fi.enable_segmentation); fs.segmentation.enabled = true; if fs.segmentation.enabled { fs.segmentation.update_map = true; // We don't change the values between frames. fs.segmentation.update_data = fi.primary_ref_frame == PRIMARY_REF_NONE; // Avoid going into lossless mode by never bringing qidx below 1. // Because base_q_idx changes more frequently than the segmentation // data, it is still possible for a segment to enter lossless, so // enforcement elsewhere is needed. let offset_lower_limit = 1 - fi.base_q_idx as i16; if !fs.segmentation.update_data { let mut min_segment = MAX_SEGMENTS; for i in 0..MAX_SEGMENTS { if fs.segmentation.features[i][SegLvl::SEG_LVL_ALT_Q as usize] && fs.segmentation.data[i][SegLvl::SEG_LVL_ALT_Q as usize] >= offset_lower_limit { min_segment = i; break; } } assert_ne!(min_segment, MAX_SEGMENTS); fs.segmentation.min_segment = min_segment as u8; fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth); return; } segmentation_optimize_inner(fi, fs, offset_lower_limit); /* Figure out parameters */ fs.segmentation.preskip = false; fs.segmentation.last_active_segid = 0; for i in 0..MAX_SEGMENTS { for j in 0..SegLvl::SEG_LVL_MAX as usize { if fs.segmentation.features[i][j] { fs.segmentation.last_active_segid = i as u8; if j >= SegLvl::SEG_LVL_REF_FRAME as usize { fs.segmentation.preskip = true; } } } } } } // Select target quantizers for each segment by fitting to log(scale). fn segmentation_optimize_inner( fi: &FrameInvariants, fs: &mut FrameState, offset_lower_limit: i16, ) { use crate::quantize::{ac_q, select_ac_qi}; use crate::util::kmeans; use arrayvec::ArrayVec; // Minimize the total distance from a small set of values to all scales. // Find k-means of log(spatiotemporal scale), k in 3..=8 let c: ([_; 8], [_; 7], [_; 6], [_; 5], [_; 4], [_; 3]) = { let spatiotemporal_scores = &fi.coded_frame_data.as_ref().unwrap().spatiotemporal_scores; let mut log2_scale_q11 = Vec::with_capacity(spatiotemporal_scores.len()); log2_scale_q11.extend(spatiotemporal_scores.iter().map(|&s| s.blog16())); log2_scale_q11.sort_unstable(); let l = &log2_scale_q11; (kmeans(l), kmeans(l), kmeans(l), kmeans(l), kmeans(l), kmeans(l)) }; // Find variance in spacing between successive log(scale) let var = |c: &[i16]| { let delta = ArrayVec::<_, MAX_SEGMENTS>::from_iter( c.iter().skip(1).zip(c).map(|(&a, &b)| b as i64 - a as i64), ); let mean = delta.iter().sum::() / delta.len() as i64; delta.iter().map(|&d| (d - mean).pow(2)).sum::() as u64 }; let variance = [var(&c.0), var(&c.1), var(&c.2), var(&c.3), var(&c.4), var(&c.5)]; // Choose the k value with minimal variance in spacing let min_variance = *variance.iter().min().unwrap(); let position = variance.iter().rposition(|&v| v == min_variance).unwrap(); // For the selected centroids, derive a target quantizer: // scale Q'^2 = Q^2 // See `distortion_scale_for` for more information. let compute_delta = |centroids: &[i16]| { use crate::util::{bexp64, blog64}; let log2_base_ac_q_q57 = blog64(ac_q(fi.base_q_idx, 0, fi.config.bit_depth).get().into()); centroids .iter() .rev() // Rewrite in log form and exponentiate: // scale Q'^2 = Q^2 // Q' = Q / sqrt(scale) // log(Q') = log(Q) - 0.5 log(scale) .map(|&log2_scale_q11| { bexp64(log2_base_ac_q_q57 - ((log2_scale_q11 as i64) << (57 - 11 - 1))) }) // Find the index of the nearest quantizer to the target, // and take the delta from the base quantizer index. .map(|q| { // Avoid going into lossless mode by never bringing qidx below 1. select_ac_qi(q, fi.config.bit_depth).max(1) as i16 - fi.base_q_idx as i16 }) .collect::>() }; // Compute segment deltas for best value of k let seg_delta = match position { 0 => compute_delta(&c.0), 1 => compute_delta(&c.1), 2 => compute_delta(&c.2), 3 => compute_delta(&c.3), 4 => compute_delta(&c.4), _ => compute_delta(&c.5), }; // Update the segmentation data fs.segmentation.min_segment = 0; fs.segmentation.max_segment = seg_delta.len() as u8 - 1; for (&delta, (features, data)) in seg_delta .iter() .zip(fs.segmentation.features.iter_mut().zip(&mut fs.segmentation.data)) { features[SegLvl::SEG_LVL_ALT_Q as usize] = true; data[SegLvl::SEG_LVL_ALT_Q as usize] = delta.max(offset_lower_limit); } fs.segmentation.update_threshold(fi.base_q_idx, fi.config.bit_depth); } #[profiling::function] pub fn select_segment( fi: &FrameInvariants, ts: &TileStateMut<'_, T>, tile_bo: TileBlockOffset, bsize: BlockSize, skip: bool, ) -> std::ops::RangeInclusive { // If skip is true or segmentation is turned off, sidx is not coded. if skip || !fi.enable_segmentation { return 0..=0; } use crate::api::SegmentationLevel; if fi.config.speed_settings.segmentation == SegmentationLevel::Full { return ts.segmentation.min_segment..=ts.segmentation.max_segment; } let frame_bo = ts.to_frame_block_offset(tile_bo); let scale = spatiotemporal_scale(fi, frame_bo, bsize); let sidx = segment_idx_from_distortion(&ts.segmentation.threshold, scale); // Avoid going into lossless mode by never bringing qidx below 1. let sidx = sidx.max(ts.segmentation.min_segment); if fi.config.speed_settings.segmentation == SegmentationLevel::Complex { return sidx..=ts.segmentation.max_segment.min(sidx.saturating_add(1)); } sidx..=sidx } fn segment_idx_from_distortion( threshold: &[DistortionScale; MAX_SEGMENTS - 1], s: DistortionScale, ) -> u8 { threshold.partition_point(|&t| s.0 < t.0) as u8 } rav1e-0.7.1/src/stats.rs000064400000000000000000000053161046102023000131470ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::partition::BlockSize; use crate::predict::PREDICTION_MODES; use crate::serialize::{Deserialize, Serialize}; use crate::transform::TX_TYPES; #[cfg(feature = "serialize")] use serde_big_array::BigArray; use std::ops::{Add, AddAssign}; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct EncoderStats { /// Stores count of pixels belonging to each block size in this frame pub block_size_counts: [usize; BlockSize::BLOCK_SIZES_ALL], /// Stores count of pixels belonging to skip blocks in this frame pub skip_block_count: usize, /// Stores count of pixels belonging to each transform type in this frame pub tx_type_counts: [usize; TX_TYPES], /// Stores count of pixels belonging to each luma prediction mode in this frame #[serde(with = "BigArray")] pub luma_pred_mode_counts: [usize; PREDICTION_MODES], /// Stores count of pixels belonging to each chroma prediction mode in this frame #[serde(with = "BigArray")] pub chroma_pred_mode_counts: [usize; PREDICTION_MODES], } impl Default for EncoderStats { fn default() -> Self { let luma_pred_mode_counts = [0; PREDICTION_MODES]; let chroma_pred_mode_counts = [0; PREDICTION_MODES]; EncoderStats { block_size_counts: [0; BlockSize::BLOCK_SIZES_ALL], skip_block_count: 0, tx_type_counts: [0; TX_TYPES], luma_pred_mode_counts, chroma_pred_mode_counts, } } } impl Add<&Self> for EncoderStats { type Output = Self; fn add(self, rhs: &EncoderStats) -> Self::Output { let mut lhs = self; lhs += rhs; lhs } } impl AddAssign<&Self> for EncoderStats { fn add_assign(&mut self, rhs: &EncoderStats) { for (s, v) in self.block_size_counts.iter_mut().zip(rhs.block_size_counts.iter()) { *s += v; } for (s, v) in self .chroma_pred_mode_counts .iter_mut() .zip(rhs.chroma_pred_mode_counts.iter()) { *s += v; } for (s, v) in self .luma_pred_mode_counts .iter_mut() .zip(rhs.luma_pred_mode_counts.iter()) { *s += v; } for (s, v) in self.tx_type_counts.iter_mut().zip(rhs.tx_type_counts.iter()) { *s += v; } self.skip_block_count += rhs.skip_block_count; } } rav1e-0.7.1/src/test_encode_decode/aom.rs000064400000000000000000000117631046102023000163670ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; use crate::test_encode_decode::{compare_plane, DecodeResult, TestDecoder}; use crate::util::Pixel; use aom_sys::*; use std::collections::VecDeque; use std::ffi::CStr; use std::marker::PhantomData; use std::{mem::MaybeUninit, ptr, slice}; pub(crate) struct AomDecoder { dec: aom_codec_ctx, iter: aom_codec_iter_t, pixel: PhantomData, } impl TestDecoder for AomDecoder { fn setup_decoder(w: usize, h: usize) -> Self { unsafe { let interface = aom_codec_av1_dx(); let cfg = aom_codec_dec_cfg_t { threads: 1, w: w as u32, h: h as u32, allow_lowbitdepth: 1, }; let mut dec = MaybeUninit::uninit(); let ret = aom_codec_dec_init_ver( dec.as_mut_ptr(), interface, &cfg, 0, AOM_DECODER_ABI_VERSION as i32, ); if ret != 0 { panic!("Cannot instantiate the decoder {}", ret); } // Was initialized by aom_codec_dec_init_ver(). let dec = dec.assume_init(); AomDecoder { dec, iter: ptr::null_mut(), pixel: PhantomData } } } fn decode_packet( &mut self, packet: &[u8], rec_fifo: &mut VecDeque>, w: usize, h: usize, chroma_sampling: ChromaSampling, bit_depth: usize, verify: bool, ) -> DecodeResult { let mut corrupted_count = 0; unsafe { let ret = aom_codec_decode( &mut self.dec, packet.as_ptr(), packet.len(), ptr::null_mut(), ); debug!("Decoded. -> {}", ret); if ret != 0 { let error_msg = aom_codec_error(&mut self.dec); debug!( " Decode codec_decode failed: {}", CStr::from_ptr(error_msg).to_string_lossy() ); let detail = aom_codec_error_detail(&mut self.dec); if !detail.is_null() { debug!( " Decode codec_decode failed {}", CStr::from_ptr(detail).to_string_lossy() ); } corrupted_count += 1; } if ret == 0 { loop { debug!("Retrieving frame"); let img = aom_codec_get_frame(&mut self.dec, &mut self.iter); debug!("Retrieved."); if img.is_null() { return DecodeResult::Done; } let mut corrupted = 0; let ret = aom_codec_control( &mut self.dec, aom_dec_control_id::AOMD_GET_FRAME_CORRUPTED as i32, &mut corrupted, ); if ret != 0 { let detail = aom_codec_error_detail(&mut self.dec); panic!( "Decode codec_control failed {}", CStr::from_ptr(detail).to_string_lossy() ); } corrupted_count += corrupted; if verify { let rec = rec_fifo.pop_front().unwrap(); compare_img(img, &rec, bit_depth, w, h, chroma_sampling); } } } } if corrupted_count > 0 { DecodeResult::Corrupted(corrupted_count) } else { DecodeResult::NotDone } } } impl Drop for AomDecoder { fn drop(&mut self) { unsafe { aom_codec_destroy(&mut self.dec) }; } } fn compare_img( img: *const aom_image_t, frame: &Frame, bit_depth: usize, width: usize, height: usize, chroma_sampling: ChromaSampling, ) { let img = unsafe { *img }; let img_iter = img.planes.iter().zip(img.stride.iter()); let planes = if chroma_sampling == ChromaSampling::Cs400 { 1 } else { 3 }; for (pli, (img_plane, frame_plane)) in img_iter.zip(frame.planes.iter()).enumerate().take(planes) { let w = width >> frame_plane.cfg.xdec; let h = height >> frame_plane.cfg.ydec; let rec_stride = frame_plane.cfg.stride; if bit_depth > 8 { let dec_stride = *img_plane.1 as usize / 2; let dec = unsafe { let data = *img_plane.0 as *const u16; let size = dec_stride * h; slice::from_raw_parts(data, size) }; let rec: Vec = frame_plane.data_origin().iter().map(|&v| u16::cast_from(v)).collect(); compare_plane::(&rec[..], rec_stride, dec, dec_stride, w, h, pli); } else { let dec_stride = *img_plane.1 as usize; let dec = unsafe { let data = *img_plane.0 as *const u8; let size = dec_stride * h; slice::from_raw_parts(data, size) }; let rec: Vec = frame_plane.data_origin().iter().map(|&v| u8::cast_from(v)).collect(); compare_plane::(&rec[..], rec_stride, dec, dec_stride, w, h, pli); } } } rav1e-0.7.1/src/test_encode_decode/dav1d.rs000064400000000000000000000116111046102023000166020ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; use crate::test_encode_decode::{compare_plane, DecodeResult, TestDecoder}; use crate::util::{CastFromPrimitive, Pixel}; use std::collections::VecDeque; use std::marker::PhantomData; use std::os::raw::c_int; use std::{ mem::{self, MaybeUninit}, ptr, slice, }; use dav1d_sys::*; pub(crate) struct Dav1dDecoder { dec: *mut Dav1dContext, pixel: PhantomData, } impl TestDecoder for Dav1dDecoder { fn setup_decoder(_w: usize, _h: usize) -> Self { unsafe { let mut settings = MaybeUninit::uninit(); dav1d_default_settings(settings.as_mut_ptr()); // Was initialized by dav1d_default_settings(). let settings = settings.assume_init(); let mut dec: Dav1dDecoder = Dav1dDecoder { dec: ptr::null_mut(), pixel: PhantomData }; let ret = dav1d_open(&mut dec.dec, &settings); if ret != 0 { panic!("Cannot instantiate the decoder {}", ret); } dec } } fn decode_packet( &mut self, packet: &[u8], rec_fifo: &mut VecDeque>, w: usize, h: usize, chroma_sampling: ChromaSampling, bit_depth: usize, verify: bool, ) -> DecodeResult { let mut corrupted_count = 0; let mut data = SafeDav1dData::new(packet); let ret = data.send(self.dec); debug!("Decoded. -> {}", ret); if ret != 0 { corrupted_count += 1; } if ret == 0 { loop { let mut pic = SafeDav1dPicture::default(); debug!("Retrieving frame"); let ret = pic.get(self.dec); debug!("Retrieved."); if ret == DAV1D_ERR_AGAIN { return DecodeResult::Done; } if ret != 0 { panic!("Decode fail"); } if verify { let rec = rec_fifo.pop_front().unwrap(); compare_pic(&pic.0, &rec, bit_depth, w, h, chroma_sampling); } } } if corrupted_count > 0 { DecodeResult::Corrupted(corrupted_count) } else { DecodeResult::NotDone } } } impl Drop for Dav1dDecoder { fn drop(&mut self) { unsafe { dav1d_close(&mut self.dec) }; } } struct SafeDav1dData(Dav1dData); impl SafeDav1dData { fn new(packet: &[u8]) -> Self { unsafe { let mut data = Self { 0: mem::zeroed() }; let ptr = dav1d_data_create(&mut data.0, packet.len()); ptr::copy_nonoverlapping(packet.as_ptr(), ptr, packet.len()); data } } fn send(&mut self, context: *mut Dav1dContext) -> c_int { unsafe { dav1d_send_data(context, &mut self.0) } } } impl Drop for SafeDav1dData { fn drop(&mut self) { unsafe { dav1d_data_unref(&mut self.0) }; } } struct SafeDav1dPicture(Dav1dPicture); impl Default for SafeDav1dPicture { fn default() -> Self { Self { 0: unsafe { mem::zeroed() } } } } impl SafeDav1dPicture { fn get(&mut self, context: *mut Dav1dContext) -> c_int { unsafe { dav1d_get_picture(context, &mut self.0) } } } impl Drop for SafeDav1dPicture { fn drop(&mut self) { unsafe { dav1d_picture_unref(&mut self.0) } } } fn compare_pic( pic: &Dav1dPicture, frame: &Frame, bit_depth: usize, width: usize, height: usize, chroma_sampling: ChromaSampling, ) { use crate::frame::Plane; let cmp_plane = |data, stride, frame_plane: &Plane, pli| { let w = width >> frame_plane.cfg.xdec; let h = height >> frame_plane.cfg.ydec; let rec_stride = frame_plane.cfg.stride; if bit_depth > 8 { let stride = stride / 2; let dec = unsafe { let data = data as *const u16; let size = stride * h; slice::from_raw_parts(data, size) }; let rec: Vec = frame_plane.data_origin().iter().map(|&v| u16::cast_from(v)).collect(); compare_plane::(&rec[..], rec_stride, dec, stride, w, h, pli); } else { let dec = unsafe { let data = data as *const u8; let size = stride * h; slice::from_raw_parts(data, size) }; let rec: Vec = frame_plane.data_origin().iter().map(|&v| u8::cast_from(v)).collect(); compare_plane::(&rec[..], rec_stride, dec, stride, w, h, pli); } }; let lstride = pic.stride[0] as usize; cmp_plane(pic.data[0], lstride, &frame.planes[0], 0); if chroma_sampling != ChromaSampling::Cs400 { let cstride = pic.stride[1] as usize; cmp_plane(pic.data[1], cstride, &frame.planes[1], 1); cmp_plane(pic.data[2], cstride, &frame.planes[2], 2); } } rav1e-0.7.1/src/test_encode_decode/mod.rs000064400000000000000000000537561046102023000164020ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. // Fuzzing only uses a subset of these. #![cfg_attr(fuzzing, allow(unused))] use crate::color::ChromaSampling; use crate::api::config::GrainTableSegment; use crate::util::Pixel; use crate::*; use arrayvec::ArrayVec; use interpolate_name::interpolate_test; use rand::{Rng, SeedableRng}; use rand_chacha::ChaChaRng; use std::collections::VecDeque; #[cfg(feature = "decode_test")] mod aom; #[cfg(feature = "decode_test_dav1d")] mod dav1d; #[cfg(feature = "decode_test")] use aom::AomDecoder; #[cfg(feature = "decode_test_dav1d")] use dav1d::Dav1dDecoder; fn fill_frame(ra: &mut ChaChaRng, frame: &mut Frame) { for plane in frame.planes.iter_mut() { let stride = plane.cfg.stride; for row in plane.data.chunks_mut(stride) { for pixel in row { let v: u8 = ra.gen(); *pixel = T::cast_from(v); } } } } fn read_frame_batch( ctx: &mut Context, ra: &mut ChaChaRng, limit: usize, ) { for _ in 0..limit { let mut input = ctx.new_frame(); fill_frame(ra, &mut input); let _ = ctx.send_frame(input); } ctx.flush(); } pub(crate) enum DecodeResult { Done, NotDone, Corrupted(usize), } pub(crate) trait TestDecoder { fn setup_decoder(w: usize, h: usize) -> Self where Self: Sized; fn encode_decode( &mut self, verify: bool, w: usize, h: usize, speed: u8, quantizer: usize, limit: usize, bit_depth: usize, chroma_sampling: ChromaSampling, min_keyint: u64, max_keyint: u64, switch_frame_interval: u64, low_latency: bool, error_resilient: bool, bitrate: i32, tile_cols_log2: usize, tile_rows_log2: usize, still_picture: bool, grain_table: Option>, ) { let mut ra = ChaChaRng::from_seed([0; 32]); let mut ctx: Context = setup_encoder( w, h, speed, quantizer, bit_depth, chroma_sampling, min_keyint, max_keyint, switch_frame_interval, low_latency, error_resilient, bitrate, tile_cols_log2, tile_rows_log2, still_picture, grain_table, ); debug!( "Encoding {}x{} speed {} quantizer {} bit-depth {} bitrate {}", w, h, speed, quantizer, bit_depth, bitrate ); #[cfg(feature = "dump_ivf")] let mut out = std::fs::File::create(&format!( "out-{}x{}-s{}-q{}-r{}-{:?}.ivf", w, h, speed, quantizer, bitrate, chroma_sampling )) .unwrap(); #[cfg(feature = "dump_ivf")] ivf::write_ivf_header(&mut out, w, h, 30, 1); let mut rec_fifo = VecDeque::new(); read_frame_batch(&mut ctx, &mut ra, limit); for _ in 0..limit { let mut corrupted_count = 0; loop { let res = ctx.receive_packet(); if let Ok(pkt) = res { debug!("Encoded packet {}", pkt.input_frameno); #[cfg(feature = "dump_ivf")] ivf::write_ivf_frame(&mut out, pkt.input_frameno, &pkt.data); if let Some(pkt_rec) = pkt.rec { rec_fifo.push_back((*pkt_rec).clone()); } let packet = pkt.data; debug!("Decoding frame {}", pkt.input_frameno); match self.decode_packet( &packet, &mut rec_fifo, w, h, chroma_sampling, bit_depth, verify, ) { DecodeResult::Done => { break; } DecodeResult::NotDone => {} DecodeResult::Corrupted(corrupted) => { corrupted_count += corrupted; } } } else { break; } } assert_eq!(corrupted_count, 0); } } fn decode_packet( &mut self, packet: &[u8], rec_fifo: &mut VecDeque>, w: usize, h: usize, chroma_sampling: ChromaSampling, bit_depth: usize, verify: bool, ) -> DecodeResult; } pub fn compare_plane( rec: &[T], rec_stride: usize, dec: &[T], dec_stride: usize, width: usize, height: usize, pli: usize, ) { for (row, line) in rec.chunks(rec_stride).zip(dec.chunks(dec_stride)).take(height).enumerate() { assert_eq!( &line.0[..width], &line.1[..width], "at row {} of plane {}", row, pli ); } } fn setup_encoder( w: usize, h: usize, speed: u8, quantizer: usize, bit_depth: usize, chroma_sampling: ChromaSampling, min_keyint: u64, max_keyint: u64, switch_frame_interval: u64, low_latency: bool, error_resilient: bool, bitrate: i32, tile_cols_log2: usize, tile_rows_log2: usize, still_picture: bool, grain_table: Option>, ) -> Context { assert!(bit_depth == 8 || std::mem::size_of::() > 1); let mut enc = EncoderConfig::with_speed_preset(speed); enc.quantizer = quantizer; enc.min_key_frame_interval = min_keyint; enc.max_key_frame_interval = max_keyint; enc.switch_frame_interval = switch_frame_interval; enc.low_latency = low_latency; enc.error_resilient = error_resilient; enc.width = w; enc.height = h; enc.bit_depth = bit_depth; enc.chroma_sampling = chroma_sampling; enc.bitrate = bitrate; enc.tile_cols = 1 << tile_cols_log2; enc.tile_rows = 1 << tile_rows_log2; enc.still_picture = still_picture; enc.film_grain_params = grain_table; let threads = if cfg!(fuzzing) { 1 } else { 2 }; let cfg = Config::new().with_encoder_config(enc).with_threads(threads); cfg.new_context().unwrap() } // TODO: support non-multiple-of-16 dimensions static DIMENSION_OFFSETS: &[(usize, usize)] = &[(0, 0), (4, 4), (8, 8), (16, 16)]; fn speed(s: u8, decoder: &str) { let quantizer = 100; let limit = 5; let w = 64; let h = 80; for b in DIMENSION_OFFSETS.iter() { let w = w + b.0; let h = h + b.1; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, s, quantizer, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 0, 0, false, None, ); } } macro_rules! test_speeds { ($($S:expr),+) => { $( paste::item!{ #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn [](decoder: &str) { speed($S, decoder) } } )* } } test_speeds! { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 } macro_rules! test_dimensions { ($(($W:expr, $H:expr)),+) => { $( paste::item!{ #[cfg_attr(feature = "decode_test", interpolate_name::interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_name::interpolate_test(dav1d, "dav1d"))] fn [](decoder: &str) { super::dimension($W, $H, decoder) } } )* } } #[cfg(not(feature = "quick_test"))] mod large_dimension { test_dimensions! { (512, 512), (1024, 1024), (2048, 2048) } } mod small_dimension { test_dimensions! { (256, 256), (258, 258), (260, 260), (262, 262), (264, 264), (265, 265) } } mod tiny_dimension { test_dimensions! { (1, 1), (2, 2), (4, 4), (8, 8), (16, 16), (32, 32), (64, 64), (128, 128) } } fn dimension(w: usize, h: usize, decoder: &str) { let quantizer = 100; let limit = 1; let speed = 10; let still_picture = w < 16 || h < 16; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, quantizer, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 0, 0, still_picture, None, ); } fn quantizer(decoder: &str, q: usize) { let limit = 5; let w = 64; let h = 80; let speed = 10; for b in DIMENSION_OFFSETS.iter() { let mut dec = get_decoder::(decoder, b.0, b.1); dec.encode_decode( true, w + b.0, h + b.1, speed, q, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 0, 0, false, None, ); } } macro_rules! test_quantizer { ($($Q:expr),+) => { $( paste::item!{ #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn [](decoder: &str) { quantizer(decoder, $Q); } } )* } } test_quantizer! {60, 80, 100, 120} #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn bitrate(decoder: &str) { let limit = 5; let w = 64; let h = 80; let speed = 10; for &q in [172, 220, 252, 255].iter() { for &r in [100, 1000, 10_000].iter() { let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), 15, 15, 0, true, false, r, 0, 0, false, None, ); } } } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn keyframes(decoder: &str) { let limit = 12; let w = 64; let h = 80; let speed = 9; let q = 100; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), 6, 6, 0, true, false, 0, 0, 0, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn reordering(decoder: &str) { let limit = 12; let w = 64; let h = 80; let speed = 10; let q = 100; for keyint in &[4, 5, 6] { let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), *keyint, *keyint, 0, false, false, 0, 0, 0, false, None, ); } } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn reordering_short_video(decoder: &str) { // Regression test for https://github.com/xiph/rav1e/issues/890 let limit = 2; let w = 64; let h = 80; let speed = 10; let q = 100; let keyint = 12; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), keyint, keyint, 0, false, false, 0, 0, 0, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn error_resilient(decoder: &str) { let limit = 2; let w = 64; let h = 80; let speed = 10; let q = 100; let keyint = 12; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), keyint, keyint, 0, true, true, 0, 0, 0, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn error_resilient_reordering(decoder: &str) { let limit = 6; let w = 64; let h = 80; let speed = 10; let q = 100; for keyint in &[4, 5, 6] { let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), *keyint, *keyint, 0, false, true, 0, 0, 0, false, None, ); } } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn switch_frame(decoder: &str) { let limit = 3; let w = 64; let h = 80; let speed = 10; let q = 100; let keyint = 12; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), keyint, keyint, 1, true, true, 0, 0, 0, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn odd_size_frame_with_full_rdo(decoder: &str) { let limit = 3; let w = 64 + 1; let h = 128 - 1; let speed = 0; let qindex = 100; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, qindex, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 0, 0, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn low_bit_depth(decoder: &str) { let quantizer = 100; let limit = 3; // Include inter frames let speed = 0; // Test as many tools as possible let w = 64; let h = 80; // 8-bit let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, quantizer, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 0, 0, false, None, ); } fn high_bit_depth(decoder: &str, depth: usize) { let quantizer = 100; let limit = 3; // Include inter frames let speed = 0; // Test as many tools as possible let w = 64; let h = 80; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, quantizer, limit, depth, Default::default(), 15, 15, 0, true, false, 0, 0, 0, false, None, ); } macro_rules! test_high_bit_depth { ($($B:expr),+) => { $( paste::item!{ #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn [](decoder: &str) { high_bit_depth(decoder, $B); } } )* } } test_high_bit_depth! {10, 12} fn chroma_sampling(decoder: &str, cs: ChromaSampling) { let quantizer = 100; let limit = 3; // Include inter frames let speed = 0; // Test as many tools as possible let w = 64; let h = 80; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, quantizer, limit, 8, cs, 15, 15, 0, true, false, 0, 0, 0, false, None, ); } macro_rules! test_chroma_sampling { ($(($S:expr, $I:expr)),+) => { $( paste::item!{ #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn [](decoder: &str) { chroma_sampling(decoder, $I); } } )* } } test_chroma_sampling! {(400, ChromaSampling::Cs400), (420, ChromaSampling::Cs420), (422, ChromaSampling::Cs422), (444, ChromaSampling::Cs444)} #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn tile_encoding_with_stretched_restoration_units(decoder: &str) { let limit = 5; let w = 256; // the bottom tiles are small (their height is 140-128=12), so they will use stretched // restoration units from their above neighbours let h = 140; let speed = 10; let q = 100; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 2, 2, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] fn still_picture_mode(decoder: &str) { let limit = 1; let w = 480; let h = 304; let speed = 6; let qindex = 100; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, qindex, limit, 8, Default::default(), 0, 0, 0, false, false, 0, 0, 0, true, None, ); } pub(crate) fn get_decoder( decoder: &str, w: usize, h: usize, ) -> Box> { match decoder { #[cfg(feature = "decode_test")] "aom" => Box::new(AomDecoder::::setup_decoder(w, h)), #[cfg(feature = "decode_test_dav1d")] "dav1d" => Box::new(Dav1dDecoder::::setup_decoder(w, h)), _ => unimplemented!(), } } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn rdo_loop_decision_lrf_sanity(decoder: &str) { let limit = 2; let w = 936; let h = 1404; let speed = 9; let q = 240; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), 6, 6, 0, true, false, 0, 0, 0, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn rdo_loop_decision_cdef_sanity(decoder: &str) { let limit = 2; let w = 1404; let h = 936; let speed = 9; let q = 240; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( true, w, h, speed, q, limit, 8, Default::default(), 6, 6, 0, true, false, 0, 0, 0, false, None, ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn film_grain_table_luma_only(decoder: &str) { let quantizer = 100; let limit = 5; // Include inter frames let speed = 10; let w = 64; let h = 80; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( false, w, h, speed, quantizer, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 0, 0, false, Some(vec![GrainTableSegment { start_time: 0, end_time: 9223372036854775807, scaling_points_y: ArrayVec::from([ [0, 20], [20, 5], [39, 4], [59, 3], [78, 3], [98, 3], [118, 3], [137, 3], [157, 3], [177, 3], [196, 3], [216, 4], [235, 4], [255, 4], ]), scaling_points_cb: ArrayVec::new(), scaling_points_cr: ArrayVec::new(), scaling_shift: 8, ar_coeff_lag: 0, ar_coeffs_y: ArrayVec::new(), ar_coeffs_cb: ArrayVec::try_from([0].as_slice()).unwrap(), ar_coeffs_cr: ArrayVec::try_from([0].as_slice()).unwrap(), ar_coeff_shift: 6, cb_mult: 0, cb_luma_mult: 0, cb_offset: 0, cr_mult: 0, cr_luma_mult: 0, cr_offset: 0, overlap_flag: true, chroma_scaling_from_luma: false, grain_scale_shift: 0, random_seed: 7391, }]), ); } #[cfg_attr(feature = "decode_test", interpolate_test(aom, "aom"))] #[cfg_attr(feature = "decode_test_dav1d", interpolate_test(dav1d, "dav1d"))] #[ignore] fn film_grain_table_chroma(decoder: &str) { let quantizer = 100; let limit = 5; // Include inter frames let speed = 10; let w = 64; let h = 80; let mut dec = get_decoder::(decoder, w as usize, h as usize); dec.encode_decode( false, w, h, speed, quantizer, limit, 8, Default::default(), 15, 15, 0, true, false, 0, 0, 0, false, Some(vec![GrainTableSegment { start_time: 0, end_time: 9223372036854775807, scaling_points_y: ArrayVec::from([ [0, 0], [20, 4], [39, 3], [59, 3], [78, 3], [98, 3], [118, 4], [137, 4], [157, 4], [177, 4], [196, 4], [216, 5], [235, 5], [255, 5], ]), scaling_points_cb: ArrayVec::from([ [0, 0], [28, 0], [57, 0], [85, 0], [113, 0], [142, 0], [170, 0], [198, 0], [227, 0], [255, 1], ]), scaling_points_cr: ArrayVec::from([ [0, 0], [28, 0], [57, 0], [85, 0], [113, 0], [142, 0], [170, 0], [198, 0], [227, 0], [255, 1], ]), scaling_shift: 8, ar_coeff_lag: 0, ar_coeffs_y: ArrayVec::new(), ar_coeffs_cb: ArrayVec::try_from([0].as_slice()).unwrap(), ar_coeffs_cr: ArrayVec::try_from([0].as_slice()).unwrap(), ar_coeff_shift: 6, cb_mult: 128, cb_luma_mult: 192, cb_offset: 256, cr_mult: 128, cr_luma_mult: 192, cr_offset: 256, overlap_flag: true, chroma_scaling_from_luma: false, grain_scale_shift: 0, random_seed: 7391, }]), ); } rav1e-0.7.1/src/tiling/mod.rs000064400000000000000000000015641046102023000140570ustar 00000000000000// Copyright (c) 2019-2020, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(unused)] mod plane_region; mod tile; mod tile_blocks; mod tile_motion_stats; mod tile_restoration_state; mod tile_state; mod tiler; pub use self::plane_region::*; pub use self::tile::*; pub use self::tile_blocks::*; pub use self::tile_motion_stats::*; pub use self::tile_restoration_state::*; pub use self::tile_state::*; pub use self::tiler::*; rav1e-0.7.1/src/tiling/plane_region.rs000064400000000000000000000601671046102023000157460ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(clippy::iter_nth_zero)] use crate::context::*; use crate::frame::*; use crate::util::*; use std::iter::FusedIterator; use std::marker::PhantomData; use std::ops::{Index, IndexMut}; use std::slice; /// Rectangle of a plane region, in pixels #[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] pub struct Rect { // coordinates relative to the plane origin (xorigin, yorigin) pub x: isize, pub y: isize, pub width: usize, pub height: usize, } impl Rect { #[inline(always)] pub const fn decimated(&self, xdec: usize, ydec: usize) -> Self { Self { x: self.x >> xdec, y: self.y >> ydec, width: self.width >> xdec, height: self.height >> ydec, } } pub const fn to_area(&self) -> Area { Area::Rect { x: self.x, y: self.y, width: self.width, height: self.height } } } // Structure to describe a rectangle area in several ways // // To retrieve a subregion from a region, we need to provide the subregion // bounds, relative to its parent region. The subregion must always be included // in its parent region. // // For that purpose, we could just use a rectangle (x, y, width, height), but // this would be too cumbersome to use in practice. For example, we often need // to pass a subregion from an offset, using the same bottom-right corner as // its parent, or to pass a subregion expressed in block offset instead of // pixel offset. // // Area provides a flexible way to describe a subregion. #[derive(Debug, Clone, Copy)] pub enum Area { /// A well-defined rectangle Rect { x: isize, y: isize, width: usize, height: usize }, /// A rectangle starting at offset (x, y) and ending at the bottom-right /// corner of the parent StartingAt { x: isize, y: isize }, /// A well-defined rectangle with offset expressed in blocks BlockRect { bo: BlockOffset, width: usize, height: usize }, /// a rectangle starting at given block offset until the bottom-right corner /// of the parent BlockStartingAt { bo: BlockOffset }, } impl Area { #[inline(always)] /// Convert to a rectangle of pixels. /// For a `BlockRect` and `BlockStartingAt`, for subsampled chroma planes, /// the returned rect will be aligned to a 4x4 chroma block. /// This is necessary for `compute_distortion` and `rdo_cfl_alpha` as /// the subsampled chroma block covers multiple luma blocks. pub const fn to_rect( &self, xdec: usize, ydec: usize, parent_width: usize, parent_height: usize, ) -> Rect { match *self { Area::Rect { x, y, width, height } => Rect { x, y, width, height }, Area::StartingAt { x, y } => Rect { x, y, width: (parent_width as isize - x) as usize, height: (parent_height as isize - y) as usize, }, Area::BlockRect { bo, width, height } => Rect { x: (bo.x >> xdec << BLOCK_TO_PLANE_SHIFT) as isize, y: (bo.y >> ydec << BLOCK_TO_PLANE_SHIFT) as isize, width, height, }, Area::BlockStartingAt { bo } => { let x = (bo.x >> xdec << BLOCK_TO_PLANE_SHIFT) as isize; let y = (bo.y >> ydec << BLOCK_TO_PLANE_SHIFT) as isize; Rect { x, y, width: (parent_width as isize - x) as usize, height: (parent_height as isize - y) as usize, } } } } } /// Bounded region of a plane /// /// This allows to give access to a rectangular area of a plane without /// giving access to the whole plane. #[derive(Debug)] pub struct PlaneRegion<'a, T: Pixel> { data: *const T, // points to (plane_cfg.x, plane_cfg.y) pub plane_cfg: &'a PlaneConfig, // private to guarantee borrowing rules rect: Rect, phantom: PhantomData<&'a T>, } /// Mutable bounded region of a plane /// /// This allows to give mutable access to a rectangular area of the plane /// without giving access to the whole plane. #[derive(Debug)] pub struct PlaneRegionMut<'a, T: Pixel> { data: *mut T, // points to (plane_cfg.x, plane_cfg.y) pub plane_cfg: &'a PlaneConfig, rect: Rect, phantom: PhantomData<&'a mut T>, } // common impl for PlaneRegion and PlaneRegionMut macro_rules! plane_region_common { // $name: PlaneRegion or PlaneRegionMut // $as_ptr: as_ptr or as_mut_ptr // $opt_mut: nothing or mut ($name:ident, $as_ptr:ident $(,$opt_mut:tt)?) => { impl<'a, T: Pixel> $name<'a, T> { #[inline(always)] pub fn is_null(&self) -> bool { self.data.is_null() } #[cold] pub fn empty(plane_cfg : &'a PlaneConfig) -> Self { return Self { // SAFETY: This is actually pretty unsafe. // This means we need to ensure that no other method on this struct // can access data if the dimensions are 0. data: unsafe { std::ptr::null_mut::() }, plane_cfg, rect: Rect::default(), phantom: PhantomData, } } /// # Panics /// /// - If the configured dimensions are invalid #[inline(always)] pub fn from_slice(data: &'a $($opt_mut)? [T], cfg: &'a PlaneConfig, rect: Rect) -> Self { if cfg.width == 0 || cfg.height == 0 { return Self::empty(&cfg); } assert!(rect.x >= -(cfg.xorigin as isize)); assert!(rect.y >= -(cfg.yorigin as isize)); assert!(cfg.xorigin as isize + rect.x + rect.width as isize <= cfg.stride as isize); assert!(cfg.yorigin as isize + rect.y + rect.height as isize <= cfg.alloc_height as isize); // SAFETY: The above asserts ensure we do not go OOB. unsafe { Self::from_slice_unsafe(data, cfg, rect)} } #[inline(always)] pub unsafe fn from_slice_unsafe(data: &'a $($opt_mut)? [T], cfg: &'a PlaneConfig, rect: Rect) -> Self { debug_assert!(rect.x >= -(cfg.xorigin as isize)); debug_assert!(rect.y >= -(cfg.yorigin as isize)); debug_assert!(cfg.xorigin as isize + rect.x + rect.width as isize <= cfg.stride as isize); debug_assert!(cfg.yorigin as isize + rect.y + rect.height as isize <= cfg.alloc_height as isize); let origin = (cfg.yorigin as isize + rect.y) * cfg.stride as isize + cfg.xorigin as isize + rect.x; Self { data: data.$as_ptr().offset(origin), plane_cfg: cfg, rect, phantom: PhantomData, } } #[inline(always)] pub fn new(plane: &'a $($opt_mut)? Plane, rect: Rect) -> Self { Self::from_slice(& $($opt_mut)? plane.data, &plane.cfg, rect) } #[inline(always)] pub fn new_from_plane(plane: &'a $($opt_mut)? Plane) -> Self { let rect = Area::StartingAt { x: 0, y: 0 }.to_rect( plane.cfg.xdec, plane.cfg.ydec, plane.cfg.stride - plane.cfg.xorigin, plane.cfg.alloc_height - plane.cfg.yorigin, ); // SAFETY: Area::StartingAt{}.to_rect is guaranteed to be the entire plane unsafe { Self::from_slice_unsafe(& $($opt_mut)? plane.data, &plane.cfg, rect) } } #[inline(always)] pub fn data_ptr(&self) -> *const T { self.data } #[inline(always)] pub fn rect(&self) -> &Rect { &self.rect } #[inline(always)] pub fn rows_iter(&self) -> RowsIter<'_, T> { RowsIter { data: self.data, stride: self.plane_cfg.stride, width: self.rect.width, remaining: self.rect.height, phantom: PhantomData, } } pub fn vert_windows(&self, h: usize) -> VertWindows<'_, T> { VertWindows { data: self.data, plane_cfg: self.plane_cfg, remaining: (self.rect.height as isize - h as isize + 1).max(0) as usize, output_rect: Rect { x: self.rect.x, y: self.rect.y, width: self.rect.width, height: h } } } pub fn horz_windows(&self, w: usize) -> HorzWindows<'_, T> { HorzWindows { data: self.data, plane_cfg: self.plane_cfg, remaining: (self.rect.width as isize - w as isize + 1).max(0) as usize, output_rect: Rect { x: self.rect.x, y: self.rect.y, width: w, height: self.rect.height } } } /// Return a view to a subregion of the plane /// /// The subregion must be included in (i.e. must not exceed) this region. /// /// It is described by an `Area`, relative to this region. /// /// # Panics /// /// - If the requested dimensions are larger than the plane region size /// /// # Example /// /// ``` ignore /// # use rav1e::tiling::*; /// # fn f(region: &PlaneRegion<'_, u16>) { /// // a subregion from (10, 8) to the end of the region /// let subregion = region.subregion(Area::StartingAt { x: 10, y: 8 }); /// # } /// ``` /// /// ``` ignore /// # use rav1e::context::*; /// # use rav1e::tiling::*; /// # fn f(region: &PlaneRegion<'_, u16>) { /// // a subregion from the top-left of block (2, 3) having size (64, 64) /// let bo = BlockOffset { x: 2, y: 3 }; /// let subregion = region.subregion(Area::BlockRect { bo, width: 64, height: 64 }); /// # } /// ``` #[inline(always)] pub fn subregion(&self, area: Area) -> PlaneRegion<'_, T> { if self.data.is_null() { return PlaneRegion::empty(&self.plane_cfg); } let rect = area.to_rect( self.plane_cfg.xdec, self.plane_cfg.ydec, self.rect.width, self.rect.height, ); assert!(rect.x >= 0 && rect.x as usize <= self.rect.width); assert!(rect.y >= 0 && rect.y as usize <= self.rect.height); // SAFETY: The above asserts ensure we do not go outside the original rectangle. let data = unsafe { self.data.add(rect.y as usize * self.plane_cfg.stride + rect.x as usize) }; let absolute_rect = Rect { x: self.rect.x + rect.x, y: self.rect.y + rect.y, width: rect.width, height: rect.height, }; PlaneRegion { data, plane_cfg: &self.plane_cfg, rect: absolute_rect, phantom: PhantomData, } } // Return an equivalent PlaneRegion with origin homed to 0,0. Data // pointer is not moved (0,0 points to the same pixel previously // pointed to by old x,y). #[inline(always)] pub fn home(&self) -> Self { let home_rect = Rect { x: 0, y: 0, width: self.rect.width, height: self.rect.height, }; Self { data: self.data, plane_cfg: &self.plane_cfg, rect: home_rect, phantom: PhantomData, } } #[inline(always)] pub fn to_frame_plane_offset(&self, tile_po: PlaneOffset) -> PlaneOffset { PlaneOffset { x: self.rect.x + tile_po.x, y: self.rect.y + tile_po.y, } } #[inline(always)] pub fn to_frame_block_offset(&self, tile_bo: TileBlockOffset) -> PlaneBlockOffset { debug_assert!(self.rect.x >= 0); debug_assert!(self.rect.y >= 0); let PlaneConfig { xdec, ydec, .. } = self.plane_cfg; debug_assert!(self.rect.x as usize % (MI_SIZE >> xdec) == 0); debug_assert!(self.rect.y as usize % (MI_SIZE >> ydec) == 0); let bx = self.rect.x as usize >> MI_SIZE_LOG2 - xdec; let by = self.rect.y as usize >> MI_SIZE_LOG2 - ydec; PlaneBlockOffset(BlockOffset { x: bx + tile_bo.0.x, y: by + tile_bo.0.y, }) } #[inline(always)] pub fn to_frame_super_block_offset( &self, tile_sbo: TileSuperBlockOffset, sb_size_log2: usize ) -> PlaneSuperBlockOffset { debug_assert!(sb_size_log2 == 6 || sb_size_log2 == 7); debug_assert!(self.rect.x >= 0); debug_assert!(self.rect.y >= 0); let PlaneConfig { xdec, ydec, .. } = self.plane_cfg; debug_assert!(self.rect.x as usize % (1 << sb_size_log2 - xdec) == 0); debug_assert!(self.rect.y as usize % (1 << sb_size_log2 - ydec) == 0); let sbx = self.rect.x as usize >> sb_size_log2 - xdec; let sby = self.rect.y as usize >> sb_size_log2 - ydec; PlaneSuperBlockOffset(SuperBlockOffset { x: sbx + tile_sbo.0.x, y: sby + tile_sbo.0.y, }) } /// Returns the frame block offset of the subregion. #[inline(always)] pub fn frame_block_offset(&self) -> PlaneBlockOffset { self.to_frame_block_offset(TileBlockOffset(BlockOffset { x: 0, y: 0 })) } pub(crate) fn scratch_copy(&self) -> Plane { let &Rect { width, height, .. } = self.rect(); let &PlaneConfig { xdec, ydec, .. } = self.plane_cfg; let mut ret: Plane = Plane::new(width, height, xdec, ydec, 0, 0); let mut dst: PlaneRegionMut = ret.as_region_mut(); for (dst_row, src_row) in dst.rows_iter_mut().zip(self.rows_iter()) { for (out, input) in dst_row.iter_mut().zip(src_row) { *out = *input; } } ret } } unsafe impl Send for $name<'_, T> {} unsafe impl Sync for $name<'_, T> {} impl Index for $name<'_, T> { type Output = [T]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { assert!(index < self.rect.height); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.plane_cfg.stride); slice::from_raw_parts(ptr, self.rect.width) } } } } } plane_region_common!(PlaneRegion, as_ptr); plane_region_common!(PlaneRegionMut, as_mut_ptr, mut); impl<'a, T: Pixel> PlaneRegionMut<'a, T> { #[inline(always)] pub fn data_ptr_mut(&mut self) -> *mut T { self.data } #[inline(always)] pub fn rows_iter_mut(&mut self) -> RowsIterMut<'_, T> { RowsIterMut { data: self.data, stride: self.plane_cfg.stride, width: self.rect.width, remaining: self.rect.height, phantom: PhantomData, } } /// Return a mutable view to a subregion of the plane /// /// The subregion must be included in (i.e. must not exceed) this region. /// /// It is described by an `Area`, relative to this region. /// /// # Panics /// /// - If the targeted `area` is outside of the bounds of this plane region. /// /// # Example /// /// ``` ignore /// # use rav1e::tiling::*; /// # fn f(region: &mut PlaneRegionMut<'_, u16>) { /// // a mutable subregion from (10, 8) having size (32, 32) /// let subregion = region.subregion_mut(Area::Rect { x: 10, y: 8, width: 32, height: 32 }); /// # } /// ``` /// /// ``` ignore /// # use rav1e::context::*; /// # use rav1e::tiling::*; /// # fn f(region: &mut PlaneRegionMut<'_, u16>) { /// // a mutable subregion from the top-left of block (2, 3) to the end of the region /// let bo = BlockOffset { x: 2, y: 3 }; /// let subregion = region.subregion_mut(Area::BlockStartingAt { bo }); /// # } /// ``` #[inline(always)] pub fn subregion_mut(&mut self, area: Area) -> PlaneRegionMut<'_, T> { let rect = area.to_rect( self.plane_cfg.xdec, self.plane_cfg.ydec, self.rect.width, self.rect.height, ); assert!(rect.x >= 0 && rect.x as usize <= self.rect.width); assert!(rect.y >= 0 && rect.y as usize <= self.rect.height); // SAFETY: The above asserts ensure we do not go outside the original rectangle. let data = unsafe { self.data.add(rect.y as usize * self.plane_cfg.stride + rect.x as usize) }; let absolute_rect = Rect { x: self.rect.x + rect.x, y: self.rect.y + rect.y, width: rect.width, height: rect.height, }; PlaneRegionMut { data, plane_cfg: self.plane_cfg, rect: absolute_rect, phantom: PhantomData, } } #[inline(always)] pub fn as_const(&self) -> PlaneRegion<'_, T> { PlaneRegion { data: self.data, plane_cfg: self.plane_cfg, rect: self.rect, phantom: PhantomData, } } } impl IndexMut for PlaneRegionMut<'_, T> { #[inline(always)] fn index_mut(&mut self, index: usize) -> &mut Self::Output { assert!(index < self.rect.height); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.plane_cfg.stride); slice::from_raw_parts_mut(ptr, self.rect.width) } } } /// Iterator over plane region rows pub struct RowsIter<'a, T: Pixel> { data: *const T, stride: usize, width: usize, remaining: usize, phantom: PhantomData<&'a T>, } /// Mutable iterator over plane region rows pub struct RowsIterMut<'a, T: Pixel> { data: *mut T, stride: usize, width: usize, remaining: usize, phantom: PhantomData<&'a mut T>, } impl<'a, T: Pixel> Iterator for RowsIter<'a, T> { type Item = &'a [T]; #[inline(always)] fn next(&mut self) -> Option { if self.remaining > 0 { // SAFETY: We verified that we have enough data left to not go OOB, // assuming that `self.stride` and `self.width` are set correctly. let row = unsafe { let ptr = self.data; self.data = self.data.add(self.stride); slice::from_raw_parts(ptr, self.width) }; self.remaining -= 1; Some(row) } else { None } } #[inline(always)] fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } } impl<'a, T: Pixel> Iterator for RowsIterMut<'a, T> { type Item = &'a mut [T]; #[inline(always)] fn next(&mut self) -> Option { if self.remaining > 0 { // SAFETY: We verified that we have enough data left to not go OOB, // assuming that `self.stride` and `self.width` are set correctly. let row = unsafe { let ptr = self.data; self.data = self.data.add(self.stride); slice::from_raw_parts_mut(ptr, self.width) }; self.remaining -= 1; Some(row) } else { None } } #[inline(always)] fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } } impl ExactSizeIterator for RowsIter<'_, T> {} impl FusedIterator for RowsIter<'_, T> {} impl ExactSizeIterator for RowsIterMut<'_, T> {} impl FusedIterator for RowsIterMut<'_, T> {} pub struct VertWindows<'a, T: Pixel> { data: *const T, plane_cfg: &'a PlaneConfig, remaining: usize, output_rect: Rect, } pub struct HorzWindows<'a, T: Pixel> { data: *const T, plane_cfg: &'a PlaneConfig, remaining: usize, output_rect: Rect, } impl<'a, T: Pixel> Iterator for VertWindows<'a, T> { type Item = PlaneRegion<'a, T>; #[inline(always)] fn next(&mut self) -> Option { self.nth(0) } #[inline(always)] fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } #[inline(always)] fn nth(&mut self, n: usize) -> Option { if self.remaining > n { // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(self.plane_cfg.stride * n) }; self.output_rect.y += n as isize; let output = PlaneRegion { data: self.data, plane_cfg: self.plane_cfg, rect: self.output_rect, phantom: PhantomData, }; // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(self.plane_cfg.stride) }; self.output_rect.y += 1; self.remaining -= (n + 1); Some(output) } else { None } } } impl<'a, T: Pixel> Iterator for HorzWindows<'a, T> { type Item = PlaneRegion<'a, T>; #[inline(always)] fn next(&mut self) -> Option { self.nth(0) } #[inline(always)] fn size_hint(&self) -> (usize, Option) { (self.remaining, Some(self.remaining)) } #[inline(always)] fn nth(&mut self, n: usize) -> Option { if self.remaining > n { // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(n) }; self.output_rect.x += n as isize; let output = PlaneRegion { data: self.data, plane_cfg: self.plane_cfg, rect: self.output_rect, phantom: PhantomData, }; // SAFETY: We verified that we have enough data left to not go OOB. self.data = unsafe { self.data.add(1) }; self.output_rect.x += 1; self.remaining -= (n + 1); Some(output) } else { None } } } impl ExactSizeIterator for VertWindows<'_, T> {} impl FusedIterator for VertWindows<'_, T> {} impl ExactSizeIterator for HorzWindows<'_, T> {} impl FusedIterator for HorzWindows<'_, T> {} #[test] fn area_test() { assert_eq!( (Area::BlockStartingAt { bo: BlockOffset { x: 0, y: 0 } }) .to_rect(0, 0, 100, 100), Rect { x: 0, y: 0, width: 100, height: 100 } ); assert_eq!( (Area::BlockStartingAt { bo: BlockOffset { x: 1, y: 1 } }) .to_rect(0, 0, 100, 100), Rect { x: 4, y: 4, width: 96, height: 96 } ); assert_eq!( (Area::BlockStartingAt { bo: BlockOffset { x: 1, y: 1 } }) .to_rect(1, 1, 50, 50), Rect { x: 0, y: 0, width: 50, height: 50 } ); assert_eq!( (Area::BlockStartingAt { bo: BlockOffset { x: 2, y: 2 } }) .to_rect(1, 1, 50, 50), Rect { x: 4, y: 4, width: 46, height: 46 } ); assert_eq!( (Area::BlockRect { bo: BlockOffset { x: 0, y: 0 }, width: 1, height: 1 }) .to_rect(0, 0, 100, 100), Rect { x: 0, y: 0, width: 1, height: 1 } ); assert_eq!( (Area::BlockRect { bo: BlockOffset { x: 1, y: 1 }, width: 1, height: 1 }) .to_rect(0, 0, 100, 100), Rect { x: 4, y: 4, width: 1, height: 1 } ); assert_eq!( (Area::BlockRect { bo: BlockOffset { x: 1, y: 1 }, width: 1, height: 1 }) .to_rect(1, 1, 50, 50), Rect { x: 0, y: 0, width: 1, height: 1 } ); assert_eq!( (Area::BlockRect { bo: BlockOffset { x: 2, y: 2 }, width: 1, height: 1 }) .to_rect(1, 1, 50, 50), Rect { x: 4, y: 4, width: 1, height: 1 } ); } #[test] fn frame_block_offset() { { let p = Plane::::new(100, 100, 0, 0, 0, 0); let pr = PlaneRegion::new(&p, Rect { x: 0, y: 0, width: 100, height: 100 }); let bo = BlockOffset { x: 0, y: 0 }; assert_eq!( pr.to_frame_block_offset(TileBlockOffset(bo)), PlaneBlockOffset(bo) ); assert_eq!( pr.to_frame_block_offset(TileBlockOffset(bo)), pr.subregion(Area::BlockStartingAt { bo }).frame_block_offset() ); } { let p = Plane::::new(100, 100, 0, 0, 0, 0); let pr = PlaneRegion::new(&p, Rect { x: 0, y: 0, width: 100, height: 100 }); let bo = BlockOffset { x: 1, y: 1 }; assert_eq!( pr.to_frame_block_offset(TileBlockOffset(bo)), PlaneBlockOffset(bo) ); assert_eq!( pr.to_frame_block_offset(TileBlockOffset(bo)), pr.subregion(Area::BlockStartingAt { bo }).frame_block_offset() ); } { let p = Plane::::new(100, 100, 1, 1, 0, 0); let pr = PlaneRegion::new(&p, Rect { x: 0, y: 0, width: 100, height: 100 }); let bo = BlockOffset { x: 1, y: 1 }; assert_eq!( pr.to_frame_block_offset(TileBlockOffset(bo)), PlaneBlockOffset(bo) ); } { let p = Plane::::new(100, 100, 1, 1, 0, 0); let pr = PlaneRegion::new(&p, Rect { x: 0, y: 0, width: 100, height: 100 }); let bo = BlockOffset { x: 2, y: 2 }; assert_eq!( pr.to_frame_block_offset(TileBlockOffset(bo)), PlaneBlockOffset(bo) ); assert_eq!( pr.to_frame_block_offset(TileBlockOffset(bo)), pr.subregion(Area::BlockStartingAt { bo }).frame_block_offset() ); } } rav1e-0.7.1/src/tiling/tile.rs000064400000000000000000000145131046102023000142330ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; use crate::context::*; use crate::frame::*; use crate::util::*; /// Rectangle of a tile, in pixels /// /// This is similar to Rect, but with unsigned (x, y) for convenience. #[derive(Debug, Clone, Copy)] pub struct TileRect { pub x: usize, pub y: usize, pub width: usize, pub height: usize, } impl TileRect { #[inline(always)] pub const fn decimated(&self, xdec: usize, ydec: usize) -> Self { Self { x: self.x >> xdec, y: self.y >> ydec, width: self.width >> xdec, height: self.height >> ydec, } } #[inline(always)] pub const fn to_frame_plane_offset( &self, tile_po: PlaneOffset, ) -> PlaneOffset { PlaneOffset { x: self.x as isize + tile_po.x, y: self.y as isize + tile_po.y, } } #[inline(always)] pub fn to_frame_block_offset( &self, tile_bo: TileBlockOffset, xdec: usize, ydec: usize, ) -> PlaneBlockOffset { debug_assert!(self.x % (MI_SIZE >> xdec) == 0); debug_assert!(self.y % (MI_SIZE >> ydec) == 0); let bx = self.x >> (MI_SIZE_LOG2 - xdec); let by = self.y >> (MI_SIZE_LOG2 - ydec); PlaneBlockOffset(BlockOffset { x: bx + tile_bo.0.x, y: by + tile_bo.0.y }) } #[inline(always)] pub fn to_frame_super_block_offset( &self, tile_sbo: TileSuperBlockOffset, sb_size_log2: usize, xdec: usize, ydec: usize, ) -> PlaneSuperBlockOffset { debug_assert!(sb_size_log2 == 6 || sb_size_log2 == 7); debug_assert!(self.x % (1 << (sb_size_log2 - xdec)) == 0); debug_assert!(self.y % (1 << (sb_size_log2 - ydec)) == 0); let sbx = self.x >> (sb_size_log2 - xdec); let sby = self.y >> (sb_size_log2 - ydec); PlaneSuperBlockOffset(SuperBlockOffset { x: sbx + tile_sbo.0.x, y: sby + tile_sbo.0.y, }) } } impl From for Rect { #[inline(always)] fn from(tile_rect: TileRect) -> Rect { Rect { x: tile_rect.x as isize, y: tile_rect.y as isize, width: tile_rect.width, height: tile_rect.height, } } } /// Tiled view of a frame #[derive(Debug)] pub struct Tile<'a, T: Pixel> { pub planes: [PlaneRegion<'a, T>; MAX_PLANES], } /// Mutable tiled view of a frame #[derive(Debug)] pub struct TileMut<'a, T: Pixel> { pub planes: [PlaneRegionMut<'a, T>; MAX_PLANES], } // common impl for Tile and TileMut macro_rules! tile_common { // $name: Tile or TileMut // $pr_type: PlaneRegion or PlaneRegionMut // $iter: iter or iter_mut //opt_mut: nothing or mut ($name:ident, $pr_type:ident, $iter:ident $(,$opt_mut:tt)?) => { impl<'a, T: Pixel> $name<'a, T> { #[inline(always)] pub fn new( frame: &'a $($opt_mut)? Frame, luma_rect: TileRect, ) -> Self { let mut planes_iter = frame.planes.$iter(); Self { planes: [ { let plane = planes_iter.next().unwrap(); $pr_type::new(plane, luma_rect.into()) }, { let plane = planes_iter.next().unwrap(); let rect = luma_rect.decimated(plane.cfg.xdec, plane.cfg.ydec); $pr_type::new(plane, rect.into()) }, { let plane = planes_iter.next().unwrap(); let rect = luma_rect.decimated(plane.cfg.xdec, plane.cfg.ydec); $pr_type::new(plane, rect.into()) }, ], } } /// Return a view to a subregion of a Tile /// /// The subregion must be included in (i.e. must not exceed) this Tile. /// /// It is described by an `Area`, relative to the luma plane of /// this region. /// /// # Panics /// /// - If the requested dimensions are larger than the plane size #[inline(always)] pub fn subregion(&self, area: Area) -> Tile<'_, T> { let tile_rect = area.to_rect( 0, 0, self.planes[0].rect().width, self.planes[0].rect().height, ); Tile { planes: { let sub_plane = |pli: usize| { let plane = &self.planes[pli]; let &PlaneConfig { xdec, ydec, .. } = plane.plane_cfg; let rect = tile_rect.decimated(xdec, ydec); if !plane.is_null() { assert!(rect.x >= 0 && rect.x as usize <= plane.rect().width); assert!(rect.y >= 0 && rect.y as usize <= plane.rect().height); assert!(rect.x as usize + rect.width <= plane.rect().x as usize + plane.rect().width); assert!(rect.y as usize + rect.height <= plane.rect().y as usize + plane.rect().height); } plane.subregion(rect.to_area()) }; [sub_plane(0), sub_plane(1), sub_plane(2)] }, } } // Return an equivalent Tile with origin homed to 0,0. Data // pointer is not moved (0,0 points to the same pixel previously // pointed to by old x,y). #[inline(always)] pub fn home(&self) -> Self { Self { planes: [ self.planes[0].home(), self.planes[1].home(), self.planes[2].home(), ], } } // Return a copy of this tile's contents in a new backing frame. #[inline(always)] pub(crate) fn scratch_copy(&self) -> Frame { Frame { planes: [ self.planes[0].scratch_copy(), self.planes[1].scratch_copy(), self.planes[2].scratch_copy(), ], } } } } } tile_common!(Tile, PlaneRegion, iter); tile_common!(TileMut, PlaneRegionMut, iter_mut, mut); impl<'a, T: Pixel> TileMut<'a, T> { #[inline(always)] pub fn as_const(&self) -> Tile<'_, T> { Tile { planes: [ self.planes[0].as_const(), self.planes[1].as_const(), self.planes[2].as_const(), ], } } } rav1e-0.7.1/src/tiling/tile_blocks.rs000064400000000000000000000173571046102023000156010ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::context::*; use crate::mc::MotionVector; use crate::partition::*; use crate::predict::PredictionMode; use crate::transform::*; use std::cmp; use std::marker::PhantomData; use std::ops::{Index, IndexMut}; use std::slice; /// Tiled view of `FrameBlocks` #[derive(Debug)] pub struct TileBlocks<'a> { data: *const Block, x: usize, y: usize, cols: usize, rows: usize, frame_cols: usize, frame_rows: usize, phantom: PhantomData<&'a Block>, } /// Mutable tiled view of `FrameBlocks` #[derive(Debug)] pub struct TileBlocksMut<'a> { data: *mut Block, // private to guarantee borrowing rules x: usize, y: usize, cols: usize, rows: usize, frame_cols: usize, frame_rows: usize, phantom: PhantomData<&'a mut Block>, } // common impl for TileBlocks and TileBlocksMut macro_rules! tile_blocks_common { // $name: TileBlocks or TileBlocksMut // $opt_mut: nothing or mut ($name:ident $(,$opt_mut:tt)?) => { impl<'a> $name<'a> { #[inline(always)] pub fn new( frame_blocks: &'a $($opt_mut)? FrameBlocks, x: usize, y: usize, cols: usize, rows: usize, ) -> Self { Self { data: & $($opt_mut)? frame_blocks[y][x], x, y, cols, rows, frame_cols: frame_blocks.cols, frame_rows: frame_blocks.rows, phantom: PhantomData, } } pub fn subregion( &mut self, x: usize, y: usize, cols: usize, rows: usize, ) -> TileBlocks<'_> { TileBlocks { data: &self[y][x], x: self.x+x, y: self.y+y, cols: cmp::min(cols, self.cols - x), rows: cmp::min(rows, self.rows - y), frame_cols: self.frame_cols, frame_rows: self.frame_rows, phantom: PhantomData, } } #[inline(always)] pub const fn x(&self) -> usize { self.x } #[inline(always)] pub const fn y(&self) -> usize { self.y } #[inline(always)] pub const fn cols(&self) -> usize { self.cols } #[inline(always)] pub const fn rows(&self) -> usize { self.rows } #[inline(always)] pub const fn frame_cols(&self) -> usize { self.frame_cols } #[inline(always)] pub const fn frame_rows(&self) -> usize { self.frame_rows } #[inline(always)] pub fn above_of(&self, bo: TileBlockOffset) -> &Block { &self[bo.0.y - 1][bo.0.x] } #[inline(always)] pub fn left_of(&self, bo: TileBlockOffset) -> &Block { &self[bo.0.y][bo.0.x - 1] } #[inline(always)] pub fn above_left_of(&self, bo: TileBlockOffset) -> &Block { &self[bo.0.y - 1][bo.0.x - 1] } pub fn get_cdef(&self, sbo: TileSuperBlockOffset) -> u8 { let bo = sbo.block_offset(0, 0).0; self[bo.y][bo.x].cdef_index } } unsafe impl Send for $name<'_> {} unsafe impl Sync for $name<'_> {} impl Index for $name<'_> { type Output = [Block]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.frame_cols); slice::from_raw_parts(ptr, self.cols) } } } // for convenience, also index by TileBlockOffset impl Index for $name<'_> { type Output = Block; #[inline(always)] fn index(&self, bo: TileBlockOffset) -> &Self::Output { &self[bo.0.y][bo.0.x] } } } } tile_blocks_common!(TileBlocks); tile_blocks_common!(TileBlocksMut, mut); impl TileBlocksMut<'_> { #[inline(always)] pub const fn as_const(&self) -> TileBlocks<'_> { TileBlocks { data: self.data, x: self.x, y: self.y, cols: self.cols, rows: self.rows, frame_cols: self.frame_cols, frame_rows: self.frame_rows, phantom: PhantomData, } } pub fn subregion_mut( &mut self, x: usize, y: usize, cols: usize, rows: usize, ) -> TileBlocksMut<'_> { TileBlocksMut { data: &mut self[y][x], x: self.x + x, y: self.y + y, cols: cmp::min(cols, self.cols - x), rows: cmp::min(rows, self.rows - y), frame_cols: self.frame_cols, frame_rows: self.frame_rows, phantom: PhantomData, } } #[inline(always)] pub fn for_each(&mut self, bo: TileBlockOffset, bsize: BlockSize, f: F) where F: Fn(&mut Block), { let mut bw = bsize.width_mi(); let bh = bsize.height_mi(); if bo.0.x + bw >= self.cols { bw = self.cols - bo.0.x; } for y in 0..bh { if bo.0.y + y >= self.rows { continue; } for block in self[bo.0.y + y][bo.0.x..bo.0.x + bw].iter_mut() { f(block); } } } #[inline(always)] pub fn set_mode( &mut self, bo: TileBlockOffset, bsize: BlockSize, mode: PredictionMode, ) { self.for_each(bo, bsize, |block| block.mode = mode); } #[inline(always)] pub fn set_block_size(&mut self, bo: TileBlockOffset, bsize: BlockSize) { let n4_w = bsize.width_mi() as u8; let n4_h = bsize.height_mi() as u8; self.for_each(bo, bsize, |block| { block.bsize = bsize; block.n4_w = n4_w; block.n4_h = n4_h; }); } #[inline(always)] pub fn set_tx_size( &mut self, bo: TileBlockOffset, bsize: BlockSize, tx_size: TxSize, ) { self.for_each(bo, bsize, |block| block.txsize = tx_size); } #[inline(always)] pub fn set_skip( &mut self, bo: TileBlockOffset, bsize: BlockSize, skip: bool, ) { self.for_each(bo, bsize, |block| block.skip = skip); } #[inline(always)] pub fn set_segmentation_idx( &mut self, bo: TileBlockOffset, bsize: BlockSize, idx: u8, ) { self.for_each(bo, bsize, |block| block.segmentation_idx = idx); } #[inline(always)] pub fn set_ref_frames( &mut self, bo: TileBlockOffset, bsize: BlockSize, r: [RefType; 2], ) { self.for_each(bo, bsize, |block| block.ref_frames = r); } #[inline(always)] pub fn set_motion_vectors( &mut self, bo: TileBlockOffset, bsize: BlockSize, mvs: [MotionVector; 2], ) { self.for_each(bo, bsize, |block| block.mv = mvs); } #[inline(always)] pub fn set_cdef(&mut self, sbo: TileSuperBlockOffset, cdef_index: u8) { let bo = sbo.block_offset(0, 0).0; // Checkme: Is 16 still the right block unit for 128x128 superblocks? let bw = cmp::min(bo.x + MIB_SIZE, self.cols); let bh = cmp::min(bo.y + MIB_SIZE, self.rows); for y in bo.y..bh { for x in bo.x..bw { self[y][x].cdef_index = cdef_index; } } } } impl IndexMut for TileBlocksMut<'_> { #[inline(always)] fn index_mut(&mut self, index: usize) -> &mut Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.frame_cols); slice::from_raw_parts_mut(ptr, self.cols) } } } impl IndexMut for TileBlocksMut<'_> { #[inline(always)] fn index_mut(&mut self, bo: TileBlockOffset) -> &mut Self::Output { &mut self[bo.0.y][bo.0.x] } } rav1e-0.7.1/src/tiling/tile_motion_stats.rs000064400000000000000000000072431046102023000170400ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::mc::MotionVector; use crate::me::*; use std::marker::PhantomData; use std::ops::{Index, IndexMut}; use std::slice; /// Tiled view of `FrameMEStats` #[derive(Debug)] pub struct TileMEStats<'a> { data: *const MEStats, // expressed in mi blocks // private to guarantee borrowing rules x: usize, y: usize, cols: usize, rows: usize, /// number of cols in the underlying `FrameMEStats` stride: usize, phantom: PhantomData<&'a MotionVector>, } /// Mutable tiled view of `FrameMEStats` #[derive(Debug)] pub struct TileMEStatsMut<'a> { data: *mut MEStats, // expressed in mi blocks // private to guarantee borrowing rules x: usize, y: usize, cols: usize, rows: usize, /// number of cols in the underlying `FrameMEStats` stride: usize, phantom: PhantomData<&'a mut MotionVector>, } // common impl for TileMotionVectors and TileMotionVectorsMut macro_rules! tile_me_stats_common { // $name: TileMEStats or TileMEStatsMut // $opt_mut: nothing or mut ($name:ident $(,$opt_mut:tt)?) => { impl<'a> $name<'a> { /// # Panics /// /// - If the requested dimensions are larger than the frame MV size #[inline(always)] pub fn new( frame_mvs: &'a $($opt_mut)? FrameMEStats, x: usize, y: usize, cols: usize, rows: usize, ) -> Self { assert!(x + cols <= frame_mvs.cols); assert!(y + rows <= frame_mvs.rows); Self { data: & $($opt_mut)? frame_mvs[y][x], x, y, cols, rows, stride: frame_mvs.cols, phantom: PhantomData, } } #[inline(always)] pub const fn x(&self) -> usize { self.x } #[inline(always)] pub const fn y(&self) -> usize { self.y } #[inline(always)] pub const fn cols(&self) -> usize { self.cols } #[inline(always)] pub const fn rows(&self) -> usize { self.rows } } unsafe impl Send for $name<'_> {} unsafe impl Sync for $name<'_> {} impl Index for $name<'_> { type Output = [MEStats]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.stride); slice::from_raw_parts(ptr, self.cols) } } } } } tile_me_stats_common!(TileMEStats); tile_me_stats_common!(TileMEStatsMut, mut); impl TileMEStatsMut<'_> { #[inline(always)] pub const fn as_const(&self) -> TileMEStats<'_> { TileMEStats { data: self.data, x: self.x, y: self.y, cols: self.cols, rows: self.rows, stride: self.stride, phantom: PhantomData, } } } impl IndexMut for TileMEStatsMut<'_> { #[inline(always)] fn index_mut(&mut self, index: usize) -> &mut Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.stride); slice::from_raw_parts_mut(ptr, self.cols) } } } rav1e-0.7.1/src/tiling/tile_restoration_state.rs000064400000000000000000000331121046102023000200600ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::context::*; use crate::encoder::FrameInvariants; use crate::lrf::*; use crate::util::Pixel; use std::marker::PhantomData; use std::ops::{Index, IndexMut}; use std::ptr; use std::slice; /// Tiled view of `RestorationUnits` #[derive(Debug)] pub struct TileRestorationUnits<'a> { data: *const RestorationUnit, // private to guarantee borrowing rules x: usize, y: usize, cols: usize, rows: usize, /// number of cols in the underlying `FrameRestorationUnits` stride: usize, phantom: PhantomData<&'a RestorationUnit>, } /// Mutable tiled view of `RestorationUnits` #[derive(Debug)] pub struct TileRestorationUnitsMut<'a> { data: *mut RestorationUnit, // private to guarantee borrowing rules x: usize, y: usize, cols: usize, rows: usize, /// number of cols in the underlying `FrameRestorationUnits` stride: usize, phantom: PhantomData<&'a mut RestorationUnit>, } // common impl for TileRestorationUnits and TileRestorationUnitsMut macro_rules! tile_restoration_units_common { // $name: TileRestorationUnits or TileRestorationUnitsMut // $null: null or null_mut // $opt_mut: nothing or mut ($name:ident, $null:ident $(,$opt_mut:tt)?) => { impl<'a> $name<'a> { #[inline(always)] pub fn new( frame_units: &'a $($opt_mut)? FrameRestorationUnits, x: usize, y: usize, cols: usize, rows: usize, ) -> Self { Self { data: if x < frame_units.cols && y < frame_units.rows { & $($opt_mut)? frame_units[y][x] } else { // on edges, a tile may contain no restoration units ptr::$null() }, x, y, cols, rows, stride: frame_units.cols, phantom: PhantomData, } } #[inline(always)] pub const fn x(&self) -> usize { self.x } #[inline(always)] pub const fn y(&self) -> usize { self.y } #[inline(always)] pub const fn cols(&self) -> usize { self.cols } #[inline(always)] pub const fn rows(&self) -> usize { self.rows } } unsafe impl Send for $name<'_> {} unsafe impl Sync for $name<'_> {} impl Index for $name<'_> { type Output = [RestorationUnit]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.stride); slice::from_raw_parts(ptr, self.cols) } } } } } tile_restoration_units_common!(TileRestorationUnits, null); tile_restoration_units_common!(TileRestorationUnitsMut, null_mut, mut); impl TileRestorationUnitsMut<'_> { #[inline(always)] pub const fn as_const(&self) -> TileRestorationUnits<'_> { TileRestorationUnits { data: self.data, x: self.x, y: self.y, cols: self.cols, rows: self.rows, stride: self.stride, phantom: PhantomData, } } } impl IndexMut for TileRestorationUnitsMut<'_> { #[inline(always)] fn index_mut(&mut self, index: usize) -> &mut Self::Output { assert!(index < self.rows); // SAFETY: The above assert ensures we do not access OOB data. unsafe { let ptr = self.data.add(index * self.stride); slice::from_raw_parts_mut(ptr, self.cols) } } } /// Tiled view of `RestorationPlane` #[derive(Debug)] pub struct TileRestorationPlane<'a> { pub rp_cfg: &'a RestorationPlaneConfig, pub wiener_ref: [[i8; 3]; 2], pub sgrproj_ref: [i8; 2], pub units: TileRestorationUnits<'a>, } /// Mutable tiled view of `RestorationPlane` #[derive(Debug)] pub struct TileRestorationPlaneMut<'a> { pub rp_cfg: &'a RestorationPlaneConfig, pub wiener_ref: [[i8; 3]; 2], pub sgrproj_ref: [i8; 2], pub units: TileRestorationUnitsMut<'a>, } // common impl for TileRestorationPlane and TileRestorationPlaneMut macro_rules! tile_restoration_plane_common { // $name: TileRestorationPlane or TileRestorationPlaneMut // $tru_type: TileRestorationUnits or TileRestorationUnitsMut // $opt_mut: nothing or mut ($name:ident, $tru_type:ident $(,$opt_mut:tt)?) => { impl<'a> $name<'a> { #[inline(always)] pub fn new( rp: &'a $($opt_mut)? RestorationPlane, units_x: usize, units_y: usize, units_cols: usize, units_rows: usize, ) -> Self { Self { rp_cfg: &rp.cfg, wiener_ref: [WIENER_TAPS_MID; 2], sgrproj_ref: SGRPROJ_XQD_MID, units: $tru_type::new(& $($opt_mut)? rp.units, units_x, units_y, units_cols, units_rows), } } // determines the loop restoration unit row and column a // superblock belongs to. The stretch boolean indicates if a // superblock that belongs to a stretched LRU should return an // index (stretch == true) or None (stretch == false). pub const fn restoration_unit_index(&self, sbo: TileSuperBlockOffset, stretch: bool) -> Option<(usize, usize)> { if self.units.rows > 0 && self.units.cols > 0 { // is this a stretch block? let x_stretch = sbo.0.x < self.rp_cfg.sb_cols && sbo.0.x >> self.rp_cfg.sb_h_shift >= self.units.cols; let y_stretch = sbo.0.y < self.rp_cfg.sb_rows && sbo.0.y >> self.rp_cfg.sb_v_shift >= self.units.rows; if (x_stretch || y_stretch) && !stretch { None } else { let x = (sbo.0.x >> self.rp_cfg.sb_h_shift) - if x_stretch { 1 } else { 0 }; let y = (sbo.0.y >> self.rp_cfg.sb_v_shift) - if y_stretch { 1 } else { 0 }; if x < self.units.cols && y < self.units.rows { Some((x, y)) } else { None } } } else { None } } pub fn restoration_unit_offset(&self, base: TileSuperBlockOffset, offset: TileSuperBlockOffset, stretch: bool) -> Option<(usize, usize)> { let base_option = self.restoration_unit_index(base, stretch); let delta_option = self.restoration_unit_index(base + offset, stretch); if let (Some((base_x, base_y)), Some((delta_x, delta_y))) = (base_option, delta_option) { Some ((delta_x - base_x, delta_y - base_y)) } else { None } } pub const fn restoration_unit_countable(&self, x: usize, y: usize) -> usize { y * self.units.cols + x } // Is this the last sb (in scan order) in the restoration unit // that we will be considering for RDO? This would be a // straightforward calculation but for stretch; if the LRU // stretches into a different tile, we don't consider those SBs // in the other tile to be part of the LRU for RDO purposes. pub fn restoration_unit_last_sb_for_rdo( &self, fi: &FrameInvariants, global_sbo: PlaneSuperBlockOffset, tile_sbo: TileSuperBlockOffset, ) -> bool { // there is 1 restoration unit for (1 << sb_shift) super-blocks let h_mask = (1 << self.rp_cfg.sb_h_shift) - 1; let v_mask = (1 << self.rp_cfg.sb_v_shift) - 1; // is this a stretch block? let x_stretch = tile_sbo.0.x >> self.rp_cfg.sb_h_shift >= self.units.cols; let y_stretch = tile_sbo.0.y >> self.rp_cfg.sb_v_shift >= self.units.rows; // Need absolute superblock offsets for edge check, not local to the tile. let sbx = global_sbo.0.x + tile_sbo.0.x; let sby = global_sbo.0.y + tile_sbo.0.y; // edge-of-tile check + edge-of-frame check let last_x = (tile_sbo.0.x & h_mask == h_mask && !x_stretch) || sbx == fi.sb_width-1; let last_y = (tile_sbo.0.y & v_mask == v_mask && !y_stretch) || sby == fi.sb_height-1; last_x && last_y } #[inline(always)] pub fn restoration_unit(&self, sbo: TileSuperBlockOffset, stretch: bool) -> Option<&RestorationUnit> { self.restoration_unit_index(sbo, stretch).map(|(x, y)| &self.units[y][x]) } } } } tile_restoration_plane_common!(TileRestorationPlane, TileRestorationUnits); tile_restoration_plane_common!( TileRestorationPlaneMut, TileRestorationUnitsMut, mut ); impl<'a> TileRestorationPlaneMut<'a> { #[inline(always)] pub fn restoration_unit_mut( &mut self, sbo: TileSuperBlockOffset, ) -> Option<&mut RestorationUnit> { // cannot use map() due to lifetime constraints if let Some((x, y)) = self.restoration_unit_index(sbo, true) { Some(&mut self.units[y][x]) } else { None } } #[inline(always)] pub const fn as_const(&self) -> TileRestorationPlane<'_> { TileRestorationPlane { rp_cfg: self.rp_cfg, wiener_ref: self.wiener_ref, sgrproj_ref: self.sgrproj_ref, units: self.units.as_const(), } } } /// Tiled view of `RestorationState` #[derive(Debug)] pub struct TileRestorationState<'a> { pub planes: [TileRestorationPlane<'a>; MAX_PLANES], } /// Mutable tiled view of `RestorationState` #[derive(Debug)] pub struct TileRestorationStateMut<'a> { pub planes: [TileRestorationPlaneMut<'a>; MAX_PLANES], } // common impl for TileRestorationState and TileRestorationStateMut macro_rules! tile_restoration_state_common { // $name: TileRestorationState or TileRestorationStateMut // $trp_type: TileRestorationPlane or TileRestorationPlaneMut // $iter: iter or iter_mut // $opt_mut: nothing or mut ($name:ident, $trp_type:ident, $iter:ident $(,$opt_mut:tt)?) => { impl<'a> $name<'a> { #[inline(always)] pub fn new( rs: &'a $($opt_mut)? RestorationState, sbo: PlaneSuperBlockOffset, sb_width: usize, sb_height: usize, ) -> Self { let (units_x0, units_y0, units_cols0, units_rows0) = Self::get_units_region(rs, sbo, sb_width, sb_height, 0); let (units_x1, units_y1, units_cols1, units_rows1) = Self::get_units_region(rs, sbo, sb_width, sb_height, 1); let (units_x2, units_y2, units_cols2, units_rows2) = Self::get_units_region(rs, sbo, sb_width, sb_height, 2); // we cannot retrieve &mut of slice items directly and safely let mut planes_iter = rs.planes.$iter(); Self { planes: [ { let plane = planes_iter.next().unwrap(); $trp_type::new(plane, units_x0, units_y0, units_cols0, units_rows0) }, { let plane = planes_iter.next().unwrap(); $trp_type::new(plane, units_x1, units_y1, units_cols1, units_rows1) }, { let plane = planes_iter.next().unwrap(); $trp_type::new(plane, units_x2, units_y2, units_cols2, units_rows2) }, ], } } #[inline(always)] fn get_units_region( rs: &RestorationState, sbo: PlaneSuperBlockOffset, sb_width: usize, sb_height: usize, pli: usize, ) -> (usize, usize, usize, usize) { let sb_h_shift = rs.planes[pli].cfg.sb_h_shift; let sb_v_shift = rs.planes[pli].cfg.sb_v_shift; // there may be several super-blocks per restoration unit // the given super-block offset must match the start of a restoration unit debug_assert!(sbo.0.x % (1 << sb_h_shift) == 0); debug_assert!(sbo.0.y % (1 << sb_v_shift) == 0); let units_x = sbo.0.x >> sb_h_shift; let units_y = sbo.0.y >> sb_v_shift; let units_cols = sb_width + (1 << sb_h_shift) - 1 >> sb_h_shift; let units_rows = sb_height + (1 << sb_v_shift) - 1 >> sb_v_shift; let FrameRestorationUnits { cols: rs_cols, rows: rs_rows, .. } = rs.planes[pli].units; // +1 because the last super-block may use the "stretched" restoration unit // from its neighbours // debug_assert!(units_x < rs_cols + 1); debug_assert!(units_y < rs_rows + 1); debug_assert!(units_x + units_cols <= rs_cols + 1); debug_assert!(units_y + units_rows <= rs_rows + 1); let units_x = units_x.min(rs_cols); let units_y = units_y.min(rs_rows); let units_cols = units_cols.min(rs_cols - units_x); let units_rows = units_rows.min(rs_rows - units_y); (units_x, units_y, units_cols, units_rows) } #[inline(always)] pub fn has_restoration_unit(&self, sbo: TileSuperBlockOffset, pli: usize, stretch: bool) -> bool { self.planes[pli].restoration_unit(sbo, stretch).is_some() } } } } tile_restoration_state_common!( TileRestorationState, TileRestorationPlane, iter ); tile_restoration_state_common!( TileRestorationStateMut, TileRestorationPlaneMut, iter_mut, mut ); impl<'a> TileRestorationStateMut<'a> { #[inline(always)] pub const fn as_const(&self) -> TileRestorationState { TileRestorationState { planes: [ self.planes[0].as_const(), self.planes[1].as_const(), self.planes[2].as_const(), ], } } } rav1e-0.7.1/src/tiling/tile_state.rs000064400000000000000000000174671046102023000154460ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; use crate::context::*; use crate::encoder::*; use crate::frame::*; use crate::lrf::{IntegralImageBuffer, SOLVE_IMAGE_SIZE}; use crate::mc::MotionVector; use crate::me::FrameMEStats; use crate::me::WriteGuardMEStats; use crate::partition::{RefType, REF_FRAMES}; use crate::predict::{InterCompoundBuffers, PredictionMode}; use crate::quantize::*; use crate::rdo::*; use crate::stats::EncoderStats; use crate::util::*; use std::ops::{Index, IndexMut}; use std::sync::Arc; /// Tiled view of `FrameState` /// /// Contrary to `PlaneRegionMut` and `TileMut`, there is no const version: /// - in practice, we don't need it; /// - it would require to instantiate a const version of every of its inner /// tiled views recursively. /// /// # `TileState` fields /// /// The way the `FrameState` fields are mapped depend on how they are accessed /// tile-wise and frame-wise. /// /// Some fields (like `qc`) are only used during tile-encoding, so they are only /// stored in `TileState`. /// /// Some other fields (like `input` or `segmentation`) are not written /// tile-wise, so they just reference the matching field in `FrameState`. /// /// Some others (like `rec`) are written tile-wise, but must be accessible /// frame-wise once the tile views vanish (e.g. for deblocking). #[derive(Debug)] pub struct TileStateMut<'a, T: Pixel> { pub sbo: PlaneSuperBlockOffset, pub sb_size_log2: usize, pub sb_width: usize, pub sb_height: usize, pub mi_width: usize, pub mi_height: usize, pub width: usize, pub height: usize, pub input: &'a Frame, // the whole frame pub input_tile: Tile<'a, T>, // the current tile pub input_hres: &'a Plane, pub input_qres: &'a Plane, pub deblock: &'a DeblockState, pub rec: TileMut<'a, T>, pub qc: QuantizationContext, pub segmentation: &'a SegmentationState, pub restoration: TileRestorationStateMut<'a>, pub me_stats: Vec>, pub coded_block_info: MiTileState, pub integral_buffer: IntegralImageBuffer, pub inter_compound_buffers: InterCompoundBuffers, } /// Contains information for a coded block that is /// useful to persist. For example, the intra edge /// filter requires surrounding coded block information. #[derive(Debug, Clone, Copy)] pub struct CodedBlockInfo { pub luma_mode: PredictionMode, pub chroma_mode: PredictionMode, pub reference_types: [RefType; 2], } impl Default for CodedBlockInfo { fn default() -> Self { CodedBlockInfo { luma_mode: PredictionMode::DC_PRED, chroma_mode: PredictionMode::DC_PRED, reference_types: [RefType::INTRA_FRAME, RefType::NONE_FRAME], } } } #[derive(Debug, Clone)] pub struct MiTileState { mi_width: usize, mi_height: usize, mi_block_info: Vec, } impl MiTileState { pub fn new(mi_width: usize, mi_height: usize) -> Self { MiTileState { mi_width, mi_height, mi_block_info: vec![CodedBlockInfo::default(); mi_width * mi_height], } } } impl Index for MiTileState { type Output = [CodedBlockInfo]; #[inline(always)] fn index(&self, index: usize) -> &Self::Output { &self.mi_block_info[index * self.mi_width..(index + 1) * self.mi_width] } } impl IndexMut for MiTileState { #[inline(always)] fn index_mut(&mut self, index: usize) -> &mut Self::Output { &mut self.mi_block_info[index * self.mi_width..(index + 1) * self.mi_width] } } impl<'a, T: Pixel> TileStateMut<'a, T> { pub fn new( fs: &'a mut FrameState, sbo: PlaneSuperBlockOffset, sb_size_log2: usize, width: usize, height: usize, frame_me_stats: &'a mut [FrameMEStats], ) -> Self { debug_assert!( width % MI_SIZE == 0, "Tile width must be a multiple of MI_SIZE" ); debug_assert!( height % MI_SIZE == 0, "Tile width must be a multiple of MI_SIZE" ); let sb_rounded_width = width.align_power_of_two(sb_size_log2); let sb_rounded_height = height.align_power_of_two(sb_size_log2); let luma_rect = TileRect { x: sbo.0.x << sb_size_log2, y: sbo.0.y << sb_size_log2, width: sb_rounded_width, height: sb_rounded_height, }; let sb_width = width.align_power_of_two_and_shift(sb_size_log2); let sb_height = height.align_power_of_two_and_shift(sb_size_log2); Self { sbo, sb_size_log2, sb_width, sb_height, mi_width: width >> MI_SIZE_LOG2, mi_height: height >> MI_SIZE_LOG2, width, height, input: &fs.input, input_tile: Tile::new(&fs.input, luma_rect), input_hres: &fs.input_hres, input_qres: &fs.input_qres, deblock: &fs.deblock, rec: TileMut::new(Arc::make_mut(&mut fs.rec), luma_rect), qc: Default::default(), segmentation: &fs.segmentation, restoration: TileRestorationStateMut::new( &mut fs.restoration, sbo, sb_width, sb_height, ), me_stats: frame_me_stats .iter_mut() .map(|fmvs| { TileMEStatsMut::new( fmvs, sbo.0.x << (sb_size_log2 - MI_SIZE_LOG2), sbo.0.y << (sb_size_log2 - MI_SIZE_LOG2), width >> MI_SIZE_LOG2, height >> MI_SIZE_LOG2, ) }) .collect(), coded_block_info: MiTileState::new( width >> MI_SIZE_LOG2, height >> MI_SIZE_LOG2, ), integral_buffer: IntegralImageBuffer::zeroed(SOLVE_IMAGE_SIZE), inter_compound_buffers: InterCompoundBuffers::default(), } } #[inline(always)] pub fn tile_rect(&self) -> TileRect { TileRect { x: self.sbo.0.x << self.sb_size_log2, y: self.sbo.0.y << self.sb_size_log2, width: self.width, height: self.height, } } #[inline(always)] pub fn to_frame_block_offset( &self, tile_bo: TileBlockOffset, ) -> PlaneBlockOffset { let bx = self.sbo.0.x << (self.sb_size_log2 - MI_SIZE_LOG2); let by = self.sbo.0.y << (self.sb_size_log2 - MI_SIZE_LOG2); PlaneBlockOffset(BlockOffset { x: bx + tile_bo.0.x, y: by + tile_bo.0.y }) } #[inline(always)] pub fn to_frame_super_block_offset( &self, tile_sbo: TileSuperBlockOffset, ) -> PlaneSuperBlockOffset { PlaneSuperBlockOffset(SuperBlockOffset { x: self.sbo.0.x + tile_sbo.0.x, y: self.sbo.0.y + tile_sbo.0.y, }) } /// Returns above block information for context during prediction. /// If there is no above block, returns `None`. /// `xdec` and `ydec` are the decimation factors of the targeted plane. pub fn above_block_info( &self, bo: TileBlockOffset, xdec: usize, ydec: usize, ) -> Option { let (mut bo_x, mut bo_y) = (bo.0.x, bo.0.y); if bo_x & 1 == 0 { bo_x += xdec }; if bo_y & 1 == 1 { bo_y -= ydec }; if bo_y == 0 { None } else { Some(self.coded_block_info[bo_y - 1][bo_x]) } } /// Returns left block information for context during prediction. /// If there is no left block, returns `None`. /// `xdec` and `ydec` are the decimation factors of the targeted plane. pub fn left_block_info( &self, bo: TileBlockOffset, xdec: usize, ydec: usize, ) -> Option { let (mut bo_x, mut bo_y) = (bo.0.x, bo.0.y); if bo_x & 1 == 1 { bo_x -= xdec }; if bo_y & 1 == 0 { bo_y += ydec }; if bo_x == 0 { None } else { Some(self.coded_block_info[bo_y][bo_x - 1]) } } } rav1e-0.7.1/src/tiling/tiler.rs000064400000000000000000000646141046102023000144240ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::*; use crate::context::*; use crate::encoder::*; use crate::me::WriteGuardMEStats; use crate::util::*; use std::iter::FusedIterator; use std::marker::PhantomData; use std::ops::DerefMut; pub const MAX_TILE_WIDTH: usize = 4096; pub const MAX_TILE_AREA: usize = 4096 * 2304; pub const MAX_TILE_COLS: usize = 64; pub const MAX_TILE_ROWS: usize = 64; pub const MAX_TILE_RATE: f64 = 4096f64 * 2176f64 * 60f64 * 1.1; /// Tiling information /// /// This stores everything necessary to split a frame into tiles, and write /// headers fields into the bitstream. /// /// The method `tile_iter_mut()` actually provides tiled views of `FrameState` /// and `FrameBlocks`. #[derive(Debug, Clone, Copy)] pub struct TilingInfo { pub frame_width: usize, pub frame_height: usize, pub tile_width_sb: usize, pub tile_height_sb: usize, pub cols: usize, // number of columns of tiles within the whole frame pub rows: usize, // number of rows of tiles within the whole frame pub tile_cols_log2: usize, pub tile_rows_log2: usize, pub min_tile_cols_log2: usize, pub max_tile_cols_log2: usize, pub min_tile_rows_log2: usize, pub max_tile_rows_log2: usize, pub sb_size_log2: usize, pub min_tiles_log2: usize, } impl TilingInfo { /// # Panics /// /// Panics if the resulting tile sizes would be too large. pub fn from_target_tiles( sb_size_log2: usize, frame_width: usize, frame_height: usize, frame_rate: f64, tile_cols_log2: usize, tile_rows_log2: usize, is_422_p: bool, ) -> Self { // // Frame::new() aligns to the next multiple of 8 let frame_width = frame_width.align_power_of_two(3); let frame_height = frame_height.align_power_of_two(3); let frame_width_sb = frame_width.align_power_of_two_and_shift(sb_size_log2); let frame_height_sb = frame_height.align_power_of_two_and_shift(sb_size_log2); let sb_cols = frame_width.align_power_of_two_and_shift(sb_size_log2); let sb_rows = frame_height.align_power_of_two_and_shift(sb_size_log2); // these are bitstream-defined values and must not be changed let max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2; let max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2); let min_tile_cols_log2 = Self::tile_log2(max_tile_width_sb, sb_cols).unwrap(); let max_tile_cols_log2 = Self::tile_log2(1, sb_cols.min(MAX_TILE_COLS)).unwrap(); let max_tile_rows_log2 = Self::tile_log2(1, sb_rows.min(MAX_TILE_ROWS)).unwrap(); let min_tiles_log2 = min_tile_cols_log2 .max(Self::tile_log2(max_tile_area_sb, sb_cols * sb_rows).unwrap()); // Implements restriction in Annex A of the spec. // Unlike the other restrictions, this one does not change // the header coding of the tile rows/cols. let min_tiles_ratelimit_log2 = min_tiles_log2.max( ((frame_width * frame_height) as f64 * frame_rate / MAX_TILE_RATE) .ceil() .log2() .ceil() as usize, ); let tile_cols_log2 = tile_cols_log2.clamp(min_tile_cols_log2, max_tile_cols_log2); let tile_width_sb_pre = sb_cols.align_power_of_two_and_shift(tile_cols_log2); // If this is 4:2:2, our UV horizontal is subsampled but not our // vertical. Loop Restoration Units must be square, so they // will always have an even number of horizontal superblocks. For // tiles and LRUs to align, tile_width_sb must be even in 4:2:2 // video. // This is only relevant when doing loop restoration RDO inline // with block/superblock encoding, that is, where tiles are // relevant. If (when) we introduce optionally delaying loop-filter // encode to after the partitioning loop, we won't need to make // any 4:2:2 adjustment. let tile_width_sb = if is_422_p { (tile_width_sb_pre + 1) >> 1 << 1 } else { tile_width_sb_pre }; let cols = (frame_width_sb + tile_width_sb - 1) / tile_width_sb; // Adjust tile_cols_log2 in case of rounding tile_width_sb to even. let tile_cols_log2 = Self::tile_log2(1, cols).unwrap(); assert!(tile_cols_log2 >= min_tile_cols_log2); let min_tile_rows_log2 = if min_tiles_log2 > tile_cols_log2 { min_tiles_log2 - tile_cols_log2 } else { 0 }; let min_tile_rows_ratelimit_log2 = if min_tiles_ratelimit_log2 > tile_cols_log2 { min_tiles_ratelimit_log2 - tile_cols_log2 } else { 0 }; let tile_rows_log2 = tile_rows_log2 .max(min_tile_rows_log2) .clamp(min_tile_rows_ratelimit_log2, max_tile_rows_log2); let tile_height_sb = sb_rows.align_power_of_two_and_shift(tile_rows_log2); let rows = (frame_height_sb + tile_height_sb - 1) / tile_height_sb; Self { frame_width, frame_height, tile_width_sb, tile_height_sb, cols, rows, tile_cols_log2, tile_rows_log2, min_tile_cols_log2, max_tile_cols_log2, min_tile_rows_log2, max_tile_rows_log2, sb_size_log2, min_tiles_log2, } } /// Return the smallest value for `k` such that `blkSize << k` is greater than /// or equal to `target`. /// /// pub fn tile_log2(blk_size: usize, target: usize) -> Option { let mut k = 0; while (blk_size.checked_shl(k)?) < target { k += 1; } Some(k as usize) } #[inline(always)] pub const fn tile_count(&self) -> usize { self.cols * self.rows } /// Split frame-level structures into tiles /// /// Provide mutable tiled views of frame-level structures. pub fn tile_iter_mut<'a, T: Pixel>( &self, fs: &'a mut FrameState, fb: &'a mut FrameBlocks, ) -> TileContextIterMut<'a, T> { let afs = fs as *mut _; let afb = fb as *mut _; let frame_me_stats = fs.frame_me_stats.write().expect("poisoned lock"); TileContextIterMut { ti: *self, fs: afs, fb: afb, next: 0, frame_me_stats } } } /// Container for all tiled views pub struct TileContextMut<'a, T: Pixel> { pub ts: TileStateMut<'a, T>, pub tb: TileBlocksMut<'a>, } /// Iterator over tiled views pub struct TileContextIterMut<'a, T: Pixel> { ti: TilingInfo, fs: *mut FrameState, fb: *mut FrameBlocks, frame_me_stats: WriteGuardMEStats<'a>, next: usize, } impl<'a, T: Pixel> Iterator for TileContextIterMut<'a, T> { type Item = TileContextMut<'a, T>; fn next(&mut self) -> Option { if self.next < self.ti.rows * self.ti.cols { let tile_col = self.next % self.ti.cols; let tile_row = self.next / self.ti.cols; let ctx = TileContextMut { ts: { // SAFETY: Multiple tiles mutably access this struct. // The dimensions must be configured correctly to ensure // the tiles do not overlap. let fs = unsafe { &mut *self.fs }; // SAFETY: ditto let frame_me_stats = unsafe { let len = self.frame_me_stats.len(); let ptr = self.frame_me_stats.as_mut_ptr(); std::slice::from_raw_parts_mut(ptr, len) }; let sbo = PlaneSuperBlockOffset(SuperBlockOffset { x: tile_col * self.ti.tile_width_sb, y: tile_row * self.ti.tile_height_sb, }); let x = sbo.0.x << self.ti.sb_size_log2; let y = sbo.0.y << self.ti.sb_size_log2; let tile_width = self.ti.tile_width_sb << self.ti.sb_size_log2; let tile_height = self.ti.tile_height_sb << self.ti.sb_size_log2; let width = tile_width.min(self.ti.frame_width - x); let height = tile_height.min(self.ti.frame_height - y); TileStateMut::new( fs, sbo, self.ti.sb_size_log2, width, height, frame_me_stats, ) }, tb: { // SAFETY: Multiple tiles mutably access this struct. // The dimensions must be configured correctly to ensure // the tiles do not overlap. let fb = unsafe { &mut *self.fb }; let tile_width_mi = self.ti.tile_width_sb << (self.ti.sb_size_log2 - MI_SIZE_LOG2); let tile_height_mi = self.ti.tile_height_sb << (self.ti.sb_size_log2 - MI_SIZE_LOG2); let x = tile_col * tile_width_mi; let y = tile_row * tile_height_mi; let cols = tile_width_mi.min(fb.cols - x); let rows = tile_height_mi.min(fb.rows - y); TileBlocksMut::new(fb, x, y, cols, rows) }, }; self.next += 1; Some(ctx) } else { None } } fn size_hint(&self) -> (usize, Option) { let remaining = self.ti.cols * self.ti.rows - self.next; (remaining, Some(remaining)) } } impl ExactSizeIterator for TileContextIterMut<'_, T> {} impl FusedIterator for TileContextIterMut<'_, T> {} #[cfg(test)] pub mod test { use super::*; use crate::api::*; use crate::lrf::*; use crate::mc::MotionVector; use crate::predict::PredictionMode; use std::sync::Arc; #[test] fn test_tiling_info_from_tile_count() { let sb_size_log2 = 6; let (width, height) = (160, 144); let frame_rate = 25f64; let ti = TilingInfo::from_target_tiles( sb_size_log2, width, height, frame_rate, 0, 0, false, ); assert_eq!(1, ti.cols); assert_eq!(1, ti.rows); assert_eq!(3, ti.tile_width_sb); assert_eq!(3, ti.tile_height_sb); let ti = TilingInfo::from_target_tiles( sb_size_log2, width, height, frame_rate, 1, 1, false, ); assert_eq!(2, ti.cols); assert_eq!(2, ti.rows); assert_eq!(2, ti.tile_width_sb); assert_eq!(2, ti.tile_height_sb); let ti = TilingInfo::from_target_tiles( sb_size_log2, width, height, frame_rate, 2, 2, false, ); assert_eq!(3, ti.cols); assert_eq!(3, ti.rows); assert_eq!(1, ti.tile_width_sb); assert_eq!(1, ti.tile_height_sb); // cannot split more than superblocks let ti = TilingInfo::from_target_tiles( sb_size_log2, width, height, frame_rate, 10, 8, false, ); assert_eq!(3, ti.cols); assert_eq!(3, ti.rows); assert_eq!(1, ti.tile_width_sb); assert_eq!(1, ti.tile_height_sb); let ti = TilingInfo::from_target_tiles( sb_size_log2, 1024, 1024, frame_rate, 0, 0, false, ); assert_eq!(1, ti.cols); assert_eq!(1, ti.rows); assert_eq!(16, ti.tile_width_sb); assert_eq!(16, ti.tile_height_sb); } fn setup( width: usize, height: usize, ) -> (FrameInvariants, FrameState, FrameBlocks, f64) { // FrameInvariants aligns to the next multiple of 8, so using other values could make tests confusing assert!(width & 7 == 0); assert!(height & 7 == 0); // We test only for 420 for now let chroma_sampling = ChromaSampling::Cs420; let config = Arc::new(EncoderConfig { width, height, bit_depth: 8, chroma_sampling, ..Default::default() }); let mut sequence = Sequence::new(&config); // These tests are all assuming SB-sized LRUs, so set that. sequence.enable_large_lru = false; let frame_rate = config.frame_rate(); let fi = FrameInvariants::new(config, Arc::new(sequence)); let fs = FrameState::new(&fi); let fb = FrameBlocks::new(fi.w_in_b, fi.h_in_b); (fi, fs, fb, frame_rate) } #[test] fn test_tile_iter_len() { // frame size 160x144, 40x36 in 4x4-blocks let (fi, mut fs, mut fb, frame_rate) = setup(160, 144); { // 2x2 tiles let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 1, 1, false, ); let mut iter = ti.tile_iter_mut(&mut fs, &mut fb); assert_eq!(4, iter.len()); assert!(iter.next().is_some()); assert_eq!(3, iter.len()); assert!(iter.next().is_some()); assert_eq!(2, iter.len()); assert!(iter.next().is_some()); assert_eq!(1, iter.len()); assert!(iter.next().is_some()); assert_eq!(0, iter.len()); assert!(iter.next().is_none()); } { // 4x4 tiles requested, will actually get 3x3 tiles let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 2, 2, false, ); let mut iter = ti.tile_iter_mut(&mut fs, &mut fb); assert_eq!(9, iter.len()); assert!(iter.next().is_some()); assert_eq!(8, iter.len()); assert!(iter.next().is_some()); assert_eq!(7, iter.len()); assert!(iter.next().is_some()); assert_eq!(6, iter.len()); assert!(iter.next().is_some()); assert_eq!(5, iter.len()); assert!(iter.next().is_some()); assert_eq!(4, iter.len()); assert!(iter.next().is_some()); assert_eq!(3, iter.len()); assert!(iter.next().is_some()); assert_eq!(2, iter.len()); assert!(iter.next().is_some()); assert_eq!(1, iter.len()); assert!(iter.next().is_some()); assert_eq!(0, iter.len()); assert!(iter.next().is_none()); } } #[inline] fn rect( region: &PlaneRegionMut<'_, T>, ) -> (isize, isize, usize, usize) { let &Rect { x, y, width, height } = region.rect(); (x, y, width, height) } #[test] fn test_tile_area() { let (fi, mut fs, mut fb, frame_rate) = setup(160, 144); // 4x4 tiles requested, will actually get 3x3 tiles let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 2, 2, false, ); let iter = ti.tile_iter_mut(&mut fs, &mut fb); let tile_states = iter.map(|ctx| ctx.ts).collect::>(); // the frame must be split into 9 tiles: // // luma (Y) chroma (U) chroma (V) // 64x64 64x64 32x64 32x32 32x32 16x32 32x32 32x32 16x32 // 64x64 64x64 32x64 32x32 32x32 16x32 32x32 32x32 16x32 // 64x16 64x16 32x16 32x 8 32x 8 16x 8 32x 8 32x 8 16x 8 assert_eq!(9, tile_states.len()); let tile = &tile_states[0].rec; // the top-left tile assert_eq!((0, 0, 64, 64), rect(&tile.planes[0])); assert_eq!((0, 0, 32, 32), rect(&tile.planes[1])); assert_eq!((0, 0, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[1].rec; // the top-middle tile assert_eq!((64, 0, 64, 64), rect(&tile.planes[0])); assert_eq!((32, 0, 32, 32), rect(&tile.planes[1])); assert_eq!((32, 0, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[2].rec; // the top-right tile assert_eq!((128, 0, 64, 64), rect(&tile.planes[0])); assert_eq!((64, 0, 32, 32), rect(&tile.planes[1])); assert_eq!((64, 0, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[3].rec; // the middle-left tile assert_eq!((0, 64, 64, 64), rect(&tile.planes[0])); assert_eq!((0, 32, 32, 32), rect(&tile.planes[1])); assert_eq!((0, 32, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[4].rec; // the center tile assert_eq!((64, 64, 64, 64), rect(&tile.planes[0])); assert_eq!((32, 32, 32, 32), rect(&tile.planes[1])); assert_eq!((32, 32, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[5].rec; // the middle-right tile assert_eq!((128, 64, 64, 64), rect(&tile.planes[0])); assert_eq!((64, 32, 32, 32), rect(&tile.planes[1])); assert_eq!((64, 32, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[6].rec; // the bottom-left tile assert_eq!((0, 128, 64, 64), rect(&tile.planes[0])); assert_eq!((0, 64, 32, 32), rect(&tile.planes[1])); assert_eq!((0, 64, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[7].rec; // the bottom-middle tile assert_eq!((64, 128, 64, 64), rect(&tile.planes[0])); assert_eq!((32, 64, 32, 32), rect(&tile.planes[1])); assert_eq!((32, 64, 32, 32), rect(&tile.planes[2])); let tile = &tile_states[8].rec; // the bottom-right tile assert_eq!((128, 128, 64, 64), rect(&tile.planes[0])); assert_eq!((64, 64, 32, 32), rect(&tile.planes[1])); assert_eq!((64, 64, 32, 32), rect(&tile.planes[2])); } #[inline] const fn b_area(region: &TileBlocksMut<'_>) -> (usize, usize, usize, usize) { (region.x(), region.y(), region.cols(), region.rows()) } #[test] fn test_tile_blocks_area() { let (fi, mut fs, mut fb, frame_rate) = setup(160, 144); // 4x4 tiles requested, will actually get 3x3 tiles let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 2, 2, false, ); let iter = ti.tile_iter_mut(&mut fs, &mut fb); let tbs = iter.map(|ctx| ctx.tb).collect::>(); // the FrameBlocks must be split into 9 TileBlocks: // // 16x16 16x16 8x16 // 16x16 16x16 8x16 // 16x 4 16x4 8x 4 assert_eq!(9, tbs.len()); assert_eq!((0, 0, 16, 16), b_area(&tbs[0])); assert_eq!((16, 0, 16, 16), b_area(&tbs[1])); assert_eq!((32, 0, 8, 16), b_area(&tbs[2])); assert_eq!((0, 16, 16, 16), b_area(&tbs[3])); assert_eq!((16, 16, 16, 16), b_area(&tbs[4])); assert_eq!((32, 16, 8, 16), b_area(&tbs[5])); assert_eq!((0, 32, 16, 4), b_area(&tbs[6])); assert_eq!((16, 32, 16, 4), b_area(&tbs[7])); assert_eq!((32, 32, 8, 4), b_area(&tbs[8])); } #[test] fn test_tile_write() { let (fi, mut fs, mut fb, frame_rate) = setup(160, 144); { // 4x4 tiles requested, will actually get 3x3 tiles let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 2, 2, false, ); let iter = ti.tile_iter_mut(&mut fs, &mut fb); let mut tile_states = iter.map(|ctx| ctx.ts).collect::>(); { // row 12 of Y-plane of the top-left tile let tile_plane = &mut tile_states[0].rec.planes[0]; let row = &mut tile_plane[12]; assert_eq!(64, row.len()); row[35..41].copy_from_slice(&[4, 42, 12, 18, 15, 31]); } { // row 8 of U-plane of the middle-right tile let tile_plane = &mut tile_states[5].rec.planes[1]; let row = &mut tile_plane[8]; assert_eq!(32, row.len()); row[..4].copy_from_slice(&[14, 121, 1, 3]); } { // row 1 of V-plane of the bottom-middle tile let tile_plane = &mut tile_states[7].rec.planes[2]; let row = &mut tile_plane[1]; assert_eq!(32, row.len()); row[11..16].copy_from_slice(&[6, 5, 2, 11, 8]); } } // check that writes on tiles correctly affected the underlying frame let plane = &fs.rec.planes[0]; let y = plane.cfg.yorigin + 12; let x = plane.cfg.xorigin + 35; let idx = y * plane.cfg.stride + x; assert_eq!(&[4, 42, 12, 18, 15, 31], &plane.data[idx..idx + 6]); let plane = &fs.rec.planes[1]; let offset = (64, 32); // middle-right tile, chroma plane let y = plane.cfg.yorigin + offset.1 + 8; let x = plane.cfg.xorigin + offset.0; let idx = y * plane.cfg.stride + x; assert_eq!(&[14, 121, 1, 3], &plane.data[idx..idx + 4]); let plane = &fs.rec.planes[2]; let offset = (32, 64); // bottom-middle tile, chroma plane let y = plane.cfg.yorigin + offset.1 + 1; let x = plane.cfg.xorigin + offset.0 + 11; let idx = y * plane.cfg.stride + x; assert_eq!(&[6, 5, 2, 11, 8], &plane.data[idx..idx + 5]); } #[test] fn test_tile_restoration_edges() { let (fi, mut fs, mut fb, frame_rate) = setup(64, 80); let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 2, 2, false, ); let iter = ti.tile_iter_mut(&mut fs, &mut fb); let mut tile_states = iter.map(|ctx| ctx.ts).collect::>(); assert_eq!(tile_states.len(), 2); { let trs = &mut tile_states[0].restoration; let units = &trs.planes[0].units; assert_eq!(units.x(), 0); assert_eq!(units.y(), 0); assert_eq!(units.cols(), 1); assert_eq!(units.rows(), 1); } { let trs = &mut tile_states[1].restoration; let units = &trs.planes[0].units; assert_eq!(units.x(), 0); assert_eq!(units.y(), 1); // no units, the tile is too small (less than 1/2 super-block) assert_eq!(units.cols() * units.rows(), 0); } } #[test] fn test_tile_restoration_write() { let (fi, mut fs, mut fb, frame_rate) = setup(256, 256); { // 2x2 tiles, each one containing 2×2 restoration units (1 super-block per restoration unit) let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 1, 1, false, ); let iter = ti.tile_iter_mut(&mut fs, &mut fb); let mut tile_states = iter.map(|ctx| ctx.ts).collect::>(); { // unit (1, 0) of Y-plane of the top-left tile let units = &mut tile_states[0].restoration.planes[0].units; units[0][1].filter = RestorationFilter::Wiener { coeffs: [[1, 2, 3], [4, 5, 6]] }; } { // unit (0, 1) of U-plane of the bottom-right tile let units = &mut tile_states[3].restoration.planes[1].units; units[1][0].filter = RestorationFilter::Sgrproj { set: 42, xqd: [10, 20] }; } { // unit (1, 1) of V-plane of the bottom-left tile let units = &mut tile_states[2].restoration.planes[2].units; units[1][1].filter = RestorationFilter::Sgrproj { set: 5, xqd: [1, 2] }; } } // check that writes on tiles correctly affected the underlying restoration units let units = &mut fs.restoration.planes[0].units; assert_eq!( units[0][1].filter, RestorationFilter::Wiener { coeffs: [[1, 2, 3], [4, 5, 6]] } ); let units = &mut fs.restoration.planes[1].units; assert_eq!( units[3][2].filter, RestorationFilter::Sgrproj { set: 42, xqd: [10, 20] } ); let units = &mut fs.restoration.planes[2].units; assert_eq!( units[3][1].filter, RestorationFilter::Sgrproj { set: 5, xqd: [1, 2] } ); } #[test] fn test_tile_motion_vectors_write() { let (fi, mut fs, mut fb, frame_rate) = setup(160, 144); { // 4x4 tiles requested, will actually get 3x3 tiles let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 2, 2, false, ); let iter = ti.tile_iter_mut(&mut fs, &mut fb); let mut tile_states = iter.map(|ctx| ctx.ts).collect::>(); { // block (8, 5) of the top-left tile (of the first ref frame) let me_stats = &mut tile_states[0].me_stats[0]; me_stats[5][8].mv = MotionVector { col: 42, row: 38 }; println!("{:?}", me_stats[5][8].mv); } { // block (4, 2) of the middle-right tile (of ref frame 2) let me_stats = &mut tile_states[5].me_stats[2]; me_stats[2][3].mv = MotionVector { col: 2, row: 14 }; } } // check that writes on tiled views affected the underlying motion vectors let me_stats = &fs.frame_me_stats.read().unwrap()[0]; assert_eq!(MotionVector { col: 42, row: 38 }, me_stats[5][8].mv); let me_stats = &fs.frame_me_stats.read().unwrap()[2]; let mix = (128 >> MI_SIZE_LOG2) + 3; let miy = (64 >> MI_SIZE_LOG2) + 2; assert_eq!(MotionVector { col: 2, row: 14 }, me_stats[miy][mix].mv); } #[test] fn test_tile_blocks_write() { let (fi, mut fs, mut fb, frame_rate) = setup(160, 144); { // 4x4 tiles requested, will actually get 3x3 tiles let ti = TilingInfo::from_target_tiles( fi.sb_size_log2(), fi.width, fi.height, frame_rate, 2, 2, false, ); let iter = ti.tile_iter_mut(&mut fs, &mut fb); let mut tbs = iter.map(|ctx| ctx.tb).collect::>(); { // top-left tile let tb = &mut tbs[0]; // block (4, 3) tb[3][4].n4_w = 42; // block (8, 5) tb[5][8].segmentation_idx = 14; } { // middle-right tile let tb = &mut tbs[5]; // block (0, 1) tb[1][0].n4_h = 11; // block (7, 5) tb[5][7].cdef_index = 3; } { // bottom-middle tile let tb = &mut tbs[7]; // block (3, 2) tb[2][3].mode = PredictionMode::PAETH_PRED; // block (1, 1) tb[1][1].n4_w = 8; } } // check that writes on tiles correctly affected the underlying blocks assert_eq!(42, fb[3][4].n4_w); assert_eq!(14, fb[5][8].segmentation_idx); assert_eq!(11, fb[17][32].n4_h); assert_eq!(3, fb[21][39].cdef_index); assert_eq!(PredictionMode::PAETH_PRED, fb[34][19].mode); assert_eq!(8, fb[33][17].n4_w); } #[test] fn tile_log2_overflow() { assert_eq!(TilingInfo::tile_log2(1, usize::MAX), None); } #[test] fn from_target_tiles_422() { let sb_size_log2 = 6; let is_422_p = true; let frame_rate = 60.; let sb_size = 1 << sb_size_log2; for frame_height in (sb_size..4352).step_by(sb_size) { for tile_rows_log2 in 0..=TilingInfo::tile_log2(1, frame_height >> sb_size_log2).unwrap() { for frame_width in (sb_size..7680).step_by(sb_size) { for tile_cols_log2 in 0..=TilingInfo::tile_log2(1, frame_width >> sb_size_log2).unwrap() { let ti = TilingInfo::from_target_tiles( sb_size_log2, frame_width, frame_height, frame_rate, tile_cols_log2, tile_rows_log2, is_422_p, ); assert_eq!( ti.tile_cols_log2, TilingInfo::tile_log2(1, ti.cols).unwrap() ); assert_eq!( ti.tile_rows_log2, TilingInfo::tile_log2(1, ti.rows).unwrap() ); } } } } } } rav1e-0.7.1/src/token_cdfs.rs000064400000000000000000003070421046102023000141310ustar 00000000000000// Copyright (c) 2018-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_upper_case_globals)] use crate::context::*; use crate::transform::*; use crate::util::*; const TOKEN_CDF_Q_CTXS: usize = 4; pub static av1_default_dc_sign_cdfs: [[[[u16; 2]; DC_SIGN_CONTEXTS]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [[128 * 125], [128 * 102], [128 * 147]], [[128 * 119], [128 * 101], [128 * 135]], ], [ [[128 * 125], [128 * 102], [128 * 147]], [[128 * 119], [128 * 101], [128 * 135]], ], [ [[128 * 125], [128 * 102], [128 * 147]], [[128 * 119], [128 * 101], [128 * 135]], ], [ [[128 * 125], [128 * 102], [128 * 147]], [[128 * 119], [128 * 101], [128 * 135]], ], ]); pub static av1_default_txb_skip_cdfs: [[[[u16; 2]; TXB_SKIP_CONTEXTS]; TxSize::TX_SIZES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [ [31849], [5892], [12112], [21935], [20289], [27473], [32487], [7654], [19473], [29984], [9961], [30242], [32117], ], [ [31548], [1549], [10130], [16656], [18591], [26308], [32537], [5403], [18096], [30003], [16384], [16384], [16384], ], [ [29957], [5391], [18039], [23566], [22431], [25822], [32197], [3778], [15336], [28981], [16384], [16384], [16384], ], [ [17920], [1818], [7282], [25273], [10923], [31554], [32624], [1366], [15628], [30462], [146], [5132], [31657], ], [ [6308], [117], [1638], [2161], [16384], [10923], [30247], [16384], [16384], [16384], [16384], [16384], [16384], ], ], [ [ [30371], [7570], [13155], [20751], [20969], [27067], [32013], [5495], [17942], [28280], [16384], [16384], [16384], ], [ [31782], [1836], [10689], [17604], [21622], [27518], [32399], [4419], [16294], [28345], [16384], [16384], [16384], ], [ [31901], [10311], [18047], [24806], [23288], [27914], [32296], [4215], [15756], [28341], [16384], [16384], [16384], ], [ [26726], [1045], [11703], [20590], [18554], [25970], [31938], [5583], [21313], [29390], [641], [22265], [31452], ], [ [26584], [188], [8847], [24519], [22938], [30583], [32608], [16384], [16384], [16384], [16384], [16384], [16384], ], ], [ [ [29614], [9068], [12924], [19538], [17737], [24619], [30642], [4119], [16026], [25657], [16384], [16384], [16384], ], [ [31957], [3230], [11153], [18123], [20143], [26536], [31986], [3050], [14603], [25155], [16384], [16384], [16384], ], [ [32363], [10692], [19090], [24357], [24442], [28312], [32169], [3648], [15690], [26815], [16384], [16384], [16384], ], [ [30669], [3832], [11663], [18889], [19782], [23313], [31330], [5124], [18719], [28468], [3082], [20982], [29443], ], [ [28573], [3183], [17802], [25977], [26677], [27832], [32387], [16384], [16384], [16384], [16384], [16384], [16384], ], ], [ [ [26887], [6729], [10361], [17442], [15045], [22478], [29072], [2713], [11861], [20773], [16384], [16384], [16384], ], [ [31903], [2044], [7528], [14618], [16182], [24168], [31037], [2786], [11194], [20155], [16384], [16384], [16384], ], [ [32510], [8430], [17318], [24154], [23674], [28789], [32139], [3440], [13117], [22702], [16384], [16384], [16384], ], [ [31671], [2056], [11746], [16852], [18635], [24715], [31484], [4656], [16074], [24704], [1806], [14645], [25336], ], [ [31539], [8433], [20576], [27904], [27852], [30026], [32441], [16384], [16384], [16384], [16384], [16384], [16384], ], ], ]); pub static av1_default_eob_extra_cdfs: [[[[[u16; 2]; EOB_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES]; TOKEN_CDF_Q_CTXS] = cdf_5d([ [ [ [ [16961], [17223], [7621], [16384], [16384], [16384], [16384], [16384], [16384], ], [ [19069], [22525], [13377], [16384], [16384], [16384], [16384], [16384], [16384], ], ], [ [ [20401], [17025], [12845], [12873], [14094], [16384], [16384], [16384], [16384], ], [ [20681], [20701], [15250], [15017], [14928], [16384], [16384], [16384], [16384], ], ], [ [ [23905], [17194], [16170], [17695], [13826], [15810], [12036], [16384], [16384], ], [ [23959], [20799], [19021], [16203], [17886], [14144], [12010], [16384], [16384], ], ], [ [ [27399], [16327], [18071], [19584], [20721], [18432], [19560], [10150], [8805], ], [ [24932], [20833], [12027], [16670], [19914], [15106], [17662], [13783], [28756], ], ], [ [ [23406], [21845], [18432], [16384], [17096], [12561], [17320], [22395], [21370], ], [ [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], ], ], ], [ [ [ [17471], [20223], [11357], [16384], [16384], [16384], [16384], [16384], [16384], ], [ [20335], [21667], [14818], [16384], [16384], [16384], [16384], [16384], [16384], ], ], [ [ [20430], [20662], [15367], [16970], [14657], [16384], [16384], [16384], [16384], ], [ [22117], [22028], [18650], [16042], [15885], [16384], [16384], [16384], [16384], ], ], [ [ [22409], [21012], [15650], [17395], [15469], [20205], [19511], [16384], [16384], ], [ [24220], [22480], [17737], [18916], [19268], [18412], [18844], [16384], [16384], ], ], [ [ [25991], [20314], [17731], [19678], [18649], [17307], [21798], [17549], [15630], ], [ [26585], [21469], [20432], [17735], [19280], [15235], [20297], [22471], [28997], ], ], [ [ [26605], [11304], [16726], [16560], [20866], [23524], [19878], [13469], [23084], ], [ [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], ], ], ], [ [ [ [18983], [20512], [14885], [16384], [16384], [16384], [16384], [16384], [16384], ], [ [20090], [19444], [17286], [16384], [16384], [16384], [16384], [16384], [16384], ], ], [ [ [19139], [21487], [18959], [20910], [19089], [16384], [16384], [16384], [16384], ], [ [20536], [20664], [20625], [19123], [14862], [16384], [16384], [16384], [16384], ], ], [ [ [19833], [21502], [17485], [20267], [18353], [23329], [21478], [16384], [16384], ], [ [22041], [23434], [20001], [20554], [20951], [20145], [15562], [16384], [16384], ], ], [ [ [23312], [21607], [16526], [18957], [18034], [18934], [24247], [16921], [17080], ], [ [26579], [24910], [18637], [19800], [20388], [9887], [15642], [30198], [24721], ], ], [ [ [26998], [16737], [17838], [18922], [19515], [18636], [17333], [15776], [22658], ], [ [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], ], ], ], [ [ [ [20177], [20789], [20262], [16384], [16384], [16384], [16384], [16384], [16384], ], [ [21416], [20855], [23410], [16384], [16384], [16384], [16384], [16384], [16384], ], ], [ [ [20238], [21057], [19159], [22337], [20159], [16384], [16384], [16384], [16384], ], [ [20125], [20559], [21707], [22296], [17333], [16384], [16384], [16384], [16384], ], ], [ [ [19941], [20527], [21470], [22487], [19558], [22354], [20331], [16384], [16384], ], [ [22752], [25006], [22075], [21576], [17740], [21690], [19211], [16384], [16384], ], ], [ [ [21442], [22358], [18503], [20291], [19945], [21294], [21178], [19400], [10556], ], [ [24648], [24949], [20708], [23905], [20501], [9558], [9423], [30365], [19253], ], ], [ [ [26064], [22098], [19613], [20525], [17595], [16618], [20497], [18989], [15513], ], [ [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], [16384], ], ], ], ]); pub static av1_default_eob_multi16_cdfs: [[[[u16; 5]; 2]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [[840, 1039, 1980, 4895], [370, 671, 1883, 4471]], [[3247, 4950, 9688, 14563], [1904, 3354, 7763, 14647]], ], [ [[2125, 2551, 5165, 8946], [513, 765, 1859, 6339]], [[7637, 9498, 14259, 19108], [2497, 4096, 8866, 16993]], ], [ [[4016, 4897, 8881, 14968], [716, 1105, 2646, 10056]], [[11139, 13270, 18241, 23566], [3192, 5032, 10297, 19755]], ], [ [[6708, 8958, 14746, 22133], [1222, 2074, 4783, 15410]], [[19575, 21766, 26044, 29709], [7297, 10767, 19273, 28194]], ], ]); pub static av1_default_eob_multi32_cdfs: [[[[u16; 6]; 2]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [[400, 520, 977, 2102, 6542], [210, 405, 1315, 3326, 7537]], [[2636, 4273, 7588, 11794, 20401], [1786, 3179, 6902, 11357, 19054]], ], [ [[989, 1249, 2019, 4151, 10785], [313, 441, 1099, 2917, 8562]], [[8394, 10352, 13932, 18855, 26014], [2578, 4124, 8181, 13670, 24234]], ], [ [[2515, 3003, 4452, 8162, 16041], [574, 821, 1836, 5089, 13128]], [[13468, 16303, 20361, 25105, 29281], [3542, 5502, 10415, 16760, 25644]], ], [ [[4617, 5709, 8446, 13584, 23135], [1156, 1702, 3675, 9274, 20539]], [[22086, 24282, 27010, 29770, 31743], [7699, 10897, 20891, 26926, 31628]], ], ]); pub static av1_default_eob_multi64_cdfs: [[[[u16; 7]; 2]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [[329, 498, 1101, 1784, 3265, 7758], [335, 730, 1459, 5494, 8755, 12997]], [ [3505, 5304, 10086, 13814, 17684, 23370], [1563, 2700, 4876, 10911, 14706, 22480], ], ], [ [ [1260, 1446, 2253, 3712, 6652, 13369], [401, 605, 1029, 2563, 5845, 12626], ], [ [8609, 10612, 14624, 18714, 22614, 29024], [1923, 3127, 5867, 9703, 14277, 27100], ], ], [ [ [2374, 2772, 4583, 7276, 12288, 19706], [497, 810, 1315, 3000, 7004, 15641], ], [ [15050, 17126, 21410, 24886, 28156, 30726], [4034, 6290, 10235, 14982, 21214, 28491], ], ], [ [ [6307, 7541, 12060, 16358, 22553, 27865], [1289, 2320, 3971, 7926, 14153, 24291], ], [ [24212, 25708, 28268, 30035, 31307, 32049], [8726, 12378, 19409, 26450, 30038, 32462], ], ], ]); pub static av1_default_eob_multi128_cdfs: [[[[u16; 8]; 2]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [ [219, 482, 1140, 2091, 3680, 6028, 12586], [371, 699, 1254, 4830, 9479, 12562, 17497], ], [ [5245, 7456, 12880, 15852, 20033, 23932, 27608], [2054, 3472, 5869, 14232, 18242, 20590, 26752], ], ], [ [ [685, 933, 1488, 2714, 4766, 8562, 19254], [217, 352, 618, 2303, 5261, 9969, 17472], ], [ [8045, 11200, 15497, 19595, 23948, 27408, 30938], [2310, 4160, 7471, 14997, 17931, 20768, 30240], ], ], [ [ [1366, 1738, 2527, 5016, 9355, 15797, 24643], [354, 558, 944, 2760, 7287, 14037, 21779], ], [ [13627, 16246, 20173, 24429, 27948, 30415, 31863], [6275, 9889, 14769, 23164, 27988, 30493, 32272], ], ], [ [ [3472, 4885, 7489, 12481, 18517, 24536, 29635], [886, 1731, 3271, 8469, 15569, 22126, 28383], ], [ [24313, 26062, 28385, 30107, 31217, 31898, 32345], [9165, 13282, 21150, 30286, 31894, 32571, 32712], ], ], ]); pub static av1_default_eob_multi256_cdfs: [[[[u16; 9]; 2]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [ [310, 584, 1887, 3589, 6168, 8611, 11352, 15652], [998, 1850, 2998, 5604, 17341, 19888, 22899, 25583], ], [ [2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168], [2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261], ], ], [ [ [1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300], [399, 1019, 1749, 3038, 10444, 15546, 22739, 27294], ], [ [6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377], [1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940], ], ], [ [ [3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176], [1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799], ], [ [11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277], [6571, 9610, 15516, 21826, 29092, 30829, 31842, 32708], ], ], [ [ [5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757], [2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547], ], [ [23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094], [9998, 17661, 25178, 28097, 31308, 32038, 32403, 32695], ], ], ]); pub static av1_default_eob_multi512_cdfs: [[[[u16; 10]; 2]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [ [641, 983, 3707, 5430, 10234, 14958, 18788, 23412, 26061], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], [ [5095, 6446, 9996, 13354, 16017, 17986, 20919, 26129, 29140], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], ], [ [ [1230, 2278, 5035, 7776, 11871, 15346, 19590, 24584, 28749], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], [ [7265, 9979, 15819, 19250, 21780, 23846, 26478, 28396, 31811], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], ], [ [ [2624, 3936, 6480, 9686, 13979, 17726, 23267, 28410, 31078], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], [ [12015, 14769, 19588, 22052, 24222, 25812, 27300, 29219, 32114], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], ], [ [ [5927, 7809, 10923, 14597, 19439, 24135, 28456, 31142, 32060], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], [ [21093, 23043, 25742, 27658, 29097, 29716, 30073, 30820, 31956], [3277, 6554, 9830, 13107, 16384, 19661, 22938, 26214, 29491], ], ], ]); pub static av1_default_eob_multi1024_cdfs: [[[[u16; 11]; 2]; PLANE_TYPES]; TOKEN_CDF_Q_CTXS] = cdf_4d([ [ [ [393, 421, 751, 1623, 3160, 6352, 13345, 18047, 22571, 25830], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], [ [1865, 1988, 2930, 4242, 10533, 16538, 21354, 27255, 28546, 31784], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], ], [ [ [696, 948, 3145, 5702, 9706, 13217, 17851, 21856, 25692, 28034], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], [ [2672, 3591, 9330, 17084, 22725, 24284, 26527, 28027, 28377, 30876], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], ], [ [ [2784, 3831, 7041, 10521, 14847, 18844, 23155, 26682, 29229, 31045], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], [ [9577, 12466, 17739, 20750, 22061, 23215, 24601, 25483, 25843, 32056], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], ], [ [ [6698, 8334, 11961, 15762, 20186, 23862, 27434, 29326, 31082, 32050], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], [ [20569, 22426, 25569, 26859, 28053, 28913, 29486, 29724, 29807, 32570], [2979, 5958, 8937, 11916, 14895, 17873, 20852, 23831, 26810, 29789], ], ], ]); pub static av1_default_coeff_lps_multi_cdfs: [[[[[u16; BR_CDF_SIZE]; LEVEL_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES]; TOKEN_CDF_Q_CTXS] = cdf_5d([ [ [ [ [14298, 20718, 24174], [12536, 19601, 23789], [8712, 15051, 19503], [6170, 11327, 15434], [4742, 8926, 12538], [3803, 7317, 10546], [1696, 3317, 4871], [14392, 19951, 22756], [15978, 23218, 26818], [12187, 19474, 23889], [9176, 15640, 20259], [7068, 12655, 17028], [5656, 10442, 14472], [2580, 4992, 7244], [12136, 18049, 21426], [13784, 20721, 24481], [10836, 17621, 21900], [8372, 14444, 18847], [6523, 11779, 16000], [5337, 9898, 13760], [3034, 5860, 8462], ], [ [15967, 22905, 26286], [13534, 20654, 24579], [9504, 16092, 20535], [6975, 12568, 16903], [5364, 10091, 14020], [4357, 8370, 11857], [2506, 4934, 7218], [23032, 28815, 30936], [19540, 26704, 29719], [15158, 22969, 27097], [11408, 18865, 23650], [8885, 15448, 20250], [7108, 12853, 17416], [4231, 8041, 11480], [19823, 26490, 29156], [18890, 25929, 28932], [15660, 23491, 27433], [12147, 19776, 24488], [9728, 16774, 21649], [7919, 14277, 19066], [5440, 10170, 14185], ], ], [ [ [14406, 20862, 24414], [11824, 18907, 23109], [8257, 14393, 18803], [5860, 10747, 14778], [4475, 8486, 11984], [3606, 6954, 10043], [1736, 3410, 5048], [14430, 20046, 22882], [15593, 22899, 26709], [12102, 19368, 23811], [9059, 15584, 20262], [6999, 12603, 17048], [5684, 10497, 14553], [2822, 5438, 7862], [15785, 21585, 24359], [18347, 25229, 28266], [14974, 22487, 26389], [11423, 18681, 23271], [8863, 15350, 20008], [7153, 12852, 17278], [3707, 7036, 9982], ], [ [15460, 21696, 25469], [12170, 19249, 23191], [8723, 15027, 19332], [6428, 11704, 15874], [4922, 9292, 13052], [4139, 7695, 11010], [2291, 4508, 6598], [19856, 26920, 29828], [17923, 25289, 28792], [14278, 21968, 26297], [10910, 18136, 22950], [8423, 14815, 19627], [6771, 12283, 16774], [4074, 7750, 11081], [19852, 26074, 28672], [19371, 26110, 28989], [16265, 23873, 27663], [12758, 20378, 24952], [10095, 17098, 21961], [8250, 14628, 19451], [5205, 9745, 13622], ], ], [ [ [10563, 16233, 19763], [9794, 16022, 19804], [6750, 11945, 15759], [4963, 9186, 12752], [3845, 7435, 10627], [3051, 6085, 8834], [1311, 2596, 3830], [11246, 16404, 19689], [12315, 18911, 22731], [10557, 17095, 21289], [8136, 14006, 18249], [6348, 11474, 15565], [5196, 9655, 13400], [2349, 4526, 6587], [13337, 18730, 21569], [19306, 26071, 28882], [15952, 23540, 27254], [12409, 19934, 24430], [9760, 16706, 21389], [8004, 14220, 18818], [4138, 7794, 10961], ], [ [10870, 16684, 20949], [9664, 15230, 18680], [6886, 12109, 15408], [4825, 8900, 12305], [3630, 7162, 10314], [3036, 6429, 9387], [1671, 3296, 4940], [13819, 19159, 23026], [11984, 19108, 23120], [10690, 17210, 21663], [7984, 14154, 18333], [6868, 12294, 16124], [5274, 8994, 12868], [2988, 5771, 8424], [19736, 26647, 29141], [18933, 26070, 28984], [15779, 23048, 27200], [12638, 20061, 24532], [10692, 17545, 22220], [9217, 15251, 20054], [5078, 9284, 12594], ], ], [ [ [2331, 3662, 5244], [2891, 4771, 6145], [4598, 7623, 9729], [3520, 6845, 9199], [3417, 6119, 9324], [2601, 5412, 7385], [600, 1173, 1744], [7672, 13286, 17469], [4232, 7792, 10793], [2915, 5317, 7397], [2318, 4356, 6152], [2127, 4000, 5554], [1850, 3478, 5275], [977, 1933, 2843], [18280, 24387, 27989], [15852, 22671, 26185], [13845, 20951, 24789], [11055, 17966, 22129], [9138, 15422, 19801], [7454, 13145, 17456], [3370, 6393, 9013], ], [ [5842, 9229, 10838], [2313, 3491, 4276], [2998, 6104, 7496], [2420, 7447, 9868], [3034, 8495, 10923], [4076, 8937, 10975], [1086, 2370, 3299], [9714, 17254, 20444], [8543, 13698, 17123], [4918, 9007, 11910], [4129, 7532, 10553], [2364, 5533, 8058], [1834, 3546, 5563], [1473, 2908, 4133], [15405, 21193, 25619], [15691, 21952, 26561], [12962, 19194, 24165], [10272, 17855, 22129], [8588, 15270, 20718], [8682, 14669, 19500], [4870, 9636, 13205], ], ], [ [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], [ [ [ [14995, 21341, 24749], [13158, 20289, 24601], [8941, 15326, 19876], [6297, 11541, 15807], [4817, 9029, 12776], [3731, 7273, 10627], [1847, 3617, 5354], [14472, 19659, 22343], [16806, 24162, 27533], [12900, 20404, 24713], [9411, 16112, 20797], [7056, 12697, 17148], [5544, 10339, 14460], [2954, 5704, 8319], [12464, 18071, 21354], [15482, 22528, 26034], [12070, 19269, 23624], [8953, 15406, 20106], [7027, 12730, 17220], [5887, 10913, 15140], [3793, 7278, 10447], ], [ [15571, 22232, 25749], [14506, 21575, 25374], [10189, 17089, 21569], [7316, 13301, 17915], [5783, 10912, 15190], [4760, 9155, 13088], [2993, 5966, 8774], [23424, 28903, 30778], [20775, 27666, 30290], [16474, 24410, 28299], [12471, 20180, 24987], [9410, 16487, 21439], [7536, 13614, 18529], [5048, 9586, 13549], [21090, 27290, 29756], [20796, 27402, 30026], [17819, 25485, 28969], [13860, 21909, 26462], [11002, 18494, 23529], [8953, 15929, 20897], [6448, 11918, 16454], ], ], [ [ [15999, 22208, 25449], [13050, 19988, 24122], [8594, 14864, 19378], [6033, 11079, 15238], [4554, 8683, 12347], [3672, 7139, 10337], [1900, 3771, 5576], [15788, 21340, 23949], [16825, 24235, 27758], [12873, 20402, 24810], [9590, 16363, 21094], [7352, 13209, 17733], [5960, 10989, 15184], [3232, 6234, 9007], [15761, 20716, 23224], [19318, 25989, 28759], [15529, 23094, 26929], [11662, 18989, 23641], [8955, 15568, 20366], [7281, 13106, 17708], [4248, 8059, 11440], ], [ [14899, 21217, 24503], [13519, 20283, 24047], [9429, 15966, 20365], [6700, 12355, 16652], [5088, 9704, 13716], [4243, 8154, 11731], [2702, 5364, 7861], [22745, 28388, 30454], [20235, 27146, 29922], [15896, 23715, 27637], [11840, 19350, 24131], [9122, 15932, 20880], [7488, 13581, 18362], [5114, 9568, 13370], [20845, 26553, 28932], [20981, 27372, 29884], [17781, 25335, 28785], [13760, 21708, 26297], [10975, 18415, 23365], [9045, 15789, 20686], [6130, 11199, 15423], ], ], [ [ [13549, 19724, 23158], [11844, 18382, 22246], [7919, 13619, 17773], [5486, 10143, 13946], [4166, 7983, 11324], [3364, 6506, 9427], [1598, 3160, 4674], [15281, 20979, 23781], [14939, 22119, 25952], [11363, 18407, 22812], [8609, 14857, 19370], [6737, 12184, 16480], [5506, 10263, 14262], [2990, 5786, 8380], [20249, 25253, 27417], [21070, 27518, 30001], [16854, 24469, 28074], [12864, 20486, 25000], [9962, 16978, 21778], [8074, 14338, 19048], [4494, 8479, 11906], ], [ [13960, 19617, 22829], [11150, 17341, 21228], [7150, 12964, 17190], [5331, 10002, 13867], [4167, 7744, 11057], [3480, 6629, 9646], [1883, 3784, 5686], [18752, 25660, 28912], [16968, 24586, 28030], [13520, 21055, 25313], [10453, 17626, 22280], [8386, 14505, 19116], [6742, 12595, 17008], [4273, 8140, 11499], [22120, 27827, 30233], [20563, 27358, 29895], [17076, 24644, 28153], [13362, 20942, 25309], [10794, 17965, 22695], [9014, 15652, 20319], [5708, 10512, 14497], ], ], [ [ [5705, 10930, 15725], [7946, 12765, 16115], [6801, 12123, 16226], [5462, 10135, 14200], [4189, 8011, 11507], [3191, 6229, 9408], [1057, 2137, 3212], [10018, 17067, 21491], [7380, 12582, 16453], [6068, 10845, 14339], [5098, 9198, 12555], [4312, 8010, 11119], [3700, 6966, 9781], [1693, 3326, 4887], [18757, 24930, 27774], [17648, 24596, 27817], [14707, 22052, 26026], [11720, 18852, 23292], [9357, 15952, 20525], [7810, 13753, 18210], [3879, 7333, 10328], ], [ [8278, 13242, 15922], [10547, 15867, 18919], [9106, 15842, 20609], [6833, 13007, 17218], [4811, 9712, 13923], [3985, 7352, 11128], [1688, 3458, 5262], [12951, 21861, 26510], [9788, 16044, 20276], [6309, 11244, 14870], [5183, 9349, 12566], [4389, 8229, 11492], [3633, 6945, 10620], [3600, 6847, 9907], [21748, 28137, 30255], [19436, 26581, 29560], [16359, 24201, 27953], [13961, 21693, 25871], [11544, 18686, 23322], [9372, 16462, 20952], [6138, 11210, 15390], ], ], [ [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], [ [ [ [16138, 22223, 25509], [15347, 22430, 26332], [9614, 16736, 21332], [6600, 12275, 16907], [4811, 9424, 13547], [3748, 7809, 11420], [2254, 4587, 6890], [15196, 20284, 23177], [18317, 25469, 28451], [13918, 21651, 25842], [10052, 17150, 21995], [7499, 13630, 18587], [6158, 11417, 16003], [4014, 7785, 11252], [15048, 21067, 24384], [18202, 25346, 28553], [14302, 22019, 26356], [10839, 18139, 23166], [8715, 15744, 20806], [7536, 13576, 18544], [5413, 10335, 14498], ], [ [17394, 24501, 27895], [15889, 23420, 27185], [11561, 19133, 23870], [8285, 14812, 19844], [6496, 12043, 16550], [4771, 9574, 13677], [3603, 6830, 10144], [21656, 27704, 30200], [21324, 27915, 30511], [17327, 25336, 28997], [13417, 21381, 26033], [10132, 17425, 22338], [8580, 15016, 19633], [5694, 11477, 16411], [24116, 29780, 31450], [23853, 29695, 31591], [20085, 27614, 30428], [15326, 24335, 28575], [11814, 19472, 24810], [10221, 18611, 24767], [7689, 14558, 20321], ], ], [ [ [16214, 22380, 25770], [14213, 21304, 25295], [9213, 15823, 20455], [6395, 11758, 16139], [4779, 9187, 13066], [3821, 7501, 10953], [2293, 4567, 6795], [15859, 21283, 23820], [18404, 25602, 28726], [14325, 21980, 26206], [10669, 17937, 22720], [8297, 14642, 19447], [6746, 12389, 16893], [4324, 8251, 11770], [16532, 21631, 24475], [20667, 27150, 29668], [16728, 24510, 28175], [12861, 20645, 25332], [10076, 17361, 22417], [8395, 14940, 19963], [5731, 10683, 14912], ], [ [14433, 21155, 24938], [14658, 21716, 25545], [9923, 16824, 21557], [6982, 13052, 17721], [5419, 10503, 15050], [4852, 9162, 13014], [3271, 6395, 9630], [22210, 27833, 30109], [20750, 27368, 29821], [16894, 24828, 28573], [13247, 21276, 25757], [10038, 17265, 22563], [8587, 14947, 20327], [5645, 11371, 15252], [22027, 27526, 29714], [23098, 29146, 31221], [19886, 27341, 30272], [15609, 23747, 28046], [11993, 20065, 24939], [9637, 18267, 23671], [7625, 13801, 19144], ], ], [ [ [14438, 20798, 24089], [12621, 19203, 23097], [8177, 14125, 18402], [5674, 10501, 14456], [4236, 8239, 11733], [3447, 6750, 9806], [1986, 3950, 5864], [16208, 22099, 24930], [16537, 24025, 27585], [12780, 20381, 24867], [9767, 16612, 21416], [7686, 13738, 18398], [6333, 11614, 15964], [3941, 7571, 10836], [22819, 27422, 29202], [22224, 28514, 30721], [17660, 25433, 28913], [13574, 21482, 26002], [10629, 17977, 22938], [8612, 15298, 20265], [5607, 10491, 14596], ], [ [13569, 19800, 23206], [13128, 19924, 23869], [8329, 14841, 19403], [6130, 10976, 15057], [4682, 8839, 12518], [3656, 7409, 10588], [2577, 5099, 7412], [22427, 28684, 30585], [20913, 27750, 30139], [15840, 24109, 27834], [12308, 20029, 24569], [10216, 16785, 21458], [8309, 14203, 19113], [6043, 11168, 15307], [23166, 28901, 30998], [21899, 28405, 30751], [18413, 26091, 29443], [15233, 23114, 27352], [12683, 20472, 25288], [10702, 18259, 23409], [8125, 14464, 19226], ], ], [ [ [9040, 14786, 18360], [9979, 15718, 19415], [7913, 13918, 18311], [5859, 10889, 15184], [4593, 8677, 12510], [3820, 7396, 10791], [1730, 3471, 5192], [11803, 18365, 22709], [11419, 18058, 22225], [9418, 15774, 20243], [7539, 13325, 17657], [6233, 11317, 15384], [5137, 9656, 13545], [2977, 5774, 8349], [21207, 27246, 29640], [19547, 26578, 29497], [16169, 23871, 27690], [12820, 20458, 25018], [10224, 17332, 22214], [8526, 15048, 19884], [5037, 9410, 13118], ], [ [12339, 17329, 20140], [13505, 19895, 23225], [9847, 16944, 21564], [7280, 13256, 18348], [4712, 10009, 14454], [4361, 7914, 12477], [2870, 5628, 7995], [20061, 25504, 28526], [15235, 22878, 26145], [12985, 19958, 24155], [9782, 16641, 21403], [9456, 16360, 20760], [6855, 12940, 18557], [5661, 10564, 15002], [25656, 30602, 31894], [22570, 29107, 31092], [18917, 26423, 29541], [15940, 23649, 27754], [12803, 20581, 25219], [11082, 18695, 23376], [7939, 14373, 19005], ], ], [ [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], [ [ [ [18315, 24289, 27551], [16854, 24068, 27835], [10140, 17927, 23173], [6722, 12982, 18267], [4661, 9826, 14706], [3832, 8165, 12294], [2795, 6098, 9245], [17145, 23326, 26672], [20733, 27680, 30308], [16032, 24461, 28546], [11653, 20093, 25081], [9290, 16429, 22086], [7796, 14598, 19982], [6502, 12378, 17441], [21681, 27732, 30320], [22389, 29044, 31261], [19027, 26731, 30087], [14739, 23755, 28624], [11358, 20778, 25511], [10995, 18073, 24190], [9162, 14990, 20617], ], [ [21425, 27952, 30388], [18062, 25838, 29034], [11956, 19881, 24808], [7718, 15000, 20980], [5702, 11254, 16143], [4898, 9088, 16864], [3679, 6776, 11907], [23294, 30160, 31663], [24397, 29896, 31836], [19245, 27128, 30593], [13202, 19825, 26404], [11578, 19297, 23957], [8073, 13297, 21370], [5461, 10923, 19745], [27367, 30521, 31934], [24904, 30671, 31940], [23075, 28460, 31299], [14400, 23658, 30417], [13885, 23882, 28325], [14746, 22938, 27853], [5461, 16384, 27307], ], ], [ [ [18274, 24813, 27890], [15537, 23149, 27003], [9449, 16740, 21827], [6700, 12498, 17261], [4988, 9866, 14198], [4236, 8147, 11902], [2867, 5860, 8654], [17124, 23171, 26101], [20396, 27477, 30148], [16573, 24629, 28492], [12749, 20846, 25674], [10233, 17878, 22818], [8525, 15332, 20363], [6283, 11632, 16255], [20466, 26511, 29286], [23059, 29174, 31191], [19481, 27263, 30241], [15458, 23631, 28137], [12416, 20608, 25693], [10261, 18011, 23261], [8016, 14655, 19666], ], [ [17616, 24586, 28112], [15809, 23299, 27155], [10767, 18890, 23793], [7727, 14255, 18865], [6129, 11926, 16882], [4482, 9704, 14861], [3277, 7452, 11522], [22956, 28551, 30730], [22724, 28937, 30961], [18467, 26324, 29580], [13234, 20713, 25649], [11181, 17592, 22481], [8291, 18358, 24576], [7568, 11881, 14984], [24948, 29001, 31147], [25674, 30619, 32151], [20841, 26793, 29603], [14669, 24356, 28666], [11334, 23593, 28219], [8922, 14762, 22873], [8301, 13544, 20535], ], ], [ [ [17113, 23733, 27081], [14139, 21406, 25452], [8552, 15002, 19776], [5871, 11120, 15378], [4455, 8616, 12253], [3469, 6910, 10386], [2255, 4553, 6782], [18224, 24376, 27053], [19290, 26710, 29614], [14936, 22991, 27184], [11238, 18951, 23762], [8786, 15617, 20588], [7317, 13228, 18003], [5101, 9512, 13493], [22639, 28222, 30210], [23216, 29331, 31307], [19075, 26762, 29895], [15014, 23113, 27457], [11938, 19857, 24752], [9942, 17280, 22282], [7167, 13144, 17752], ], [ [15820, 22738, 26488], [13530, 20885, 25216], [8395, 15530, 20452], [6574, 12321, 16380], [5353, 10419, 14568], [4613, 8446, 12381], [3440, 7158, 9903], [24247, 29051, 31224], [22118, 28058, 30369], [16498, 24768, 28389], [12920, 21175, 26137], [10730, 18619, 25352], [10187, 16279, 22791], [9310, 14631, 22127], [24970, 30558, 32057], [24801, 29942, 31698], [22432, 28453, 30855], [19054, 25680, 29580], [14392, 23036, 28109], [12495, 20947, 26650], [12442, 20326, 26214], ], ], [ [ [12162, 18785, 22648], [12749, 19697, 23806], [8580, 15297, 20346], [6169, 11749, 16543], [4836, 9391, 13448], [3821, 7711, 11613], [2228, 4601, 7070], [16319, 24725, 28280], [15698, 23277, 27168], [12726, 20368, 25047], [9912, 17015, 21976], [7888, 14220, 19179], [6777, 12284, 17018], [4492, 8590, 12252], [23249, 28904, 30947], [21050, 27908, 30512], [17440, 25340, 28949], [14059, 22018, 26541], [11288, 18903, 23898], [9411, 16342, 21428], [6278, 11588, 15944], ], [ [13981, 20067, 23226], [16922, 23580, 26783], [11005, 19039, 24487], [7389, 14218, 19798], [5598, 11505, 17206], [6090, 11213, 15659], [3820, 7371, 10119], [21082, 26925, 29675], [21262, 28627, 31128], [18392, 26454, 30437], [14870, 22910, 27096], [12620, 19484, 24908], [9290, 16553, 22802], [6668, 14288, 20004], [27704, 31055, 31949], [24709, 29978, 31788], [21668, 29264, 31657], [18295, 26968, 30074], [16399, 24422, 29313], [14347, 23026, 28104], [12370, 19806, 24477], ], ], [ [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], ]); pub static av1_default_coeff_base_multi_cdfs: [[[[[u16; NUM_BASE_LEVELS + 2]; SIG_COEF_CONTEXTS]; PLANE_TYPES]; TxSize::TX_SIZES]; TOKEN_CDF_Q_CTXS] = cdf_5d([ [ [ [ [4034, 8930, 12727], [18082, 29741, 31877], [12596, 26124, 30493], [9446, 21118, 27005], [6308, 15141, 21279], [2463, 6357, 9783], [20667, 30546, 31929], [13043, 26123, 30134], [8151, 18757, 24778], [5255, 12839, 18632], [2820, 7206, 11161], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [15736, 27553, 30604], [11210, 23794, 28787], [5947, 13874, 19701], [4215, 9323, 13891], [2833, 6462, 10059], [19605, 30393, 31582], [13523, 26252, 30248], [8446, 18622, 24512], [3818, 10343, 15974], [1481, 4117, 6796], [22649, 31302, 32190], [14829, 27127, 30449], [8313, 17702, 23304], [3022, 8301, 12786], [1536, 4412, 7184], [22354, 29774, 31372], [14723, 25472, 29214], [6673, 13745, 18662], [2068, 5766, 9322], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [6302, 16444, 21761], [23040, 31538, 32475], [15196, 28452, 31496], [10020, 22946, 28514], [6533, 16862, 23501], [3538, 9816, 15076], [24444, 31875, 32525], [15881, 28924, 31635], [9922, 22873, 28466], [6527, 16966, 23691], [4114, 11303, 17220], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [20201, 30770, 32209], [14754, 28071, 31258], [8378, 20186, 26517], [5916, 15299, 21978], [4268, 11583, 17901], [24361, 32025, 32581], [18673, 30105, 31943], [10196, 22244, 27576], [5495, 14349, 20417], [2676, 7415, 11498], [24678, 31958, 32585], [18629, 29906, 31831], [9364, 20724, 26315], [4641, 12318, 18094], [2758, 7387, 11579], [25433, 31842, 32469], [18795, 29289, 31411], [7644, 17584, 23592], [3408, 9014, 15047], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [4536, 10072, 14001], [25459, 31416, 32206], [16605, 28048, 30818], [11008, 22857, 27719], [6915, 16268, 22315], [2625, 6812, 10537], [24257, 31788, 32499], [16880, 29454, 31879], [11958, 25054, 29778], [7916, 18718, 25084], [3383, 8777, 13446], [22720, 31603, 32393], [14960, 28125, 31335], [9731, 22210, 27928], [6304, 15832, 22277], [2910, 7818, 12166], [20375, 30627, 32131], [13904, 27284, 30887], [9368, 21558, 27144], [5937, 14966, 21119], [2667, 7225, 11319], [23970, 31470, 32378], [17173, 29734, 32018], [12795, 25441, 29965], [8981, 19680, 25893], [4728, 11372, 16902], [24287, 31797, 32439], [16703, 29145, 31696], [10833, 23554, 28725], [6468, 16566, 23057], [2415, 6562, 10278], [26610, 32395, 32659], [18590, 30498, 32117], [12420, 25756, 29950], [7639, 18746, 24710], [3001, 8086, 12347], [25076, 32064, 32580], [17946, 30128, 32028], [12024, 24985, 29378], [7517, 18390, 24304], [3243, 8781, 13331], [8192, 16384, 24576], ], [ [6037, 16771, 21957], [24774, 31704, 32426], [16830, 28589, 31056], [10602, 22828, 27760], [6733, 16829, 23071], [3250, 8914, 13556], [25582, 32220, 32668], [18659, 30342, 32223], [12546, 26149, 30515], [8420, 20451, 26801], [4636, 12420, 18344], [27581, 32362, 32639], [18987, 30083, 31978], [11327, 24248, 29084], [7264, 17719, 24120], [3995, 10768, 16169], [25893, 31831, 32487], [16577, 28587, 31379], [10189, 22748, 28182], [6832, 17094, 23556], [3708, 10110, 15334], [25904, 32282, 32656], [19721, 30792, 32276], [12819, 26243, 30411], [8572, 20614, 26891], [5364, 14059, 20467], [26580, 32438, 32677], [20852, 31225, 32340], [12435, 25700, 29967], [8691, 20825, 26976], [4446, 12209, 17269], [27350, 32429, 32696], [21372, 30977, 32272], [12673, 25270, 29853], [9208, 20925, 26640], [5018, 13351, 18732], [27351, 32479, 32713], [21398, 31209, 32387], [12162, 25047, 29842], [7896, 18691, 25319], [4670, 12882, 18881], [8192, 16384, 24576], ], ], [ [ [5487, 10460, 13708], [21597, 28303, 30674], [11037, 21953, 26476], [8147, 17962, 22952], [5242, 13061, 18532], [1889, 5208, 8182], [26774, 32133, 32590], [17844, 29564, 31767], [11690, 24438, 29171], [7542, 18215, 24459], [2993, 8050, 12319], [28023, 32328, 32591], [18651, 30126, 31954], [12164, 25146, 29589], [7762, 18530, 24771], [3492, 9183, 13920], [27591, 32008, 32491], [17149, 28853, 31510], [11485, 24003, 28860], [7697, 18086, 24210], [3075, 7999, 12218], [28268, 32482, 32654], [19631, 31051, 32404], [13860, 27260, 31020], [9605, 21613, 27594], [4876, 12162, 17908], [27248, 32316, 32576], [18955, 30457, 32075], [11824, 23997, 28795], [7346, 18196, 24647], [3403, 9247, 14111], [29711, 32655, 32735], [21169, 31394, 32417], [13487, 27198, 30957], [8828, 21683, 27614], [4270, 11451, 17038], [28708, 32578, 32731], [20120, 31241, 32482], [13692, 27550, 31321], [9418, 22514, 28439], [4999, 13283, 19462], [8192, 16384, 24576], ], [ [5673, 14302, 19711], [26251, 30701, 31834], [12782, 23783, 27803], [9127, 20657, 25808], [6368, 16208, 21462], [2465, 7177, 10822], [29961, 32563, 32719], [18318, 29891, 31949], [11361, 24514, 29357], [7900, 19603, 25607], [4002, 10590, 15546], [29637, 32310, 32595], [18296, 29913, 31809], [10144, 21515, 26871], [5358, 14322, 20394], [3067, 8362, 13346], [28652, 32470, 32676], [17538, 30771, 32209], [13924, 26882, 30494], [10496, 22837, 27869], [7236, 16396, 21621], [30743, 32687, 32746], [23006, 31676, 32489], [14494, 27828, 31120], [10174, 22801, 28352], [6242, 15281, 21043], [25817, 32243, 32720], [18618, 31367, 32325], [13997, 28318, 31878], [12255, 26534, 31383], [9561, 21588, 28450], [28188, 32635, 32724], [22060, 32365, 32728], [18102, 30690, 32528], [14196, 28864, 31999], [12262, 25792, 30865], [24176, 32109, 32628], [18280, 29681, 31963], [10205, 23703, 29664], [7889, 20025, 27676], [6060, 16743, 23970], [8192, 16384, 24576], ], ], [ [ [5141, 7096, 8260], [27186, 29022, 29789], [6668, 12568, 15682], [2172, 6181, 8638], [1126, 3379, 4531], [443, 1361, 2254], [26083, 31153, 32436], [13486, 24603, 28483], [6508, 14840, 19910], [3386, 8800, 13286], [1530, 4322, 7054], [29639, 32080, 32548], [15897, 27552, 30290], [8588, 20047, 25383], [4889, 13339, 19269], [2240, 6871, 10498], [28165, 32197, 32517], [20735, 30427, 31568], [14325, 24671, 27692], [5119, 12554, 17805], [1810, 5441, 8261], [31212, 32724, 32748], [23352, 31766, 32545], [14669, 27570, 31059], [8492, 20894, 27272], [3644, 10194, 15204], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [2461, 7013, 9371], [24749, 29600, 30986], [9466, 19037, 22417], [3584, 9280, 14400], [1505, 3929, 5433], [677, 1500, 2736], [23987, 30702, 32117], [13554, 24571, 29263], [6211, 14556, 21155], [3135, 10972, 15625], [2435, 7127, 11427], [31300, 32532, 32550], [14757, 30365, 31954], [4405, 11612, 18553], [580, 4132, 7322], [1695, 10169, 14124], [30008, 32282, 32591], [19244, 30108, 31748], [11180, 24158, 29555], [5650, 14972, 19209], [2114, 5109, 8456], [31856, 32716, 32748], [23012, 31664, 32572], [13694, 26656, 30636], [8142, 19508, 26093], [4253, 10955, 16724], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [601, 983, 1311], [18725, 23406, 28087], [5461, 8192, 10923], [3781, 15124, 21425], [2587, 7761, 12072], [106, 458, 810], [22282, 29710, 31894], [8508, 20926, 25984], [3726, 12713, 18083], [1620, 7112, 10893], [729, 2236, 3495], [30163, 32474, 32684], [18304, 30464, 32000], [11443, 26526, 29647], [6007, 15292, 21299], [2234, 6703, 8937], [30954, 32177, 32571], [17363, 29562, 31076], [9686, 22464, 27410], [8192, 16384, 21390], [1755, 8046, 11264], [31168, 32734, 32748], [22486, 31441, 32471], [12833, 25627, 29738], [6980, 17379, 23122], [3111, 8887, 13479], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], [ [ [ [6041, 11854, 15927], [20326, 30905, 32251], [14164, 26831, 30725], [9760, 20647, 26585], [6416, 14953, 21219], [2966, 7151, 10891], [23567, 31374, 32254], [14978, 27416, 30946], [9434, 20225, 26254], [6658, 14558, 20535], [3916, 8677, 12989], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [18088, 29545, 31587], [13062, 25843, 30073], [8940, 16827, 22251], [7654, 13220, 17973], [5733, 10316, 14456], [22879, 31388, 32114], [15215, 27993, 30955], [9397, 19445, 24978], [3442, 9813, 15344], [1368, 3936, 6532], [25494, 32033, 32406], [16772, 27963, 30718], [9419, 18165, 23260], [2677, 7501, 11797], [1516, 4344, 7170], [26556, 31454, 32101], [17128, 27035, 30108], [8324, 15344, 20249], [1903, 5696, 9469], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8455, 19003, 24368], [23563, 32021, 32604], [16237, 29446, 31935], [10724, 23999, 29358], [6725, 17528, 24416], [3927, 10927, 16825], [26313, 32288, 32634], [17430, 30095, 32095], [11116, 24606, 29679], [7195, 18384, 25269], [4726, 12852, 19315], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [22822, 31648, 32483], [16724, 29633, 31929], [10261, 23033, 28725], [7029, 17840, 24528], [4867, 13886, 21502], [25298, 31892, 32491], [17809, 29330, 31512], [9668, 21329, 26579], [4774, 12956, 18976], [2322, 7030, 11540], [25472, 31920, 32543], [17957, 29387, 31632], [9196, 20593, 26400], [4680, 12705, 19202], [2917, 8456, 13436], [26471, 32059, 32574], [18458, 29783, 31909], [8400, 19464, 25956], [3812, 10973, 17206], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [6779, 13743, 17678], [24806, 31797, 32457], [17616, 29047, 31372], [11063, 23175, 28003], [6521, 16110, 22324], [2764, 7504, 11654], [25266, 32367, 32637], [19054, 30553, 32175], [12139, 25212, 29807], [7311, 18162, 24704], [3397, 9164, 14074], [25988, 32208, 32522], [16253, 28912, 31526], [9151, 21387, 27372], [5688, 14915, 21496], [2717, 7627, 12004], [23144, 31855, 32443], [16070, 28491, 31325], [8702, 20467, 26517], [5243, 13956, 20367], [2621, 7335, 11567], [26636, 32340, 32630], [19990, 31050, 32341], [13243, 26105, 30315], [8588, 19521, 25918], [4717, 11585, 17304], [25844, 32292, 32582], [19090, 30635, 32097], [11963, 24546, 28939], [6218, 16087, 22354], [2340, 6608, 10426], [28046, 32576, 32694], [21178, 31313, 32296], [13486, 26184, 29870], [7149, 17871, 23723], [2833, 7958, 12259], [27710, 32528, 32686], [20674, 31076, 32268], [12413, 24955, 29243], [6676, 16927, 23097], [2966, 8333, 12919], [8192, 16384, 24576], ], [ [8639, 19339, 24429], [24404, 31837, 32525], [16997, 29425, 31784], [11253, 24234, 29149], [6751, 17394, 24028], [3490, 9830, 15191], [26283, 32471, 32714], [19599, 31168, 32442], [13146, 26954, 30893], [8214, 20588, 26890], [4699, 13081, 19300], [28212, 32458, 32669], [18594, 30316, 32100], [11219, 24408, 29234], [6865, 17656, 24149], [3678, 10362, 16006], [25825, 32136, 32616], [17313, 29853, 32021], [11197, 24471, 29472], [6947, 17781, 24405], [3768, 10660, 16261], [27352, 32500, 32706], [20850, 31468, 32469], [14021, 27707, 31133], [8964, 21748, 27838], [5437, 14665, 21187], [26304, 32492, 32698], [20409, 31380, 32385], [13682, 27222, 30632], [8974, 21236, 26685], [4234, 11665, 16934], [26273, 32357, 32711], [20672, 31242, 32441], [14172, 27254, 30902], [9870, 21898, 27275], [5164, 13506, 19270], [26725, 32459, 32728], [20991, 31442, 32527], [13071, 26434, 30811], [8184, 20090, 26742], [4803, 13255, 19895], [8192, 16384, 24576], ], ], [ [ [7555, 14942, 18501], [24410, 31178, 32287], [14394, 26738, 30253], [8413, 19554, 25195], [4766, 12924, 18785], [2029, 5806, 9207], [26776, 32364, 32663], [18732, 29967, 31931], [11005, 23786, 28852], [6466, 16909, 23510], [3044, 8638, 13419], [29208, 32582, 32704], [20068, 30857, 32208], [12003, 25085, 29595], [6947, 17750, 24189], [3245, 9103, 14007], [27359, 32465, 32669], [19421, 30614, 32174], [11915, 25010, 29579], [6950, 17676, 24074], [3007, 8473, 13096], [29002, 32676, 32735], [22102, 31849, 32576], [14408, 28009, 31405], [9027, 21679, 27931], [4694, 12678, 18748], [28216, 32528, 32682], [20849, 31264, 32318], [12756, 25815, 29751], [7565, 18801, 24923], [3509, 9533, 14477], [30133, 32687, 32739], [23063, 31910, 32515], [14588, 28051, 31132], [9085, 21649, 27457], [4261, 11654, 17264], [29518, 32691, 32748], [22451, 31959, 32613], [14864, 28722, 31700], [9695, 22964, 28716], [4932, 13358, 19502], [8192, 16384, 24576], ], [ [6465, 16958, 21688], [25199, 31514, 32360], [14774, 27149, 30607], [9257, 21438, 26972], [5723, 15183, 21882], [3150, 8879, 13731], [26989, 32262, 32682], [17396, 29937, 32085], [11387, 24901, 29784], [7289, 18821, 25548], [3734, 10577, 16086], [29728, 32501, 32695], [17431, 29701, 31903], [9921, 22826, 28300], [5896, 15434, 22068], [3430, 9646, 14757], [28614, 32511, 32705], [19364, 30638, 32263], [13129, 26254, 30402], [8754, 20484, 26440], [4378, 11607, 17110], [30292, 32671, 32744], [21780, 31603, 32501], [14314, 27829, 31291], [9611, 22327, 28263], [4890, 13087, 19065], [25862, 32567, 32733], [20794, 32050, 32567], [17243, 30625, 32254], [13283, 27628, 31474], [9669, 22532, 28918], [27435, 32697, 32748], [24922, 32390, 32714], [21449, 31504, 32536], [16392, 29729, 31832], [11692, 24884, 29076], [24193, 32290, 32735], [18909, 31104, 32563], [12236, 26841, 31403], [8171, 21840, 29082], [7224, 17280, 25275], [8192, 16384, 24576], ], ], [ [ [3078, 6839, 9890], [13837, 20450, 24479], [5914, 14222, 19328], [3866, 10267, 14762], [2612, 7208, 11042], [1067, 2991, 4776], [25817, 31646, 32529], [13708, 26338, 30385], [7328, 18585, 24870], [4691, 13080, 19276], [1825, 5253, 8352], [29386, 32315, 32624], [17160, 29001, 31360], [9602, 21862, 27396], [5915, 15772, 22148], [2786, 7779, 12047], [29246, 32450, 32663], [18696, 29929, 31818], [10510, 23369, 28560], [6229, 16499, 23125], [2608, 7448, 11705], [30753, 32710, 32748], [21638, 31487, 32503], [12937, 26854, 30870], [8182, 20596, 26970], [3637, 10269, 15497], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [5244, 12150, 16906], [20486, 26858, 29701], [7756, 18317, 23735], [3452, 9256, 13146], [2020, 5206, 8229], [1801, 4993, 7903], [27051, 31858, 32531], [15988, 27531, 30619], [9188, 21484, 26719], [6273, 17186, 23800], [3108, 9355, 14764], [31076, 32520, 32680], [18119, 30037, 31850], [10244, 22969, 27472], [4692, 14077, 19273], [3694, 11677, 17556], [30060, 32581, 32720], [21011, 30775, 32120], [11931, 24820, 29289], [7119, 17662, 24356], [3833, 10706, 16304], [31954, 32731, 32748], [23913, 31724, 32489], [15520, 28060, 31286], [11517, 23008, 28571], [6193, 14508, 20629], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [1035, 2807, 4156], [13162, 18138, 20939], [2696, 6633, 8755], [1373, 4161, 6853], [1099, 2746, 4716], [340, 1021, 1599], [22826, 30419, 32135], [10395, 21762, 26942], [4726, 12407, 17361], [2447, 7080, 10593], [1227, 3717, 6011], [28156, 31424, 31934], [16915, 27754, 30373], [9148, 20990, 26431], [5950, 15515, 21148], [2492, 7327, 11526], [30602, 32477, 32670], [20026, 29955, 31568], [11220, 23628, 28105], [6652, 17019, 22973], [3064, 8536, 13043], [31769, 32724, 32748], [22230, 30887, 32373], [12234, 25079, 29731], [7326, 18816, 25353], [3933, 10907, 16616], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], [ [ [ [8896, 16227, 20630], [23629, 31782, 32527], [15173, 27755, 31321], [10158, 21233, 27382], [6420, 14857, 21558], [3269, 8155, 12646], [24835, 32009, 32496], [16509, 28421, 31579], [10957, 21514, 27418], [7881, 15930, 22096], [5388, 10960, 15918], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [20745, 30773, 32093], [15200, 27221, 30861], [13032, 20873, 25667], [12285, 18663, 23494], [11563, 17481, 21489], [26260, 31982, 32320], [15397, 28083, 31100], [9742, 19217, 24824], [3261, 9629, 15362], [1480, 4322, 7499], [27599, 32256, 32460], [16857, 27659, 30774], [9551, 18290, 23748], [3052, 8933, 14103], [2021, 5910, 9787], [29005, 32015, 32392], [17677, 27694, 30863], [9204, 17356, 23219], [2403, 7516, 12814], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [10808, 22056, 26896], [25739, 32313, 32676], [17288, 30203, 32221], [11359, 24878, 29896], [6949, 17767, 24893], [4287, 11796, 18071], [27880, 32521, 32705], [19038, 31004, 32414], [12564, 26345, 30768], [8269, 19947, 26779], [5674, 14657, 21674], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [25742, 32319, 32671], [19557, 31164, 32454], [13381, 26381, 30755], [10101, 21466, 26722], [9209, 19650, 26825], [27107, 31917, 32432], [18056, 28893, 31203], [10200, 21434, 26764], [4660, 12913, 19502], [2368, 6930, 12504], [26960, 32158, 32613], [18628, 30005, 32031], [10233, 22442, 28232], [5471, 14630, 21516], [3235, 10767, 17109], [27696, 32440, 32692], [20032, 31167, 32438], [8700, 21341, 28442], [5662, 14831, 21795], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [9704, 17294, 21132], [26762, 32278, 32633], [18382, 29620, 31819], [10891, 23475, 28723], [6358, 16583, 23309], [3248, 9118, 14141], [27204, 32573, 32699], [19818, 30824, 32329], [11772, 25120, 30041], [6995, 18033, 25039], [3752, 10442, 16098], [27222, 32256, 32559], [15356, 28399, 31475], [8821, 20635, 27057], [5511, 14404, 21239], [2935, 8222, 13051], [24875, 32120, 32529], [15233, 28265, 31445], [8605, 20570, 26932], [5431, 14413, 21196], [2994, 8341, 13223], [28201, 32604, 32700], [21041, 31446, 32456], [13221, 26213, 30475], [8255, 19385, 26037], [4930, 12585, 18830], [28768, 32448, 32627], [19705, 30561, 32021], [11572, 23589, 28220], [5532, 15034, 21446], [2460, 7150, 11456], [29874, 32619, 32699], [21621, 31071, 32201], [12511, 24747, 28992], [6281, 16395, 22748], [3246, 9278, 14497], [29715, 32625, 32712], [20958, 31011, 32283], [11233, 23671, 28806], [6012, 16128, 22868], [3427, 9851, 15414], [8192, 16384, 24576], ], [ [11016, 22111, 26794], [25946, 32357, 32677], [17890, 30452, 32252], [11678, 25142, 29816], [6720, 17534, 24584], [4230, 11665, 17820], [28400, 32623, 32747], [21164, 31668, 32575], [13572, 27388, 31182], [8234, 20750, 27358], [5065, 14055, 20897], [28981, 32547, 32705], [18681, 30543, 32239], [10919, 24075, 29286], [6431, 17199, 24077], [3819, 10464, 16618], [26870, 32467, 32693], [19041, 30831, 32347], [11794, 25211, 30016], [6888, 18019, 24970], [4370, 12363, 18992], [29578, 32670, 32744], [23159, 32007, 32613], [15315, 28669, 31676], [9298, 22607, 28782], [6144, 15913, 22968], [28110, 32499, 32669], [21574, 30937, 32015], [12759, 24818, 28727], [6545, 16761, 23042], [3649, 10597, 16833], [28163, 32552, 32728], [22101, 31469, 32464], [13160, 25472, 30143], [7303, 18684, 25468], [5241, 13975, 20955], [28400, 32631, 32744], [22104, 31793, 32603], [13557, 26571, 30846], [7749, 19861, 26675], [4873, 14030, 21234], [8192, 16384, 24576], ], ], [ [ [9800, 17635, 21073], [26153, 31885, 32527], [15038, 27852, 31006], [8718, 20564, 26486], [5128, 14076, 20514], [2636, 7566, 11925], [27551, 32504, 32701], [18310, 30054, 32100], [10211, 23420, 29082], [6222, 16876, 23916], [3462, 9954, 15498], [29991, 32633, 32721], [19883, 30751, 32201], [11141, 24184, 29285], [6420, 16940, 23774], [3392, 9753, 15118], [28465, 32616, 32712], [19850, 30702, 32244], [10983, 24024, 29223], [6294, 16770, 23582], [3244, 9283, 14509], [30023, 32717, 32748], [22940, 32032, 32626], [14282, 27928, 31473], [8562, 21327, 27914], [4846, 13393, 19919], [29981, 32590, 32695], [20465, 30963, 32166], [11479, 23579, 28195], [5916, 15648, 22073], [3031, 8605, 13398], [31146, 32691, 32739], [23106, 31724, 32444], [13783, 26738, 30439], [7852, 19468, 25807], [3860, 11124, 16853], [31014, 32724, 32748], [23629, 32109, 32628], [14747, 28115, 31403], [8545, 21242, 27478], [4574, 12781, 19067], [8192, 16384, 24576], ], [ [9185, 19694, 24688], [26081, 31985, 32621], [16015, 29000, 31787], [10542, 23690, 29206], [6732, 17945, 24677], [3916, 11039, 16722], [28224, 32566, 32744], [19100, 31138, 32485], [12528, 26620, 30879], [7741, 20277, 26885], [4566, 12845, 18990], [29933, 32593, 32718], [17670, 30333, 32155], [10385, 23600, 28909], [6243, 16236, 22407], [3976, 10389, 16017], [28377, 32561, 32738], [19366, 31175, 32482], [13327, 27175, 31094], [8258, 20769, 27143], [4703, 13198, 19527], [31086, 32706, 32748], [22853, 31902, 32583], [14759, 28186, 31419], [9284, 22382, 28348], [5585, 15192, 21868], [28291, 32652, 32746], [19849, 32107, 32571], [14834, 26818, 29214], [10306, 22594, 28672], [6615, 17384, 23384], [28947, 32604, 32745], [25625, 32289, 32646], [18758, 28672, 31403], [10017, 23430, 28523], [6862, 15269, 22131], [23933, 32509, 32739], [19927, 31495, 32631], [11903, 26023, 30621], [7026, 20094, 27252], [5998, 18106, 24437], [8192, 16384, 24576], ], ], [ [ [4456, 11274, 15533], [21219, 29079, 31616], [11173, 23774, 28567], [7282, 18293, 24263], [4890, 13286, 19115], [1890, 5508, 8659], [26651, 32136, 32647], [14630, 28254, 31455], [8716, 21287, 27395], [5615, 15331, 22008], [2675, 7700, 12150], [29954, 32526, 32690], [16126, 28982, 31633], [9030, 21361, 27352], [5411, 14793, 21271], [2943, 8422, 13163], [29539, 32601, 32730], [18125, 30385, 32201], [10422, 24090, 29468], [6468, 17487, 24438], [2970, 8653, 13531], [30912, 32715, 32748], [20666, 31373, 32497], [12509, 26640, 30917], [8058, 20629, 27290], [4231, 12006, 18052], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [10202, 20633, 25484], [27336, 31445, 32352], [12420, 24384, 28552], [7648, 18115, 23856], [5662, 14341, 19902], [3611, 10328, 15390], [30945, 32616, 32736], [18682, 30505, 32253], [11513, 25336, 30203], [7449, 19452, 26148], [4482, 13051, 18886], [32022, 32690, 32747], [18578, 30501, 32146], [11249, 23368, 28631], [5645, 16958, 22158], [5009, 11444, 16637], [31357, 32710, 32748], [21552, 31494, 32504], [13891, 27677, 31340], [9051, 22098, 28172], [5190, 13377, 19486], [32364, 32740, 32748], [24839, 31907, 32551], [17160, 28779, 31696], [12452, 24137, 29602], [6165, 15389, 22477], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [2575, 7281, 11077], [14002, 20866, 25402], [6343, 15056, 19658], [4474, 11858, 17041], [2865, 8299, 12534], [1344, 3949, 6391], [24720, 31239, 32459], [12585, 25356, 29968], [7181, 18246, 24444], [5025, 13667, 19885], [2521, 7304, 11605], [29908, 32252, 32584], [17421, 29156, 31575], [9889, 22188, 27782], [5878, 15647, 22123], [2814, 8665, 13323], [30183, 32568, 32713], [18528, 30195, 32049], [10982, 24606, 29657], [6957, 18165, 25231], [3508, 10118, 15468], [31761, 32736, 32748], [21041, 31328, 32546], [12568, 26732, 31166], [8052, 20720, 27733], [4336, 12192, 18396], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], [ [ [ [7062, 16472, 22319], [24538, 32261, 32674], [13675, 28041, 31779], [8590, 20674, 27631], [5685, 14675, 22013], [3655, 9898, 15731], [26493, 32418, 32658], [16376, 29342, 32090], [10594, 22649, 28970], [8176, 17170, 24303], [5605, 12694, 19139], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [23888, 31902, 32542], [18612, 29687, 31987], [16245, 24852, 29249], [15765, 22608, 27559], [19895, 24699, 27510], [28401, 32212, 32457], [15274, 27825, 30980], [9364, 18128, 24332], [2283, 8193, 15082], [1228, 3972, 7881], [29455, 32469, 32620], [17981, 28245, 31388], [10921, 20098, 26240], [3743, 11829, 18657], [2374, 9593, 15715], [31068, 32466, 32635], [20321, 29572, 31971], [10771, 20255, 27119], [2795, 10410, 17361], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [9320, 22102, 27840], [27057, 32464, 32724], [16331, 30268, 32309], [10319, 23935, 29720], [6189, 16448, 24106], [3589, 10884, 18808], [29026, 32624, 32748], [19226, 31507, 32587], [12692, 26921, 31203], [7049, 19532, 27635], [7727, 15669, 23252], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [28056, 32625, 32748], [22383, 32075, 32669], [15417, 27098, 31749], [18127, 26493, 27190], [5461, 16384, 21845], [27982, 32091, 32584], [19045, 29868, 31972], [10397, 22266, 27932], [5990, 13697, 21500], [1792, 6912, 15104], [28198, 32501, 32718], [21534, 31521, 32569], [11109, 25217, 30017], [5671, 15124, 26151], [4681, 14043, 18725], [28688, 32580, 32741], [22576, 32079, 32661], [10627, 22141, 28340], [9362, 14043, 28087], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [7754, 16948, 22142], [25670, 32330, 32691], [15663, 29225, 31994], [9878, 23288, 29158], [6419, 17088, 24336], [3859, 11003, 17039], [27562, 32595, 32725], [17575, 30588, 32399], [10819, 24838, 30309], [7124, 18686, 25916], [4479, 12688, 19340], [28385, 32476, 32673], [15306, 29005, 31938], [8937, 21615, 28322], [5982, 15603, 22786], [3620, 10267, 16136], [27280, 32464, 32667], [15607, 29160, 32004], [9091, 22135, 28740], [6232, 16632, 24020], [4047, 11377, 17672], [29220, 32630, 32718], [19650, 31220, 32462], [13050, 26312, 30827], [9228, 20870, 27468], [6146, 15149, 21971], [30169, 32481, 32623], [17212, 29311, 31554], [9911, 21311, 26882], [4487, 13314, 20372], [2570, 7772, 12889], [30924, 32613, 32708], [19490, 30206, 32107], [11232, 23998, 29276], [6769, 17955, 25035], [4398, 12623, 19214], [30609, 32627, 32722], [19370, 30582, 32287], [10457, 23619, 29409], [6443, 17637, 24834], [4645, 13236, 20106], [8192, 16384, 24576], ], [ [8626, 20271, 26216], [26707, 32406, 32711], [16999, 30329, 32286], [11445, 25123, 30286], [6411, 18828, 25601], [6801, 12458, 20248], [29918, 32682, 32748], [20649, 31739, 32618], [12879, 27773, 31581], [7896, 21751, 28244], [5260, 14870, 23698], [29252, 32593, 32731], [17072, 30460, 32294], [10653, 24143, 29365], [6536, 17490, 23983], [4929, 13170, 20085], [28137, 32518, 32715], [18171, 30784, 32407], [11437, 25436, 30459], [7252, 18534, 26176], [4126, 13353, 20978], [31162, 32726, 32748], [23017, 32222, 32701], [15629, 29233, 32046], [9387, 22621, 29480], [6922, 17616, 25010], [28838, 32265, 32614], [19701, 30206, 31920], [11214, 22410, 27933], [5320, 14177, 23034], [5049, 12881, 17827], [27484, 32471, 32734], [21076, 31526, 32561], [12707, 26303, 31211], [8169, 21722, 28219], [6045, 19406, 27042], [27753, 32572, 32745], [20832, 31878, 32653], [13250, 27356, 31674], [7718, 21508, 29858], [7209, 18350, 25559], [8192, 16384, 24576], ], ], [ [ [7876, 16901, 21741], [24001, 31898, 32625], [14529, 27959, 31451], [8273, 20818, 27258], [5278, 14673, 21510], [2983, 8843, 14039], [28016, 32574, 32732], [17471, 30306, 32301], [10224, 24063, 29728], [6602, 17954, 25052], [4002, 11585, 17759], [30190, 32634, 32739], [17497, 30282, 32270], [10229, 23729, 29538], [6344, 17211, 24440], [3849, 11189, 17108], [28570, 32583, 32726], [17521, 30161, 32238], [10153, 23565, 29378], [6455, 17341, 24443], [3907, 11042, 17024], [30689, 32715, 32748], [21546, 31840, 32610], [13547, 27581, 31459], [8912, 21757, 28309], [5548, 15080, 22046], [30783, 32540, 32685], [17540, 29528, 31668], [10160, 21468, 26783], [4724, 13393, 20054], [2702, 8174, 13102], [31648, 32686, 32742], [20954, 31094, 32337], [12420, 25698, 30179], [7304, 19320, 26248], [4366, 12261, 18864], [31581, 32723, 32748], [21373, 31586, 32525], [12744, 26625, 30885], [7431, 20322, 26950], [4692, 13323, 20111], [8192, 16384, 24576], ], [ [7833, 18369, 24095], [26650, 32273, 32702], [16371, 29961, 32191], [11055, 24082, 29629], [6892, 18644, 25400], [5006, 13057, 19240], [29834, 32666, 32748], [19577, 31335, 32570], [12253, 26509, 31122], [7991, 20772, 27711], [5677, 15910, 23059], [30109, 32532, 32720], [16747, 30166, 32252], [10134, 23542, 29184], [5791, 16176, 23556], [4362, 10414, 17284], [29492, 32626, 32748], [19894, 31402, 32525], [12942, 27071, 30869], [8346, 21216, 27405], [6572, 17087, 23859], [32035, 32735, 32748], [22957, 31838, 32618], [14724, 28572, 31772], [10364, 23999, 29553], [7004, 18433, 25655], [27528, 32277, 32681], [16959, 31171, 32096], [10486, 23593, 27962], [8192, 16384, 23211], [8937, 17873, 20852], [27715, 32002, 32615], [15073, 29491, 31676], [11264, 24576, 28672], [2341, 18725, 23406], [7282, 18204, 25486], [28547, 32213, 32657], [20788, 29773, 32239], [6780, 21469, 30508], [5958, 14895, 23831], [16384, 21845, 27307], [8192, 16384, 24576], ], ], [ [ [5992, 14304, 19765], [22612, 31238, 32456], [13456, 27162, 31087], [8001, 20062, 26504], [5168, 14105, 20764], [2632, 7771, 12385], [27034, 32344, 32709], [15850, 29415, 31997], [9494, 22776, 28841], [6151, 16830, 23969], [3461, 10039, 15722], [30134, 32569, 32731], [15638, 29422, 31945], [9150, 21865, 28218], [5647, 15719, 22676], [3402, 9772, 15477], [28530, 32586, 32735], [17139, 30298, 32292], [10200, 24039, 29685], [6419, 17674, 24786], [3544, 10225, 15824], [31333, 32726, 32748], [20618, 31487, 32544], [12901, 27217, 31232], [8624, 21734, 28171], [5104, 14191, 20748], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [11206, 21090, 26561], [28759, 32279, 32671], [14171, 27952, 31569], [9743, 22907, 29141], [6871, 17886, 24868], [4960, 13152, 19315], [31077, 32661, 32748], [19400, 31195, 32515], [12752, 26858, 31040], [8370, 22098, 28591], [5457, 15373, 22298], [31697, 32706, 32748], [17860, 30657, 32333], [12510, 24812, 29261], [6180, 19124, 24722], [5041, 13548, 17959], [31552, 32716, 32748], [21908, 31769, 32623], [14470, 28201, 31565], [9493, 22982, 28608], [6858, 17240, 24137], [32543, 32752, 32756], [24286, 32097, 32666], [15958, 29217, 32024], [10207, 24234, 29958], [6929, 18305, 25652], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], [ [ [4137, 10847, 15682], [17824, 27001, 30058], [10204, 22796, 28291], [6076, 15935, 22125], [3852, 10937, 16816], [2252, 6324, 10131], [25840, 32016, 32662], [15109, 28268, 31531], [9385, 22231, 28340], [6082, 16672, 23479], [3318, 9427, 14681], [30594, 32574, 32718], [16836, 29552, 31859], [9556, 22542, 28356], [6305, 16725, 23540], [3376, 9895, 15184], [29383, 32617, 32745], [18891, 30809, 32401], [11688, 25942, 30687], [7468, 19469, 26651], [3909, 11358, 17012], [31564, 32736, 32748], [20906, 31611, 32600], [13191, 27621, 31537], [8768, 22029, 28676], [5079, 14109, 20906], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], [ [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], [8192, 16384, 24576], ], ], ], ]); pub static av1_default_coeff_base_eob_multi_cdfs: [[[[[u16; NUM_BASE_LEVELS + 1]; SIG_COEF_CONTEXTS_EOB]; PLANE_TYPES]; TxSize::TX_SIZES]; TOKEN_CDF_Q_CTXS] = cdf_5d([ [ [ [[17837, 29055], [29600, 31446], [30844, 31878], [24926, 28948]], [[21365, 30026], [30512, 32423], [31658, 32621], [29630, 31881]], ], [ [[5717, 26477], [30491, 31703], [31550, 32158], [29648, 31491]], [[12608, 27820], [30680, 32225], [30809, 32335], [31299, 32423]], ], [ [[1786, 12612], [30663, 31625], [32339, 32468], [31148, 31833]], [[18857, 23865], [31428, 32428], [31744, 32373], [31775, 32526]], ], [ [[1787, 2532], [30832, 31662], [31824, 32682], [32133, 32569]], [[13751, 22235], [32089, 32409], [27084, 27920], [29291, 32594]], ], [ [[1725, 3449], [31102, 31935], [32457, 32613], [32412, 32649]], [[10923, 21845], [10923, 21845], [10923, 21845], [10923, 21845]], ], ], [ [ [[17560, 29888], [29671, 31549], [31007, 32056], [27286, 30006]], [[26594, 31212], [31208, 32582], [31835, 32637], [30595, 32206]], ], [ [[15239, 29932], [31315, 32095], [32130, 32434], [30864, 31996]], [[26279, 30968], [31142, 32495], [31713, 32540], [31929, 32594]], ], [ [[2644, 25198], [32038, 32451], [32639, 32695], [32166, 32518]], [[17187, 27668], [31714, 32550], [32283, 32678], [31930, 32563]], ], [ [[1044, 2257], [30755, 31923], [32208, 32693], [32244, 32615]], [[21317, 26207], [29133, 30868], [29311, 31231], [29657, 31087]], ], [ [[478, 1834], [31005, 31987], [32317, 32724], [30865, 32648]], [[10923, 21845], [10923, 21845], [10923, 21845], [10923, 21845]], ], ], [ [ [[20092, 30774], [30695, 32020], [31131, 32103], [28666, 30870]], [[27258, 31095], [31804, 32623], [31763, 32528], [31438, 32506]], ], [ [[18049, 30489], [31706, 32286], [32163, 32473], [31550, 32184]], [[27116, 30842], [31971, 32598], [32088, 32576], [32067, 32664]], ], [ [[12854, 29093], [32272, 32558], [32667, 32729], [32306, 32585]], [[25476, 30366], [32169, 32687], [32479, 32689], [31673, 32634]], ], [ [[2809, 19301], [32205, 32622], [32338, 32730], [31786, 32616]], [[22737, 29105], [30810, 32362], [30014, 32627], [30528, 32574]], ], [ [[935, 3382], [30789, 31909], [32466, 32756], [30860, 32513]], [[10923, 21845], [10923, 21845], [10923, 21845], [10923, 21845]], ], ], [ [ [[22497, 31198], [31715, 32495], [31606, 32337], [30388, 31990]], [[27877, 31584], [32170, 32728], [32155, 32688], [32219, 32702]], ], [ [[21457, 31043], [31951, 32483], [32153, 32562], [31473, 32215]], [[27558, 31151], [32020, 32640], [32097, 32575], [32242, 32719]], ], [ [[19980, 30591], [32219, 32597], [32581, 32706], [31803, 32287]], [[26473, 30507], [32431, 32723], [32196, 32611], [31588, 32528]], ], [ [[24647, 30463], [32412, 32695], [32468, 32720], [31269, 32523]], [[28482, 31505], [32152, 32701], [31732, 32598], [31767, 32712]], ], [ [[12358, 24977], [31331, 32385], [32634, 32756], [30411, 32548]], [[10923, 21845], [10923, 21845], [10923, 21845], [10923, 21845]], ], ], ]); rav1e-0.7.1/src/transform/forward.rs000064400000000000000000000122651046102023000154710ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use crate::cpu_features::CpuFeatureLevel; use crate::util::*; use super::TxType; cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::transform::forward::*; } else if #[cfg(asm_neon)] { pub use crate::asm::aarch64::transform::forward::*; } else { pub use self::rust::*; } } pub mod rust { use super::*; use std::mem::MaybeUninit; use crate::transform::forward_shared::*; use crate::transform::{av1_round_shift_array, valid_av1_transform, TxSize}; use simd_helpers::cold_for_target_arch; type TxfmFunc = fn(&mut [i32]); impl_1d_tx!(); impl TxOperations for i32 { fn zero() -> Self { 0 } fn tx_mul(self, mul: i32) -> Self { ((self * mul) + (1 << SHIFT >> 1)) >> SHIFT } fn rshift1(self) -> Self { (self + i32::from(self < 0)) >> 1 } fn add(self, b: Self) -> Self { self + b } fn sub(self, b: Self) -> Self { self - b } fn add_avg(self, b: Self) -> Self { (self + b) >> 1 } fn sub_avg(self, b: Self) -> Self { (self - b) >> 1 } } /// # Panics /// /// - If called with an invalid combination of `tx_size` and `tx_type` #[cold_for_target_arch("x86_64")] pub fn forward_transform( input: &[i16], output: &mut [MaybeUninit], stride: usize, tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, ) { assert!(valid_av1_transform(tx_size, tx_type)); // Note when assigning txfm_size_col, we use the txfm_size from the // row configuration and vice versa. This is intentionally done to // accurately perform rectangular transforms. When the transform is // rectangular, the number of columns will be the same as the // txfm_size stored in the row cfg struct. It will make no difference // for square transforms. let txfm_size_col = tx_size.width(); let txfm_size_row = tx_size.height(); let mut buf = Aligned::<[MaybeUninit; 64 * 64]>::uninit_array(); let buf = &mut buf.data[..txfm_size_col * txfm_size_row]; let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); let txfm_func_col = get_func(cfg.txfm_type_col); let txfm_func_row = get_func(cfg.txfm_type_row); // Columns for c in 0..txfm_size_col { let mut col_coeffs = Aligned::<[MaybeUninit; 64]>::uninit_array(); let col_coeffs = &mut col_coeffs.data[..txfm_size_row]; if cfg.ud_flip { // flip upside down for r in 0..txfm_size_row { col_coeffs[r] .write((input[(txfm_size_row - r - 1) * stride + c]).into()); } } else { for r in 0..txfm_size_row { col_coeffs[r].write((input[r * stride + c]).into()); } } // SAFETY: The loops above have initialized all txfm_size_row elements let col_coeffs = unsafe { slice_assume_init_mut(col_coeffs) }; av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[0]); txfm_func_col(col_coeffs); av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[1]); if cfg.lr_flip { for r in 0..txfm_size_row { // flip from left to right buf[r * txfm_size_col + (txfm_size_col - c - 1)] .write(col_coeffs[r]); } } else { for r in 0..txfm_size_row { buf[r * txfm_size_col + c].write(col_coeffs[r]); } } } // SAFETY: The loops above have initialized the entire buf let buf = unsafe { slice_assume_init_mut(buf) }; // Rows for (r, row_coeffs) in buf.chunks_exact_mut(txfm_size_col).enumerate() { txfm_func_row(row_coeffs); av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]); // Store output in at most 32x32 chunks so that the first 32x32 // coefficients are stored first. When we don't have 64 rows, there is no // change in order. With 64 rows, the chunks are in this order // - First 32 rows and first 32 cols // - Last 32 rows and first 32 cols // - First 32 rows and last 32 cols // - Last 32 rows and last 32 cols // Output is grouped into 32x32 chunks so a stride of at most 32 is // used for each chunk. let output_stride = txfm_size_row.min(32); // Split the first 32 rows from the last 32 rows let output = &mut output [(r >= 32) as usize * output_stride * txfm_size_col.min(32)..]; for cg in (0..txfm_size_col).step_by(32) { // Split the first 32 cols from the last 32 cols let output = &mut output[txfm_size_row * cg..]; for c in 0..txfm_size_col.min(32) { output[c * output_stride + (r & 31)] .write(T::cast_from(row_coeffs[c + cg])); } } } } } rav1e-0.7.1/src/transform/forward_shared.rs000064400000000000000000001756651046102023000170350ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use super::TxSize; use super::TxType; use super::HTX_TAB; use super::VTX_TAB; pub type TxfmShift = [i8; 3]; pub type TxfmShifts = [TxfmShift; 3]; // Shift so that the first shift is 4 - (bd - 8) to align with the initial // design of daala_tx // 8 bit 4x4 is an exception and only shifts by 3 in the first stage const FWD_SHIFT_4X4: TxfmShifts = [[3, 0, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_8X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_16X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_32X32: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]]; const FWD_SHIFT_64X64: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]]; const FWD_SHIFT_4X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_8X4: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_8X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_16X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_16X32: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]]; const FWD_SHIFT_32X16: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]]; const FWD_SHIFT_32X64: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]]; const FWD_SHIFT_64X32: TxfmShifts = [[4, -1, -2], [2, 0, -1], [0, 0, 1]]; const FWD_SHIFT_4X16: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_16X4: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_8X32: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_32X8: TxfmShifts = [[4, -1, 0], [2, 0, 1], [0, 0, 3]]; const FWD_SHIFT_16X64: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]]; const FWD_SHIFT_64X16: TxfmShifts = [[4, -2, 0], [2, 0, 0], [0, 0, 2]]; const FWD_SHIFT_4X4_WHT: TxfmShift = [0, 0, 2]; pub const FWD_TXFM_SHIFT_LS: [TxfmShifts; TxSize::TX_SIZES_ALL] = [ FWD_SHIFT_4X4, FWD_SHIFT_8X8, FWD_SHIFT_16X16, FWD_SHIFT_32X32, FWD_SHIFT_64X64, FWD_SHIFT_4X8, FWD_SHIFT_8X4, FWD_SHIFT_8X16, FWD_SHIFT_16X8, FWD_SHIFT_16X32, FWD_SHIFT_32X16, FWD_SHIFT_32X64, FWD_SHIFT_64X32, FWD_SHIFT_4X16, FWD_SHIFT_16X4, FWD_SHIFT_8X32, FWD_SHIFT_32X8, FWD_SHIFT_16X64, FWD_SHIFT_64X16, ]; #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TxfmType { DCT4, DCT8, DCT16, DCT32, DCT64, ADST4, ADST8, ADST16, Identity4, Identity8, Identity16, Identity32, WHT4, } impl TxfmType { const TX_TYPES_1D: usize = 5; const AV1_TXFM_TYPE_LS: [[Option; Self::TX_TYPES_1D]; 5] = [ [ Some(TxfmType::DCT4), Some(TxfmType::ADST4), Some(TxfmType::ADST4), Some(TxfmType::Identity4), Some(TxfmType::WHT4), ], [ Some(TxfmType::DCT8), Some(TxfmType::ADST8), Some(TxfmType::ADST8), Some(TxfmType::Identity8), None, ], [ Some(TxfmType::DCT16), Some(TxfmType::ADST16), Some(TxfmType::ADST16), Some(TxfmType::Identity16), None, ], [Some(TxfmType::DCT32), None, None, Some(TxfmType::Identity32), None], [Some(TxfmType::DCT64), None, None, None, None], ]; } #[derive(Debug, Clone, Copy)] pub struct Txfm2DFlipCfg { pub tx_size: TxSize, /// Flip upside down pub ud_flip: bool, /// Flip left to right pub lr_flip: bool, pub shift: TxfmShift, pub txfm_type_col: TxfmType, pub txfm_type_row: TxfmType, } impl Txfm2DFlipCfg { /// # Panics /// /// - If called with an invalid combination of `tx_size` and `tx_type` pub fn fwd(tx_type: TxType, tx_size: TxSize, bd: usize) -> Self { let tx_type_1d_col = VTX_TAB[tx_type as usize]; let tx_type_1d_row = HTX_TAB[tx_type as usize]; let txw_idx = tx_size.width_index(); let txh_idx = tx_size.height_index(); let txfm_type_col = TxfmType::AV1_TXFM_TYPE_LS[txh_idx][tx_type_1d_col as usize].unwrap(); let txfm_type_row = TxfmType::AV1_TXFM_TYPE_LS[txw_idx][tx_type_1d_row as usize].unwrap(); let (ud_flip, lr_flip) = Self::get_flip_cfg(tx_type); let shift = if tx_type == TxType::WHT_WHT { FWD_SHIFT_4X4_WHT } else { FWD_TXFM_SHIFT_LS[tx_size as usize][(bd - 8) / 2] }; Txfm2DFlipCfg { tx_size, ud_flip, lr_flip, shift, txfm_type_col, txfm_type_row, } } /// Determine the flip config, returning `(ud_flip, lr_flip)` const fn get_flip_cfg(tx_type: TxType) -> (bool, bool) { use self::TxType::*; match tx_type { DCT_DCT | ADST_DCT | DCT_ADST | ADST_ADST | IDTX | V_DCT | H_DCT | V_ADST | H_ADST | WHT_WHT => (false, false), FLIPADST_DCT | FLIPADST_ADST | V_FLIPADST => (true, false), DCT_FLIPADST | ADST_FLIPADST | H_FLIPADST => (false, true), FLIPADST_FLIPADST => (true, true), } } } macro_rules! store_coeffs { ( $arr:expr, $( $x:expr ),* ) => { { let mut i: i32 = -1; $( i += 1; $arr[i as usize] = $x; )* } }; } macro_rules! impl_1d_tx { () => { impl_1d_tx! {allow(unused_attributes), } }; ($m:meta, $($s:ident),*) => { pub trait TxOperations: Copy { $($s)* fn zero() -> Self; $($s)* fn tx_mul(self, mul: i32) -> Self; $($s)* fn rshift1(self) -> Self; $($s)* fn add(self, b: Self) -> Self; $($s)* fn sub(self, b: Self) -> Self; $($s)* fn add_avg(self, b: Self) -> Self; $($s)* fn sub_avg(self, b: Self) -> Self; $($s)* fn copy_fn(self) -> Self { self } } #[inline] fn get_func(t: TxfmType) -> TxfmFunc { use self::TxfmType::*; match t { DCT4 => daala_fdct4, DCT8 => daala_fdct8, DCT16 => daala_fdct16, DCT32 => daala_fdct32, DCT64 => daala_fdct64, ADST4 => daala_fdst_vii_4, ADST8 => daala_fdst8, ADST16 => daala_fdst16, Identity4 => fidentity, Identity8 => fidentity, Identity16 => fidentity, Identity32 => fidentity, WHT4 => fwht4, } } trait RotateKernelPi4 { const ADD: $($s)* fn(T, T) -> T; const SUB: $($s)* fn(T, T) -> T; #[$m] $($s)* fn kernel(p0: T, p1: T, m: (i32, i32)) -> (T, T) { let t = Self::ADD(p1, p0); let (a, out0) = (p0.tx_mul::(m.0), t.tx_mul::(m.1)); let out1 = Self::SUB(a, out0); (out0, out1) } } struct RotatePi4Add; struct RotatePi4AddAvg; struct RotatePi4Sub; struct RotatePi4SubAvg; impl RotateKernelPi4 for RotatePi4Add { const ADD: $($s)* fn(T, T) -> T = T::add; const SUB: $($s)* fn(T, T) -> T = T::sub; } impl RotateKernelPi4 for RotatePi4AddAvg { const ADD: $($s)* fn(T, T) -> T = T::add_avg; const SUB: $($s)* fn(T, T) -> T = T::sub; } impl RotateKernelPi4 for RotatePi4Sub { const ADD: $($s)* fn(T, T) -> T = T::sub; const SUB: $($s)* fn(T, T) -> T = T::add; } impl RotateKernelPi4 for RotatePi4SubAvg { const ADD: $($s)* fn(T, T) -> T = T::sub_avg; const SUB: $($s)* fn(T, T) -> T = T::add; } trait RotateKernel { const ADD: $($s)* fn(T, T) -> T; const SUB: $($s)* fn(T, T) -> T; const SHIFT: $($s)* fn(T) -> T; #[$m] $($s)* fn half_kernel( p0: (T, T), p1: T, m: (i32, i32, i32), ) -> (T, T) { let t = Self::ADD(p1, p0.0); let (a, b, c) = (p0.1.tx_mul::(m.0), p1.tx_mul::(m.1), t.tx_mul::(m.2)); let out0 = b.add(c); let shifted = Self::SHIFT(c); let out1 = Self::SUB(a, shifted); (out0, out1) } #[$m] $($s)* fn kernel(p0: T, p1: T, m: (i32, i32, i32)) -> (T, T) { Self::half_kernel::((p0, p0), p1, m) } } trait RotateKernelNeg { const ADD: $($s)* fn(T, T) -> T; #[$m] $($s)* fn kernel(p0: T, p1: T, m: (i32, i32, i32)) -> (T, T) { let t = Self::ADD(p0, p1); let (a, b, c) = (p0.tx_mul::(m.0), p1.tx_mul::(m.1), t.tx_mul::(m.2)); let out0 = b.sub(c); let out1 = c.sub(a); (out0, out1) } } struct RotateAdd; struct RotateAddAvg; struct RotateAddShift; struct RotateSub; struct RotateSubAvg; struct RotateSubShift; struct RotateNeg; struct RotateNegAvg; impl RotateKernel for RotateAdd { const ADD: $($s)* fn(T, T) -> T = T::add; const SUB: $($s)* fn(T, T) -> T = T::sub; const SHIFT: $($s)* fn(T) -> T = T::copy_fn; } impl RotateKernel for RotateAddAvg { const ADD: $($s)* fn(T, T) -> T = T::add_avg; const SUB: $($s)* fn(T, T) -> T = T::sub; const SHIFT: $($s)* fn(T) -> T = T::copy_fn; } impl RotateKernel for RotateAddShift { const ADD: $($s)* fn(T, T) -> T = T::add; const SUB: $($s)* fn(T, T) -> T = T::sub; const SHIFT: $($s)* fn(T) -> T = T::rshift1; } impl RotateKernel for RotateSub { const ADD: $($s)* fn(T, T) -> T = T::sub; const SUB: $($s)* fn(T, T) -> T = T::add; const SHIFT: $($s)* fn(T) -> T = T::copy_fn; } impl RotateKernel for RotateSubAvg { const ADD: $($s)* fn(T, T) -> T = T::sub_avg; const SUB: $($s)* fn(T, T) -> T = T::add; const SHIFT: $($s)* fn(T) -> T = T::copy_fn; } impl RotateKernel for RotateSubShift { const ADD: $($s)* fn(T, T) -> T = T::sub; const SUB: $($s)* fn(T, T) -> T = T::add; const SHIFT: $($s)* fn(T) -> T = T::rshift1; } impl RotateKernelNeg for RotateNeg { const ADD: $($s)* fn(T, T) -> T = T::sub; } impl RotateKernelNeg for RotateNegAvg { const ADD: $($s)* fn(T, T) -> T = T::sub_avg; } #[inline] #[$m] $($s)* fn butterfly_add(p0: T, p1: T) -> ((T, T), T) { let p0 = p0.add(p1); let p0h = p0.rshift1(); let p1h = p1.sub(p0h); ((p0h, p0), p1h) } #[inline] #[$m] $($s)* fn butterfly_sub(p0: T, p1: T) -> ((T, T), T) { let p0 = p0.sub(p1); let p0h = p0.rshift1(); let p1h = p1.add(p0h); ((p0h, p0), p1h) } #[inline] #[$m] $($s)* fn butterfly_neg(p0: T, p1: T) -> (T, (T, T)) { let p1 = p0.sub(p1); let p1h = p1.rshift1(); let p0h = p0.sub(p1h); (p0h, (p1h, p1)) } #[inline] #[$m] $($s)* fn butterfly_add_asym(p0: (T, T), p1h: T) -> (T, T) { let p1 = p1h.add(p0.0); let p0 = p0.1.sub(p1); (p0, p1) } #[inline] #[$m] $($s)* fn butterfly_sub_asym(p0: (T, T), p1h: T) -> (T, T) { let p1 = p1h.sub(p0.0); let p0 = p0.1.add(p1); (p0, p1) } #[inline] #[$m] $($s)* fn butterfly_neg_asym(p0h: T, p1: (T, T)) -> (T, T) { let p0 = p0h.add(p1.0); let p1 = p0.sub(p1.1); (p0, p1) } #[$m] $($s)* fn daala_fdct_ii_2_asym(p0h: T, p1: (T, T)) -> (T, T) { butterfly_neg_asym(p0h, p1) } #[$m] $($s)* fn daala_fdst_iv_2_asym(p0: (T, T), p1h: T) -> (T, T) { // 473/512 = (Sin[3*Pi/8] + Cos[3*Pi/8])/Sqrt[2] = 0.9238795325112867 // 3135/4096 = (Sin[3*Pi/8] - Cos[3*Pi/8])*Sqrt[2] = 0.7653668647301795 // 4433/8192 = Cos[3*Pi/8]*Sqrt[2] = 0.5411961001461971 RotateAdd::half_kernel::<9, 12, 13>(p0, p1h, (473, 3135, 4433)) } #[$m] $($s)* fn daala_fdct_ii_4( q0: T, q1: T, q2: T, q3: T, output: &mut [T], ) { // +/- Butterflies with asymmetric output. let (q0h, q3) = butterfly_neg(q0, q3); let (q1, q2h) = butterfly_add(q1, q2); // Embedded 2-point transforms with asymmetric input. let (q0, q1) = daala_fdct_ii_2_asym(q0h, q1); let (q3, q2) = daala_fdst_iv_2_asym(q3, q2h); store_coeffs!(output, q0, q1, q2, q3); } #[$m] $($s)* fn daala_fdct4(coeffs: &mut [T]) { assert!(coeffs.len() >= 4); let mut temp_out: [T; 4] = [T::zero(); 4]; daala_fdct_ii_4(coeffs[0], coeffs[1], coeffs[2], coeffs[3], &mut temp_out); coeffs[0] = temp_out[0]; coeffs[1] = temp_out[2]; coeffs[2] = temp_out[1]; coeffs[3] = temp_out[3]; } #[$m] $($s)* fn daala_fdst_vii_4(coeffs: &mut [T]) { assert!(coeffs.len() >= 4); let q0 = coeffs[0]; let q1 = coeffs[1]; let q2 = coeffs[2]; let q3 = coeffs[3]; let t0 = q1.add(q3); // t1 = (q0 + q1 - q3)/2 let t1 = q1.add(q0.sub_avg(t0)); let t2 = q0.sub(q1); let t3 = q2; let t4 = q0.add(q3); // 7021/16384 ~= 2*Sin[2*Pi/9]/3 ~= 0.428525073124360 let t0 = t0.tx_mul::<14>(7021); // 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 let t1 = t1.tx_mul::<15>(37837); // 21513/32768 ~= 2*Sin[4*Pi/9]/3 ~= 0.656538502008139 let t2 = t2.tx_mul::<15>(21513); // 37837/32768 ~= 4*Sin[3*Pi/9]/3 ~= 1.154700538379252 let t3 = t3.tx_mul::<15>(37837); // 467/2048 ~= 2*Sin[1*Pi/9]/3 ~= 0.228013428883779 let t4 = t4.tx_mul::<11>(467); let t3h = t3.rshift1(); let u4 = t4.add(t3h); coeffs[0] = t0.add(u4); coeffs[1] = t1; coeffs[2] = t0.add(t2.sub(t3h)); coeffs[3] = t2.add(t3.sub(u4)); } #[$m] $($s)* fn daala_fdct_ii_2(p0: T, p1: T) -> (T, T) { // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (p1, p0) = RotatePi4SubAvg::kernel::<13, 13>(p1, p0, (11585, 11585)); (p0, p1) } #[$m] $($s)* fn daala_fdst_iv_2(p0: T, p1: T) -> (T, T) { // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461971 // 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 RotateAddAvg::kernel::<13, 14, 12>(p0, p1, (10703, 8867, 3135)) } #[$m] $($s)* fn daala_fdct_ii_4_asym( q0h: T, q1: (T, T), q2h: T, q3: (T, T), output: &mut [T], ) { // +/- Butterflies with asymmetric input. let (q0, q3) = butterfly_neg_asym(q0h, q3); let (q1, q2) = butterfly_sub_asym(q1, q2h); // Embedded 2-point orthonormal transforms. let (q0, q1) = daala_fdct_ii_2(q0, q1); let (q3, q2) = daala_fdst_iv_2(q3, q2); store_coeffs!(output, q0, q1, q2, q3); } #[$m] $($s)* fn daala_fdst_iv_4_asym( q0: (T, T), q1h: T, q2: (T, T), q3h: T, output: &mut [T], ) { // Stage 0 // 9633/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/2 = 0.5879378012096793 // 12873/8192 = (Sin[7*Pi/16] - Cos[7*Pi/16])*2 = 1.5713899167742045 // 12785/32768 = Cos[7*Pi/16]*2 = 0.3901806440322565 let (q0, q3) = RotateAddShift::half_kernel::<14, 13, 15>( q0, q3h, (9633, 12873, 12785), ); // 11363/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/2 = 0.6935199226610738 // 18081/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*2 = 0.5517987585658861 // 4551/4096 = Cos[5*Pi/16]*2 = 1.1111404660392044 let (q2, q1) = RotateSubShift::half_kernel::<14, 15, 12>( q2, q1h, (11363, 18081, 4551), ); // Stage 1 let (q2, q3) = butterfly_sub_asym((q2.rshift1(), q2), q3); let (q0, q1) = butterfly_sub_asym((q0.rshift1(), q0), q1); // Stage 2 // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (q2, q1) = RotatePi4AddAvg::kernel::<13, 13>(q2, q1, (11585, 11585)); store_coeffs!(output, q0, q1, q2, q3); } #[$m] $($s)* fn daala_fdct_ii_8( r0: T, r1: T, r2: T, r3: T, r4: T, r5: T, r6: T, r7: T, output: &mut [T], ) { // +/- Butterflies with asymmetric output. let (r0h, r7) = butterfly_neg(r0, r7); let (r1, r6h) = butterfly_add(r1, r6); let (r2h, r5) = butterfly_neg(r2, r5); let (r3, r4h) = butterfly_add(r3, r4); // Embedded 4-point transforms with asymmetric input. daala_fdct_ii_4_asym(r0h, r1, r2h, r3, &mut output[0..4]); daala_fdst_iv_4_asym(r7, r6h, r5, r4h, &mut output[4..8]); output[4..8].reverse(); } #[$m] $($s)* fn daala_fdct8(coeffs: &mut [T]) { assert!(coeffs.len() >= 8); let mut temp_out: [T; 8] = [T::zero(); 8]; daala_fdct_ii_8( coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4], coeffs[5], coeffs[6], coeffs[7], &mut temp_out, ); coeffs[0] = temp_out[0]; coeffs[1] = temp_out[4]; coeffs[2] = temp_out[2]; coeffs[3] = temp_out[6]; coeffs[4] = temp_out[1]; coeffs[5] = temp_out[5]; coeffs[6] = temp_out[3]; coeffs[7] = temp_out[7]; } #[$m] $($s)* fn daala_fdst_iv_8( r0: T, r1: T, r2: T, r3: T, r4: T, r5: T, r6: T, r7: T, output: &mut [T], ) { // Stage 0 // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 // 803/8192 = Cos[15*Pi/32] = 0.0980171403295606 let (r0, r7) = RotateAdd::kernel::<14, 14, 13>(r0, r7, (17911, 14699, 803)); // 20435/16384 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.24722501298667123 // 21845/32768 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.66665565847774650 // 1189/4096 = Cos[13*Pi/32] = 0.29028467725446233 let (r6, r1) = RotateSub::kernel::<14, 15, 12>(r6, r1, (20435, 21845, 1189)); // 22173/16384 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 // 3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 // 15447/32768 = Cos[11*Pi/32] = 0.47139673682599764 let (r2, r5) = RotateAdd::kernel::<14, 13, 15>(r2, r5, (22173, 3363, 15447)); // 23059/16384 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 // 2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 // 5197/8192 = Cos[9*Pi/32] = 0.6343932841636455 let (r4, r3) = RotateSub::kernel::<14, 14, 13>(r4, r3, (23059, 2271, 5197)); // Stage 1 let (r0, r3h) = butterfly_add(r0, r3); let (r2, r1h) = butterfly_sub(r2, r1); let (r5, r6h) = butterfly_add(r5, r6); let (r7, r4h) = butterfly_sub(r7, r4); // Stage 2 let (r7, r6) = butterfly_add_asym(r7, r6h); let (r5, r3) = butterfly_add_asym(r5, r3h); let (r2, r4) = butterfly_add_asym(r2, r4h); let (r0, r1) = butterfly_sub_asym(r0, r1h); // Stage 3 // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 let (r3, r4) = RotateSubAvg::kernel::<13, 14, 12>(r3, r4, (10703, 8867, 3135)); // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 let (r2, r5) = RotateNegAvg::kernel::<13, 14, 12>(r2, r5, (10703, 8867, 3135)); // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (r1, r6) = RotatePi4SubAvg::kernel::<13, 13>(r1, r6, (11585, 11585)); store_coeffs!(output, r0, r1, r2, r3, r4, r5, r6, r7); } #[$m] $($s)* fn daala_fdst8(coeffs: &mut [T]) { assert!(coeffs.len() >= 8); let mut temp_out: [T; 8] = [T::zero(); 8]; daala_fdst_iv_8( coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4], coeffs[5], coeffs[6], coeffs[7], &mut temp_out, ); coeffs[0] = temp_out[0]; coeffs[1] = temp_out[4]; coeffs[2] = temp_out[2]; coeffs[3] = temp_out[6]; coeffs[4] = temp_out[1]; coeffs[5] = temp_out[5]; coeffs[6] = temp_out[3]; coeffs[7] = temp_out[7]; } #[$m] $($s)* fn daala_fdst_iv_4( q0: T, q1: T, q2: T, q3: T, output: &mut [T], ) { // Stage 0 // 13623/16384 = (Sin[7*Pi/16] + Cos[7*Pi/16])/Sqrt[2] = 0.831469612302545 // 4551/4096 = (Sin[7*Pi/16] - Cos[7*Pi/16])*Sqrt[2] = 1.111140466039204 // 9041/32768 = Cos[7*Pi/16]*Sqrt[2] = 0.275899379282943 let (q0, q3) = RotateAddShift::kernel::<14, 12, 11>(q0, q3, (13623, 4551, 565)); // 16069/16384 = (Sin[5*Pi/16] + Cos[5*Pi/16])/Sqrt[2] = 0.9807852804032304 // 12785/32768 = (Sin[5*Pi/16] - Cos[5*Pi/16])*Sqrt[2] = 0.3901806440322566 // 1609/2048 = Cos[5*Pi/16]*Sqrt[2] = 0.7856949583871021 let (q2, q1) = RotateSubShift::kernel::<14, 15, 11>(q2, q1, (16069, 12785, 1609)); // Stage 1 let (q2, q3) = butterfly_sub_asym((q2.rshift1(), q2), q3); let (q0, q1) = butterfly_sub_asym((q0.rshift1(), q0), q1); // Stage 2 // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (q2, q1) = RotatePi4AddAvg::kernel::<13, 13>(q2, q1, (11585, 11585)); store_coeffs!(output, q0, q1, q2, q3); } #[$m] $($s)* fn daala_fdct_ii_8_asym( r0h: T, r1: (T, T), r2h: T, r3: (T, T), r4h: T, r5: (T, T), r6h: T, r7: (T, T), output: &mut [T], ) { // +/- Butterflies with asymmetric input. let (r0, r7) = butterfly_neg_asym(r0h, r7); let (r1, r6) = butterfly_sub_asym(r1, r6h); let (r2, r5) = butterfly_neg_asym(r2h, r5); let (r3, r4) = butterfly_sub_asym(r3, r4h); // Embedded 4-point orthonormal transforms. daala_fdct_ii_4(r0, r1, r2, r3, &mut output[0..4]); daala_fdst_iv_4(r7, r6, r5, r4, &mut output[4..8]); output[4..8].reverse(); } #[$m] $($s)* fn daala_fdst_iv_8_asym( r0: (T, T), r1h: T, r2: (T, T), r3h: T, r4: (T, T), r5h: T, r6: (T, T), r7h: T, output: &mut [T], ) { // Stage 0 // 12665/16384 = (Sin[15*Pi/32] + Cos[15*Pi/32])/Sqrt[2] = 0.77301045336274 // 5197/4096 = (Sin[15*Pi/32] - Cos[15*Pi/32])*Sqrt[2] = 1.26878656832729 // 2271/16384 = Cos[15*Pi/32]*Sqrt[2] = 0.13861716919909 let (r0, r7) = RotateAdd::half_kernel::<14, 12, 14>(r0, r7h, (12665, 5197, 2271)); // 14449/16384 = Sin[13*Pi/32] + Cos[13*Pi/32])/Sqrt[2] = 0.881921264348355 // 30893/32768 = Sin[13*Pi/32] - Cos[13*Pi/32])*Sqrt[2] = 0.942793473651995 // 3363/8192 = Cos[13*Pi/32]*Sqrt[2] = 0.410524527522357 let (r6, r1) = RotateSub::half_kernel::<14, 15, 13>(r6, r1h, (14449, 30893, 3363)); // 15679/16384 = Sin[11*Pi/32] + Cos[11*Pi/32])/Sqrt[2] = 0.956940335732209 // 1189/2048 = Sin[11*Pi/32] - Cos[11*Pi/32])*Sqrt[2] = 0.580569354508925 // 5461/8192 = Cos[11*Pi/32]*Sqrt[2] = 0.666655658477747 let (r2, r5) = RotateAdd::half_kernel::<14, 11, 13>(r2, r5h, (15679, 1189, 5461)); // 16305/16384 = (Sin[9*Pi/32] + Cos[9*Pi/32])/Sqrt[2] = 0.9951847266721969 // 803/4096 = (Sin[9*Pi/32] - Cos[9*Pi/32])*Sqrt[2] = 0.1960342806591213 // 14699/16384 = Cos[9*Pi/32]*Sqrt[2] = 0.8971675863426364 let (r4, r3) = RotateSub::half_kernel::<14, 12, 14>(r4, r3h, (16305, 803, 14699)); // Stage 1 let (r0, r3h) = butterfly_add(r0, r3); let (r2, r1h) = butterfly_sub(r2, r1); let (r5, r6h) = butterfly_add(r5, r6); let (r7, r4h) = butterfly_sub(r7, r4); // Stage 2 let (r7, r6) = butterfly_add_asym(r7, r6h); let (r5, r3) = butterfly_add_asym(r5, r3h); let (r2, r4) = butterfly_add_asym(r2, r4h); let (r0, r1) = butterfly_sub_asym(r0, r1h); // Stage 3 // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 let (r3, r4) = RotateSubAvg::kernel::<9, 14, 12>(r3, r4, (669, 8867, 3135)); // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 let (r2, r5) = RotateNegAvg::kernel::<9, 14, 12>(r2, r5, (669, 8867, 3135)); // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (r1, r6) = RotatePi4SubAvg::kernel::<12, 13>(r1, r6, (5793, 11585)); store_coeffs!(output, r0, r1, r2, r3, r4, r5, r6, r7); } #[$m] $($s)* fn daala_fdct_ii_16( s0: T, s1: T, s2: T, s3: T, s4: T, s5: T, s6: T, s7: T, s8: T, s9: T, sa: T, sb: T, sc: T, sd: T, se: T, sf: T, output: &mut [T], ) { // +/- Butterflies with asymmetric output. let (s0h, sf) = butterfly_neg(s0, sf); let (s1, seh) = butterfly_add(s1, se); let (s2h, sd) = butterfly_neg(s2, sd); let (s3, sch) = butterfly_add(s3, sc); let (s4h, sb) = butterfly_neg(s4, sb); let (s5, sah) = butterfly_add(s5, sa); let (s6h, s9) = butterfly_neg(s6, s9); let (s7, s8h) = butterfly_add(s7, s8); // Embedded 8-point transforms with asymmetric input. daala_fdct_ii_8_asym(s0h, s1, s2h, s3, s4h, s5, s6h, s7, &mut output[0..8]); daala_fdst_iv_8_asym(sf, seh, sd, sch, sb, sah, s9, s8h, &mut output[8..16]); output[8..16].reverse(); } #[$m] $($s)* fn daala_fdct16(coeffs: &mut [T]) { assert!(coeffs.len() >= 16); let mut temp_out: [T; 16] = [T::zero(); 16]; daala_fdct_ii_16( coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4], coeffs[5], coeffs[6], coeffs[7], coeffs[8], coeffs[9], coeffs[10], coeffs[11], coeffs[12], coeffs[13], coeffs[14], coeffs[15], &mut temp_out, ); coeffs[0] = temp_out[0]; coeffs[1] = temp_out[8]; coeffs[2] = temp_out[4]; coeffs[3] = temp_out[12]; coeffs[4] = temp_out[2]; coeffs[5] = temp_out[10]; coeffs[6] = temp_out[6]; coeffs[7] = temp_out[14]; coeffs[8] = temp_out[1]; coeffs[9] = temp_out[9]; coeffs[10] = temp_out[5]; coeffs[11] = temp_out[13]; coeffs[12] = temp_out[3]; coeffs[13] = temp_out[11]; coeffs[14] = temp_out[7]; coeffs[15] = temp_out[15]; } #[$m] $($s)* fn daala_fdst_iv_16( s0: T, s1: T, s2: T, s3: T, s4: T, s5: T, s6: T, s7: T, s8: T, s9: T, sa: T, sb: T, sc: T, sd: T, se: T, sf: T, output: &mut [T], ) { // Stage 0 // 24279/32768 = (Sin[31*Pi/64] + Cos[31*Pi/64])/Sqrt[2] = 0.74095112535496 // 11003/8192 = (Sin[31*Pi/64] - Cos[31*Pi/64])*Sqrt[2] = 1.34311790969404 // 1137/16384 = Cos[31*Pi/64]*Sqrt[2] = 0.06939217050794 let (s0, sf) = RotateAddShift::kernel::<15, 13, 14>(s0, sf, (24279, 11003, 1137)); // 1645/2048 = (Sin[29*Pi/64] + Cos[29*Pi/64])/Sqrt[2] = 0.8032075314806449 // 305/256 = (Sin[29*Pi/64] - Cos[29*Pi/64])*Sqrt[2] = 1.1913986089848667 // 425/2048 = Cos[29*Pi/64]*Sqrt[2] = 0.2075082269882116 let (se, s1) = RotateSubShift::kernel::<11, 8, 11>(se, s1, (1645, 305, 425)); // 14053/32768 = (Sin[27*Pi/64] + Cos[27*Pi/64])/Sqrt[2] = 0.85772861000027 // 8423/8192 = (Sin[27*Pi/64] - Cos[27*Pi/64])*Sqrt[2] = 1.02820548838644 // 2815/8192 = Cos[27*Pi/64]*Sqrt[2] = 0.34362586580705 let (s2, sd) = RotateAddShift::kernel::<14, 13, 13>(s2, sd, (14053, 8423, 2815)); // 14811/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/Sqrt[2] = 0.90398929312344 // 7005/8192 = (Sin[25*Pi/64] - Cos[25*Pi/64])*Sqrt[2] = 0.85511018686056 // 3903/8192 = Cos[25*Pi/64]*Sqrt[2] = 0.47643419969316 let (sc, s3) = RotateSubShift::kernel::<14, 13, 13>(sc, s3, (14811, 7005, 3903)); // 30853/32768 = (Sin[23*Pi/64] + Cos[23*Pi/64])/Sqrt[2] = 0.94154406518302 // 11039/16384 = (Sin[23*Pi/64] - Cos[23*Pi/64])*Sqrt[2] = 0.67377970678444 // 9907/16384 = Cos[23*Pi/64]*Sqrt[2] = 0.60465421179080 let (s4, sb) = RotateAddShift::kernel::<15, 14, 14>(s4, sb, (30853, 11039, 9907)); // 15893/16384 = (Sin[21*Pi/64] + Cos[21*Pi/64])/Sqrt[2] = 0.97003125319454 // 3981/8192 = (Sin[21*Pi/64] - Cos[21*Pi/64])*Sqrt[2] = 0.89716758634264 // 1489/2048 = Cos[21*Pi/64]*Sqrt[2] = 0.72705107329128 let (sa, s5) = RotateSubShift::kernel::<14, 13, 11>(sa, s5, (15893, 3981, 1489)); // 32413/32768 = (Sin[19*Pi/64] + Cos[19*Pi/64])/Sqrt[2] = 0.98917650996478 // 601/2048 = (Sin[19*Pi/64] - Cos[19*Pi/64])*Sqrt[2] = 0.29346094891072 // 13803/16384 = Cos[19*Pi/64]*Sqrt[2] = 0.84244603550942 let (s6, s9) = RotateAddShift::kernel::<15, 11, 14>(s6, s9, (32413, 601, 13803)); // 32729/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/Sqrt[2] = 0.99879545620517 // 201/2048 = (Sin[17*Pi/64] - Cos[17*Pi/64])*Sqrt[2] = 0.09813534865484 // 1945/2048 = Cos[17*Pi/64]*Sqrt[2] = 0.94972778187775 let (s8, s7) = RotateSubShift::kernel::<15, 11, 11>(s8, s7, (32729, 201, 1945)); // Stage 1 let (s0, s7) = butterfly_sub_asym((s0.rshift1(), s0), s7); let (s8, sf) = butterfly_sub_asym((s8.rshift1(), s8), sf); let (s4, s3) = butterfly_add_asym((s4.rshift1(), s4), s3); let (sc, sb) = butterfly_add_asym((sc.rshift1(), sc), sb); let (s2, s5) = butterfly_sub_asym((s2.rshift1(), s2), s5); let (sa, sd) = butterfly_sub_asym((sa.rshift1(), sa), sd); let (s6, s1) = butterfly_add_asym((s6.rshift1(), s6), s1); let (se, s9) = butterfly_add_asym((se.rshift1(), se), s9); // Stage 2 let ((_s8h, s8), s4h) = butterfly_add(s8, s4); let ((_s7h, s7), sbh) = butterfly_add(s7, sb); let ((_sah, sa), s6h) = butterfly_sub(sa, s6); let ((_s5h, s5), s9h) = butterfly_sub(s5, s9); let (s0, s3h) = butterfly_add(s0, s3); let (sd, seh) = butterfly_add(sd, se); let (s2, s1h) = butterfly_sub(s2, s1); let (sf, sch) = butterfly_sub(sf, sc); // Stage 3 // 301/256 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 // 1609/2048 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 // 12785/32768 = 2*Cos[7*Pi/16] = 0.3901806440322565 let (s8, s7) = RotateAddAvg::kernel::<8, 11, 15>(s8, s7, (301, 1609, 12785)); // 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 // 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 let (s9, s6) = RotateAdd::kernel::<13, 15, 13>(s9h, s6h, (11363, 9041, 4551)); // 5681/4096 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 // 4551/4096 = 2*Cos[5*Pi/16] = 1.1111404660392044 let (s5, sa) = RotateNegAvg::kernel::<12, 15, 12>(s5, sa, (5681, 9041, 4551)); // 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 // 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 let (s4, sb) = RotateNeg::kernel::<13, 14, 15>(s4h, sbh, (9633, 12873, 6393)); // Stage 4 let (s2, sc) = butterfly_add_asym(s2, sch); let (s0, s1) = butterfly_sub_asym(s0, s1h); let (sf, se) = butterfly_add_asym(sf, seh); let (sd, s3) = butterfly_add_asym(sd, s3h); let (s7, s6) = butterfly_add_asym((s7.rshift1(), s7), s6); let (s8, s9) = butterfly_sub_asym((s8.rshift1(), s8), s9); let (sa, sb) = butterfly_sub_asym((sa.rshift1(), sa), sb); let (s5, s4) = butterfly_add_asym((s5.rshift1(), s5), s4); // Stage 5 // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/4096 = 2*Cos[7*Pi/8] = 0.7653668647301796 let (sc, s3) = RotateAddAvg::kernel::<9, 14, 12>(sc, s3, (669, 8867, 3135)); // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/4096 = 2*Cos[3*Pi/8] = 0.7653668647301796 let (s2, sd) = RotateNegAvg::kernel::<9, 14, 12>(s2, sd, (669, 8867, 3135)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (sa, s5) = RotatePi4AddAvg::kernel::<12, 13>(sa, s5, (5793, 11585)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (s6, s9) = RotatePi4AddAvg::kernel::<12, 13>(s6, s9, (5793, 11585)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 11585/8192 = 2*Cos[Pi/4] = 1.4142135623730951 let (se, s1) = RotatePi4AddAvg::kernel::<12, 13>(se, s1, (5793, 11585)); store_coeffs!( output, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf ); } #[$m] $($s)* fn daala_fdst16(coeffs: &mut [T]) { assert!(coeffs.len() >= 16); let mut temp_out: [T; 16] = [T::zero(); 16]; daala_fdst_iv_16( coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4], coeffs[5], coeffs[6], coeffs[7], coeffs[8], coeffs[9], coeffs[10], coeffs[11], coeffs[12], coeffs[13], coeffs[14], coeffs[15], &mut temp_out, ); coeffs[0] = temp_out[0]; coeffs[1] = temp_out[8]; coeffs[2] = temp_out[4]; coeffs[3] = temp_out[12]; coeffs[4] = temp_out[2]; coeffs[5] = temp_out[10]; coeffs[6] = temp_out[6]; coeffs[7] = temp_out[14]; coeffs[8] = temp_out[1]; coeffs[9] = temp_out[9]; coeffs[10] = temp_out[5]; coeffs[11] = temp_out[13]; coeffs[12] = temp_out[3]; coeffs[13] = temp_out[11]; coeffs[14] = temp_out[7]; coeffs[15] = temp_out[15]; } #[$m] $($s)* fn daala_fdct_ii_16_asym( s0h: T, s1: (T, T), s2h: T, s3: (T, T), s4h: T, s5: (T, T), s6h: T, s7: (T, T), s8h: T, s9: (T, T), sah: T, sb: (T, T), sch: T, sd: (T, T), seh: T, sf: (T, T), output: &mut [T], ) { // +/- Butterflies with asymmetric input. let (s0, sf) = butterfly_neg_asym(s0h, sf); let (s1, se) = butterfly_sub_asym(s1, seh); let (s2, sd) = butterfly_neg_asym(s2h, sd); let (s3, sc) = butterfly_sub_asym(s3, sch); let (s4, sb) = butterfly_neg_asym(s4h, sb); let (s5, sa) = butterfly_sub_asym(s5, sah); let (s6, s9) = butterfly_neg_asym(s6h, s9); let (s7, s8) = butterfly_sub_asym(s7, s8h); // Embedded 8-point orthonormal transforms. daala_fdct_ii_8(s0, s1, s2, s3, s4, s5, s6, s7, &mut output[0..8]); daala_fdst_iv_8(sf, se, sd, sc, sb, sa, s9, s8, &mut output[8..16]); output[8..16].reverse(); } #[$m] $($s)* fn daala_fdst_iv_16_asym( s0: (T, T), s1h: T, s2: (T, T), s3h: T, s4: (T, T), s5h: T, s6: (T, T), s7h: T, s8: (T, T), s9h: T, sa: (T, T), sbh: T, sc: (T, T), sdh: T, se: (T, T), sfh: T, output: &mut [T], ) { // Stage 0 // 1073/2048 = (Sin[31*Pi/64] + Cos[31*Pi/64])/2 = 0.5239315652662953 // 62241/32768 = (Sin[31*Pi/64] - Cos[31*Pi/64])*2 = 1.8994555637555088 // 201/16384 = Cos[31*Pi/64]*2 = 0.0981353486548360 let (s0, sf) = RotateAddShift::half_kernel::<11, 15, 11>(s0, sfh, (1073, 62241, 201)); // 18611/32768 = (Sin[29*Pi/64] + Cos[29*Pi/64])/2 = 0.5679534922100714 // 55211/32768 = (Sin[29*Pi/64] - Cos[29*Pi/64])*2 = 1.6848920710188384 // 601/2048 = Cos[29*Pi/64]*2 = 0.2934609489107235 let (se, s1) = RotateSubShift::half_kernel::<15, 15, 11>( se, s1h, (18611, 55211, 601), ); // 9937/16384 = (Sin[27*Pi/64] + Cos[27*Pi/64])/2 = 0.6065057165489039 // 1489/1024 = (Sin[27*Pi/64] - Cos[27*Pi/64])*2 = 1.4541021465825602 // 3981/8192 = Cos[27*Pi/64]*2 = 0.4859603598065277 let (s2, sd) = RotateAddShift::half_kernel::<14, 10, 13>(s2, sdh, (9937, 1489, 3981)); // 10473/16384 = (Sin[25*Pi/64] + Cos[25*Pi/64])/2 = 0.6392169592876205 // 39627/32768 = (Sin[25*Pi/64] - Cos[25*Pi/64])*2 = 1.2093084235816014 // 11039/16384 = Cos[25*Pi/64]*2 = 0.6737797067844401 let (sc, s3) = RotateSubShift::half_kernel::<14, 15, 14>( sc, s3h, (10473, 39627, 11039), ); // 2727/4096 = (Sin[23*Pi/64] + Cos[23*Pi/64])/2 = 0.6657721932768628 // 3903/4096 = (Sin[23*Pi/64] - Cos[23*Pi/64])*2 = 0.9528683993863225 // 7005/8192 = Cos[23*Pi/64]*2 = 0.8551101868605642 let (s4, sb) = RotateAddShift::half_kernel::<12, 12, 13>(s4, sbh, (2727, 3903, 7005)); // 5619/8192 = (Sin[21*Pi/64] + Cos[21*Pi/64])/2 = 0.6859156770967569 // 2815/4096 = (Sin[21*Pi/64] - Cos[21*Pi/64])*2 = 0.6872517316141069 // 8423/8192 = Cos[21*Pi/64]*2 = 1.0282054883864433 let (sa, s5) = RotateSubShift::half_kernel::<13, 12, 13>(sa, s5h, (5619, 2815, 8423)); // 2865/4096 = (Sin[19*Pi/64] + Cos[19*Pi/64])/2 = 0.6994534179865391 // 13588/32768 = (Sin[19*Pi/64] - Cos[19*Pi/64])*2 = 0.4150164539764232 // 305/256 = Cos[19*Pi/64]*2 = 1.1913986089848667 let (s6, s9) = RotateAddShift::half_kernel::<12, 15, 8>(s6, s9h, (2865, 13599, 305)); // 23143/32768 = (Sin[17*Pi/64] + Cos[17*Pi/64])/2 = 0.7062550401009887 // 1137/8192 = (Sin[17*Pi/64] - Cos[17*Pi/64])*2 = 0.1387843410158816 // 11003/8192 = Cos[17*Pi/64]*2 = 1.3431179096940367 let (s8, s7) = RotateSubShift::half_kernel::<15, 13, 13>( s8, s7h, (23143, 1137, 11003), ); // Stage 1 let (s0, s7) = butterfly_sub_asym((s0.rshift1(), s0), s7); let (s8, sf) = butterfly_sub_asym((s8.rshift1(), s8), sf); let (s4, s3) = butterfly_add_asym((s4.rshift1(), s4), s3); let (sc, sb) = butterfly_add_asym((sc.rshift1(), sc), sb); let (s2, s5) = butterfly_sub_asym((s2.rshift1(), s2), s5); let (sa, sd) = butterfly_sub_asym((sa.rshift1(), sa), sd); let (s6, s1) = butterfly_add_asym((s6.rshift1(), s6), s1); let (se, s9) = butterfly_add_asym((se.rshift1(), se), s9); // Stage 2 let ((_s8h, s8), s4h) = butterfly_add(s8, s4); let ((_s7h, s7), sbh) = butterfly_add(s7, sb); let ((_sah, sa), s6h) = butterfly_sub(sa, s6); let ((_s5h, s5), s9h) = butterfly_sub(s5, s9); let (s0, s3h) = butterfly_add(s0, s3); let (sd, seh) = butterfly_add(sd, se); let (s2, s1h) = butterfly_sub(s2, s1); let (sf, sch) = butterfly_sub(sf, sc); // Stage 3 // 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 // 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 let (s8, s7) = RotateAdd::kernel::<13, 14, 15>(s8, s7, (9633, 12873, 6393)); // 22725/16384 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 // 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 let (s9, s6) = RotateAdd::kernel::<14, 15, 13>(s9h, s6h, (22725, 9041, 4551)); // 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 // 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 let (s5, sa) = RotateNeg::kernel::<13, 15, 13>(s5, sa, (11363, 9041, 4551)); // 9633/32768 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 // 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 let (s4, sb) = RotateNeg::kernel::<13, 14, 15>(s4h, sbh, (9633, 12873, 6393)); // Stage 4 let (s2, sc) = butterfly_add_asym(s2, sch); let (s0, s1) = butterfly_sub_asym(s0, s1h); let (sf, se) = butterfly_add_asym(sf, seh); let (sd, s3) = butterfly_add_asym(sd, s3h); let (s7, s6) = butterfly_add_asym((s7.rshift1(), s7), s6); let (s8, s9) = butterfly_sub_asym((s8.rshift1(), s8), s9); let (sa, sb) = butterfly_sub_asym((sa.rshift1(), sa), sb); let (s5, s4) = butterfly_add_asym((s5.rshift1(), s5), s4); // Stage 5 // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (sc, s3) = RotateAdd::kernel::<13, 14, 13>(sc, s3, (10703, 8867, 3135)); // 10703/8192 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3870398453221475 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (s2, sd) = RotateNeg::kernel::<13, 14, 13>(s2, sd, (10703, 8867, 3135)); // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (sa, s5) = RotatePi4Add::kernel::<13, 13>(sa, s5, (11585, 5793)); // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (s6, s9) = RotatePi4Add::kernel::<13, 13>(s6, s9, (11585, 5793)); // 11585/8192 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (se, s1) = RotatePi4Add::kernel::<13, 13>(se, s1, (11585, 5793)); store_coeffs!( output, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf ); } #[$m] $($s)* fn daala_fdct_ii_32( t0: T, t1: T, t2: T, t3: T, t4: T, t5: T, t6: T, t7: T, t8: T, t9: T, ta: T, tb: T, tc: T, td: T, te: T, tf: T, tg: T, th: T, ti: T, tj: T, tk: T, tl: T, tm: T, tn: T, to: T, tp: T, tq: T, tr: T, ts: T, tt: T, tu: T, tv: T, output: &mut [T], ) { // +/- Butterflies with asymmetric output. let (t0h, tv) = butterfly_neg(t0, tv); let (t1, tuh) = butterfly_add(t1, tu); let (t2h, tt) = butterfly_neg(t2, tt); let (t3, tsh) = butterfly_add(t3, ts); let (t4h, tr) = butterfly_neg(t4, tr); let (t5, tqh) = butterfly_add(t5, tq); let (t6h, tp) = butterfly_neg(t6, tp); let (t7, toh) = butterfly_add(t7, to); let (t8h, tn) = butterfly_neg(t8, tn); let (t9, tmh) = butterfly_add(t9, tm); let (tah, tl) = butterfly_neg(ta, tl); let (tb, tkh) = butterfly_add(tb, tk); let (tch, tj) = butterfly_neg(tc, tj); let (td, tih) = butterfly_add(td, ti); let (teh, th) = butterfly_neg(te, th); let (tf, tgh) = butterfly_add(tf, tg); // Embedded 16-point transforms with asymmetric input. daala_fdct_ii_16_asym( t0h, t1, t2h, t3, t4h, t5, t6h, t7, t8h, t9, tah, tb, tch, td, teh, tf, &mut output[0..16], ); daala_fdst_iv_16_asym( tv, tuh, tt, tsh, tr, tqh, tp, toh, tn, tmh, tl, tkh, tj, tih, th, tgh, &mut output[16..32], ); output[16..32].reverse(); } #[$m] $($s)* fn daala_fdct32(coeffs: &mut [T]) { assert!(coeffs.len() >= 32); let mut temp_out: [T; 32] = [T::zero(); 32]; daala_fdct_ii_32( coeffs[0], coeffs[1], coeffs[2], coeffs[3], coeffs[4], coeffs[5], coeffs[6], coeffs[7], coeffs[8], coeffs[9], coeffs[10], coeffs[11], coeffs[12], coeffs[13], coeffs[14], coeffs[15], coeffs[16], coeffs[17], coeffs[18], coeffs[19], coeffs[20], coeffs[21], coeffs[22], coeffs[23], coeffs[24], coeffs[25], coeffs[26], coeffs[27], coeffs[28], coeffs[29], coeffs[30], coeffs[31], &mut temp_out, ); coeffs[0] = temp_out[0]; coeffs[1] = temp_out[16]; coeffs[2] = temp_out[8]; coeffs[3] = temp_out[24]; coeffs[4] = temp_out[4]; coeffs[5] = temp_out[20]; coeffs[6] = temp_out[12]; coeffs[7] = temp_out[28]; coeffs[8] = temp_out[2]; coeffs[9] = temp_out[18]; coeffs[10] = temp_out[10]; coeffs[11] = temp_out[26]; coeffs[12] = temp_out[6]; coeffs[13] = temp_out[22]; coeffs[14] = temp_out[14]; coeffs[15] = temp_out[30]; coeffs[16] = temp_out[1]; coeffs[17] = temp_out[17]; coeffs[18] = temp_out[9]; coeffs[19] = temp_out[25]; coeffs[20] = temp_out[5]; coeffs[21] = temp_out[21]; coeffs[22] = temp_out[13]; coeffs[23] = temp_out[29]; coeffs[24] = temp_out[3]; coeffs[25] = temp_out[19]; coeffs[26] = temp_out[11]; coeffs[27] = temp_out[27]; coeffs[28] = temp_out[7]; coeffs[29] = temp_out[23]; coeffs[30] = temp_out[15]; coeffs[31] = temp_out[31]; } #[$m] $($s)* fn daala_fdct_ii_32_asym( t0h: T, t1: (T, T), t2h: T, t3: (T, T), t4h: T, t5: (T, T), t6h: T, t7: (T, T), t8h: T, t9: (T, T), tah: T, tb: (T, T), tch: T, td: (T, T), teh: T, tf: (T, T), tgh: T, th: (T, T), tih: T, tj: (T, T), tkh: T, tl: (T, T), tmh: T, tn: (T, T), toh: T, tp: (T, T), tqh: T, tr: (T, T), tsh: T, tt: (T, T), tuh: T, tv: (T, T), output: &mut [T], ) { // +/- Butterflies with asymmetric input. let (t0, tv) = butterfly_neg_asym(t0h, tv); let (t1, tu) = butterfly_sub_asym(t1, tuh); let (t2, tt) = butterfly_neg_asym(t2h, tt); let (t3, ts) = butterfly_sub_asym(t3, tsh); let (t4, tr) = butterfly_neg_asym(t4h, tr); let (t5, tq) = butterfly_sub_asym(t5, tqh); let (t6, tp) = butterfly_neg_asym(t6h, tp); let (t7, to) = butterfly_sub_asym(t7, toh); let (t8, tn) = butterfly_neg_asym(t8h, tn); let (t9, tm) = butterfly_sub_asym(t9, tmh); let (ta, tl) = butterfly_neg_asym(tah, tl); let (tb, tk) = butterfly_sub_asym(tb, tkh); let (tc, tj) = butterfly_neg_asym(tch, tj); let (td, ti) = butterfly_sub_asym(td, tih); let (te, th) = butterfly_neg_asym(teh, th); let (tf, tg) = butterfly_sub_asym(tf, tgh); // Embedded 16-point orthonormal transforms. daala_fdct_ii_16( t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf, &mut output[0..16], ); daala_fdst_iv_16( tv, tu, tt, ts, tr, tq, tp, to, tn, tm, tl, tk, tj, ti, th, tg, &mut output[16..32], ); output[16..32].reverse(); } #[$m] $($s)* fn daala_fdst_iv_32_asym( t0: (T, T), t1h: T, t2: (T, T), t3h: T, t4: (T, T), t5h: T, t6: (T, T), t7h: T, t8: (T, T), t9h: T, ta: (T, T), tbh: T, tc: (T, T), tdh: T, te: (T, T), tfh: T, tg: (T, T), thh: T, ti: (T, T), tjh: T, tk: (T, T), tlh: T, tm: (T, T), tnh: T, to: (T, T), tph: T, tq: (T, T), trh: T, ts: (T, T), tth: T, tu: (T, T), tvh: T, output: &mut [T], ) { // Stage 0 // 5933/8192 = (Sin[63*Pi/128] + Cos[63*Pi/128])/Sqrt[2] = 0.72424708295147 // 22595/16384 = (Sin[63*Pi/128] - Cos[63*Pi/128])*Sqrt[2] = 1.37908108947413 // 1137/32768 = Cos[63*Pi/128]*Sqrt[2] = 0.03470653821440 let (t0, tv) = RotateAdd::half_kernel::<13, 14, 15>(t0, tvh, (5933, 22595, 1137)); // 6203/8192 = (Sin[61*Pi/128] + Cos[61*Pi/128])/Sqrt[2] = 0.75720884650648 // 21403/16384 = (Sin[61*Pi/128] - Cos[61*Pi/128])*Sqrt[2] = 1.30634568590755 // 3409/32768 = Cos[61*Pi/128]*Sqrt[2] = 0.10403600355271 let (tu, t1) = RotateSub::half_kernel::<13, 14, 15>(tu, t1h, (6203, 21403, 3409)); // 25833/32768 = (Sin[59*Pi/128] + Cos[59*Pi/128])/Sqrt[2] = 0.78834642762661 // 315/256 = (Sin[59*Pi/128] - Cos[59*Pi/128])*Sqrt[2] = 1.23046318116125 // 5673/32768 = Cos[59*Pi/128]*Sqrt[2] = 0.17311483704598 let (t2, tt) = RotateAdd::half_kernel::<15, 8, 15>(t2, tth, (25833, 315, 5673)); // 26791/32768 = (Sin[57*Pi/128] + Cos[57*Pi/128])/Sqrt[2] = 0.81758481315158 // 4717/4096 = (Sin[57*Pi/128] - Cos[57*Pi/128])*Sqrt[2] = 1.15161638283569 // 7923/32768 = Cos[57*Pi/128]*Sqrt[2] = 0.24177662173374 let (ts, t3) = RotateSub::half_kernel::<15, 12, 15>(ts, t3h, (26791, 4717, 7923)); // 6921/8192 = (Sin[55*Pi/128] + Cos[55*Pi/128])/Sqrt[2] = 0.84485356524971 // 17531/16384 = (Sin[55*Pi/128] - Cos[55*Pi/128])*Sqrt[2] = 1.06999523977419 // 10153/32768 = Cos[55*Pi/128]*Sqrt[2] = 0.30985594536261 let (t4, tr) = RotateAdd::half_kernel::<13, 14, 15>(t4, trh, (6921, 17531, 10153)); // 28511/32768 = (Sin[53*Pi/128] + Cos[53*Pi/128])/Sqrt[2] = 0.87008699110871 // 32303/32768 = (Sin[53*Pi/128] - Cos[53*Pi/128])*Sqrt[2] = 0.98579638445957 // 1545/4096 = Cos[53*Pi/128]*Sqrt[2] = 0.37718879887893 let (tq, t5) = RotateSub::half_kernel::<15, 15, 12>(tq, t5h, (28511, 32303, 1545)); // 29269/32768 = (Sin[51*Pi/128] + Cos[51*Pi/128])/Sqrt[2] = 0.89322430119552 // 14733/16384 = (Sin[51*Pi/128] - Cos[51*Pi/128])*Sqrt[2] = 0.89922265930921 // 1817/4096 = Cos[51*Pi/128]*Sqrt[2] = 0.44361297154091 let (t6, tp) = RotateAdd::half_kernel::<15, 14, 12>(t6, tph, (29269, 14733, 1817)); // 29957/32768 = (Sin[49*Pi/128] + Cos[49*Pi/128])/Sqrt[2] = 0.91420975570353 // 13279/16384 = (Sin[49*Pi/128] - Cos[49*Pi/128])*Sqrt[2] = 0.81048262800998 // 8339/16384 = Cos[49*Pi/128]*Sqrt[2] = 0.50896844169854 let (to, t7) = RotateSub::half_kernel::<15, 14, 14>(to, t7h, (29957, 13279, 8339)); // 7643/8192 = (Sin[47*Pi/128] + Cos[47*Pi/128])/Sqrt[2] = 0.93299279883474 // 11793/16384 = (Sin[47*Pi/128] - Cos[47*Pi/128])*Sqrt[2] = 0.71979007306998 // 18779/32768 = Cos[47*Pi/128]*Sqrt[2] = 0.57309776229975 let (t8, tn) = RotateAdd::half_kernel::<13, 14, 15>(t8, tnh, (7643, 11793, 18779)); // 15557/16384 = (Sin[45*Pi/128] + Cos[45*Pi/128])/Sqrt[2] = 0.94952818059304 // 20557/32768 = (Sin[45*Pi/128] - Cos[45*Pi/128])*Sqrt[2] = 0.62736348079778 // 20835/32768 = Cos[45*Pi/128]*Sqrt[2] = 0.63584644019415 let (tm, t9) = RotateSub::half_kernel::<14, 15, 15>(tm, t9h, (15557, 20557, 20835)); // 31581/32768 = (Sin[43*Pi/128] + Cos[43*Pi/128])/Sqrt[2] = 0.96377606579544 // 17479/32768 = (Sin[43*Pi/128] - Cos[43*Pi/128])*Sqrt[2] = 0.53342551494980 // 22841/32768 = Cos[43*Pi/128]*Sqrt[2] = 0.69706330832054 let (ta, tl) = RotateAdd::half_kernel::<15, 15, 15>(ta, tlh, (31581, 17479, 22841)); // 7993/8192 = (Sin[41*Pi/128] + Cos[41*Pi/128])/Sqrt[2] = 0.97570213003853 // 14359/32768 = (Sin[41*Pi/128] - Cos[41*Pi/128])*Sqrt[2] = 0.43820248031374 // 3099/4096 = Cos[41*Pi/128]*Sqrt[2] = 0.75660088988166 let (tk, tb) = RotateSub::half_kernel::<13, 15, 12>(tk, tbh, (7993, 14359, 3099)); // 16143/16384 = (Sin[39*Pi/128] + Cos[39*Pi/128])/Sqrt[2] = 0.98527764238894 // 2801/8192 = (Sin[39*Pi/128] - Cos[39*Pi/128])*Sqrt[2] = 0.34192377752060 // 26683/32768 = Cos[39*Pi/128]*Sqrt[2] = 0.81431575362864 let (tc, tj) = RotateAdd::half_kernel::<14, 13, 15>(tc, tjh, (16143, 2801, 26683)); // 16261/16384 = (Sin[37*Pi/128] + Cos[37*Pi/128])/Sqrt[2] = 0.99247953459871 // 4011/16384 = (Sin[37*Pi/128] - Cos[37*Pi/128])*Sqrt[2] = 0.24482135039843 // 14255/16384 = Cos[37*Pi/128]*Sqrt[2] = 0.87006885939949 let (ti, td) = RotateSub::half_kernel::<14, 14, 14>(ti, tdh, (16261, 4011, 14255)); // 32679/32768 = (Sin[35*Pi/128] + Cos[35*Pi/128])/Sqrt[2] = 0.99729045667869 // 4821/32768 = (Sin[35*Pi/128] - Cos[35*Pi/128])*Sqrt[2] = 0.14712912719933 // 30269/32768 = Cos[35*Pi/128]*Sqrt[2] = 0.92372589307902 let (te, th) = RotateAdd::half_kernel::<15, 15, 15>(te, thh, (32679, 4821, 30269)); // 16379/16384 = (Sin[33*Pi/128] + Cos[33*Pi/128])/Sqrt[2] = 0.99969881869620 // 201/4096 = (Sin[33*Pi/128] - Cos[33*Pi/128])*Sqrt[2] = 0.04908245704582 // 15977/16384 = Cos[33*Pi/128]*Sqrt[2] = 0.97515759017329 let (tg, tf) = RotateSub::half_kernel::<14, 12, 14>(tg, tfh, (16379, 201, 15977)); // Stage 1 let (t0, tfh) = butterfly_add(t0, tf); let (tv, tgh) = butterfly_sub(tv, tg); let (th, tuh) = butterfly_add(th, tu); let (te, t1h) = butterfly_sub(te, t1); let (t2, tdh) = butterfly_add(t2, td); let (tt, tih) = butterfly_sub(tt, ti); let (tj, tsh) = butterfly_add(tj, ts); let (tc, t3h) = butterfly_sub(tc, t3); let (t4, tbh) = butterfly_add(t4, tb); let (tr, tkh) = butterfly_sub(tr, tk); let (tl, tqh) = butterfly_add(tl, tq); let (ta, t5h) = butterfly_sub(ta, t5); let (t6, t9h) = butterfly_add(t6, t9); let (tp, tmh) = butterfly_sub(tp, tm); let (tn, toh) = butterfly_add(tn, to); let (t8, t7h) = butterfly_sub(t8, t7); // Stage 2 let (t0, t7) = butterfly_sub_asym(t0, t7h); let (tv, to) = butterfly_add_asym(tv, toh); let (tp, tu) = butterfly_sub_asym(tp, tuh); let (t6, t1) = butterfly_add_asym(t6, t1h); let (t2, t5) = butterfly_sub_asym(t2, t5h); let (tt, tq) = butterfly_add_asym(tt, tqh); let (tr, ts) = butterfly_sub_asym(tr, tsh); let (t4, t3) = butterfly_add_asym(t4, t3h); let (t8, tg) = butterfly_add_asym(t8, tgh); let (te, tm) = butterfly_sub_asym(te, tmh); let (tn, tf) = butterfly_add_asym(tn, tfh); let (th, t9) = butterfly_sub_asym(th, t9h); let (ta, ti) = butterfly_add_asym(ta, tih); let (tc, tk) = butterfly_sub_asym(tc, tkh); let (tl, td) = butterfly_add_asym(tl, tdh); let (tj, tb) = butterfly_sub_asym(tj, tbh); // Stage 3 // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 // 803/8192 = Cos[15*Pi/32] = 0.0980171403295606 let (tf, tg) = RotateSub::kernel::<14, 14, 13>(tf, tg, (17911, 14699, 803)); // 10217/8192 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.2472250129866712 // 5461/8192 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.6666556584777465 // 1189/4096 = Cos[13*Pi/32] = 0.2902846772544623 let (th, te) = RotateAdd::kernel::<13, 13, 12>(th, te, (10217, 5461, 1189)); // 5543/4096 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 // 3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 // 7723/16384 = Cos[11*Pi/32] = 0.4713967368259976 let (ti, td) = RotateAdd::kernel::<12, 13, 14>(ti, td, (5543, 3363, 7723)); // 11529/8192 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 // 2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 // 5197/8192 = Cos[9*Pi/32] = 0.6343932841636455 let (tc, tj) = RotateSub::kernel::<13, 14, 13>(tc, tj, (11529, 2271, 5197)); // 11529/8192 = Sin[9*Pi/32] + Cos[9*Pi/32] = 1.4074037375263826 // 2271/16384 = Sin[9*Pi/32] - Cos[9*Pi/32] = 0.1386171691990915 // 5197/8192 = Cos[9*Pi/32] = 0.6343932841636455 let (tb, tk) = RotateNeg::kernel::<13, 14, 13>(tb, tk, (11529, 2271, 5197)); // 5543/4096 = Sin[11*Pi/32] + Cos[11*Pi/32] = 1.3533180011743526 // 3363/8192 = Sin[11*Pi/32] - Cos[11*Pi/32] = 0.4105245275223574 // 7723/16384 = Cos[11*Pi/32] = 0.4713967368259976 let (ta, tl) = RotateNeg::kernel::<12, 13, 14>(ta, tl, (5543, 3363, 7723)); // 10217/8192 = Sin[13*Pi/32] + Cos[13*Pi/32] = 1.2472250129866712 // 5461/8192 = Sin[13*Pi/32] - Cos[13*Pi/32] = 0.6666556584777465 // 1189/4096 = Cos[13*Pi/32] = 0.2902846772544623 let (t9, tm) = RotateNeg::kernel::<13, 13, 12>(t9, tm, (10217, 5461, 1189)); // 17911/16384 = Sin[15*Pi/32] + Cos[15*Pi/32] = 1.0932018670017576 // 14699/16384 = Sin[15*Pi/32] - Cos[15*Pi/32] = 0.8971675863426363 // 803/8192 = Cos[15*Pi/32] = 0.0980171403295606 let (t8, tn) = RotateNeg::kernel::<14, 14, 13>(t8, tn, (17911, 14699, 803)); // Stage 4 let (t3, t0h) = butterfly_sub(t3, t0); let (ts, tvh) = butterfly_add(ts, tv); let (tu, tth) = butterfly_sub(tu, tt); let (t1, t2h) = butterfly_add(t1, t2); let ((_toh, to), t4h) = butterfly_add(to, t4); let ((_tqh, tq), t6h) = butterfly_sub(tq, t6); let ((_t7h, t7), trh) = butterfly_add(t7, tr); let ((_t5h, t5), tph) = butterfly_sub(t5, tp); let (tb, t8h) = butterfly_sub(tb, t8); let (tk, tnh) = butterfly_add(tk, tn); let (tm, tlh) = butterfly_sub(tm, tl); let (t9, tah) = butterfly_add(t9, ta); let (tf, tch) = butterfly_sub(tf, tc); let (tg, tjh) = butterfly_add(tg, tj); let (ti, thh) = butterfly_sub(ti, th); let (td, teh) = butterfly_add(td, te); // Stage 5 // 301/256 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 // 1609/2048 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 // 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 let (to, t7) = RotateAdd::kernel::<8, 11, 15>(to, t7, (301, 1609, 6393)); // 11363/8192 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 // 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 let (tph, t6h) = RotateAdd::kernel::<13, 15, 13>(tph, t6h, (11363, 9041, 4551)); // 5681/4096 = Sin[5*Pi/16] + Cos[5*Pi/16] = 1.3870398453221475 // 9041/32768 = Sin[5*Pi/16] - Cos[5*Pi/16] = 0.2758993792829431 // 4551/8192 = Cos[5*Pi/16] = 0.5555702330196022 let (t5, tq) = RotateNeg::kernel::<12, 15, 13>(t5, tq, (5681, 9041, 4551)); // 9633/8192 = Sin[7*Pi/16] + Cos[7*Pi/16] = 1.1758756024193586 // 12873/16384 = Sin[7*Pi/16] - Cos[7*Pi/16] = 0.7856949583871022 // 6393/32768 = Cos[7*Pi/16] = 0.1950903220161283 let (t4h, trh) = RotateNeg::kernel::<13, 14, 15>(t4h, trh, (9633, 12873, 6393)); // Stage 6 let (t1, t0) = butterfly_add_asym(t1, t0h); let (tu, tv) = butterfly_sub_asym(tu, tvh); let (ts, t2) = butterfly_sub_asym(ts, t2h); let (t3, tt) = butterfly_sub_asym(t3, tth); let (t5, t4) = butterfly_add_asym((t5.rshift1(), t5), t4h); let (tq, tr) = butterfly_sub_asym((tq.rshift1(), tq), trh); let (t7, t6) = butterfly_add_asym((t7.rshift1(), t7), t6h); let (to, tp) = butterfly_sub_asym((to.rshift1(), to), tph); let (t9, t8) = butterfly_add_asym(t9, t8h); let (tm, tn) = butterfly_sub_asym(tm, tnh); let (tk, ta) = butterfly_sub_asym(tk, tah); let (tb, tl) = butterfly_sub_asym(tb, tlh); let (ti, tc) = butterfly_add_asym(ti, tch); let (td, tj) = butterfly_add_asym(td, tjh); let (tf, te) = butterfly_add_asym(tf, teh); let (tg, th) = butterfly_sub_asym(tg, thh); // Stage 7 // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (t2, tt) = RotateNeg::kernel::<9, 14, 13>(t2, tt, (669, 8867, 3135)); // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (ts, t3) = RotateAdd::kernel::<9, 14, 13>(ts, t3, (669, 8867, 3135)); // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (ta, tl) = RotateNeg::kernel::<9, 14, 13>(ta, tl, (669, 8867, 3135)); // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (tk, tb) = RotateAdd::kernel::<9, 14, 13>(tk, tb, (669, 8867, 3135)); // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (tc, tj) = RotateAdd::kernel::<9, 14, 13>(tc, tj, (669, 8867, 3135)); // 669/512 = Sin[3*Pi/8] + Cos[3*Pi/8] = 1.3065629648763766 // 8867/16384 = Sin[3*Pi/8] - Cos[3*Pi/8] = 0.5411961001461969 // 3135/8192 = Cos[3*Pi/8] = 0.3826834323650898 let (ti, td) = RotateNeg::kernel::<9, 14, 13>(ti, td, (669, 8867, 3135)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (tu, t1) = RotatePi4Add::kernel::<12, 13>(tu, t1, (5793, 5793)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (tq, t5) = RotatePi4Add::kernel::<12, 13>(tq, t5, (5793, 5793)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (tp, t6) = RotatePi4Sub::kernel::<12, 13>(tp, t6, (5793, 5793)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (tm, t9) = RotatePi4Add::kernel::<12, 13>(tm, t9, (5793, 5793)); // 5793/4096 = Sin[Pi/4] + Cos[Pi/4] = 1.4142135623730951 // 5793/8192 = Cos[Pi/4] = 0.7071067811865475 let (te, th) = RotatePi4Add::kernel::<12, 13>(te, th, (5793, 5793)); store_coeffs!( output, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, ta, tb, tc, td, te, tf, tg, th, ti, tj, tk, tl, tm, tn, to, tp, tq, tr, ts, tt, tu, tv ); } #[allow(clippy::identity_op)] #[$m] $($s)* fn daala_fdct64(coeffs: &mut [T]) { assert!(coeffs.len() >= 64); // Use arrays to avoid ridiculous variable names let mut asym: [(T, T); 32] = [(T::zero(), T::zero()); 32]; let mut half: [T; 32] = [T::zero(); 32]; // +/- Butterflies with asymmetric output. { #[$m] #[inline] $($s)* fn butterfly_pair( half: &mut [T; 32], asym: &mut [(T, T); 32], input: &[T], i: usize ) { let j = i * 2; let (ah, c) = butterfly_neg(input[j], input[63 - j]); let (b, dh) = butterfly_add(input[j + 1], input[63 - j - 1]); half[i] = ah; half[31 - i] = dh; asym[i] = b; asym[31 - i] = c; } butterfly_pair(&mut half, &mut asym, coeffs, 0); butterfly_pair(&mut half, &mut asym, coeffs, 1); butterfly_pair(&mut half, &mut asym, coeffs, 2); butterfly_pair(&mut half, &mut asym, coeffs, 3); butterfly_pair(&mut half, &mut asym, coeffs, 4); butterfly_pair(&mut half, &mut asym, coeffs, 5); butterfly_pair(&mut half, &mut asym, coeffs, 6); butterfly_pair(&mut half, &mut asym, coeffs, 7); butterfly_pair(&mut half, &mut asym, coeffs, 8); butterfly_pair(&mut half, &mut asym, coeffs, 9); butterfly_pair(&mut half, &mut asym, coeffs, 10); butterfly_pair(&mut half, &mut asym, coeffs, 11); butterfly_pair(&mut half, &mut asym, coeffs, 12); butterfly_pair(&mut half, &mut asym, coeffs, 13); butterfly_pair(&mut half, &mut asym, coeffs, 14); butterfly_pair(&mut half, &mut asym, coeffs, 15); } let mut temp_out: [T; 64] = [T::zero(); 64]; // Embedded 2-point transforms with asymmetric input. daala_fdct_ii_32_asym( half[0], asym[0], half[1], asym[1], half[2], asym[2], half[3], asym[3], half[4], asym[4], half[5], asym[5], half[6], asym[6], half[7], asym[7], half[8], asym[8], half[9], asym[9], half[10], asym[10], half[11], asym[11], half[12], asym[12], half[13], asym[13], half[14], asym[14], half[15], asym[15], &mut temp_out[0..32], ); daala_fdst_iv_32_asym( asym[31], half[31], asym[30], half[30], asym[29], half[29], asym[28], half[28], asym[27], half[27], asym[26], half[26], asym[25], half[25], asym[24], half[24], asym[23], half[23], asym[22], half[22], asym[21], half[21], asym[20], half[20], asym[19], half[19], asym[18], half[18], asym[17], half[17], asym[16], half[16], &mut temp_out[32..64], ); temp_out[32..64].reverse(); // Store a reordered version of output in temp_out #[$m] #[inline] $($s)* fn reorder_4( output: &mut [T], i: usize, tmp: [T; 64], j: usize ) { output[0 + i * 4] = tmp[0 + j]; output[1 + i * 4] = tmp[32 + j]; output[2 + i * 4] = tmp[16 + j]; output[3 + i * 4] = tmp[48 + j]; } reorder_4(coeffs, 0, temp_out, 0); reorder_4(coeffs, 1, temp_out, 8); reorder_4(coeffs, 2, temp_out, 4); reorder_4(coeffs, 3, temp_out, 12); reorder_4(coeffs, 4, temp_out, 2); reorder_4(coeffs, 5, temp_out, 10); reorder_4(coeffs, 6, temp_out, 6); reorder_4(coeffs, 7, temp_out, 14); reorder_4(coeffs, 8, temp_out, 1); reorder_4(coeffs, 9, temp_out, 9); reorder_4(coeffs, 10, temp_out, 5); reorder_4(coeffs, 11, temp_out, 13); reorder_4(coeffs, 12, temp_out, 3); reorder_4(coeffs, 13, temp_out, 11); reorder_4(coeffs, 14, temp_out, 7); reorder_4(coeffs, 15, temp_out, 15); } #[$m] $($s)* fn fidentity(_coeffs: &mut [T]) {} #[$m] $($s)* fn fwht4(coeffs: &mut [T]) { assert!(coeffs.len() >= 4); let x0 = coeffs[0]; let x1 = coeffs[1]; let x2 = coeffs[2]; let x3 = coeffs[3]; let s0 = x0.add(x1); let s1 = x3.sub(x2); let s2 = s0.sub_avg(s1); let q1 = s2.sub(x2); let q0 = s0.sub(q1); let q3 = s2.sub(x1); let q2 = s1.add(q3); store_coeffs!(coeffs, q0, q1, q2, q3); } } } rav1e-0.7.1/src/transform/inverse.rs000064400000000000000000002032011046102023000154700ustar 00000000000000// Copyright (c) 2018-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. cfg_if::cfg_if! { if #[cfg(nasm_x86_64)] { pub use crate::asm::x86::transform::inverse::*; } else if #[cfg(asm_neon)] { pub use crate::asm::aarch64::transform::inverse::*; } else { pub use self::rust::*; } } use crate::tiling::PlaneRegionMut; use crate::util::*; // TODO: move 1d txfm code to rust module. use super::clamp_value; use super::consts::*; use super::get_1d_tx_types; use super::get_rect_tx_log_ratio; use super::half_btf; use super::TxSize; use super::TxType; /// # Panics /// /// - If `input` or `output` have fewer than 4 items. pub fn av1_iwht4(input: &[i32], output: &mut [i32], _range: usize) { assert!(input.len() >= 4); assert!(output.len() >= 4); // let x0 = input[0]; let x1 = input[1]; let x2 = input[2]; let x3 = input[3]; let s0 = x0 + x1; let s2 = x2 - x3; let s4 = (s0 - s2) >> 1; let s3 = s4 - x3; let s1 = s4 - x1; output[0] = s0 - s3; output[1] = s3; output[2] = s1; output[3] = s2 + s1; } static COSPI_INV: [i32; 64] = [ 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973, 3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564, 3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896, 2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019, 1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995, 897, 799, 700, 601, 501, 401, 301, 201, 101, ]; static SINPI_INV: [i32; 5] = [0, 1321, 2482, 3344, 3803]; const INV_COS_BIT: usize = 12; /// # Panics /// /// - If `input` or `output` have fewer than 4 items. pub fn av1_idct4(input: &[i32], output: &mut [i32], range: usize) { assert!(input.len() >= 4); assert!(output.len() >= 4); // stage 1 let stg1 = [input[0], input[2], input[1], input[3]]; // stage 2 let stg2 = [ half_btf(COSPI_INV[32], stg1[0], COSPI_INV[32], stg1[1], INV_COS_BIT), half_btf(COSPI_INV[32], stg1[0], -COSPI_INV[32], stg1[1], INV_COS_BIT), half_btf(COSPI_INV[48], stg1[2], -COSPI_INV[16], stg1[3], INV_COS_BIT), half_btf(COSPI_INV[16], stg1[2], COSPI_INV[48], stg1[3], INV_COS_BIT), ]; // stage 3 output[0] = clamp_value(stg2[0] + stg2[3], range); output[1] = clamp_value(stg2[1] + stg2[2], range); output[2] = clamp_value(stg2[1] - stg2[2], range); output[3] = clamp_value(stg2[0] - stg2[3], range); } pub fn av1_iflipadst4(input: &[i32], output: &mut [i32], range: usize) { av1_iadst4(input, output, range); output[..4].reverse(); } /// # Panics /// /// - If `input` or `output` have fewer than 4 items. #[inline(always)] pub fn av1_iadst4(input: &[i32], output: &mut [i32], _range: usize) { assert!(input.len() >= 4); assert!(output.len() >= 4); let bit = 12; let x0 = input[0]; let x1 = input[1]; let x2 = input[2]; let x3 = input[3]; // stage 1 let s0 = SINPI_INV[1] * x0; let s1 = SINPI_INV[2] * x0; let s2 = SINPI_INV[3] * x1; let s3 = SINPI_INV[4] * x2; let s4 = SINPI_INV[1] * x2; let s5 = SINPI_INV[2] * x3; let s6 = SINPI_INV[4] * x3; // stage 2 let s7 = (x0 - x2) + x3; // stage 3 let s0 = s0 + s3; let s1 = s1 - s4; let s3 = s2; let s2 = SINPI_INV[3] * s7; // stage 4 let s0 = s0 + s5; let s1 = s1 - s6; // stage 5 let x0 = s0 + s3; let x1 = s1 + s3; let x2 = s2; let x3 = s0 + s1; // stage 6 let x3 = x3 - s3; output[0] = round_shift(x0, bit); output[1] = round_shift(x1, bit); output[2] = round_shift(x2, bit); output[3] = round_shift(x3, bit); } pub fn av1_iidentity4(input: &[i32], output: &mut [i32], _range: usize) { output[..4] .iter_mut() .zip(input[..4].iter()) .for_each(|(outp, inp)| *outp = round_shift(SQRT2 * *inp, 12)); } /// # Panics /// /// - If `input` or `output` have fewer than 8 items. pub fn av1_idct8(input: &[i32], output: &mut [i32], range: usize) { assert!(input.len() >= 8); assert!(output.len() >= 8); // call idct4 let temp_in = [input[0], input[2], input[4], input[6]]; let mut temp_out: [i32; 4] = [0; 4]; av1_idct4(&temp_in, &mut temp_out, range); // stage 0 // stage 1 let stg1 = [input[1], input[5], input[3], input[7]]; // stage 2 let stg2 = [ half_btf(COSPI_INV[56], stg1[0], -COSPI_INV[8], stg1[3], INV_COS_BIT), half_btf(COSPI_INV[24], stg1[1], -COSPI_INV[40], stg1[2], INV_COS_BIT), half_btf(COSPI_INV[40], stg1[1], COSPI_INV[24], stg1[2], INV_COS_BIT), half_btf(COSPI_INV[8], stg1[0], COSPI_INV[56], stg1[3], INV_COS_BIT), ]; // stage 3 let stg3 = [ clamp_value(stg2[0] + stg2[1], range), clamp_value(stg2[0] - stg2[1], range), clamp_value(-stg2[2] + stg2[3], range), clamp_value(stg2[2] + stg2[3], range), ]; // stage 4 let stg4 = [ stg3[0], half_btf(-COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT), half_btf(COSPI_INV[32], stg3[1], COSPI_INV[32], stg3[2], INV_COS_BIT), stg3[3], ]; // stage 5 output[0] = clamp_value(temp_out[0] + stg4[3], range); output[1] = clamp_value(temp_out[1] + stg4[2], range); output[2] = clamp_value(temp_out[2] + stg4[1], range); output[3] = clamp_value(temp_out[3] + stg4[0], range); output[4] = clamp_value(temp_out[3] - stg4[0], range); output[5] = clamp_value(temp_out[2] - stg4[1], range); output[6] = clamp_value(temp_out[1] - stg4[2], range); output[7] = clamp_value(temp_out[0] - stg4[3], range); } pub fn av1_iflipadst8(input: &[i32], output: &mut [i32], range: usize) { av1_iadst8(input, output, range); output[..8].reverse(); } /// # Panics /// /// - If `input` or `output` have fewer than 8 items. #[inline(always)] pub fn av1_iadst8(input: &[i32], output: &mut [i32], range: usize) { assert!(input.len() >= 8); assert!(output.len() >= 8); // stage 1 let stg1 = [ input[7], input[0], input[5], input[2], input[3], input[4], input[1], input[6], ]; // stage 2 let stg2 = [ half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[1], INV_COS_BIT), half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[1], INV_COS_BIT), half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[3], INV_COS_BIT), half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[3], INV_COS_BIT), half_btf(COSPI_INV[36], stg1[4], COSPI_INV[28], stg1[5], INV_COS_BIT), half_btf(COSPI_INV[28], stg1[4], -COSPI_INV[36], stg1[5], INV_COS_BIT), half_btf(COSPI_INV[52], stg1[6], COSPI_INV[12], stg1[7], INV_COS_BIT), half_btf(COSPI_INV[12], stg1[6], -COSPI_INV[52], stg1[7], INV_COS_BIT), ]; // stage 3 let stg3 = [ clamp_value(stg2[0] + stg2[4], range), clamp_value(stg2[1] + stg2[5], range), clamp_value(stg2[2] + stg2[6], range), clamp_value(stg2[3] + stg2[7], range), clamp_value(stg2[0] - stg2[4], range), clamp_value(stg2[1] - stg2[5], range), clamp_value(stg2[2] - stg2[6], range), clamp_value(stg2[3] - stg2[7], range), ]; // stage 4 let stg4 = [ stg3[0], stg3[1], stg3[2], stg3[3], half_btf(COSPI_INV[16], stg3[4], COSPI_INV[48], stg3[5], INV_COS_BIT), half_btf(COSPI_INV[48], stg3[4], -COSPI_INV[16], stg3[5], INV_COS_BIT), half_btf(-COSPI_INV[48], stg3[6], COSPI_INV[16], stg3[7], INV_COS_BIT), half_btf(COSPI_INV[16], stg3[6], COSPI_INV[48], stg3[7], INV_COS_BIT), ]; // stage 5 let stg5 = [ clamp_value(stg4[0] + stg4[2], range), clamp_value(stg4[1] + stg4[3], range), clamp_value(stg4[0] - stg4[2], range), clamp_value(stg4[1] - stg4[3], range), clamp_value(stg4[4] + stg4[6], range), clamp_value(stg4[5] + stg4[7], range), clamp_value(stg4[4] - stg4[6], range), clamp_value(stg4[5] - stg4[7], range), ]; // stage 6 let stg6 = [ stg5[0], stg5[1], half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[3], INV_COS_BIT), half_btf(COSPI_INV[32], stg5[2], -COSPI_INV[32], stg5[3], INV_COS_BIT), stg5[4], stg5[5], half_btf(COSPI_INV[32], stg5[6], COSPI_INV[32], stg5[7], INV_COS_BIT), half_btf(COSPI_INV[32], stg5[6], -COSPI_INV[32], stg5[7], INV_COS_BIT), ]; // stage 7 output[0] = stg6[0]; output[1] = -stg6[4]; output[2] = stg6[6]; output[3] = -stg6[2]; output[4] = stg6[3]; output[5] = -stg6[7]; output[6] = stg6[5]; output[7] = -stg6[1]; } pub fn av1_iidentity8(input: &[i32], output: &mut [i32], _range: usize) { output[..8] .iter_mut() .zip(input[..8].iter()) .for_each(|(outp, inp)| *outp = 2 * *inp); } fn av1_idct16(input: &[i32], output: &mut [i32], range: usize) { assert!(input.len() >= 16); assert!(output.len() >= 16); // call idct8 let temp_in = [ input[0], input[2], input[4], input[6], input[8], input[10], input[12], input[14], ]; let mut temp_out: [i32; 8] = [0; 8]; av1_idct8(&temp_in, &mut temp_out, range); // stage 1 let stg1 = [ input[1], input[9], input[5], input[13], input[3], input[11], input[7], input[15], ]; // stage 2 let stg2 = [ half_btf(COSPI_INV[60], stg1[0], -COSPI_INV[4], stg1[7], INV_COS_BIT), half_btf(COSPI_INV[28], stg1[1], -COSPI_INV[36], stg1[6], INV_COS_BIT), half_btf(COSPI_INV[44], stg1[2], -COSPI_INV[20], stg1[5], INV_COS_BIT), half_btf(COSPI_INV[12], stg1[3], -COSPI_INV[52], stg1[4], INV_COS_BIT), half_btf(COSPI_INV[52], stg1[3], COSPI_INV[12], stg1[4], INV_COS_BIT), half_btf(COSPI_INV[20], stg1[2], COSPI_INV[44], stg1[5], INV_COS_BIT), half_btf(COSPI_INV[36], stg1[1], COSPI_INV[28], stg1[6], INV_COS_BIT), half_btf(COSPI_INV[4], stg1[0], COSPI_INV[60], stg1[7], INV_COS_BIT), ]; // stage 3 let stg3 = [ clamp_value(stg2[0] + stg2[1], range), clamp_value(stg2[0] - stg2[1], range), clamp_value(-stg2[2] + stg2[3], range), clamp_value(stg2[2] + stg2[3], range), clamp_value(stg2[4] + stg2[5], range), clamp_value(stg2[4] - stg2[5], range), clamp_value(-stg2[6] + stg2[7], range), clamp_value(stg2[6] + stg2[7], range), ]; // stage 4 let stg4 = [ stg3[0], half_btf(-COSPI_INV[16], stg3[1], COSPI_INV[48], stg3[6], INV_COS_BIT), half_btf(-COSPI_INV[48], stg3[2], -COSPI_INV[16], stg3[5], INV_COS_BIT), stg3[3], stg3[4], half_btf(-COSPI_INV[16], stg3[2], COSPI_INV[48], stg3[5], INV_COS_BIT), half_btf(COSPI_INV[48], stg3[1], COSPI_INV[16], stg3[6], INV_COS_BIT), stg3[7], ]; // stage 5 let stg5 = [ clamp_value(stg4[0] + stg4[3], range), clamp_value(stg4[1] + stg4[2], range), clamp_value(stg4[1] - stg4[2], range), clamp_value(stg4[0] - stg4[3], range), clamp_value(-stg4[4] + stg4[7], range), clamp_value(-stg4[5] + stg4[6], range), clamp_value(stg4[5] + stg4[6], range), clamp_value(stg4[4] + stg4[7], range), ]; // stage 6 let stg6 = [ stg5[0], stg5[1], half_btf(-COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT), half_btf(-COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT), half_btf(COSPI_INV[32], stg5[3], COSPI_INV[32], stg5[4], INV_COS_BIT), half_btf(COSPI_INV[32], stg5[2], COSPI_INV[32], stg5[5], INV_COS_BIT), stg5[6], stg5[7], ]; // stage 7 output[0] = clamp_value(temp_out[0] + stg6[7], range); output[1] = clamp_value(temp_out[1] + stg6[6], range); output[2] = clamp_value(temp_out[2] + stg6[5], range); output[3] = clamp_value(temp_out[3] + stg6[4], range); output[4] = clamp_value(temp_out[4] + stg6[3], range); output[5] = clamp_value(temp_out[5] + stg6[2], range); output[6] = clamp_value(temp_out[6] + stg6[1], range); output[7] = clamp_value(temp_out[7] + stg6[0], range); output[8] = clamp_value(temp_out[7] - stg6[0], range); output[9] = clamp_value(temp_out[6] - stg6[1], range); output[10] = clamp_value(temp_out[5] - stg6[2], range); output[11] = clamp_value(temp_out[4] - stg6[3], range); output[12] = clamp_value(temp_out[3] - stg6[4], range); output[13] = clamp_value(temp_out[2] - stg6[5], range); output[14] = clamp_value(temp_out[1] - stg6[6], range); output[15] = clamp_value(temp_out[0] - stg6[7], range); } pub fn av1_iflipadst16(input: &[i32], output: &mut [i32], range: usize) { av1_iadst16(input, output, range); output[..16].reverse(); } #[inline(always)] fn av1_iadst16(input: &[i32], output: &mut [i32], range: usize) { assert!(input.len() >= 16); assert!(output.len() >= 16); // stage 1 let stg1 = [ input[15], input[0], input[13], input[2], input[11], input[4], input[9], input[6], input[7], input[8], input[5], input[10], input[3], input[12], input[1], input[14], ]; // stage 2 let stg2 = [ half_btf(COSPI_INV[2], stg1[0], COSPI_INV[62], stg1[1], INV_COS_BIT), half_btf(COSPI_INV[62], stg1[0], -COSPI_INV[2], stg1[1], INV_COS_BIT), half_btf(COSPI_INV[10], stg1[2], COSPI_INV[54], stg1[3], INV_COS_BIT), half_btf(COSPI_INV[54], stg1[2], -COSPI_INV[10], stg1[3], INV_COS_BIT), half_btf(COSPI_INV[18], stg1[4], COSPI_INV[46], stg1[5], INV_COS_BIT), half_btf(COSPI_INV[46], stg1[4], -COSPI_INV[18], stg1[5], INV_COS_BIT), half_btf(COSPI_INV[26], stg1[6], COSPI_INV[38], stg1[7], INV_COS_BIT), half_btf(COSPI_INV[38], stg1[6], -COSPI_INV[26], stg1[7], INV_COS_BIT), half_btf(COSPI_INV[34], stg1[8], COSPI_INV[30], stg1[9], INV_COS_BIT), half_btf(COSPI_INV[30], stg1[8], -COSPI_INV[34], stg1[9], INV_COS_BIT), half_btf(COSPI_INV[42], stg1[10], COSPI_INV[22], stg1[11], INV_COS_BIT), half_btf(COSPI_INV[22], stg1[10], -COSPI_INV[42], stg1[11], INV_COS_BIT), half_btf(COSPI_INV[50], stg1[12], COSPI_INV[14], stg1[13], INV_COS_BIT), half_btf(COSPI_INV[14], stg1[12], -COSPI_INV[50], stg1[13], INV_COS_BIT), half_btf(COSPI_INV[58], stg1[14], COSPI_INV[6], stg1[15], INV_COS_BIT), half_btf(COSPI_INV[6], stg1[14], -COSPI_INV[58], stg1[15], INV_COS_BIT), ]; // stage 3 let stg3 = [ clamp_value(stg2[0] + stg2[8], range), clamp_value(stg2[1] + stg2[9], range), clamp_value(stg2[2] + stg2[10], range), clamp_value(stg2[3] + stg2[11], range), clamp_value(stg2[4] + stg2[12], range), clamp_value(stg2[5] + stg2[13], range), clamp_value(stg2[6] + stg2[14], range), clamp_value(stg2[7] + stg2[15], range), clamp_value(stg2[0] - stg2[8], range), clamp_value(stg2[1] - stg2[9], range), clamp_value(stg2[2] - stg2[10], range), clamp_value(stg2[3] - stg2[11], range), clamp_value(stg2[4] - stg2[12], range), clamp_value(stg2[5] - stg2[13], range), clamp_value(stg2[6] - stg2[14], range), clamp_value(stg2[7] - stg2[15], range), ]; // stage 4 let stg4 = [ stg3[0], stg3[1], stg3[2], stg3[3], stg3[4], stg3[5], stg3[6], stg3[7], half_btf(COSPI_INV[8], stg3[8], COSPI_INV[56], stg3[9], INV_COS_BIT), half_btf(COSPI_INV[56], stg3[8], -COSPI_INV[8], stg3[9], INV_COS_BIT), half_btf(COSPI_INV[40], stg3[10], COSPI_INV[24], stg3[11], INV_COS_BIT), half_btf(COSPI_INV[24], stg3[10], -COSPI_INV[40], stg3[11], INV_COS_BIT), half_btf(-COSPI_INV[56], stg3[12], COSPI_INV[8], stg3[13], INV_COS_BIT), half_btf(COSPI_INV[8], stg3[12], COSPI_INV[56], stg3[13], INV_COS_BIT), half_btf(-COSPI_INV[24], stg3[14], COSPI_INV[40], stg3[15], INV_COS_BIT), half_btf(COSPI_INV[40], stg3[14], COSPI_INV[24], stg3[15], INV_COS_BIT), ]; // stage 5 let stg5 = [ clamp_value(stg4[0] + stg4[4], range), clamp_value(stg4[1] + stg4[5], range), clamp_value(stg4[2] + stg4[6], range), clamp_value(stg4[3] + stg4[7], range), clamp_value(stg4[0] - stg4[4], range), clamp_value(stg4[1] - stg4[5], range), clamp_value(stg4[2] - stg4[6], range), clamp_value(stg4[3] - stg4[7], range), clamp_value(stg4[8] + stg4[12], range), clamp_value(stg4[9] + stg4[13], range), clamp_value(stg4[10] + stg4[14], range), clamp_value(stg4[11] + stg4[15], range), clamp_value(stg4[8] - stg4[12], range), clamp_value(stg4[9] - stg4[13], range), clamp_value(stg4[10] - stg4[14], range), clamp_value(stg4[11] - stg4[15], range), ]; // stage 6 let stg6 = [ stg5[0], stg5[1], stg5[2], stg5[3], half_btf(COSPI_INV[16], stg5[4], COSPI_INV[48], stg5[5], INV_COS_BIT), half_btf(COSPI_INV[48], stg5[4], -COSPI_INV[16], stg5[5], INV_COS_BIT), half_btf(-COSPI_INV[48], stg5[6], COSPI_INV[16], stg5[7], INV_COS_BIT), half_btf(COSPI_INV[16], stg5[6], COSPI_INV[48], stg5[7], INV_COS_BIT), stg5[8], stg5[9], stg5[10], stg5[11], half_btf(COSPI_INV[16], stg5[12], COSPI_INV[48], stg5[13], INV_COS_BIT), half_btf(COSPI_INV[48], stg5[12], -COSPI_INV[16], stg5[13], INV_COS_BIT), half_btf(-COSPI_INV[48], stg5[14], COSPI_INV[16], stg5[15], INV_COS_BIT), half_btf(COSPI_INV[16], stg5[14], COSPI_INV[48], stg5[15], INV_COS_BIT), ]; // stage 7 let stg7 = [ clamp_value(stg6[0] + stg6[2], range), clamp_value(stg6[1] + stg6[3], range), clamp_value(stg6[0] - stg6[2], range), clamp_value(stg6[1] - stg6[3], range), clamp_value(stg6[4] + stg6[6], range), clamp_value(stg6[5] + stg6[7], range), clamp_value(stg6[4] - stg6[6], range), clamp_value(stg6[5] - stg6[7], range), clamp_value(stg6[8] + stg6[10], range), clamp_value(stg6[9] + stg6[11], range), clamp_value(stg6[8] - stg6[10], range), clamp_value(stg6[9] - stg6[11], range), clamp_value(stg6[12] + stg6[14], range), clamp_value(stg6[13] + stg6[15], range), clamp_value(stg6[12] - stg6[14], range), clamp_value(stg6[13] - stg6[15], range), ]; // stage 8 let stg8 = [ stg7[0], stg7[1], half_btf(COSPI_INV[32], stg7[2], COSPI_INV[32], stg7[3], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[2], -COSPI_INV[32], stg7[3], INV_COS_BIT), stg7[4], stg7[5], half_btf(COSPI_INV[32], stg7[6], COSPI_INV[32], stg7[7], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[6], -COSPI_INV[32], stg7[7], INV_COS_BIT), stg7[8], stg7[9], half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[11], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[10], -COSPI_INV[32], stg7[11], INV_COS_BIT), stg7[12], stg7[13], half_btf(COSPI_INV[32], stg7[14], COSPI_INV[32], stg7[15], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[14], -COSPI_INV[32], stg7[15], INV_COS_BIT), ]; // stage 9 output[0] = stg8[0]; output[1] = -stg8[8]; output[2] = stg8[12]; output[3] = -stg8[4]; output[4] = stg8[6]; output[5] = -stg8[14]; output[6] = stg8[10]; output[7] = -stg8[2]; output[8] = stg8[3]; output[9] = -stg8[11]; output[10] = stg8[15]; output[11] = -stg8[7]; output[12] = stg8[5]; output[13] = -stg8[13]; output[14] = stg8[9]; output[15] = -stg8[1]; } fn av1_iidentity16(input: &[i32], output: &mut [i32], _range: usize) { output[..16] .iter_mut() .zip(input[..16].iter()) .for_each(|(outp, inp)| *outp = round_shift(SQRT2 * 2 * *inp, 12)); } fn av1_idct32(input: &[i32], output: &mut [i32], range: usize) { assert!(input.len() >= 32); assert!(output.len() >= 32); // stage 1; let stg1 = [ input[0], input[16], input[8], input[24], input[4], input[20], input[12], input[28], input[2], input[18], input[10], input[26], input[6], input[22], input[14], input[30], input[1], input[17], input[9], input[25], input[5], input[21], input[13], input[29], input[3], input[19], input[11], input[27], input[7], input[23], input[15], input[31], ]; // stage 2 let stg2 = [ stg1[0], stg1[1], stg1[2], stg1[3], stg1[4], stg1[5], stg1[6], stg1[7], stg1[8], stg1[9], stg1[10], stg1[11], stg1[12], stg1[13], stg1[14], stg1[15], half_btf(COSPI_INV[62], stg1[16], -COSPI_INV[2], stg1[31], INV_COS_BIT), half_btf(COSPI_INV[30], stg1[17], -COSPI_INV[34], stg1[30], INV_COS_BIT), half_btf(COSPI_INV[46], stg1[18], -COSPI_INV[18], stg1[29], INV_COS_BIT), half_btf(COSPI_INV[14], stg1[19], -COSPI_INV[50], stg1[28], INV_COS_BIT), half_btf(COSPI_INV[54], stg1[20], -COSPI_INV[10], stg1[27], INV_COS_BIT), half_btf(COSPI_INV[22], stg1[21], -COSPI_INV[42], stg1[26], INV_COS_BIT), half_btf(COSPI_INV[38], stg1[22], -COSPI_INV[26], stg1[25], INV_COS_BIT), half_btf(COSPI_INV[6], stg1[23], -COSPI_INV[58], stg1[24], INV_COS_BIT), half_btf(COSPI_INV[58], stg1[23], COSPI_INV[6], stg1[24], INV_COS_BIT), half_btf(COSPI_INV[26], stg1[22], COSPI_INV[38], stg1[25], INV_COS_BIT), half_btf(COSPI_INV[42], stg1[21], COSPI_INV[22], stg1[26], INV_COS_BIT), half_btf(COSPI_INV[10], stg1[20], COSPI_INV[54], stg1[27], INV_COS_BIT), half_btf(COSPI_INV[50], stg1[19], COSPI_INV[14], stg1[28], INV_COS_BIT), half_btf(COSPI_INV[18], stg1[18], COSPI_INV[46], stg1[29], INV_COS_BIT), half_btf(COSPI_INV[34], stg1[17], COSPI_INV[30], stg1[30], INV_COS_BIT), half_btf(COSPI_INV[2], stg1[16], COSPI_INV[62], stg1[31], INV_COS_BIT), ]; // stage 3 let stg3 = [ stg2[0], stg2[1], stg2[2], stg2[3], stg2[4], stg2[5], stg2[6], stg2[7], half_btf(COSPI_INV[60], stg2[8], -COSPI_INV[4], stg2[15], INV_COS_BIT), half_btf(COSPI_INV[28], stg2[9], -COSPI_INV[36], stg2[14], INV_COS_BIT), half_btf(COSPI_INV[44], stg2[10], -COSPI_INV[20], stg2[13], INV_COS_BIT), half_btf(COSPI_INV[12], stg2[11], -COSPI_INV[52], stg2[12], INV_COS_BIT), half_btf(COSPI_INV[52], stg2[11], COSPI_INV[12], stg2[12], INV_COS_BIT), half_btf(COSPI_INV[20], stg2[10], COSPI_INV[44], stg2[13], INV_COS_BIT), half_btf(COSPI_INV[36], stg2[9], COSPI_INV[28], stg2[14], INV_COS_BIT), half_btf(COSPI_INV[4], stg2[8], COSPI_INV[60], stg2[15], INV_COS_BIT), clamp_value(stg2[16] + stg2[17], range), clamp_value(stg2[16] - stg2[17], range), clamp_value(-stg2[18] + stg2[19], range), clamp_value(stg2[18] + stg2[19], range), clamp_value(stg2[20] + stg2[21], range), clamp_value(stg2[20] - stg2[21], range), clamp_value(-stg2[22] + stg2[23], range), clamp_value(stg2[22] + stg2[23], range), clamp_value(stg2[24] + stg2[25], range), clamp_value(stg2[24] - stg2[25], range), clamp_value(-stg2[26] + stg2[27], range), clamp_value(stg2[26] + stg2[27], range), clamp_value(stg2[28] + stg2[29], range), clamp_value(stg2[28] - stg2[29], range), clamp_value(-stg2[30] + stg2[31], range), clamp_value(stg2[30] + stg2[31], range), ]; // stage 4 let stg4 = [ stg3[0], stg3[1], stg3[2], stg3[3], half_btf(COSPI_INV[56], stg3[4], -COSPI_INV[8], stg3[7], INV_COS_BIT), half_btf(COSPI_INV[24], stg3[5], -COSPI_INV[40], stg3[6], INV_COS_BIT), half_btf(COSPI_INV[40], stg3[5], COSPI_INV[24], stg3[6], INV_COS_BIT), half_btf(COSPI_INV[8], stg3[4], COSPI_INV[56], stg3[7], INV_COS_BIT), clamp_value(stg3[8] + stg3[9], range), clamp_value(stg3[8] - stg3[9], range), clamp_value(-stg3[10] + stg3[11], range), clamp_value(stg3[10] + stg3[11], range), clamp_value(stg3[12] + stg3[13], range), clamp_value(stg3[12] - stg3[13], range), clamp_value(-stg3[14] + stg3[15], range), clamp_value(stg3[14] + stg3[15], range), stg3[16], half_btf(-COSPI_INV[8], stg3[17], COSPI_INV[56], stg3[30], INV_COS_BIT), half_btf(-COSPI_INV[56], stg3[18], -COSPI_INV[8], stg3[29], INV_COS_BIT), stg3[19], stg3[20], half_btf(-COSPI_INV[40], stg3[21], COSPI_INV[24], stg3[26], INV_COS_BIT), half_btf(-COSPI_INV[24], stg3[22], -COSPI_INV[40], stg3[25], INV_COS_BIT), stg3[23], stg3[24], half_btf(-COSPI_INV[40], stg3[22], COSPI_INV[24], stg3[25], INV_COS_BIT), half_btf(COSPI_INV[24], stg3[21], COSPI_INV[40], stg3[26], INV_COS_BIT), stg3[27], stg3[28], half_btf(-COSPI_INV[8], stg3[18], COSPI_INV[56], stg3[29], INV_COS_BIT), half_btf(COSPI_INV[56], stg3[17], COSPI_INV[8], stg3[30], INV_COS_BIT), stg3[31], ]; // stage 5 let stg5 = [ half_btf(COSPI_INV[32], stg4[0], COSPI_INV[32], stg4[1], INV_COS_BIT), half_btf(COSPI_INV[32], stg4[0], -COSPI_INV[32], stg4[1], INV_COS_BIT), half_btf(COSPI_INV[48], stg4[2], -COSPI_INV[16], stg4[3], INV_COS_BIT), half_btf(COSPI_INV[16], stg4[2], COSPI_INV[48], stg4[3], INV_COS_BIT), clamp_value(stg4[4] + stg4[5], range), clamp_value(stg4[4] - stg4[5], range), clamp_value(-stg4[6] + stg4[7], range), clamp_value(stg4[6] + stg4[7], range), stg4[8], half_btf(-COSPI_INV[16], stg4[9], COSPI_INV[48], stg4[14], INV_COS_BIT), half_btf(-COSPI_INV[48], stg4[10], -COSPI_INV[16], stg4[13], INV_COS_BIT), stg4[11], stg4[12], half_btf(-COSPI_INV[16], stg4[10], COSPI_INV[48], stg4[13], INV_COS_BIT), half_btf(COSPI_INV[48], stg4[9], COSPI_INV[16], stg4[14], INV_COS_BIT), stg4[15], clamp_value(stg4[16] + stg4[19], range), clamp_value(stg4[17] + stg4[18], range), clamp_value(stg4[17] - stg4[18], range), clamp_value(stg4[16] - stg4[19], range), clamp_value(-stg4[20] + stg4[23], range), clamp_value(-stg4[21] + stg4[22], range), clamp_value(stg4[21] + stg4[22], range), clamp_value(stg4[20] + stg4[23], range), clamp_value(stg4[24] + stg4[27], range), clamp_value(stg4[25] + stg4[26], range), clamp_value(stg4[25] - stg4[26], range), clamp_value(stg4[24] - stg4[27], range), clamp_value(-stg4[28] + stg4[31], range), clamp_value(-stg4[29] + stg4[30], range), clamp_value(stg4[29] + stg4[30], range), clamp_value(stg4[28] + stg4[31], range), ]; // stage 6 let stg6 = [ clamp_value(stg5[0] + stg5[3], range), clamp_value(stg5[1] + stg5[2], range), clamp_value(stg5[1] - stg5[2], range), clamp_value(stg5[0] - stg5[3], range), stg5[4], half_btf(-COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT), half_btf(COSPI_INV[32], stg5[5], COSPI_INV[32], stg5[6], INV_COS_BIT), stg5[7], clamp_value(stg5[8] + stg5[11], range), clamp_value(stg5[9] + stg5[10], range), clamp_value(stg5[9] - stg5[10], range), clamp_value(stg5[8] - stg5[11], range), clamp_value(-stg5[12] + stg5[15], range), clamp_value(-stg5[13] + stg5[14], range), clamp_value(stg5[13] + stg5[14], range), clamp_value(stg5[12] + stg5[15], range), stg5[16], stg5[17], half_btf(-COSPI_INV[16], stg5[18], COSPI_INV[48], stg5[29], INV_COS_BIT), half_btf(-COSPI_INV[16], stg5[19], COSPI_INV[48], stg5[28], INV_COS_BIT), half_btf(-COSPI_INV[48], stg5[20], -COSPI_INV[16], stg5[27], INV_COS_BIT), half_btf(-COSPI_INV[48], stg5[21], -COSPI_INV[16], stg5[26], INV_COS_BIT), stg5[22], stg5[23], stg5[24], stg5[25], half_btf(-COSPI_INV[16], stg5[21], COSPI_INV[48], stg5[26], INV_COS_BIT), half_btf(-COSPI_INV[16], stg5[20], COSPI_INV[48], stg5[27], INV_COS_BIT), half_btf(COSPI_INV[48], stg5[19], COSPI_INV[16], stg5[28], INV_COS_BIT), half_btf(COSPI_INV[48], stg5[18], COSPI_INV[16], stg5[29], INV_COS_BIT), stg5[30], stg5[31], ]; // stage 7 let stg7 = [ clamp_value(stg6[0] + stg6[7], range), clamp_value(stg6[1] + stg6[6], range), clamp_value(stg6[2] + stg6[5], range), clamp_value(stg6[3] + stg6[4], range), clamp_value(stg6[3] - stg6[4], range), clamp_value(stg6[2] - stg6[5], range), clamp_value(stg6[1] - stg6[6], range), clamp_value(stg6[0] - stg6[7], range), stg6[8], stg6[9], half_btf(-COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT), half_btf(-COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT), half_btf(COSPI_INV[32], stg6[11], COSPI_INV[32], stg6[12], INV_COS_BIT), half_btf(COSPI_INV[32], stg6[10], COSPI_INV[32], stg6[13], INV_COS_BIT), stg6[14], stg6[15], clamp_value(stg6[16] + stg6[23], range), clamp_value(stg6[17] + stg6[22], range), clamp_value(stg6[18] + stg6[21], range), clamp_value(stg6[19] + stg6[20], range), clamp_value(stg6[19] - stg6[20], range), clamp_value(stg6[18] - stg6[21], range), clamp_value(stg6[17] - stg6[22], range), clamp_value(stg6[16] - stg6[23], range), clamp_value(-stg6[24] + stg6[31], range), clamp_value(-stg6[25] + stg6[30], range), clamp_value(-stg6[26] + stg6[29], range), clamp_value(-stg6[27] + stg6[28], range), clamp_value(stg6[27] + stg6[28], range), clamp_value(stg6[26] + stg6[29], range), clamp_value(stg6[25] + stg6[30], range), clamp_value(stg6[24] + stg6[31], range), ]; // stage 8 let stg8 = [ clamp_value(stg7[0] + stg7[15], range), clamp_value(stg7[1] + stg7[14], range), clamp_value(stg7[2] + stg7[13], range), clamp_value(stg7[3] + stg7[12], range), clamp_value(stg7[4] + stg7[11], range), clamp_value(stg7[5] + stg7[10], range), clamp_value(stg7[6] + stg7[9], range), clamp_value(stg7[7] + stg7[8], range), clamp_value(stg7[7] - stg7[8], range), clamp_value(stg7[6] - stg7[9], range), clamp_value(stg7[5] - stg7[10], range), clamp_value(stg7[4] - stg7[11], range), clamp_value(stg7[3] - stg7[12], range), clamp_value(stg7[2] - stg7[13], range), clamp_value(stg7[1] - stg7[14], range), clamp_value(stg7[0] - stg7[15], range), stg7[16], stg7[17], stg7[18], stg7[19], half_btf(-COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT), half_btf(-COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT), half_btf(-COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT), half_btf(-COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[23], COSPI_INV[32], stg7[24], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[22], COSPI_INV[32], stg7[25], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[21], COSPI_INV[32], stg7[26], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[20], COSPI_INV[32], stg7[27], INV_COS_BIT), stg7[28], stg7[29], stg7[30], stg7[31], ]; // stage 9 output[0] = clamp_value(stg8[0] + stg8[31], range); output[1] = clamp_value(stg8[1] + stg8[30], range); output[2] = clamp_value(stg8[2] + stg8[29], range); output[3] = clamp_value(stg8[3] + stg8[28], range); output[4] = clamp_value(stg8[4] + stg8[27], range); output[5] = clamp_value(stg8[5] + stg8[26], range); output[6] = clamp_value(stg8[6] + stg8[25], range); output[7] = clamp_value(stg8[7] + stg8[24], range); output[8] = clamp_value(stg8[8] + stg8[23], range); output[9] = clamp_value(stg8[9] + stg8[22], range); output[10] = clamp_value(stg8[10] + stg8[21], range); output[11] = clamp_value(stg8[11] + stg8[20], range); output[12] = clamp_value(stg8[12] + stg8[19], range); output[13] = clamp_value(stg8[13] + stg8[18], range); output[14] = clamp_value(stg8[14] + stg8[17], range); output[15] = clamp_value(stg8[15] + stg8[16], range); output[16] = clamp_value(stg8[15] - stg8[16], range); output[17] = clamp_value(stg8[14] - stg8[17], range); output[18] = clamp_value(stg8[13] - stg8[18], range); output[19] = clamp_value(stg8[12] - stg8[19], range); output[20] = clamp_value(stg8[11] - stg8[20], range); output[21] = clamp_value(stg8[10] - stg8[21], range); output[22] = clamp_value(stg8[9] - stg8[22], range); output[23] = clamp_value(stg8[8] - stg8[23], range); output[24] = clamp_value(stg8[7] - stg8[24], range); output[25] = clamp_value(stg8[6] - stg8[25], range); output[26] = clamp_value(stg8[5] - stg8[26], range); output[27] = clamp_value(stg8[4] - stg8[27], range); output[28] = clamp_value(stg8[3] - stg8[28], range); output[29] = clamp_value(stg8[2] - stg8[29], range); output[30] = clamp_value(stg8[1] - stg8[30], range); output[31] = clamp_value(stg8[0] - stg8[31], range); } fn av1_iidentity32(input: &[i32], output: &mut [i32], _range: usize) { output[..32] .iter_mut() .zip(input[..32].iter()) .for_each(|(outp, inp)| *outp = 4 * *inp); } fn av1_idct64(input: &[i32], output: &mut [i32], range: usize) { assert!(input.len() >= 64); assert!(output.len() >= 64); // stage 1; let stg1 = [ input[0], input[32], input[16], input[48], input[8], input[40], input[24], input[56], input[4], input[36], input[20], input[52], input[12], input[44], input[28], input[60], input[2], input[34], input[18], input[50], input[10], input[42], input[26], input[58], input[6], input[38], input[22], input[54], input[14], input[46], input[30], input[62], input[1], input[33], input[17], input[49], input[9], input[41], input[25], input[57], input[5], input[37], input[21], input[53], input[13], input[45], input[29], input[61], input[3], input[35], input[19], input[51], input[11], input[43], input[27], input[59], input[7], input[39], input[23], input[55], input[15], input[47], input[31], input[63], ]; // stage 2 let stg2 = [ stg1[0], stg1[1], stg1[2], stg1[3], stg1[4], stg1[5], stg1[6], stg1[7], stg1[8], stg1[9], stg1[10], stg1[11], stg1[12], stg1[13], stg1[14], stg1[15], stg1[16], stg1[17], stg1[18], stg1[19], stg1[20], stg1[21], stg1[22], stg1[23], stg1[24], stg1[25], stg1[26], stg1[27], stg1[28], stg1[29], stg1[30], stg1[31], half_btf(COSPI_INV[63], stg1[32], -COSPI_INV[1], stg1[63], INV_COS_BIT), half_btf(COSPI_INV[31], stg1[33], -COSPI_INV[33], stg1[62], INV_COS_BIT), half_btf(COSPI_INV[47], stg1[34], -COSPI_INV[17], stg1[61], INV_COS_BIT), half_btf(COSPI_INV[15], stg1[35], -COSPI_INV[49], stg1[60], INV_COS_BIT), half_btf(COSPI_INV[55], stg1[36], -COSPI_INV[9], stg1[59], INV_COS_BIT), half_btf(COSPI_INV[23], stg1[37], -COSPI_INV[41], stg1[58], INV_COS_BIT), half_btf(COSPI_INV[39], stg1[38], -COSPI_INV[25], stg1[57], INV_COS_BIT), half_btf(COSPI_INV[7], stg1[39], -COSPI_INV[57], stg1[56], INV_COS_BIT), half_btf(COSPI_INV[59], stg1[40], -COSPI_INV[5], stg1[55], INV_COS_BIT), half_btf(COSPI_INV[27], stg1[41], -COSPI_INV[37], stg1[54], INV_COS_BIT), half_btf(COSPI_INV[43], stg1[42], -COSPI_INV[21], stg1[53], INV_COS_BIT), half_btf(COSPI_INV[11], stg1[43], -COSPI_INV[53], stg1[52], INV_COS_BIT), half_btf(COSPI_INV[51], stg1[44], -COSPI_INV[13], stg1[51], INV_COS_BIT), half_btf(COSPI_INV[19], stg1[45], -COSPI_INV[45], stg1[50], INV_COS_BIT), half_btf(COSPI_INV[35], stg1[46], -COSPI_INV[29], stg1[49], INV_COS_BIT), half_btf(COSPI_INV[3], stg1[47], -COSPI_INV[61], stg1[48], INV_COS_BIT), half_btf(COSPI_INV[61], stg1[47], COSPI_INV[3], stg1[48], INV_COS_BIT), half_btf(COSPI_INV[29], stg1[46], COSPI_INV[35], stg1[49], INV_COS_BIT), half_btf(COSPI_INV[45], stg1[45], COSPI_INV[19], stg1[50], INV_COS_BIT), half_btf(COSPI_INV[13], stg1[44], COSPI_INV[51], stg1[51], INV_COS_BIT), half_btf(COSPI_INV[53], stg1[43], COSPI_INV[11], stg1[52], INV_COS_BIT), half_btf(COSPI_INV[21], stg1[42], COSPI_INV[43], stg1[53], INV_COS_BIT), half_btf(COSPI_INV[37], stg1[41], COSPI_INV[27], stg1[54], INV_COS_BIT), half_btf(COSPI_INV[5], stg1[40], COSPI_INV[59], stg1[55], INV_COS_BIT), half_btf(COSPI_INV[57], stg1[39], COSPI_INV[7], stg1[56], INV_COS_BIT), half_btf(COSPI_INV[25], stg1[38], COSPI_INV[39], stg1[57], INV_COS_BIT), half_btf(COSPI_INV[41], stg1[37], COSPI_INV[23], stg1[58], INV_COS_BIT), half_btf(COSPI_INV[9], stg1[36], COSPI_INV[55], stg1[59], INV_COS_BIT), half_btf(COSPI_INV[49], stg1[35], COSPI_INV[15], stg1[60], INV_COS_BIT), half_btf(COSPI_INV[17], stg1[34], COSPI_INV[47], stg1[61], INV_COS_BIT), half_btf(COSPI_INV[33], stg1[33], COSPI_INV[31], stg1[62], INV_COS_BIT), half_btf(COSPI_INV[1], stg1[32], COSPI_INV[63], stg1[63], INV_COS_BIT), ]; // stage 3 let stg3 = [ stg2[0], stg2[1], stg2[2], stg2[3], stg2[4], stg2[5], stg2[6], stg2[7], stg2[8], stg2[9], stg2[10], stg2[11], stg2[12], stg2[13], stg2[14], stg2[15], half_btf(COSPI_INV[62], stg2[16], -COSPI_INV[2], stg2[31], INV_COS_BIT), half_btf(COSPI_INV[30], stg2[17], -COSPI_INV[34], stg2[30], INV_COS_BIT), half_btf(COSPI_INV[46], stg2[18], -COSPI_INV[18], stg2[29], INV_COS_BIT), half_btf(COSPI_INV[14], stg2[19], -COSPI_INV[50], stg2[28], INV_COS_BIT), half_btf(COSPI_INV[54], stg2[20], -COSPI_INV[10], stg2[27], INV_COS_BIT), half_btf(COSPI_INV[22], stg2[21], -COSPI_INV[42], stg2[26], INV_COS_BIT), half_btf(COSPI_INV[38], stg2[22], -COSPI_INV[26], stg2[25], INV_COS_BIT), half_btf(COSPI_INV[6], stg2[23], -COSPI_INV[58], stg2[24], INV_COS_BIT), half_btf(COSPI_INV[58], stg2[23], COSPI_INV[6], stg2[24], INV_COS_BIT), half_btf(COSPI_INV[26], stg2[22], COSPI_INV[38], stg2[25], INV_COS_BIT), half_btf(COSPI_INV[42], stg2[21], COSPI_INV[22], stg2[26], INV_COS_BIT), half_btf(COSPI_INV[10], stg2[20], COSPI_INV[54], stg2[27], INV_COS_BIT), half_btf(COSPI_INV[50], stg2[19], COSPI_INV[14], stg2[28], INV_COS_BIT), half_btf(COSPI_INV[18], stg2[18], COSPI_INV[46], stg2[29], INV_COS_BIT), half_btf(COSPI_INV[34], stg2[17], COSPI_INV[30], stg2[30], INV_COS_BIT), half_btf(COSPI_INV[2], stg2[16], COSPI_INV[62], stg2[31], INV_COS_BIT), clamp_value(stg2[32] + stg2[33], range), clamp_value(stg2[32] - stg2[33], range), clamp_value(-stg2[34] + stg2[35], range), clamp_value(stg2[34] + stg2[35], range), clamp_value(stg2[36] + stg2[37], range), clamp_value(stg2[36] - stg2[37], range), clamp_value(-stg2[38] + stg2[39], range), clamp_value(stg2[38] + stg2[39], range), clamp_value(stg2[40] + stg2[41], range), clamp_value(stg2[40] - stg2[41], range), clamp_value(-stg2[42] + stg2[43], range), clamp_value(stg2[42] + stg2[43], range), clamp_value(stg2[44] + stg2[45], range), clamp_value(stg2[44] - stg2[45], range), clamp_value(-stg2[46] + stg2[47], range), clamp_value(stg2[46] + stg2[47], range), clamp_value(stg2[48] + stg2[49], range), clamp_value(stg2[48] - stg2[49], range), clamp_value(-stg2[50] + stg2[51], range), clamp_value(stg2[50] + stg2[51], range), clamp_value(stg2[52] + stg2[53], range), clamp_value(stg2[52] - stg2[53], range), clamp_value(-stg2[54] + stg2[55], range), clamp_value(stg2[54] + stg2[55], range), clamp_value(stg2[56] + stg2[57], range), clamp_value(stg2[56] - stg2[57], range), clamp_value(-stg2[58] + stg2[59], range), clamp_value(stg2[58] + stg2[59], range), clamp_value(stg2[60] + stg2[61], range), clamp_value(stg2[60] - stg2[61], range), clamp_value(-stg2[62] + stg2[63], range), clamp_value(stg2[62] + stg2[63], range), ]; // stage 4 let stg4 = [ stg3[0], stg3[1], stg3[2], stg3[3], stg3[4], stg3[5], stg3[6], stg3[7], half_btf(COSPI_INV[60], stg3[8], -COSPI_INV[4], stg3[15], INV_COS_BIT), half_btf(COSPI_INV[28], stg3[9], -COSPI_INV[36], stg3[14], INV_COS_BIT), half_btf(COSPI_INV[44], stg3[10], -COSPI_INV[20], stg3[13], INV_COS_BIT), half_btf(COSPI_INV[12], stg3[11], -COSPI_INV[52], stg3[12], INV_COS_BIT), half_btf(COSPI_INV[52], stg3[11], COSPI_INV[12], stg3[12], INV_COS_BIT), half_btf(COSPI_INV[20], stg3[10], COSPI_INV[44], stg3[13], INV_COS_BIT), half_btf(COSPI_INV[36], stg3[9], COSPI_INV[28], stg3[14], INV_COS_BIT), half_btf(COSPI_INV[4], stg3[8], COSPI_INV[60], stg3[15], INV_COS_BIT), clamp_value(stg3[16] + stg3[17], range), clamp_value(stg3[16] - stg3[17], range), clamp_value(-stg3[18] + stg3[19], range), clamp_value(stg3[18] + stg3[19], range), clamp_value(stg3[20] + stg3[21], range), clamp_value(stg3[20] - stg3[21], range), clamp_value(-stg3[22] + stg3[23], range), clamp_value(stg3[22] + stg3[23], range), clamp_value(stg3[24] + stg3[25], range), clamp_value(stg3[24] - stg3[25], range), clamp_value(-stg3[26] + stg3[27], range), clamp_value(stg3[26] + stg3[27], range), clamp_value(stg3[28] + stg3[29], range), clamp_value(stg3[28] - stg3[29], range), clamp_value(-stg3[30] + stg3[31], range), clamp_value(stg3[30] + stg3[31], range), stg3[32], half_btf(-COSPI_INV[4], stg3[33], COSPI_INV[60], stg3[62], INV_COS_BIT), half_btf(-COSPI_INV[60], stg3[34], -COSPI_INV[4], stg3[61], INV_COS_BIT), stg3[35], stg3[36], half_btf(-COSPI_INV[36], stg3[37], COSPI_INV[28], stg3[58], INV_COS_BIT), half_btf(-COSPI_INV[28], stg3[38], -COSPI_INV[36], stg3[57], INV_COS_BIT), stg3[39], stg3[40], half_btf(-COSPI_INV[20], stg3[41], COSPI_INV[44], stg3[54], INV_COS_BIT), half_btf(-COSPI_INV[44], stg3[42], -COSPI_INV[20], stg3[53], INV_COS_BIT), stg3[43], stg3[44], half_btf(-COSPI_INV[52], stg3[45], COSPI_INV[12], stg3[50], INV_COS_BIT), half_btf(-COSPI_INV[12], stg3[46], -COSPI_INV[52], stg3[49], INV_COS_BIT), stg3[47], stg3[48], half_btf(-COSPI_INV[52], stg3[46], COSPI_INV[12], stg3[49], INV_COS_BIT), half_btf(COSPI_INV[12], stg3[45], COSPI_INV[52], stg3[50], INV_COS_BIT), stg3[51], stg3[52], half_btf(-COSPI_INV[20], stg3[42], COSPI_INV[44], stg3[53], INV_COS_BIT), half_btf(COSPI_INV[44], stg3[41], COSPI_INV[20], stg3[54], INV_COS_BIT), stg3[55], stg3[56], half_btf(-COSPI_INV[36], stg3[38], COSPI_INV[28], stg3[57], INV_COS_BIT), half_btf(COSPI_INV[28], stg3[37], COSPI_INV[36], stg3[58], INV_COS_BIT), stg3[59], stg3[60], half_btf(-COSPI_INV[4], stg3[34], COSPI_INV[60], stg3[61], INV_COS_BIT), half_btf(COSPI_INV[60], stg3[33], COSPI_INV[4], stg3[62], INV_COS_BIT), stg3[63], ]; // stage 5 let stg5 = [ stg4[0], stg4[1], stg4[2], stg4[3], half_btf(COSPI_INV[56], stg4[4], -COSPI_INV[8], stg4[7], INV_COS_BIT), half_btf(COSPI_INV[24], stg4[5], -COSPI_INV[40], stg4[6], INV_COS_BIT), half_btf(COSPI_INV[40], stg4[5], COSPI_INV[24], stg4[6], INV_COS_BIT), half_btf(COSPI_INV[8], stg4[4], COSPI_INV[56], stg4[7], INV_COS_BIT), clamp_value(stg4[8] + stg4[9], range), clamp_value(stg4[8] - stg4[9], range), clamp_value(-stg4[10] + stg4[11], range), clamp_value(stg4[10] + stg4[11], range), clamp_value(stg4[12] + stg4[13], range), clamp_value(stg4[12] - stg4[13], range), clamp_value(-stg4[14] + stg4[15], range), clamp_value(stg4[14] + stg4[15], range), stg4[16], half_btf(-COSPI_INV[8], stg4[17], COSPI_INV[56], stg4[30], INV_COS_BIT), half_btf(-COSPI_INV[56], stg4[18], -COSPI_INV[8], stg4[29], INV_COS_BIT), stg4[19], stg4[20], half_btf(-COSPI_INV[40], stg4[21], COSPI_INV[24], stg4[26], INV_COS_BIT), half_btf(-COSPI_INV[24], stg4[22], -COSPI_INV[40], stg4[25], INV_COS_BIT), stg4[23], stg4[24], half_btf(-COSPI_INV[40], stg4[22], COSPI_INV[24], stg4[25], INV_COS_BIT), half_btf(COSPI_INV[24], stg4[21], COSPI_INV[40], stg4[26], INV_COS_BIT), stg4[27], stg4[28], half_btf(-COSPI_INV[8], stg4[18], COSPI_INV[56], stg4[29], INV_COS_BIT), half_btf(COSPI_INV[56], stg4[17], COSPI_INV[8], stg4[30], INV_COS_BIT), stg4[31], clamp_value(stg4[32] + stg4[35], range), clamp_value(stg4[33] + stg4[34], range), clamp_value(stg4[33] - stg4[34], range), clamp_value(stg4[32] - stg4[35], range), clamp_value(-stg4[36] + stg4[39], range), clamp_value(-stg4[37] + stg4[38], range), clamp_value(stg4[37] + stg4[38], range), clamp_value(stg4[36] + stg4[39], range), clamp_value(stg4[40] + stg4[43], range), clamp_value(stg4[41] + stg4[42], range), clamp_value(stg4[41] - stg4[42], range), clamp_value(stg4[40] - stg4[43], range), clamp_value(-stg4[44] + stg4[47], range), clamp_value(-stg4[45] + stg4[46], range), clamp_value(stg4[45] + stg4[46], range), clamp_value(stg4[44] + stg4[47], range), clamp_value(stg4[48] + stg4[51], range), clamp_value(stg4[49] + stg4[50], range), clamp_value(stg4[49] - stg4[50], range), clamp_value(stg4[48] - stg4[51], range), clamp_value(-stg4[52] + stg4[55], range), clamp_value(-stg4[53] + stg4[54], range), clamp_value(stg4[53] + stg4[54], range), clamp_value(stg4[52] + stg4[55], range), clamp_value(stg4[56] + stg4[59], range), clamp_value(stg4[57] + stg4[58], range), clamp_value(stg4[57] - stg4[58], range), clamp_value(stg4[56] - stg4[59], range), clamp_value(-stg4[60] + stg4[63], range), clamp_value(-stg4[61] + stg4[62], range), clamp_value(stg4[61] + stg4[62], range), clamp_value(stg4[60] + stg4[63], range), ]; // stage 6 let stg6 = [ half_btf(COSPI_INV[32], stg5[0], COSPI_INV[32], stg5[1], INV_COS_BIT), half_btf(COSPI_INV[32], stg5[0], -COSPI_INV[32], stg5[1], INV_COS_BIT), half_btf(COSPI_INV[48], stg5[2], -COSPI_INV[16], stg5[3], INV_COS_BIT), half_btf(COSPI_INV[16], stg5[2], COSPI_INV[48], stg5[3], INV_COS_BIT), clamp_value(stg5[4] + stg5[5], range), clamp_value(stg5[4] - stg5[5], range), clamp_value(-stg5[6] + stg5[7], range), clamp_value(stg5[6] + stg5[7], range), stg5[8], half_btf(-COSPI_INV[16], stg5[9], COSPI_INV[48], stg5[14], INV_COS_BIT), half_btf(-COSPI_INV[48], stg5[10], -COSPI_INV[16], stg5[13], INV_COS_BIT), stg5[11], stg5[12], half_btf(-COSPI_INV[16], stg5[10], COSPI_INV[48], stg5[13], INV_COS_BIT), half_btf(COSPI_INV[48], stg5[9], COSPI_INV[16], stg5[14], INV_COS_BIT), stg5[15], clamp_value(stg5[16] + stg5[19], range), clamp_value(stg5[17] + stg5[18], range), clamp_value(stg5[17] - stg5[18], range), clamp_value(stg5[16] - stg5[19], range), clamp_value(-stg5[20] + stg5[23], range), clamp_value(-stg5[21] + stg5[22], range), clamp_value(stg5[21] + stg5[22], range), clamp_value(stg5[20] + stg5[23], range), clamp_value(stg5[24] + stg5[27], range), clamp_value(stg5[25] + stg5[26], range), clamp_value(stg5[25] - stg5[26], range), clamp_value(stg5[24] - stg5[27], range), clamp_value(-stg5[28] + stg5[31], range), clamp_value(-stg5[29] + stg5[30], range), clamp_value(stg5[29] + stg5[30], range), clamp_value(stg5[28] + stg5[31], range), stg5[32], stg5[33], half_btf(-COSPI_INV[8], stg5[34], COSPI_INV[56], stg5[61], INV_COS_BIT), half_btf(-COSPI_INV[8], stg5[35], COSPI_INV[56], stg5[60], INV_COS_BIT), half_btf(-COSPI_INV[56], stg5[36], -COSPI_INV[8], stg5[59], INV_COS_BIT), half_btf(-COSPI_INV[56], stg5[37], -COSPI_INV[8], stg5[58], INV_COS_BIT), stg5[38], stg5[39], stg5[40], stg5[41], half_btf(-COSPI_INV[40], stg5[42], COSPI_INV[24], stg5[53], INV_COS_BIT), half_btf(-COSPI_INV[40], stg5[43], COSPI_INV[24], stg5[52], INV_COS_BIT), half_btf(-COSPI_INV[24], stg5[44], -COSPI_INV[40], stg5[51], INV_COS_BIT), half_btf(-COSPI_INV[24], stg5[45], -COSPI_INV[40], stg5[50], INV_COS_BIT), stg5[46], stg5[47], stg5[48], stg5[49], half_btf(-COSPI_INV[40], stg5[45], COSPI_INV[24], stg5[50], INV_COS_BIT), half_btf(-COSPI_INV[40], stg5[44], COSPI_INV[24], stg5[51], INV_COS_BIT), half_btf(COSPI_INV[24], stg5[43], COSPI_INV[40], stg5[52], INV_COS_BIT), half_btf(COSPI_INV[24], stg5[42], COSPI_INV[40], stg5[53], INV_COS_BIT), stg5[54], stg5[55], stg5[56], stg5[57], half_btf(-COSPI_INV[8], stg5[37], COSPI_INV[56], stg5[58], INV_COS_BIT), half_btf(-COSPI_INV[8], stg5[36], COSPI_INV[56], stg5[59], INV_COS_BIT), half_btf(COSPI_INV[56], stg5[35], COSPI_INV[8], stg5[60], INV_COS_BIT), half_btf(COSPI_INV[56], stg5[34], COSPI_INV[8], stg5[61], INV_COS_BIT), stg5[62], stg5[63], ]; // stage 7 let stg7 = [ clamp_value(stg6[0] + stg6[3], range), clamp_value(stg6[1] + stg6[2], range), clamp_value(stg6[1] - stg6[2], range), clamp_value(stg6[0] - stg6[3], range), stg6[4], half_btf(-COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT), half_btf(COSPI_INV[32], stg6[5], COSPI_INV[32], stg6[6], INV_COS_BIT), stg6[7], clamp_value(stg6[8] + stg6[11], range), clamp_value(stg6[9] + stg6[10], range), clamp_value(stg6[9] - stg6[10], range), clamp_value(stg6[8] - stg6[11], range), clamp_value(-stg6[12] + stg6[15], range), clamp_value(-stg6[13] + stg6[14], range), clamp_value(stg6[13] + stg6[14], range), clamp_value(stg6[12] + stg6[15], range), stg6[16], stg6[17], half_btf(-COSPI_INV[16], stg6[18], COSPI_INV[48], stg6[29], INV_COS_BIT), half_btf(-COSPI_INV[16], stg6[19], COSPI_INV[48], stg6[28], INV_COS_BIT), half_btf(-COSPI_INV[48], stg6[20], -COSPI_INV[16], stg6[27], INV_COS_BIT), half_btf(-COSPI_INV[48], stg6[21], -COSPI_INV[16], stg6[26], INV_COS_BIT), stg6[22], stg6[23], stg6[24], stg6[25], half_btf(-COSPI_INV[16], stg6[21], COSPI_INV[48], stg6[26], INV_COS_BIT), half_btf(-COSPI_INV[16], stg6[20], COSPI_INV[48], stg6[27], INV_COS_BIT), half_btf(COSPI_INV[48], stg6[19], COSPI_INV[16], stg6[28], INV_COS_BIT), half_btf(COSPI_INV[48], stg6[18], COSPI_INV[16], stg6[29], INV_COS_BIT), stg6[30], stg6[31], clamp_value(stg6[32] + stg6[39], range), clamp_value(stg6[33] + stg6[38], range), clamp_value(stg6[34] + stg6[37], range), clamp_value(stg6[35] + stg6[36], range), clamp_value(stg6[35] - stg6[36], range), clamp_value(stg6[34] - stg6[37], range), clamp_value(stg6[33] - stg6[38], range), clamp_value(stg6[32] - stg6[39], range), clamp_value(-stg6[40] + stg6[47], range), clamp_value(-stg6[41] + stg6[46], range), clamp_value(-stg6[42] + stg6[45], range), clamp_value(-stg6[43] + stg6[44], range), clamp_value(stg6[43] + stg6[44], range), clamp_value(stg6[42] + stg6[45], range), clamp_value(stg6[41] + stg6[46], range), clamp_value(stg6[40] + stg6[47], range), clamp_value(stg6[48] + stg6[55], range), clamp_value(stg6[49] + stg6[54], range), clamp_value(stg6[50] + stg6[53], range), clamp_value(stg6[51] + stg6[52], range), clamp_value(stg6[51] - stg6[52], range), clamp_value(stg6[50] - stg6[53], range), clamp_value(stg6[49] - stg6[54], range), clamp_value(stg6[48] - stg6[55], range), clamp_value(-stg6[56] + stg6[63], range), clamp_value(-stg6[57] + stg6[62], range), clamp_value(-stg6[58] + stg6[61], range), clamp_value(-stg6[59] + stg6[60], range), clamp_value(stg6[59] + stg6[60], range), clamp_value(stg6[58] + stg6[61], range), clamp_value(stg6[57] + stg6[62], range), clamp_value(stg6[56] + stg6[63], range), ]; // stage 8 let stg8 = [ clamp_value(stg7[0] + stg7[7], range), clamp_value(stg7[1] + stg7[6], range), clamp_value(stg7[2] + stg7[5], range), clamp_value(stg7[3] + stg7[4], range), clamp_value(stg7[3] - stg7[4], range), clamp_value(stg7[2] - stg7[5], range), clamp_value(stg7[1] - stg7[6], range), clamp_value(stg7[0] - stg7[7], range), stg7[8], stg7[9], half_btf(-COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT), half_btf(-COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[11], COSPI_INV[32], stg7[12], INV_COS_BIT), half_btf(COSPI_INV[32], stg7[10], COSPI_INV[32], stg7[13], INV_COS_BIT), stg7[14], stg7[15], clamp_value(stg7[16] + stg7[23], range), clamp_value(stg7[17] + stg7[22], range), clamp_value(stg7[18] + stg7[21], range), clamp_value(stg7[19] + stg7[20], range), clamp_value(stg7[19] - stg7[20], range), clamp_value(stg7[18] - stg7[21], range), clamp_value(stg7[17] - stg7[22], range), clamp_value(stg7[16] - stg7[23], range), clamp_value(-stg7[24] + stg7[31], range), clamp_value(-stg7[25] + stg7[30], range), clamp_value(-stg7[26] + stg7[29], range), clamp_value(-stg7[27] + stg7[28], range), clamp_value(stg7[27] + stg7[28], range), clamp_value(stg7[26] + stg7[29], range), clamp_value(stg7[25] + stg7[30], range), clamp_value(stg7[24] + stg7[31], range), stg7[32], stg7[33], stg7[34], stg7[35], half_btf(-COSPI_INV[16], stg7[36], COSPI_INV[48], stg7[59], INV_COS_BIT), half_btf(-COSPI_INV[16], stg7[37], COSPI_INV[48], stg7[58], INV_COS_BIT), half_btf(-COSPI_INV[16], stg7[38], COSPI_INV[48], stg7[57], INV_COS_BIT), half_btf(-COSPI_INV[16], stg7[39], COSPI_INV[48], stg7[56], INV_COS_BIT), half_btf(-COSPI_INV[48], stg7[40], -COSPI_INV[16], stg7[55], INV_COS_BIT), half_btf(-COSPI_INV[48], stg7[41], -COSPI_INV[16], stg7[54], INV_COS_BIT), half_btf(-COSPI_INV[48], stg7[42], -COSPI_INV[16], stg7[53], INV_COS_BIT), half_btf(-COSPI_INV[48], stg7[43], -COSPI_INV[16], stg7[52], INV_COS_BIT), stg7[44], stg7[45], stg7[46], stg7[47], stg7[48], stg7[49], stg7[50], stg7[51], half_btf(-COSPI_INV[16], stg7[43], COSPI_INV[48], stg7[52], INV_COS_BIT), half_btf(-COSPI_INV[16], stg7[42], COSPI_INV[48], stg7[53], INV_COS_BIT), half_btf(-COSPI_INV[16], stg7[41], COSPI_INV[48], stg7[54], INV_COS_BIT), half_btf(-COSPI_INV[16], stg7[40], COSPI_INV[48], stg7[55], INV_COS_BIT), half_btf(COSPI_INV[48], stg7[39], COSPI_INV[16], stg7[56], INV_COS_BIT), half_btf(COSPI_INV[48], stg7[38], COSPI_INV[16], stg7[57], INV_COS_BIT), half_btf(COSPI_INV[48], stg7[37], COSPI_INV[16], stg7[58], INV_COS_BIT), half_btf(COSPI_INV[48], stg7[36], COSPI_INV[16], stg7[59], INV_COS_BIT), stg7[60], stg7[61], stg7[62], stg7[63], ]; // stage 9 let stg9 = [ clamp_value(stg8[0] + stg8[15], range), clamp_value(stg8[1] + stg8[14], range), clamp_value(stg8[2] + stg8[13], range), clamp_value(stg8[3] + stg8[12], range), clamp_value(stg8[4] + stg8[11], range), clamp_value(stg8[5] + stg8[10], range), clamp_value(stg8[6] + stg8[9], range), clamp_value(stg8[7] + stg8[8], range), clamp_value(stg8[7] - stg8[8], range), clamp_value(stg8[6] - stg8[9], range), clamp_value(stg8[5] - stg8[10], range), clamp_value(stg8[4] - stg8[11], range), clamp_value(stg8[3] - stg8[12], range), clamp_value(stg8[2] - stg8[13], range), clamp_value(stg8[1] - stg8[14], range), clamp_value(stg8[0] - stg8[15], range), stg8[16], stg8[17], stg8[18], stg8[19], half_btf(-COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT), half_btf(-COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT), half_btf(-COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT), half_btf(-COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT), half_btf(COSPI_INV[32], stg8[23], COSPI_INV[32], stg8[24], INV_COS_BIT), half_btf(COSPI_INV[32], stg8[22], COSPI_INV[32], stg8[25], INV_COS_BIT), half_btf(COSPI_INV[32], stg8[21], COSPI_INV[32], stg8[26], INV_COS_BIT), half_btf(COSPI_INV[32], stg8[20], COSPI_INV[32], stg8[27], INV_COS_BIT), stg8[28], stg8[29], stg8[30], stg8[31], clamp_value(stg8[32] + stg8[47], range), clamp_value(stg8[33] + stg8[46], range), clamp_value(stg8[34] + stg8[45], range), clamp_value(stg8[35] + stg8[44], range), clamp_value(stg8[36] + stg8[43], range), clamp_value(stg8[37] + stg8[42], range), clamp_value(stg8[38] + stg8[41], range), clamp_value(stg8[39] + stg8[40], range), clamp_value(stg8[39] - stg8[40], range), clamp_value(stg8[38] - stg8[41], range), clamp_value(stg8[37] - stg8[42], range), clamp_value(stg8[36] - stg8[43], range), clamp_value(stg8[35] - stg8[44], range), clamp_value(stg8[34] - stg8[45], range), clamp_value(stg8[33] - stg8[46], range), clamp_value(stg8[32] - stg8[47], range), clamp_value(-stg8[48] + stg8[63], range), clamp_value(-stg8[49] + stg8[62], range), clamp_value(-stg8[50] + stg8[61], range), clamp_value(-stg8[51] + stg8[60], range), clamp_value(-stg8[52] + stg8[59], range), clamp_value(-stg8[53] + stg8[58], range), clamp_value(-stg8[54] + stg8[57], range), clamp_value(-stg8[55] + stg8[56], range), clamp_value(stg8[55] + stg8[56], range), clamp_value(stg8[54] + stg8[57], range), clamp_value(stg8[53] + stg8[58], range), clamp_value(stg8[52] + stg8[59], range), clamp_value(stg8[51] + stg8[60], range), clamp_value(stg8[50] + stg8[61], range), clamp_value(stg8[49] + stg8[62], range), clamp_value(stg8[48] + stg8[63], range), ]; // stage 10 let stg10 = [ clamp_value(stg9[0] + stg9[31], range), clamp_value(stg9[1] + stg9[30], range), clamp_value(stg9[2] + stg9[29], range), clamp_value(stg9[3] + stg9[28], range), clamp_value(stg9[4] + stg9[27], range), clamp_value(stg9[5] + stg9[26], range), clamp_value(stg9[6] + stg9[25], range), clamp_value(stg9[7] + stg9[24], range), clamp_value(stg9[8] + stg9[23], range), clamp_value(stg9[9] + stg9[22], range), clamp_value(stg9[10] + stg9[21], range), clamp_value(stg9[11] + stg9[20], range), clamp_value(stg9[12] + stg9[19], range), clamp_value(stg9[13] + stg9[18], range), clamp_value(stg9[14] + stg9[17], range), clamp_value(stg9[15] + stg9[16], range), clamp_value(stg9[15] - stg9[16], range), clamp_value(stg9[14] - stg9[17], range), clamp_value(stg9[13] - stg9[18], range), clamp_value(stg9[12] - stg9[19], range), clamp_value(stg9[11] - stg9[20], range), clamp_value(stg9[10] - stg9[21], range), clamp_value(stg9[9] - stg9[22], range), clamp_value(stg9[8] - stg9[23], range), clamp_value(stg9[7] - stg9[24], range), clamp_value(stg9[6] - stg9[25], range), clamp_value(stg9[5] - stg9[26], range), clamp_value(stg9[4] - stg9[27], range), clamp_value(stg9[3] - stg9[28], range), clamp_value(stg9[2] - stg9[29], range), clamp_value(stg9[1] - stg9[30], range), clamp_value(stg9[0] - stg9[31], range), stg9[32], stg9[33], stg9[34], stg9[35], stg9[36], stg9[37], stg9[38], stg9[39], half_btf(-COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT), half_btf(-COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT), half_btf(-COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT), half_btf(-COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT), half_btf(-COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT), half_btf(-COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT), half_btf(-COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT), half_btf(-COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[47], COSPI_INV[32], stg9[48], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[46], COSPI_INV[32], stg9[49], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[45], COSPI_INV[32], stg9[50], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[44], COSPI_INV[32], stg9[51], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[43], COSPI_INV[32], stg9[52], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[42], COSPI_INV[32], stg9[53], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[41], COSPI_INV[32], stg9[54], INV_COS_BIT), half_btf(COSPI_INV[32], stg9[40], COSPI_INV[32], stg9[55], INV_COS_BIT), stg9[56], stg9[57], stg9[58], stg9[59], stg9[60], stg9[61], stg9[62], stg9[63], ]; // stage 11 output[0] = clamp_value(stg10[0] + stg10[63], range); output[1] = clamp_value(stg10[1] + stg10[62], range); output[2] = clamp_value(stg10[2] + stg10[61], range); output[3] = clamp_value(stg10[3] + stg10[60], range); output[4] = clamp_value(stg10[4] + stg10[59], range); output[5] = clamp_value(stg10[5] + stg10[58], range); output[6] = clamp_value(stg10[6] + stg10[57], range); output[7] = clamp_value(stg10[7] + stg10[56], range); output[8] = clamp_value(stg10[8] + stg10[55], range); output[9] = clamp_value(stg10[9] + stg10[54], range); output[10] = clamp_value(stg10[10] + stg10[53], range); output[11] = clamp_value(stg10[11] + stg10[52], range); output[12] = clamp_value(stg10[12] + stg10[51], range); output[13] = clamp_value(stg10[13] + stg10[50], range); output[14] = clamp_value(stg10[14] + stg10[49], range); output[15] = clamp_value(stg10[15] + stg10[48], range); output[16] = clamp_value(stg10[16] + stg10[47], range); output[17] = clamp_value(stg10[17] + stg10[46], range); output[18] = clamp_value(stg10[18] + stg10[45], range); output[19] = clamp_value(stg10[19] + stg10[44], range); output[20] = clamp_value(stg10[20] + stg10[43], range); output[21] = clamp_value(stg10[21] + stg10[42], range); output[22] = clamp_value(stg10[22] + stg10[41], range); output[23] = clamp_value(stg10[23] + stg10[40], range); output[24] = clamp_value(stg10[24] + stg10[39], range); output[25] = clamp_value(stg10[25] + stg10[38], range); output[26] = clamp_value(stg10[26] + stg10[37], range); output[27] = clamp_value(stg10[27] + stg10[36], range); output[28] = clamp_value(stg10[28] + stg10[35], range); output[29] = clamp_value(stg10[29] + stg10[34], range); output[30] = clamp_value(stg10[30] + stg10[33], range); output[31] = clamp_value(stg10[31] + stg10[32], range); output[32] = clamp_value(stg10[31] - stg10[32], range); output[33] = clamp_value(stg10[30] - stg10[33], range); output[34] = clamp_value(stg10[29] - stg10[34], range); output[35] = clamp_value(stg10[28] - stg10[35], range); output[36] = clamp_value(stg10[27] - stg10[36], range); output[37] = clamp_value(stg10[26] - stg10[37], range); output[38] = clamp_value(stg10[25] - stg10[38], range); output[39] = clamp_value(stg10[24] - stg10[39], range); output[40] = clamp_value(stg10[23] - stg10[40], range); output[41] = clamp_value(stg10[22] - stg10[41], range); output[42] = clamp_value(stg10[21] - stg10[42], range); output[43] = clamp_value(stg10[20] - stg10[43], range); output[44] = clamp_value(stg10[19] - stg10[44], range); output[45] = clamp_value(stg10[18] - stg10[45], range); output[46] = clamp_value(stg10[17] - stg10[46], range); output[47] = clamp_value(stg10[16] - stg10[47], range); output[48] = clamp_value(stg10[15] - stg10[48], range); output[49] = clamp_value(stg10[14] - stg10[49], range); output[50] = clamp_value(stg10[13] - stg10[50], range); output[51] = clamp_value(stg10[12] - stg10[51], range); output[52] = clamp_value(stg10[11] - stg10[52], range); output[53] = clamp_value(stg10[10] - stg10[53], range); output[54] = clamp_value(stg10[9] - stg10[54], range); output[55] = clamp_value(stg10[8] - stg10[55], range); output[56] = clamp_value(stg10[7] - stg10[56], range); output[57] = clamp_value(stg10[6] - stg10[57], range); output[58] = clamp_value(stg10[5] - stg10[58], range); output[59] = clamp_value(stg10[4] - stg10[59], range); output[60] = clamp_value(stg10[3] - stg10[60], range); output[61] = clamp_value(stg10[2] - stg10[61], range); output[62] = clamp_value(stg10[1] - stg10[62], range); output[63] = clamp_value(stg10[0] - stg10[63], range); } type InvTxfmFn = fn(input: &[i32], output: &mut [i32], range: usize); static INV_TXFM_FNS: [[InvTxfmFn; 5]; 5] = [ [av1_idct4, av1_idct8, av1_idct16, av1_idct32, av1_idct64], [ av1_iadst4, av1_iadst8, av1_iadst16, |_, _, _| unimplemented!(), |_, _, _| unimplemented!(), ], [ av1_iflipadst4, av1_iflipadst8, av1_iflipadst16, |_, _, _| unimplemented!(), |_, _, _| unimplemented!(), ], [ av1_iidentity4, av1_iidentity8, av1_iidentity16, av1_iidentity32, |_, _, _| unimplemented!(), ], [ av1_iwht4, |_, _, _| unimplemented!(), |_, _, _| unimplemented!(), |_, _, _| unimplemented!(), |_, _, _| unimplemented!(), ], ]; pub(crate) mod rust { use super::*; use crate::cpu_features::CpuFeatureLevel; use crate::util::clamp; use simd_helpers::cold_for_target_arch; use std::cmp; #[cold_for_target_arch("x86_64", "aarch64")] pub fn inverse_transform_add( input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, _eob: u16, tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, ) { let width: usize = tx_size.width(); let height: usize = tx_size.height(); // Only use at most 32 columns and 32 rows of input coefficients. let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)]; // For 64 point transforms, rely on the last 32 columns being initialized // to zero for filling out missing input coeffs. let mut buffer = vec![0i32; width * height].into_boxed_slice(); let rect_type = get_rect_tx_log_ratio(width, height); let tx_types_1d = get_1d_tx_types(tx_type); let lossless = tx_type == TxType::WHT_WHT; // perform inv txfm on every row let range = bd + 8; let txfm_fn = INV_TXFM_FNS[tx_types_1d.1 as usize][ILog::ilog(width) - 3]; // 64 point transforms only signal 32 coeffs. We only take chunks of 32 // and skip over the last 32 transforms here. for (r, buffer_slice) in (0..height.min(32)).zip(buffer.chunks_mut(width)) { // For 64 point transforms, rely on the last 32 elements being // initialized to zero for filling out the missing coeffs. let mut temp_in: [i32; 64] = [0; 64]; for (raw, clamped) in input[r..] .iter() .map(|a| i32::cast_from(*a)) .step_by(height.min(32)) .zip(temp_in.iter_mut()) { let val = if rect_type.abs() == 1 { round_shift(raw * INV_SQRT2, SQRT2_BITS) } else if lossless { raw >> 2 } else { raw }; *clamped = clamp_value(val, range); } txfm_fn(&temp_in, buffer_slice, range); } // perform inv txfm on every col let range = cmp::max(bd + 6, 16); let txfm_fn = INV_TXFM_FNS[tx_types_1d.0 as usize][ILog::ilog(height) - 3]; for c in 0..width { let mut temp_in: [i32; 64] = [0; 64]; let mut temp_out: [i32; 64] = [0; 64]; for (raw, clamped) in buffer[c..].iter().step_by(width).zip(temp_in.iter_mut()) { *clamped = clamp_value( round_shift(*raw, INV_INTERMEDIATE_SHIFTS[tx_size as usize]), range, ); } txfm_fn(&temp_in, &mut temp_out, range); for (temp, out) in temp_out .iter() .zip(output.rows_iter_mut().map(|row| &mut row[c]).take(height)) { let v: i32 = (*out).as_(); let r = if lossless { *temp } else { round_shift(*temp, 4) }; let v = clamp(v + r, 0, (1 << bd) - 1); *out = T::cast_from(v); } } } /* From AV1 Spec. https://aomediacodec.github.io/av1-spec/#2d-inverse-transform-process */ const INV_INTERMEDIATE_SHIFTS: [usize; TxSize::TX_SIZES_ALL] = [0, 1, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2]; } rav1e-0.7.1/src/transform/mod.rs000064400000000000000000000370671046102023000146130ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. #![allow(non_camel_case_types)] #![allow(dead_code)] #[macro_use] pub mod forward_shared; pub use self::forward::forward_transform; pub use self::inverse::inverse_transform_add; use crate::context::MI_SIZE_LOG2; use crate::partition::{BlockSize, BlockSize::*}; use crate::util::*; use TxSize::*; pub mod forward; pub mod inverse; pub static RAV1E_TX_TYPES: &[TxType] = &[ TxType::DCT_DCT, TxType::ADST_DCT, TxType::DCT_ADST, TxType::ADST_ADST, // TODO: Add a speed setting for FLIPADST // TxType::FLIPADST_DCT, // TxType::DCT_FLIPADST, // TxType::FLIPADST_FLIPADST, // TxType::ADST_FLIPADST, // TxType::FLIPADST_ADST, TxType::IDTX, TxType::V_DCT, TxType::H_DCT, //TxType::V_FLIPADST, //TxType::H_FLIPADST, ]; pub mod consts { pub static SQRT2_BITS: usize = 12; pub static SQRT2: i32 = 5793; // 2^12 * sqrt(2) pub static INV_SQRT2: i32 = 2896; // 2^12 / sqrt(2) } pub const TX_TYPES: usize = 16; pub const TX_TYPES_PLUS_LL: usize = 17; #[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord)] pub enum TxType { DCT_DCT = 0, // DCT in both horizontal and vertical ADST_DCT = 1, // ADST in vertical, DCT in horizontal DCT_ADST = 2, // DCT in vertical, ADST in horizontal ADST_ADST = 3, // ADST in both directions FLIPADST_DCT = 4, DCT_FLIPADST = 5, FLIPADST_FLIPADST = 6, ADST_FLIPADST = 7, FLIPADST_ADST = 8, IDTX = 9, V_DCT = 10, H_DCT = 11, V_ADST = 12, H_ADST = 13, V_FLIPADST = 14, H_FLIPADST = 15, WHT_WHT = 16, } impl TxType { /// Compute transform type for inter chroma. /// /// #[inline] pub fn uv_inter(self, uv_tx_size: TxSize) -> Self { use TxType::*; if uv_tx_size.sqr_up() == TX_32X32 { match self { IDTX => IDTX, _ => DCT_DCT, } } else if uv_tx_size.sqr() == TX_16X16 { match self { V_ADST | H_ADST | V_FLIPADST | H_FLIPADST => DCT_DCT, _ => self, } } else { self } } } /// Transform Size #[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord)] pub enum TxSize { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_64X64, TX_4X8, TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16, TX_16X4, TX_8X32, TX_32X8, TX_16X64, TX_64X16, } impl TxSize { /// Number of square transform sizes pub const TX_SIZES: usize = 5; /// Number of transform sizes (including non-square sizes) pub const TX_SIZES_ALL: usize = 14 + 5; #[inline] pub const fn width(self) -> usize { 1 << self.width_log2() } #[inline] pub const fn width_log2(self) -> usize { match self { TX_4X4 | TX_4X8 | TX_4X16 => 2, TX_8X8 | TX_8X4 | TX_8X16 | TX_8X32 => 3, TX_16X16 | TX_16X8 | TX_16X32 | TX_16X4 | TX_16X64 => 4, TX_32X32 | TX_32X16 | TX_32X64 | TX_32X8 => 5, TX_64X64 | TX_64X32 | TX_64X16 => 6, } } #[inline] pub const fn width_index(self) -> usize { self.width_log2() - TX_4X4.width_log2() } #[inline] pub const fn height(self) -> usize { 1 << self.height_log2() } #[inline] pub const fn height_log2(self) -> usize { match self { TX_4X4 | TX_8X4 | TX_16X4 => 2, TX_8X8 | TX_4X8 | TX_16X8 | TX_32X8 => 3, TX_16X16 | TX_8X16 | TX_32X16 | TX_4X16 | TX_64X16 => 4, TX_32X32 | TX_16X32 | TX_64X32 | TX_8X32 => 5, TX_64X64 | TX_32X64 | TX_16X64 => 6, } } #[inline] pub const fn height_index(self) -> usize { self.height_log2() - TX_4X4.height_log2() } #[inline] pub const fn width_mi(self) -> usize { self.width() >> MI_SIZE_LOG2 } #[inline] pub const fn area(self) -> usize { 1 << self.area_log2() } #[inline] pub const fn area_log2(self) -> usize { self.width_log2() + self.height_log2() } #[inline] pub const fn height_mi(self) -> usize { self.height() >> MI_SIZE_LOG2 } #[inline] pub const fn block_size(self) -> BlockSize { match self { TX_4X4 => BLOCK_4X4, TX_8X8 => BLOCK_8X8, TX_16X16 => BLOCK_16X16, TX_32X32 => BLOCK_32X32, TX_64X64 => BLOCK_64X64, TX_4X8 => BLOCK_4X8, TX_8X4 => BLOCK_8X4, TX_8X16 => BLOCK_8X16, TX_16X8 => BLOCK_16X8, TX_16X32 => BLOCK_16X32, TX_32X16 => BLOCK_32X16, TX_32X64 => BLOCK_32X64, TX_64X32 => BLOCK_64X32, TX_4X16 => BLOCK_4X16, TX_16X4 => BLOCK_16X4, TX_8X32 => BLOCK_8X32, TX_32X8 => BLOCK_32X8, TX_16X64 => BLOCK_16X64, TX_64X16 => BLOCK_64X16, } } #[inline] pub const fn sqr(self) -> TxSize { match self { TX_4X4 | TX_4X8 | TX_8X4 | TX_4X16 | TX_16X4 => TX_4X4, TX_8X8 | TX_8X16 | TX_16X8 | TX_8X32 | TX_32X8 => TX_8X8, TX_16X16 | TX_16X32 | TX_32X16 | TX_16X64 | TX_64X16 => TX_16X16, TX_32X32 | TX_32X64 | TX_64X32 => TX_32X32, TX_64X64 => TX_64X64, } } #[inline] pub const fn sqr_up(self) -> TxSize { match self { TX_4X4 => TX_4X4, TX_8X8 | TX_4X8 | TX_8X4 => TX_8X8, TX_16X16 | TX_8X16 | TX_16X8 | TX_4X16 | TX_16X4 => TX_16X16, TX_32X32 | TX_16X32 | TX_32X16 | TX_8X32 | TX_32X8 => TX_32X32, TX_64X64 | TX_32X64 | TX_64X32 | TX_16X64 | TX_64X16 => TX_64X64, } } #[inline] pub fn by_dims(w: usize, h: usize) -> TxSize { match (w, h) { (4, 4) => TX_4X4, (8, 8) => TX_8X8, (16, 16) => TX_16X16, (32, 32) => TX_32X32, (64, 64) => TX_64X64, (4, 8) => TX_4X8, (8, 4) => TX_8X4, (8, 16) => TX_8X16, (16, 8) => TX_16X8, (16, 32) => TX_16X32, (32, 16) => TX_32X16, (32, 64) => TX_32X64, (64, 32) => TX_64X32, (4, 16) => TX_4X16, (16, 4) => TX_16X4, (8, 32) => TX_8X32, (32, 8) => TX_32X8, (16, 64) => TX_16X64, (64, 16) => TX_64X16, _ => unreachable!(), } } #[inline] pub const fn is_rect(self) -> bool { self.width_log2() != self.height_log2() } } #[derive(Copy, Clone, PartialEq, Eq, PartialOrd)] pub enum TxSet { // DCT only TX_SET_DCTONLY, // DCT + Identity only TX_SET_INTER_3, // TX_SET_DCT_IDTX // Discrete Trig transforms w/o flip (4) + Identity (1) TX_SET_INTRA_2, // TX_SET_DTT4_IDTX // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2) TX_SET_INTRA_1, // TX_SET_DTT4_IDTX_1DDCT // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2) TX_SET_INTER_2, // TX_SET_DTT9_IDTX_1DDCT // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6) TX_SET_INTER_1, // TX_SET_ALL16 } /// Utility function that returns the log of the ratio of the col and row sizes. #[inline] pub fn get_rect_tx_log_ratio(col: usize, row: usize) -> i8 { debug_assert!(col > 0 && row > 0); ILog::ilog(col) as i8 - ILog::ilog(row) as i8 } // performs half a butterfly #[inline] const fn half_btf(w0: i32, in0: i32, w1: i32, in1: i32, bit: usize) -> i32 { // Ensure defined behaviour for when w0*in0 + w1*in1 is negative and // overflows, but w0*in0 + w1*in1 + rounding isn't. let result = (w0 * in0).wrapping_add(w1 * in1); // Implement a version of round_shift with wrapping if bit == 0 { result } else { result.wrapping_add(1 << (bit - 1)) >> bit } } // clamps value to a signed integer type of bit bits #[inline] fn clamp_value(value: i32, bit: usize) -> i32 { let max_value: i32 = ((1i64 << (bit - 1)) - 1) as i32; let min_value: i32 = (-(1i64 << (bit - 1))) as i32; clamp(value, min_value, max_value) } pub fn av1_round_shift_array(arr: &mut [i32], size: usize, bit: i8) { if bit == 0 { return; } if bit > 0 { let bit = bit as usize; arr.iter_mut().take(size).for_each(|i| { *i = round_shift(*i, bit); }) } else { arr.iter_mut().take(size).for_each(|i| { *i <<= -bit; }) } } #[derive(Debug, Clone, Copy)] enum TxType1D { DCT, ADST, FLIPADST, IDTX, WHT, } const fn get_1d_tx_types(tx_type: TxType) -> (TxType1D, TxType1D) { match tx_type { TxType::DCT_DCT => (TxType1D::DCT, TxType1D::DCT), TxType::ADST_DCT => (TxType1D::ADST, TxType1D::DCT), TxType::DCT_ADST => (TxType1D::DCT, TxType1D::ADST), TxType::ADST_ADST => (TxType1D::ADST, TxType1D::ADST), TxType::FLIPADST_DCT => (TxType1D::FLIPADST, TxType1D::DCT), TxType::DCT_FLIPADST => (TxType1D::DCT, TxType1D::FLIPADST), TxType::FLIPADST_FLIPADST => (TxType1D::FLIPADST, TxType1D::FLIPADST), TxType::ADST_FLIPADST => (TxType1D::ADST, TxType1D::FLIPADST), TxType::FLIPADST_ADST => (TxType1D::FLIPADST, TxType1D::ADST), TxType::IDTX => (TxType1D::IDTX, TxType1D::IDTX), TxType::V_DCT => (TxType1D::DCT, TxType1D::IDTX), TxType::H_DCT => (TxType1D::IDTX, TxType1D::DCT), TxType::V_ADST => (TxType1D::ADST, TxType1D::IDTX), TxType::H_ADST => (TxType1D::IDTX, TxType1D::ADST), TxType::V_FLIPADST => (TxType1D::FLIPADST, TxType1D::IDTX), TxType::H_FLIPADST => (TxType1D::IDTX, TxType1D::FLIPADST), TxType::WHT_WHT => (TxType1D::WHT, TxType1D::WHT), } } const VTX_TAB: [TxType1D; TX_TYPES_PLUS_LL] = [ TxType1D::DCT, TxType1D::ADST, TxType1D::DCT, TxType1D::ADST, TxType1D::FLIPADST, TxType1D::DCT, TxType1D::FLIPADST, TxType1D::ADST, TxType1D::FLIPADST, TxType1D::IDTX, TxType1D::DCT, TxType1D::IDTX, TxType1D::ADST, TxType1D::IDTX, TxType1D::FLIPADST, TxType1D::IDTX, TxType1D::WHT, ]; const HTX_TAB: [TxType1D; TX_TYPES_PLUS_LL] = [ TxType1D::DCT, TxType1D::DCT, TxType1D::ADST, TxType1D::ADST, TxType1D::DCT, TxType1D::FLIPADST, TxType1D::FLIPADST, TxType1D::FLIPADST, TxType1D::ADST, TxType1D::IDTX, TxType1D::IDTX, TxType1D::DCT, TxType1D::IDTX, TxType1D::ADST, TxType1D::IDTX, TxType1D::FLIPADST, TxType1D::WHT, ]; #[inline] pub const fn valid_av1_transform(tx_size: TxSize, tx_type: TxType) -> bool { let size_sq = tx_size.sqr_up(); use TxSize::*; use TxType::*; match (size_sq, tx_type) { (TX_64X64, DCT_DCT) => true, (TX_64X64, _) => false, (TX_32X32, DCT_DCT) => true, (TX_32X32, IDTX) => true, (TX_32X32, _) => false, (_, _) => true, } } #[cfg(any(test, feature = "bench"))] pub fn get_valid_txfm_types(tx_size: TxSize) -> &'static [TxType] { let size_sq = tx_size.sqr_up(); use TxType::*; if size_sq == TxSize::TX_64X64 { &[DCT_DCT] } else if size_sq == TxSize::TX_32X32 { &[DCT_DCT, IDTX] } else if size_sq == TxSize::TX_4X4 { &[ DCT_DCT, ADST_DCT, DCT_ADST, ADST_ADST, FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST, WHT_WHT, ] } else { &[ DCT_DCT, ADST_DCT, DCT_ADST, ADST_ADST, FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST, ] } } #[cfg(test)] mod test { use super::TxType::*; use super::*; use crate::context::av1_get_coded_tx_size; use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use rand::random; use std::mem::MaybeUninit; fn test_roundtrip( tx_size: TxSize, tx_type: TxType, tolerance: i16, ) { let cpu = CpuFeatureLevel::default(); let coeff_area: usize = av1_get_coded_tx_size(tx_size).area(); let mut src_storage = [T::cast_from(0); 64 * 64]; let src = &mut src_storage[..tx_size.area()]; let mut dst = Plane::from_slice( &[T::zero(); 64 * 64][..tx_size.area()], tx_size.width(), ); let mut res_storage = [0i16; 64 * 64]; let res = &mut res_storage[..tx_size.area()]; let mut freq_storage = [MaybeUninit::uninit(); 64 * 64]; let freq = &mut freq_storage[..tx_size.area()]; for ((r, s), d) in res.iter_mut().zip(src.iter_mut()).zip(dst.data.iter_mut()) { *s = T::cast_from(random::()); *d = T::cast_from(random::()); *r = i16::cast_from(*s) - i16::cast_from(*d); } forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8, cpu); // SAFETY: forward_transform initialized freq let freq = unsafe { slice_assume_init_mut(freq) }; inverse_transform_add( freq, &mut dst.as_region_mut(), coeff_area.try_into().unwrap(), tx_size, tx_type, 8, cpu, ); for (s, d) in src.iter().zip(dst.data.iter()) { assert!(i16::abs(i16::cast_from(*s) - i16::cast_from(*d)) <= tolerance); } } #[test] fn log_tx_ratios() { let combinations = [ (TxSize::TX_4X4, 0), (TxSize::TX_8X8, 0), (TxSize::TX_16X16, 0), (TxSize::TX_32X32, 0), (TxSize::TX_64X64, 0), (TxSize::TX_4X8, -1), (TxSize::TX_8X4, 1), (TxSize::TX_8X16, -1), (TxSize::TX_16X8, 1), (TxSize::TX_16X32, -1), (TxSize::TX_32X16, 1), (TxSize::TX_32X64, -1), (TxSize::TX_64X32, 1), (TxSize::TX_4X16, -2), (TxSize::TX_16X4, 2), (TxSize::TX_8X32, -2), (TxSize::TX_32X8, 2), (TxSize::TX_16X64, -2), (TxSize::TX_64X16, 2), ]; for &(tx_size, expected) in combinations.iter() { println!( "Testing combination {:?}, {:?}", tx_size.width(), tx_size.height() ); assert!( get_rect_tx_log_ratio(tx_size.width(), tx_size.height()) == expected ); } } fn roundtrips() { let combinations = [ (TX_4X4, WHT_WHT, 0), (TX_4X4, DCT_DCT, 0), (TX_4X4, ADST_DCT, 0), (TX_4X4, DCT_ADST, 0), (TX_4X4, ADST_ADST, 0), (TX_4X4, FLIPADST_DCT, 0), (TX_4X4, DCT_FLIPADST, 0), (TX_4X4, IDTX, 0), (TX_4X4, V_DCT, 0), (TX_4X4, H_DCT, 0), (TX_4X4, V_ADST, 0), (TX_4X4, H_ADST, 0), (TX_8X8, DCT_DCT, 1), (TX_8X8, ADST_DCT, 1), (TX_8X8, DCT_ADST, 1), (TX_8X8, ADST_ADST, 1), (TX_8X8, FLIPADST_DCT, 1), (TX_8X8, DCT_FLIPADST, 1), (TX_8X8, IDTX, 0), (TX_8X8, V_DCT, 0), (TX_8X8, H_DCT, 0), (TX_8X8, V_ADST, 0), (TX_8X8, H_ADST, 1), (TX_16X16, DCT_DCT, 1), (TX_16X16, ADST_DCT, 1), (TX_16X16, DCT_ADST, 1), (TX_16X16, ADST_ADST, 1), (TX_16X16, FLIPADST_DCT, 1), (TX_16X16, DCT_FLIPADST, 1), (TX_16X16, IDTX, 0), (TX_16X16, V_DCT, 1), (TX_16X16, H_DCT, 1), // 32x transforms only use DCT_DCT and IDTX (TX_32X32, DCT_DCT, 2), (TX_32X32, IDTX, 0), // 64x transforms only use DCT_DCT and IDTX //(TX_64X64, DCT_DCT, 0), (TX_4X8, DCT_DCT, 1), (TX_8X4, DCT_DCT, 1), (TX_4X16, DCT_DCT, 1), (TX_16X4, DCT_DCT, 1), (TX_8X16, DCT_DCT, 1), (TX_16X8, DCT_DCT, 1), (TX_8X32, DCT_DCT, 2), (TX_32X8, DCT_DCT, 2), (TX_16X32, DCT_DCT, 2), (TX_32X16, DCT_DCT, 2), ]; for &(tx_size, tx_type, tolerance) in combinations.iter() { println!("Testing combination {:?}, {:?}", tx_size, tx_type); test_roundtrip::(tx_size, tx_type, tolerance); } } #[test] fn roundtrips_u8() { roundtrips::(); } #[test] fn roundtrips_u16() { roundtrips::(); } } rav1e-0.7.1/src/util/align.rs000064400000000000000000000113011046102023000140470ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use std::alloc::{alloc, dealloc, Layout}; use std::mem::MaybeUninit; use std::ptr; use std::{fmt, mem}; #[repr(align(64))] pub struct Align64; // A 64 byte aligned piece of data. // # Examples // ``` // let mut x: Aligned<[i16; 64 * 64]> = Aligned::new([0; 64 * 64]); // assert!(x.data.as_ptr() as usize % 16 == 0); // // let mut x: Aligned<[i16; 64 * 64]> = Aligned::uninitialized(); // assert!(x.data.as_ptr() as usize % 16 == 0); // ``` pub struct Aligned { _alignment: [Align64; 0], pub data: T, } #[cfg(any(test, feature = "bench"))] impl Aligned<[T; N]> { #[inline(always)] pub fn from_fn(cb: F) -> Self where F: FnMut(usize) -> T, { Aligned { _alignment: [], data: std::array::from_fn(cb) } } } impl Aligned<[MaybeUninit; N]> { #[inline(always)] pub const fn uninit_array() -> Self { Aligned { _alignment: [], // SAFETY: Uninitialized [MaybeUninit; N] is valid. data: unsafe { MaybeUninit::uninit().assume_init() }, } } } impl Aligned { pub const fn new(data: T) -> Self { Aligned { _alignment: [], data } } #[allow(clippy::uninit_assumed_init)] /// # Safety /// /// The resulting `Aligned` *must* be written to before it is read from. pub const unsafe fn uninitialized() -> Self { Self::new(MaybeUninit::uninit().assume_init()) } } /// An analog to a Box<[T]> where the underlying slice is aligned. /// Alignment is according to the architecture-specific SIMD constraints. pub struct AlignedBoxedSlice { ptr: std::ptr::NonNull, len: usize, } impl AlignedBoxedSlice { // Data alignment in bytes. cfg_if::cfg_if! { if #[cfg(target_arch = "wasm32")] { // FIXME: wasm32 allocator fails for alignment larger than 3 const DATA_ALIGNMENT_LOG2: usize = 3; } else { const DATA_ALIGNMENT_LOG2: usize = 6; } } const fn layout(len: usize) -> Layout { // SAFETY: We are ensuring that `align` is non-zero and is a multiple of 2. unsafe { Layout::from_size_align_unchecked( len * mem::size_of::(), 1 << Self::DATA_ALIGNMENT_LOG2, ) } } fn alloc(len: usize) -> std::ptr::NonNull { // SAFETY: We are not calling this with a null pointer, so it's safe. unsafe { ptr::NonNull::new_unchecked(alloc(Self::layout(len)) as *mut T) } } /// Creates a [`AlignedBoxedSlice`] with a slice of length [`len`] filled with /// [`val`]. pub fn new(len: usize, val: T) -> Self where T: Clone, { let mut output = Self { ptr: Self::alloc(len), len }; for a in output.iter_mut() { *a = val.clone(); } output } } impl fmt::Debug for AlignedBoxedSlice { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Debug::fmt(&**self, f) } } impl std::ops::Deref for AlignedBoxedSlice { type Target = [T]; fn deref(&self) -> &[T] { // SAFETY: We know that `self.ptr` is not null, and we know its length. unsafe { let p = self.ptr.as_ptr(); std::slice::from_raw_parts(p, self.len) } } } impl std::ops::DerefMut for AlignedBoxedSlice { fn deref_mut(&mut self) -> &mut [T] { // SAFETY: We know that `self.ptr` is not null, and we know its length. unsafe { let p = self.ptr.as_ptr(); std::slice::from_raw_parts_mut(p, self.len) } } } impl std::ops::Drop for AlignedBoxedSlice { fn drop(&mut self) { // SAFETY: We know that the contents of this struct are aligned and valid to drop. unsafe { for a in self.iter_mut() { ptr::drop_in_place(a) } dealloc(self.ptr.as_ptr() as *mut u8, Self::layout(self.len)); } } } unsafe impl Send for AlignedBoxedSlice where T: Send {} unsafe impl Sync for AlignedBoxedSlice where T: Sync {} #[cfg(test)] mod test { use super::*; fn is_aligned(ptr: *const T, n: usize) -> bool { ((ptr as usize) & ((1 << n) - 1)) == 0 } #[test] fn sanity_stack() { let a: Aligned<_> = Aligned::new([0u8; 3]); assert!(is_aligned(a.data.as_ptr(), 4)); } #[test] fn sanity_heap() { let a: AlignedBoxedSlice<_> = AlignedBoxedSlice::new(3, 0u8); assert!(is_aligned(a.as_ptr(), 4)); } } rav1e-0.7.1/src/util/cdf.rs000064400000000000000000000056751046102023000135320ustar 00000000000000// Copyright (c) 2017-2021, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub const fn cdf( vars: [u16; VARS], ) -> [u16; CDF_LEN] { // Ensure that at least one zero is kept at the end assert!(CDF_LEN > VARS); let mut out = [0; CDF_LEN]; let mut i = 0; while i < vars.len() { assert!(vars[i] <= 32768); out[i] = 32768 - vars[i]; i += 1; } out } pub const fn cdf_2d< const VARS: usize, const CDF_LEN: usize, const N_2D: usize, >( vars: [[u16; VARS]; N_2D], ) -> [[u16; CDF_LEN]; N_2D] { let mut out = [[0u16; CDF_LEN]; N_2D]; let mut c = 0; while c < vars.len() { out[c] = cdf(vars[c]); c += 1; } out } pub const fn cdf_3d< const VARS: usize, const CDF_LEN: usize, const N_2D: usize, const N_3D: usize, >( vars: [[[u16; VARS]; N_2D]; N_3D], ) -> [[[u16; CDF_LEN]; N_2D]; N_3D] { let mut out = [[[0u16; CDF_LEN]; N_2D]; N_3D]; let mut c = 0; while c < vars.len() { out[c] = cdf_2d(vars[c]); c += 1; } out } pub const fn cdf_4d< const VARS: usize, const CDF_LEN: usize, const N_2D: usize, const N_3D: usize, const N_4D: usize, >( vars: [[[[u16; VARS]; N_2D]; N_3D]; N_4D], ) -> [[[[u16; CDF_LEN]; N_2D]; N_3D]; N_4D] { let mut out = [[[[0u16; CDF_LEN]; N_2D]; N_3D]; N_4D]; let mut c = 0; while c < vars.len() { out[c] = cdf_3d(vars[c]); c += 1; } out } pub const fn cdf_5d< const VARS: usize, const CDF_LEN: usize, const N_2D: usize, const N_3D: usize, const N_4D: usize, const N_5D: usize, >( vars: [[[[[u16; VARS]; N_2D]; N_3D]; N_4D]; N_5D], ) -> [[[[[u16; CDF_LEN]; N_2D]; N_3D]; N_4D]; N_5D] { let mut out = [[[[[0u16; CDF_LEN]; N_2D]; N_3D]; N_4D]; N_5D]; let mut c = 0; while c < vars.len() { out[c] = cdf_4d(vars[c]); c += 1; } out } #[cfg(test)] mod test { use super::*; #[test] fn cdf_len_ok() { let _: [u16; 5] = cdf([]); let _: [u16; 5] = cdf([1]); let _: [u16; 5] = cdf([1, 2, 3, 4]); } #[test] #[should_panic] fn cdf_len_panics() { let _: [u16; 5] = cdf([1, 2, 3, 4, 5]); } #[test] #[should_panic] fn cdf_val_panics() { let _: [u16; 5] = cdf([40000]); } #[test] fn cdf_vals_ok() { let cdf: [u16; 5] = cdf([2000, 10000, 32768, 0]); assert_eq!(cdf, [30768, 22768, 0, 32768, 0]); } #[test] fn cdf_5d_ok() { let cdf: [[[[[u16; 4]; 2]; 1]; 1]; 1] = cdf_5d([[[[[1000, 2000], [3000, 4000]]]]]); assert_eq!(cdf, [[[[[31768, 30768, 0, 0], [29768, 28768, 0, 0],]]]]) } } rav1e-0.7.1/src/util/kmeans.rs000064400000000000000000000063771046102023000142540ustar 00000000000000// Copyright (c) 2022-2023, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. /// Find k-means for a sorted slice of integers that can be summed in `i64`. pub fn kmeans(data: &[T]) -> [T; K] where T: Copy, T: Into, T: PartialEq, T: PartialOrd, i64: TryInto, >::Error: std::fmt::Debug, { let mut low = [0; K]; for (i, val) in low.iter_mut().enumerate() { *val = (i * (data.len() - 1)) / (K - 1); } let mut means = low.map(|i| unsafe { *data.get_unchecked(i) }); let mut high = low; let mut sum = [0i64; K]; high[K - 1] = data.len(); sum[K - 1] = means[K - 1].into(); // Constrain complexity to O(n log n) let limit = 2 * (usize::BITS - data.len().leading_zeros()); for _ in 0..limit { for (i, (threshold, (low, high))) in (means.iter().skip(1).zip(&means)) .map(|(&c1, &c2)| unsafe { ((c1.into() + c2.into() + 1) >> 1).try_into().unwrap_unchecked() }) .zip(low.iter_mut().skip(1).zip(&mut high)) .enumerate() { unsafe { scan(high, low, sum.get_unchecked_mut(i..=i + 1), data, threshold); } } let mut changed = false; for (((m, sum), high), low) in means.iter_mut().zip(&sum).zip(&high).zip(&low) { let count = (high - low) as i64; if count == 0 { continue; } let new_mean = unsafe { ((sum + (count >> 1)).saturating_div(count)) .try_into() .unwrap_unchecked() }; changed |= *m != new_mean; *m = new_mean; } if !changed { break; } } means } #[inline(never)] unsafe fn scan( high: &mut usize, low: &mut usize, sum: &mut [i64], data: &[T], t: T, ) where T: Copy, T: Into, T: PartialEq, T: PartialOrd, { let mut n = *high; let mut s = *sum.get_unchecked(0); for &d in data.get_unchecked(..n).iter().rev().take_while(|&d| *d > t) { s -= d.into(); n -= 1; } for &d in data.get_unchecked(n..).iter().take_while(|&d| *d <= t) { s += d.into(); n += 1; } *high = n; *sum.get_unchecked_mut(0) = s; let mut n = *low; let mut s = *sum.get_unchecked(1); for &d in data.get_unchecked(n..).iter().take_while(|&d| *d < t) { s -= d.into(); n += 1; } for &d in data.get_unchecked(..n).iter().rev().take_while(|&d| *d >= t) { s += d.into(); n -= 1; } *low = n; *sum.get_unchecked_mut(1) = s; } #[cfg(test)] mod test { use super::*; #[test] fn three_means() { let mut data = [1, 2, 3, 10, 11, 12, 20, 21, 22]; data.sort_unstable(); let centroids = kmeans(&data); assert_eq!(centroids, [2, 11, 21]); } #[test] fn four_means() { let mut data = [30, 31, 32, 1, 2, 3, 10, 11, 12, 20, 21, 22]; data.sort_unstable(); let centroids = kmeans(&data); assert_eq!(centroids, [2, 11, 21, 31]); } } rav1e-0.7.1/src/util/logexp.rs000064400000000000000000000253131046102023000142630ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. /// Convert an integer into a Q57 fixed-point fraction. pub const fn q57(v: i32) -> i64 { debug_assert!(v >= -64 && v <= 63); (v as i64) << 57 } #[rustfmt::skip] const ATANH_LOG2: &[i64; 32] = &[ 0x32B803473F7AD0F4, 0x2F2A71BD4E25E916, 0x2E68B244BB93BA06, 0x2E39FB9198CE62E4, 0x2E2E683F68565C8F, 0x2E2B850BE2077FC1, 0x2E2ACC58FE7B78DB, 0x2E2A9E2DE52FD5F2, 0x2E2A92A338D53EEC, 0x2E2A8FC08F5E19B6, 0x2E2A8F07E51A485E, 0x2E2A8ED9BA8AF388, 0x2E2A8ECE2FE7384A, 0x2E2A8ECB4D3E4B1A, 0x2E2A8ECA94940FE8, 0x2E2A8ECA6669811D, 0x2E2A8ECA5ADEDD6A, 0x2E2A8ECA57FC347E, 0x2E2A8ECA57438A43, 0x2E2A8ECA57155FB4, 0x2E2A8ECA5709D510, 0x2E2A8ECA5706F267, 0x2E2A8ECA570639BD, 0x2E2A8ECA57060B92, 0x2E2A8ECA57060008, 0x2E2A8ECA5705FD25, 0x2E2A8ECA5705FC6C, 0x2E2A8ECA5705FC3E, 0x2E2A8ECA5705FC33, 0x2E2A8ECA5705FC30, 0x2E2A8ECA5705FC2F, 0x2E2A8ECA5705FC2F ]; /// Computes the binary exponential of `logq57`. /// `logq57`: a log base 2 in Q57 format. /// Returns a 64 bit integer in Q0 (no fraction). pub const fn bexp64(logq57: i64) -> i64 { let ipart = (logq57 >> 57) as i32; if ipart < 0 { return 0; } if ipart >= 63 { return 0x7FFFFFFFFFFFFFFF; } // z is the fractional part of the log in Q62 format. // We need 1 bit of headroom since the magnitude can get larger than 1 // during the iteration, and a sign bit. let mut z = logq57 - q57(ipart); let mut w: i64; if z != 0 { // Rust has 128 bit multiplies, so it should be possible to do this // faster without losing accuracy. z <<= 5; // w is the exponential in Q61 format (since it also needs headroom and can // get as large as 2.0); we could get another bit if we dropped the sign, // but we'll recover that bit later anyway. // Ideally this should start out as // \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}} // but in order to guarantee convergence we have to repeat iterations 4, // 13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger. w = 0x26A3D0E401DD846D; let mut i: i64 = 0; loop { let mask = -((z < 0) as i64); w += ((w >> (i + 1)) + mask) ^ mask; z -= (ATANH_LOG2[i as usize] + mask) ^ mask; // Repeat iteration 4. if i >= 3 { break; } z *= 2; i += 1; } loop { let mask = -((z < 0) as i64); w += ((w >> (i + 1)) + mask) ^ mask; z -= (ATANH_LOG2[i as usize] + mask) ^ mask; // Repeat iteration 13. if i >= 12 { break; } z *= 2; i += 1; } while i < 32 { let mask = -((z < 0) as i64); w += ((w >> (i + 1)) + mask) ^ mask; z = (z - ((ATANH_LOG2[i as usize] + mask) ^ mask)) * 2; i += 1; } // Skip the remaining iterations unless we really require that much // precision. // We could have bailed out earlier for smaller iparts, but that would // require initializing w from a table, as the limit doesn't converge to // 61-bit precision until n=30. let mut wlo: i32 = 0; if ipart > 30 { // For these iterations, we just update the low bits, as the high bits // can't possibly be affected. // OD_ATANH_LOG2 has also converged (it actually did so one iteration // earlier, but that's no reason for an extra special case). loop { let mask = -((z < 0) as i64); wlo += (((w >> i) + mask) ^ mask) as i32; z -= (ATANH_LOG2[31] + mask) ^ mask; // Repeat iteration 40. if i >= 39 { break; } z *= 2; i += 1; } while i < 61 { let mask = -((z < 0) as i64); wlo += (((w >> i) + mask) ^ mask) as i32; z = (z - ((ATANH_LOG2[31] + mask) ^ mask)) * 2; i += 1; } } w = (w << 1) + (wlo as i64); } else { w = 1i64 << 62; } if ipart < 62 { w = ((w >> (61 - ipart)) + 1) >> 1; } w } /// Computes the binary log of `n`. /// `n`: a 64-bit integer in Q0 (no fraction). /// Returns a 64-bit log in Q57. pub const fn blog64(n: i64) -> i64 { if n <= 0 { return -1; } let ipart = 63 - n.leading_zeros() as i32; let w = if ipart > 61 { n >> (ipart - 61) } else { n << (61 - ipart) }; if (w & (w - 1)) == 0 { return q57(ipart); } // z is the fractional part of the log in Q61 format. let mut z: i64 = 0; // Rust has 128 bit multiplies, so it should be possible to do this // faster without losing accuracy. // x and y are the cosh() and sinh(), respectively, in Q61 format. // We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)). let mut x = w + (1i64 << 61); let mut y = w - (1i64 << 61); // Repeat iteration 4. // Repeat iteration 13. // Repeat iteration 40. let bounds = [3, 12, 39, 61]; let mut i = 0; let mut j = 0; loop { let end = bounds[j]; loop { let mask = -((y < 0) as i64); // ATANH_LOG2 has converged at iteration 32. z += ((ATANH_LOG2[if i < 31 { i } else { 31 }] >> i) + mask) ^ mask; let u = x >> (i + 1); x -= ((y >> (i + 1)) + mask) ^ mask; y -= (u + mask) ^ mask; if i == end { break; } i += 1; } j += 1; if j == bounds.len() { break; } } z = (z + 8) >> 4; q57(ipart) + z } /// Computes the binary log of `n`. /// `n`: an unsigned 32-bit integer in Q0 (no fraction). /// Returns a signed 32-bit log in Q24. #[allow(unused)] pub const fn blog32(n: u32) -> i32 { if n == 0 { return -1; } let ipart = 31 - n.leading_zeros() as i32; let n = n as i64; let w = if ipart > 61 { n >> (ipart - 61) } else { n << (61 - ipart) }; if (w & (w - 1)) == 0 { return ipart << 24; } // z is the fractional part of the log in Q61 format. let mut z: i64 = 0; // Rust has 128 bit multiplies, so it should be possible to do this // faster without losing accuracy. // x and y are the cosh() and sinh(), respectively, in Q61 format. // We are computing z = 2*atanh(y/x) = 2*atanh((w - 1)/(w + 1)). let mut x = w + (1i64 << 61); let mut y = w - (1i64 << 61); // Repeat iteration 4. // Repeat iteration 13. let bounds = [3, 12, 29]; let mut i = 0; let mut j = 0; loop { let end = bounds[j]; loop { let mask = -((y < 0) as i64); z += ((ATANH_LOG2[i] >> i) + mask) ^ mask; let u = x >> (i + 1); x -= ((y >> (i + 1)) + mask) ^ mask; y -= (u + mask) ^ mask; if i == end { break; } i += 1; } j += 1; if j == bounds.len() { break; } } const SHIFT: usize = 61 - 24; z = (z + (1 << SHIFT >> 1)) >> SHIFT; (ipart << 24) + z as i32 } /// Converts a Q57 fixed-point fraction to Q24 by rounding. pub const fn q57_to_q24(v: i64) -> i32 { (((v >> 32) + 1) >> 1) as i32 } /// Converts a Q24 fixed-point fraction to Q57. pub const fn q24_to_q57(v: i32) -> i64 { (v as i64) << 33 } /// Binary exponentiation of a `log_scale` with 24-bit fractional precision and /// saturation. /// `log_scale`: A binary logarithm in Q24 format. /// Returns the binary exponential in Q24 format, saturated to 2**47 - 1 if /// `log_scale` was too large. pub const fn bexp_q24(log_scale: i32) -> i64 { if log_scale < 23 << 24 { let ret = bexp64(((log_scale as i64) << 33) + q57(24)); if ret < (1i64 << 47) - 1 { return ret; } } (1i64 << 47) - 1 } /// Polynomial approximation of a binary exponential. /// Q10 input, Q0 output. #[allow(unused)] pub const fn bexp32_q10(z: i32) -> u32 { let ipart = z >> 10; let mut n = ((z & ((1 << 10) - 1)) << 4) as u32; n = ({ n * (((n * (((n * (((n * 3548) >> 15) + 6817)) >> 15) + 15823)) >> 15) + 22708) } >> 15) + 16384; if 14 - ipart > 0 { (n + (1 << (13 - ipart))) >> (14 - ipart) } else { n << (ipart - 14) } } /// Polynomial approximation of a binary logarithm. /// Q0 input, Q11 output. pub const fn blog32_q11(w: u32) -> i32 { if w == 0 { return -1; } let ipart = 32 - w.leading_zeros() as i32; let n = if ipart - 16 > 0 { w >> (ipart - 16) } else { w << (16 - ipart) } as i32 - 32768 - 16384; let fpart = ({ n * (((n * (((n * (((n * -1402) >> 15) + 2546)) >> 15) - 5216)) >> 15) + 15745) } >> 15) - 6797; (ipart << 11) + (fpart >> 3) } #[cfg(test)] mod test { use super::*; #[test] fn blog64_vectors() { assert!(blog64(1793) == 0x159dc71e24d32daf); assert!(blog64(0x678dde6e5fd29f05) == 0x7d6373ad151ca685); } #[test] fn bexp64_vectors() { assert!(bexp64(0x159dc71e24d32daf) == 1793); assert!((bexp64(0x7d6373ad151ca685) - 0x678dde6e5fd29f05).abs() < 29); } #[test] fn blog64_bexp64_round_trip() { for a in 1..=std::u16::MAX as i64 { let b = std::i64::MAX / a; let (log_a, log_b, log_ab) = (blog64(a), blog64(b), blog64(a * b)); assert!((log_a + log_b - log_ab).abs() < 4); assert!(bexp64(log_a) == a); assert!((bexp64(log_b) - b).abs() < 128); assert!((bexp64(log_ab) - a * b).abs() < 128); } } #[test] fn blog32_vectors() { assert_eq!(blog32(0), -1); assert_eq!(blog32(1793), q57_to_q24(0x159dc71e24d32daf)); } #[test] fn bexp_q24_vectors() { assert_eq!(bexp_q24(i32::MAX), (1i64 << 47) - 1); assert_eq!( (bexp_q24(q57_to_q24(0x159dc71e24d32daf)) + (1 << 24 >> 1)) >> 24, 1793 ); } #[test] fn blog32_bexp_q24_round_trip() { for a in 1..=std::u16::MAX as u32 { let b = (std::u32::MAX >> 9) / a; let (log_a, log_b, log_ab) = (blog32(a), blog32(b), blog32(a * b)); assert!((log_a + log_b - log_ab).abs() < 4); assert!((bexp_q24(log_a) - (i64::from(a) << 24)).abs() < (1 << 24 >> 1)); assert!(((bexp_q24(log_b) >> 24) - i64::from(b)).abs() < 128); assert!( ((bexp_q24(log_ab) >> 24) - i64::from(a) * i64::from(b)).abs() < 128 ); } } #[test] fn blog32_q11_bexp32_q10_round_trip() { for a in 1..=std::i16::MAX as i32 { let b = std::i16::MAX as i32 / a; let (log_a, log_b, log_ab) = ( blog32_q11(a as u32), blog32_q11(b as u32), blog32_q11(a as u32 * b as u32), ); assert!((log_a + log_b - log_ab).abs() < 4); assert!((bexp32_q10((log_a + 1) >> 1) as i32 - a).abs() < 18); assert!((bexp32_q10((log_b + 1) >> 1) as i32 - b).abs() < 2); assert!((bexp32_q10((log_ab + 1) >> 1) as i32 - a * b).abs() < 18); } } } rav1e-0.7.1/src/util/mod.rs000064400000000000000000000013411046102023000135370ustar 00000000000000// Copyright (c) 2017-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. mod align; mod cdf; mod kmeans; mod logexp; mod uninit; pub use v_frame::math::*; pub use v_frame::pixel::*; pub use align::*; pub use cdf::*; pub use uninit::*; pub use kmeans::*; pub(crate) use logexp::*; rav1e-0.7.1/src/util/pixel.rs000064400000000000000000000010521046102023000141000ustar 00000000000000// Copyright (c) 2017-2020, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. pub use v_frame::pixel::*; rav1e-0.7.1/src/util/uninit.rs000064400000000000000000000021001046102023000142600ustar 00000000000000// Copyright (c) 2019-2022, The rav1e contributors. All rights reserved // // This source code is subject to the terms of the BSD 2 Clause License and // the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License // was not distributed with this source code in the LICENSE file, you can // obtain it at www.aomedia.org/license/software. If the Alliance for Open // Media Patent License 1.0 was not distributed with this source code in the // PATENTS file, you can obtain it at www.aomedia.org/license/patent. use std::mem::MaybeUninit; pub fn init_slice_repeat_mut( slice: &'_ mut [MaybeUninit], value: T, ) -> &'_ mut [T] { // Fill all of slice for a in slice.iter_mut() { *a = MaybeUninit::new(value); } // SAFETY: Defined behavior, since all elements of slice are initialized unsafe { slice_assume_init_mut(slice) } } /// Assume all the elements are initialized. #[inline(always)] pub unsafe fn slice_assume_init_mut( slice: &'_ mut [MaybeUninit], ) -> &'_ mut [T] { &mut *(slice as *mut [MaybeUninit] as *mut [T]) } rav1e-0.7.1/src/x86/cdef16_avx2.asm000064400000000000000000000717471046102023000146150ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA %macro DIR_TABLE 1 ; stride db 1 * %1 + 0, 2 * %1 + 0 db 1 * %1 + 0, 2 * %1 - 2 db -1 * %1 + 2, -2 * %1 + 4 db 0 * %1 + 2, -1 * %1 + 4 db 0 * %1 + 2, 0 * %1 + 4 db 0 * %1 + 2, 1 * %1 + 4 db 1 * %1 + 2, 2 * %1 + 4 db 1 * %1 + 0, 2 * %1 + 2 db 1 * %1 + 0, 2 * %1 + 0 db 1 * %1 + 0, 2 * %1 - 2 db -1 * %1 + 2, -2 * %1 + 4 db 0 * %1 + 2, -1 * %1 + 4 %endmacro dir_table4: DIR_TABLE 16 dir_table8: DIR_TABLE 32 pri_taps: dw 4, 4, 3, 3, 2, 2, 3, 3 dir_shift: times 2 dw 0x4000 times 2 dw 0x1000 pw_2048: times 2 dw 2048 pw_m16384: times 2 dw -16384 cextern cdef_dir_8bpc_avx2.main SECTION .text %macro CDEF_FILTER 2 ; w, h DEFINE_ARGS dst, stride, _, dir, pridmp, pri, sec, tmp movifnidn prid, r5m movifnidn secd, r6m mov dird, r7m vpbroadcastd m8, [base+pw_2048] lea dirq, [base+dir_table%1+dirq*2] test prid, prid jz .sec_only %if WIN64 vpbroadcastw m6, prim movaps [rsp+16*0], xmm9 movaps [rsp+16*1], xmm10 %else movd xm6, prid vpbroadcastw m6, xm6 %endif lzcnt pridmpd, prid rorx tmpd, prid, 2 cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, tmpd ; pri >>= 2 mov tmpd, r8m ; damping and prid, 4 sub tmpd, 31 vpbroadcastd m9, [base+pri_taps+priq+8*0] vpbroadcastd m10, [base+pri_taps+priq+8*1] test secd, secd jz .pri_only %if WIN64 movaps r8m, xmm13 vpbroadcastw m13, secm movaps r4m, xmm11 movaps r6m, xmm12 %else movd xm0, secd vpbroadcastw m13, xm0 %endif lzcnt secd, secd xor prid, prid add pridmpd, tmpd cmovs pridmpd, prid add secd, tmpd lea tmpq, [px] mov [pri_shift], pridmpq mov [sec_shift], secq %rep %1*%2/16 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec %endrep %if WIN64 movaps xmm11, r4m movaps xmm12, r6m movaps xmm13, r8m %endif jmp .pri_end .pri_only: add pridmpd, tmpd cmovs pridmpd, secd lea tmpq, [px] mov [pri_shift], pridmpq %rep %1*%2/16 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri %endrep .pri_end: %if WIN64 movaps xmm9, [rsp+16*0] movaps xmm10, [rsp+16*1] %endif .end: RET .sec_only: mov tmpd, r8m ; damping %if WIN64 vpbroadcastw m6, secm %else movd xm6, secd vpbroadcastw m6, xm6 %endif tzcnt secd, secd sub tmpd, secd mov [sec_shift], tmpq lea tmpq, [px] %rep %1*%2/16 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec %endrep jmp .end %if %1 == %2 ALIGN function_align .pri: movsx offq, byte [dirq+4] ; off_k0 %if %1 == 4 mova m1, [tmpq+32*0] punpcklqdq m1, [tmpq+32*1] ; 0 2 1 3 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 %else mova xm1, [tmpq+32*0] vinserti128 m1, [tmpq+32*1], 1 movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+5] ; off_k1 psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0p1 pminsw m0, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m0, m2 ; constrain(diff_k0p0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 ; adiff_k1p0 paddw m0, m2 ; constrain(diff_k0) psrlw m2, m3, [pri_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, m6, m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m0, m9 ; pri_tap_k0 pmullw m7, m10 ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 add tmpq, 32*2 paddw m0, m1 %if %1 == 4 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] %endif ret ALIGN function_align .sec: movsx offq, byte [dirq+8] ; off1_k0 %if %1 == 4 mova m1, [tmpq+32*0] punpcklqdq m1, [tmpq+32*1] movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 %else mova xm1, [tmpq+32*0] vinserti128 m1, [tmpq+32*1], 1 movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+0] ; off2_k0 psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+9] ; off1_k1 psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 psignw m2, m3 ; constrain(diff_k0s1) pabsw m3, m4 ; adiff_k0s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 %else movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+1] ; off2_k1 paddw m0, m7 psignw m4, m5 ; constrain(diff_k0s3) paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif paddw m0, m7 psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 psignw m4, m5 ; constrain(diff_k1s3) paddw m0, m4 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 add tmpq, 32*2 paddw m0, m1 %if %1 == 4 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] %endif ret ALIGN function_align .pri_sec: movsx offq, byte [dirq+8] ; off2_k0 %if %1 == 4 mova m1, [tmpq+32*0] punpcklqdq m1, [tmpq+32*1] movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0s1 %else mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+0] ; off3_k0 pmaxsw m11, m2, m3 pminuw m12, m2, m3 psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m13, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m13, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k0s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k0s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+9] ; off2_k1 psignw m2, m3 ; constrain(diff_k0s1) pmaxsw m11, m4 pminuw m12, m4 pmaxsw m11, m5 pminuw m12, m5 psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 paddw m0, m2 pabsw m3, m4 ; adiff_k0s2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m13, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m13, m3 pminsw m4, m2 %if %1 == 4 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k1s0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k1s1 %else movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+1] ; off3_k1 paddw m0, m7 psignw m4, m5 ; constrain(diff_k0s3) pmaxsw m11, m2 pminuw m12, m2 pmaxsw m11, m3 pminuw m12, m3 paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m13, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m13, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1s2 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1s3 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+4] ; off1_k0 paddw m0, m7 psignw m2, m3 ; constrain(diff_k1s1) pmaxsw m11, m4 pminuw m12, m4 pmaxsw m11, m5 pminuw m12, m5 psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m13, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m13, m3 pminsw m4, m2 paddw m0, m7 %if %1 == 4 movu m2, [tmpq+offq+32*0] punpcklqdq m2, [tmpq+offq+32*1] ; k0p0 neg offq movu m3, [tmpq+offq+32*0] punpcklqdq m3, [tmpq+offq+32*1] ; k0p1 %else movu xm2, [tmpq+offq+32*0] vinserti128 m2, [tmpq+offq+32*1], 1 neg offq movu xm3, [tmpq+offq+32*0] vinserti128 m3, [tmpq+offq+32*1], 1 %endif movsx offq, byte [dirq+5] ; off1_k1 psignw m4, m5 ; constrain(diff_k1s3) pmaxsw m11, m2 pminuw m12, m2 pmaxsw m11, m3 pminuw m12, m3 psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 paddw m0, m4 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k0p1 pminsw m7, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m7, m2 ; constrain(diff_k0p0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movu m4, [tmpq+offq+32*0] punpcklqdq m4, [tmpq+offq+32*1] ; k1p0 neg offq movu m5, [tmpq+offq+32*0] punpcklqdq m5, [tmpq+offq+32*1] ; k1p1 %else movu xm4, [tmpq+offq+32*0] vinserti128 m4, [tmpq+offq+32*1], 1 neg offq movu xm5, [tmpq+offq+32*0] vinserti128 m5, [tmpq+offq+32*1], 1 %endif psignw m2, m3 ; constrain(diff_k0p1) paddw m7, m2 ; constrain(diff_k0) pmaxsw m11, m4 pminuw m12, m4 pmaxsw m11, m5 pminuw m12, m5 psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 pabsw m3, m4 ; adiff_k1p0 pmullw m7, m9 ; pri_tap_k0 paddw m0, m7 psrlw m2, m3, [pri_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, m6, m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m7, m10 ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 add tmpq, 32*2 pmaxsw m11, m1 pminuw m12, m1 paddw m0, m1 pminsw m0, m11 pmaxsw m0, m12 %if %1 == 4 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] %else mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] %endif ret %endif %endmacro INIT_YMM avx2 cglobal cdef_filter_4x4_16bpc, 5, 10, 9, 16*10, dst, stride, left, top, bot, \ pri, sec, edge %if WIN64 %define px rsp+16*6 %define offq r8 %define pri_shift rsp+16*2 %define sec_shift rsp+16*3 %else %define px rsp+16*4 %define offq r4 %define pri_shift rsp+16*0 %define sec_shift rsp+16*1 %endif %define base r8-dir_table4 mov edged, r9m lea r8, [dir_table4] movu xm0, [dstq+strideq*0] movu xm1, [dstq+strideq*1] lea r9, [strideq*3] movu xm2, [dstq+strideq*2] movu xm3, [dstq+r9 ] vpbroadcastd m7, [base+pw_m16384] mova [px+16*0+0], xm0 mova [px+16*1+0], xm1 mova [px+16*2+0], xm2 mova [px+16*3+0], xm3 test edgeb, 4 ; HAVE_TOP jz .no_top movu xm0, [topq+strideq*0] movu xm1, [topq+strideq*1] mova [px-16*2+0], xm0 mova [px-16*1+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd xm0, [topq+strideq*0-4] movd xm1, [topq+strideq*1-4] movd [px-16*2-4], xm0 movd [px-16*1-4], xm1 jmp .top_done .no_top: mova [px-16*2+0], m7 .top_no_left: movd [px-16*2-4], xm7 movd [px-16*1-4], xm7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movu xm0, [botq+strideq*0] movu xm1, [botq+strideq*1] mova [px+16*4+0], xm0 mova [px+16*5+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd xm0, [botq+strideq*0-4] movd xm1, [botq+strideq*1-4] movd [px+16*4-4], xm0 movd [px+16*5-4], xm1 jmp .bottom_done .no_bottom: mova [px+16*4+0], m7 .bottom_no_left: movd [px+16*4-4], xm7 movd [px+16*5-4], xm7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movd xm0, [leftq+4*0] movd xm1, [leftq+4*1] movd xm2, [leftq+4*2] movd xm3, [leftq+4*3] movd [px+16*0-4], xm0 movd [px+16*1-4], xm1 movd [px+16*2-4], xm2 movd [px+16*3-4], xm3 jmp .left_done .no_left: REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5 .padding_done: CDEF_FILTER 4, 4 cglobal cdef_filter_4x8_16bpc, 5, 10, 9, 16*14, dst, stride, left, top, bot, \ pri, sec, edge mov edged, r9m movu xm0, [dstq+strideq*0] movu xm1, [dstq+strideq*1] lea r9, [strideq*3] movu xm2, [dstq+strideq*2] movu xm3, [dstq+r9 ] lea r6, [dstq+strideq*4] movu xm4, [r6 +strideq*0] movu xm5, [r6 +strideq*1] movu xm6, [r6 +strideq*2] movu xm7, [r6 +r9 ] lea r8, [dir_table4] mova [px+16*0+0], xm0 mova [px+16*1+0], xm1 mova [px+16*2+0], xm2 mova [px+16*3+0], xm3 mova [px+16*4+0], xm4 mova [px+16*5+0], xm5 mova [px+16*6+0], xm6 mova [px+16*7+0], xm7 vpbroadcastd m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movu xm0, [topq+strideq*0] movu xm1, [topq+strideq*1] mova [px-16*2+0], xm0 mova [px-16*1+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd xm0, [topq+strideq*0-4] movd xm1, [topq+strideq*1-4] movd [px-16*2-4], xm0 movd [px-16*1-4], xm1 jmp .top_done .no_top: mova [px-16*2+0], m7 .top_no_left: movd [px-16*2-4], xm7 movd [px-16*1-4], xm7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movu xm0, [botq+strideq*0] movu xm1, [botq+strideq*1] mova [px+16*8+0], xm0 mova [px+16*9+0], xm1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd xm0, [botq+strideq*0-4] movd xm1, [botq+strideq*1-4] movd [px+16*8-4], xm0 movd [px+16*9-4], xm1 jmp .bottom_done .no_bottom: mova [px+16*8+0], m7 .bottom_no_left: movd [px+16*8-4], xm7 movd [px+16*9-4], xm7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movd xm0, [leftq+4*0] movd xm1, [leftq+4*1] movd xm2, [leftq+4*2] movd xm3, [leftq+4*3] movd [px+16*0-4], xm0 movd [px+16*1-4], xm1 movd [px+16*2-4], xm2 movd [px+16*3-4], xm3 movd xm0, [leftq+4*4] movd xm1, [leftq+4*5] movd xm2, [leftq+4*6] movd xm3, [leftq+4*7] movd [px+16*4-4], xm0 movd [px+16*5-4], xm1 movd [px+16*6-4], xm2 movd [px+16*7-4], xm3 jmp .left_done .no_left: REPX {movd [px+16*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+16*x+8], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 4, 8 cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*13, dst, stride, left, top, bot, \ pri, sec, edge %if WIN64 %define px rsp+32*4 %else %define px rsp+32*3 %endif %define base r8-dir_table8 mov edged, r9m movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea r6, [dstq+strideq*2] movu m2, [r6 +strideq*0] movu m3, [r6 +strideq*1] lea r6, [r6 +strideq*2] movu m4, [r6 +strideq*0] movu m5, [r6 +strideq*1] lea r6, [r6 +strideq*2] movu m6, [r6 +strideq*0] movu m7, [r6 +strideq*1] lea r8, [dir_table8] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 mova [px+32*4+0], m4 mova [px+32*5+0], m5 mova [px+32*6+0], m6 mova [px+32*7+0], m7 vpbroadcastd m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd xm0, [topq+strideq*0-4] movd xm1, [topq+strideq*1-4] movd [px-32*2-4], xm0 movd [px-32*1-4], xm1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], xm7 movd [px-32*1-4], xm7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*8+0], m0 mova [px+32*9+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd xm0, [botq+strideq*0-4] movd xm1, [botq+strideq*1-4] movd [px+32*8-4], xm0 movd [px+32*9-4], xm1 jmp .bottom_done .no_bottom: mova [px+32*8+0], m7 mova [px+32*9+0], m7 .bottom_no_left: movd [px+32*8-4], xm7 movd [px+32*9-4], xm7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movd xm0, [leftq+4*0] movd xm1, [leftq+4*1] movd xm2, [leftq+4*2] movd xm3, [leftq+4*3] movd [px+32*0-4], xm0 movd [px+32*1-4], xm1 movd [px+32*2-4], xm2 movd [px+32*3-4], xm3 movd xm0, [leftq+4*4] movd xm1, [leftq+4*5] movd xm2, [leftq+4*6] movd xm3, [leftq+4*7] movd [px+32*4-4], xm0 movd [px+32*5-4], xm1 movd [px+32*6-4], xm2 movd [px+32*7-4], xm3 jmp .left_done .no_left: REPX {movd [px+32*x-4], xm7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+16], xm7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 8, 8 cglobal cdef_dir_16bpc, 4, 7, 6, src, stride, var, bdmax lea r6, [dir_shift] shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc vpbroadcastd m4, [r6+bdmaxq*4] lea r6, [strideq*3] mova xm0, [srcq+strideq*0] mova xm1, [srcq+strideq*1] mova xm2, [srcq+strideq*2] mova xm3, [srcq+r6 ] lea srcq, [srcq+strideq*4] vinserti128 m0, [srcq+r6 ], 1 vinserti128 m1, [srcq+strideq*2], 1 vinserti128 m2, [srcq+strideq*1], 1 vinserti128 m3, [srcq+strideq*0], 1 REPX {pmulhuw x, m4}, m0, m1, m2, m3 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/cdef16_avx512.asm000064400000000000000000000522461046102023000147540ustar 00000000000000; Copyright © 2022, VideoLAN and dav1d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 cdef_perm: db 2, 18, 16, 18, 24, 19, 0, 19, 25, 20, 1, 20, 26, 21, 2, 21 db 3, 26, 3, 26, 28, 27, 4, 27, 29, 28, -1, 28, 30, 29, -1, 29 db 0, 34, 17, 34, 16, 35, 8, 35, 17, 36, 9, 36, 18, 37, 10, 37 db 1, 42, 11, 42, 20, 43, 12, 43, 21, 44, -1, 44, 22, 45, -1, 45 end_perm4: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 edge_mask4: dw 0xff99, 0xff88, 0xff11, 0xff00 ; 0100, 0101, 0110, 0111 dw 0x99ff, 0x88ff, 0x11ff, 0x00ff ; 1000, 1001, 1010, 1011 dw 0x9999, 0x8888, 0x1111, 0x0000 ; 1100, 1101, 1110, 1111 pri_taps4: dw 64, 32, 48, 48 ; left-shifted by 4 cdef_dirs4: dw 8, 16, 8, 15, -7,-14, 1, -6 dw 1, 2, 1, 10, 9, 18, 8, 17 dw 8, 16, 8, 15, -7,-14, 1, -6 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 cdef_dirs8: db 32, 64, 32, 62,-30,-60, 2,-28 db 2, 4, 2, 36, 34, 68, 32, 66 db 32, 64, 32, 62,-30,-60, 2,-28 pri_taps8: dw 4, 4, 2, 2, 3, 3, 3, 3 sec_taps4: dw 32, 16 pw_m16384: times 2 dw -16384 pw_2048: times 2 dw 2048 pd_268435568: dd 268435568 ; (1 << 28) + (7 << 4) edge_mask8: dw 0x2121, 0x2020, 0x0101 SECTION .text %macro CONSTRAIN 7 ; dst, p, px, zero, tresh, shift, tmp psubw %1, %2, %3 pabsw %1, %1 vpcmpgtw k1, %3, %2 vpsrlvw %7, %1, %6 psubusw %7, %5, %7 pminsw %1, %7 vpsubw %1{k1}, %4, %1 %endmacro ; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 ; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 ; L0 L1 00 01 02 03 04 05 b0 b1 b2 b3 b4 b5 b6 b7 ; L2 L3 10 11 12 13 14 15 B0 B1 B2 B3 B4 B5 B6 B7 INIT_ZMM avx512icl cglobal cdef_filter_4x4_16bpc, 5, 7, 16, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r6-cdef_dirs4 lea r6, [cdef_dirs4] movu xm3, [dstq+strideq*0] vinserti32x4 ym3, [dstq+strideq*1], 1 mova xm2, [leftq] lea r2, [dstq+strideq*2] vinserti32x4 m3, [r2+strideq*0], 2 mova m5, [base+cdef_perm] vinserti32x4 m3, [r2+strideq*1], 3 vpermt2d m2, m5, m3 vinserti32x4 m1, m2, [topq+strideq*0-4], 0 vinserti32x4 m1, [topq+strideq*1-4], 1 mov r3d, edgem movifnidn prid, prim punpcklwd m3, m3 ; px psrlw m5, 8 vpbroadcastd m0, [base+pd_268435568] pxor m12, m12 cmp r3d, 0x0f jne .mask_edges vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 .main: test prid, prid jz .sec_only lzcnt r4d, prid rorx r3d, prid, 2 vpbroadcastw m13, prim cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, r3d ; pri >>= 2 mov r3d, dampingm and prid, 4 sub r3d, 31 vpbroadcastd m15, [base+pri_taps4+priq] xor prid, prid add r4d, r3d cmovns prid, r4d ; pri_shift mov r4d, dirm vpbroadcastw m14, prid mov r5d, secm vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] call .constrain test r5d, r5d jz .end_no_clip lzcnt r5d, r5d vpbroadcastw m13, secm add r3d, r5d pminuw m6, m3, m8 pmaxsw m7, m3, m8 pminuw m6, m9 pmaxsw m7, m9 call .constrain_sec pminuw m6, m8 pmaxsw m7, m8 pminuw m6, m9 pmaxsw m7, m9 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain pminuw m6, m8 pmaxsw m7, m8 pminuw m6, m9 pmaxsw m7, m9 psrldq m8, m6, 2 vpshldd m3, m0, 8 psrldq m9, m7, 2 paddd m0, m3 pminuw m6, m8 psrldq m0, 1 pmaxsw m7, m9 pmaxsw m0, m6 pminsw m0, m7 vpmovdw ym0, m0 jmp .end .sec_only: tzcnt r5d, secm mov r3d, dampingm vpbroadcastw m13, secm mov r4d, dirm sub r3d, r5d ; sec_shift call .constrain_sec vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain .end_no_clip: mova ym1, [base+end_perm4] vpshldd m3, m0, 8 ; (px << 8) + ((sum > -8) << 4) paddd m0, m3 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) vpermb m0, m1, m0 .end: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm0, ym0, 1 movq [r2+strideq*0], xm0 movhps [r2+strideq*1], xm0 RET .mask_edges: vpbroadcastd m6, [base+pw_m16384] test r3b, 0x08 jz .mask_edges_no_bottom ; avoid buffer overread vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 kmovw k1, [base+edge_mask4-8+r3*2] jmp .mask_edges_main .mask_edges_no_bottom: kmovw k1, [base+edge_mask4+8+r3*2] .mask_edges_main: or r3d, 0x04 vmovdqa32 m1{k1}, m6 ; edge pixels = -16384 kmovw k1, [base+edge_mask4-8+r3*2] vmovdqa32 m2{k1}, m6 jmp .main .constrain_sec: vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] vpbroadcastw m14, r3d vpbroadcastd m15, [base+sec_taps4] .constrain: paddw m8, m5, m9 vpermi2w m8, m1, m2 ; k0p0 k1p0 psubw m9, m5, m9 vpermi2w m9, m1, m2 ; k0p1 k1p1 CONSTRAIN m10, m8, m3, m12, m13, m14, m11 vpdpwssd m0, m10, m15 CONSTRAIN m10, m9, m3, m12, m13, m14, m11 vpdpwssd m0, m10, m15 ret ; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 ; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 ; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 ; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 cglobal cdef_filter_4x8_16bpc, 5, 7, 22, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge lea r6, [cdef_dirs4] movu xm18, [dstq+strideq*0] vinserti128 ym18, [dstq+strideq*1], 1 mova xm1, [leftq+16*0] mova xm2, [leftq+16*1] lea r2, [strideq*3] vinserti32x4 m18, [dstq+strideq*2], 2 mova m5, [base+cdef_perm] vinserti32x4 m18, [dstq+r2 ], 3 vpermt2d m1, m5, m18 vinserti32x4 m0, m1, [topq+strideq*0-4], 0 vinserti32x4 m0, [topq+strideq*1-4], 1 lea r3, [dstq+strideq*4] movu xm19, [r3+strideq*0] vinserti128 ym19, [r3+strideq*1], 1 vinserti32x4 m19, [r3+strideq*2], 2 vinserti32x4 m19, [r3+r2 ], 3 mov r3d, edgem movifnidn prid, prim vpermt2d m2, m5, m19 vpbroadcastd m16, [base+pd_268435568] pxor m12, m12 punpcklwd m18, m18 ; px (top) psrlw m5, 8 punpcklwd m19, m19 ; px (bottom) mova m17, m16 vshufi32x4 m1, m2, q3210 cmp r3d, 0x0f jne .mask_edges vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 .main: test prid, prid jz .sec_only lzcnt r4d, prid rorx r3d, prid, 2 vpbroadcastw m13, prim cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, r3d ; pri >>= 2 mov r3d, dampingm and prid, 4 sub r3d, 31 vpbroadcastd m15, [base+pri_taps4+priq] xor prid, prid add r4d, r3d cmovns prid, r4d ; pri_shift mov r4d, dirm vpbroadcastw m14, prid mov r5d, secm vpbroadcastd m9, [base+cdef_dirs4+(r4+2)*4] call .constrain test r5d, r5d jz .end_no_clip lzcnt r5d, r5d vpbroadcastw m13, secm add r3d, r5d pminuw m3, m18, m6 pmaxsw m4, m18, m6 pminuw m20, m19, m7 pmaxsw m21, m19, m7 pminuw m3, m8 pmaxsw m4, m8 pminuw m20, m9 pmaxsw m21, m9 call .constrain_sec pminuw m3, m6 pmaxsw m4, m6 pminuw m20, m7 pmaxsw m21, m7 pminuw m3, m8 pmaxsw m4, m8 pminuw m20, m9 pmaxsw m21, m9 vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain pminuw m3, m6 pmaxsw m4, m6 mov r3, 0xcccccccccccccccc pminuw m20, m7 pmaxsw m21, m7 kmovq k1, r3 pminuw m3, m8 pmaxsw m4, m8 pminuw m20, m9 pmaxsw m21, m9 vbroadcasti32x4 m0, [base+deint_shuf] vpshldd m6, m20, m3, 16 vmovdqu8 m3{k1}, m20 vpshldd m18, m16, 8 vpshldd m7, m21, m4, 16 vmovdqu8 m4{k1}, m21 vpshldd m19, m17, 8 pminuw m3, m6 paddd m16, m18 pmaxsw m4, m7 paddd m17, m19 psrldq m16, 1 palignr m16{k1}, m17, m17, 15 lea r6, [dstq+strideq*4] pmaxsw m16, m3 pminsw m16, m4 pshufb m16, m0 movq [dstq+strideq*0], xm16 movhps [r6 +strideq*0], xm16 vextracti128 xm17, ym16, 1 movq [dstq+strideq*1], xm17 movhps [r6 +strideq*1], xm17 vextracti32x4 xm17, m16, 2 movq [dstq+strideq*2], xm17 movhps [r6 +strideq*2], xm17 vextracti32x4 xm16, m16, 3 movq [dstq+r2 ], xm16 movhps [r6 +r2 ], xm16 RET .sec_only: mov r4d, dirm tzcnt r5d, secm mov r3d, dampingm vpbroadcastw m13, secm sub r3d, r5d ; sec_shift call .constrain_sec vpbroadcastd m9, [base+cdef_dirs4+(r4+0)*4] call .constrain .end_no_clip: mova ym20, [base+end_perm4] vpshldd m18, m16, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m19, m17, 8 paddd m16, m18 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddd m17, m19 vpermb m16, m20, m16 vpermb m17, m20, m17 movq [dstq+strideq*0], xm16 movhps [dstq+strideq*1], xm16 vextracti128 xm16, ym16, 1 movq [dstq+strideq*2], xm16 movhps [dstq+r2 ], xm16 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm17 movhps [dstq+strideq*1], xm17 vextracti128 xm17, ym17, 1 movq [dstq+strideq*2], xm17 movhps [dstq+r2 ], xm17 RET .mask_edges: vpbroadcastd m6, [base+pw_m16384] test r3b, 0x08 jz .mask_edges_no_bottom ; avoid buffer overread vinserti32x4 m2, [botq+strideq*0-4], 2 vinserti32x4 m2, [botq+strideq*1-4], 3 kmovw k1, [base+edge_mask4-8+r3*2] jmp .mask_edges_main .mask_edges_no_bottom: kmovw k1, [base+edge_mask4+8+r3*2] .mask_edges_main: mov r4d, r3d or r3d, 0x0c vmovdqa32 m0{k1}, m6 ; edge pixels = -16384 kmovw k1, [base+edge_mask4-8+r3*2] or r4d, 0x04 vmovdqa32 m1{k1}, m6 kmovw k1, [base+edge_mask4-8+r4*2] vmovdqa32 m2{k1}, m6 jmp .main .constrain_sec: vpbroadcastd m9, [base+cdef_dirs4+(r4+4)*4] vpbroadcastw m14, r3d vpbroadcastd m15, [base+sec_taps4] .constrain: paddw m7, m5, m9 mova m6, m0 vpermt2w m6, m7, m1 ; k0p0 k1p0 (top) psubw m9, m5, m9 mova m8, m0 vpermi2w m7, m1, m2 ; k0p0 k1p0 (bottom) CONSTRAIN m10, m6, m18, m12, m13, m14, m11 vpermt2w m8, m9, m1 ; k0p1 k1p1 (top) vpdpwssd m16, m10, m15 CONSTRAIN m10, m7, m19, m12, m13, m14, m11 vpermi2w m9, m1, m2 ; k0p1 k1p1 (bottom) vpdpwssd m17, m10, m15 CONSTRAIN m10, m8, m18, m12, m13, m14, m11 vpdpwssd m16, m10, m15 CONSTRAIN m10, m9, m19, m12, m13, m14, m11 vpdpwssd m17, m10, m15 ret cglobal cdef_filter_8x8_16bpc, 5, 7, 22, 64*6, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r6-cdef_dirs8 lea r6, [cdef_dirs8] movu ym17, [dstq+strideq*0] vinserti32x8 m17, [dstq+strideq*1], 1 movq xm4, [leftq+8*0] movq xm5, [leftq+8*1] psrld m2, [base+cdef_perm], 16 movq xm6, [leftq+8*2] movq xm7, [leftq+8*3] lea r2, [strideq*3] movu ym16, [topq+strideq*0-4] vinserti32x8 m16, [topq+strideq*1-4], 1 lea r3, [dstq+strideq*4] movu ym18, [dstq+strideq*2] vinserti32x8 m18, [dstq+r2 ], 1 movu ym19, [r3+strideq*0] vinserti32x8 m19, [r3+strideq*1], 1 movu ym20, [r3+strideq*2] vinserti32x8 m20, [r3+r2 ], 1 vshufi32x4 m0, m17, m18, q2020 ; px (top) mov r3d, edgem vshufi32x4 m1, m19, m20, q2020 ; px (bottom) movifnidn prid, prim vpermt2d m17, m2, m4 vpermt2d m18, m2, m5 pxor m12, m12 vpermt2d m19, m2, m6 vpermt2d m20, m2, m7 cmp r3d, 0x0f jne .mask_edges movu ym21, [botq+strideq*0-4] vinserti32x8 m21, [botq+strideq*1-4], 1 .main: mova [rsp+64*0], m16 ; top mova [rsp+64*1], m17 ; 0 1 mova [rsp+64*2], m18 ; 2 3 mova [rsp+64*3], m19 ; 4 5 mova [rsp+64*4], m20 ; 6 7 mova [rsp+64*5], m21 ; bottom test prid, prid jz .sec_only lzcnt r4d, prid rorx r3d, prid, 2 vpbroadcastw m13, prim cmp dword r10m, 0xfff ; if (bpc == 12) cmove prid, r3d ; pri >>= 2 mov r3d, dampingm and prid, 4 sub r3d, 31 add r4d, r3d ; pri_shift vpbroadcastw m14, r4d mov r4d, dirm vpbroadcastd m2, [base+pri_taps8+priq*2+0] vpbroadcastd m3, [base+pri_taps8+priq*2+4] movsx r5, byte [base+cdef_dirs8+(r4+2)*2+0] ; k0off1 pmaxsw m14, m12 call .constrain mov r5d, secm pmullw m16, m8, m2 pmullw m17, m9, m2 test r5d, r5d jnz .pri_sec movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 call .constrain pmullw m8, m3 pmullw m9, m3 jmp .end_no_clip .pri_sec: lzcnt r5d, r5d add r3d, r5d ; sec_shift movsx r5, byte [base+cdef_dirs8+(r4+2)*2+1] ; k1off1 pminuw m18, m0, m4 pmaxsw m19, m0, m4 pminuw m20, m1, m5 pmaxsw m21, m1, m5 call .min_max_constrain2 movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] ; k0off2 pmullw m8, m3 pmullw m9, m3 vpbroadcastw m13, secm vpbroadcastw m14, r3d paddw m16, m8 paddw m17, m9 call .min_max_constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] ; k0off3 mova m2, m8 mova m3, m9 call .min_max_constrain movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] ; k1off2 paddw m2, m8 paddw m3, m9 call .min_max_constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] ; k1off3 paddw m2, m2 paddw m3, m3 paddw m16, m8 paddw m17, m9 call .min_max_constrain vpbroadcastd m10, [base+pw_2048] paddw m16, m2 paddw m17, m3 paddw m16, m8 paddw m17, m9 psraw m8, m16, 15 psraw m9, m17, 15 paddw m16, m8 paddw m17, m9 pmulhrsw m16, m10 pmulhrsw m17, m10 pminuw m18, m4 pmaxsw m19, m4 pminuw m20, m5 pmaxsw m21, m5 pminuw m18, m6 pmaxsw m19, m6 pminuw m20, m7 pmaxsw m21, m7 paddw m16, m0 paddw m17, m1 pmaxsw m16, m18 pmaxsw m17, m20 pminsw m16, m19 pminsw m17, m21 jmp .end .sec_only: tzcnt r5d, secm mov r4d, dirm mov r3d, dampingm vpbroadcastw m13, secm sub r3d, r5d movsx r5, byte [base+cdef_dirs8+(r4+0)*2+0] vpbroadcastw m14, r3d call .constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+0] mova m16, m8 mova m17, m9 call .constrain movsx r5, byte [base+cdef_dirs8+(r4+0)*2+1] paddw m16, m8 paddw m17, m9 call .constrain movsx r5, byte [base+cdef_dirs8+(r4+4)*2+1] paddw m16, m16 paddw m17, m17 paddw m16, m8 paddw m17, m9 call .constrain .end_no_clip: vpbroadcastd m10, [base+pw_2048] paddw m16, m8 paddw m17, m9 psraw m8, m16, 15 psraw m9, m17, 15 paddw m16, m8 paddw m17, m9 pmulhrsw m16, m10 pmulhrsw m17, m10 paddw m16, m0 paddw m17, m1 .end: mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm17 vextracti128 [dstq+strideq*1], ym17, 1 vextracti32x4 [dstq+strideq*2], m17, 2 vextracti32x4 [dstq+r2 ], m17, 3 RET .mask_edges: vpbroadcastd m2, [base+pw_m16384] test r3b, 0x08 jz .mask_edges_no_bottom ; avoid buffer overread movu ym21, [botq+strideq*0-4] vinserti32x8 m21, [botq+strideq*1-4], 1 jmp .mask_edges_top .mask_edges_no_bottom: mova m21, m2 .mask_edges_top: test r3b, 0x04 jnz .mask_edges_main mova m16, m2 .mask_edges_main: and r3d, 0x03 cmp r3d, 0x03 je .main kmovw k1, [base+edge_mask8+r3*2] vmovdqa32 m16{k1}, m2 ; edge pixels = -16384 vmovdqa32 m17{k1}, m2 vmovdqa32 m18{k1}, m2 vmovdqa32 m19{k1}, m2 vmovdqa32 m20{k1}, m2 vmovdqa32 m21{k1}, m2 jmp .main ALIGN function_align .min_max_constrain: pminuw m18, m4 pmaxsw m19, m4 pminuw m20, m5 pmaxsw m21, m5 .min_max_constrain2: pminuw m18, m6 pmaxsw m19, m6 pminuw m20, m7 pmaxsw m21, m7 .constrain: %define tmp rsp+gprsize+68 movu m4, [tmp+r5+64*0] vshufi32x4 m4, [tmp+r5+64*1], q2020 ; k0p0 (top) movu m5, [tmp+r5+64*2] vshufi32x4 m5, [tmp+r5+64*3], q2020 ; k0p0 (bottom) neg r5 movu m6, [tmp+r5+64*0] vshufi32x4 m6, [tmp+r5+64*1], q2020 ; k0p1 (top) movu m7, [tmp+r5+64*2] vshufi32x4 m7, [tmp+r5+64*3], q2020 ; k0p1 (bottom) CONSTRAIN m8, m4, m0, m12, m13, m14, m15 CONSTRAIN m9, m5, m1, m12, m13, m14, m15 CONSTRAIN m10, m6, m0, m12, m13, m14, m15 CONSTRAIN m11, m7, m1, m12, m13, m14, m15 paddw m8, m10 paddw m9, m11 ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/cdef16_sse.asm000064400000000000000000001011331046102023000145060ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; Copyright (c) 2017-2021, The rav1e contributors ; Copyright (c) 2021, Nathan Egge ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA %macro DUP8 1-* %rep %0 times 8 dw %1 %rotate 1 %endrep %endmacro pri_taps: DUP8 4, 2, 3, 3 dir_table: db 1 * 32 + 0, 2 * 32 + 0 db 1 * 32 + 0, 2 * 32 - 2 db -1 * 32 + 2, -2 * 32 + 4 db 0 * 32 + 2, -1 * 32 + 4 db 0 * 32 + 2, 0 * 32 + 4 db 0 * 32 + 2, 1 * 32 + 4 db 1 * 32 + 2, 2 * 32 + 4 db 1 * 32 + 0, 2 * 32 + 2 db 1 * 32 + 0, 2 * 32 + 0 db 1 * 32 + 0, 2 * 32 - 2 db -1 * 32 + 2, -2 * 32 + 4 db 0 * 32 + 2, -1 * 32 + 4 dir_shift: times 4 dw 0x4000 times 4 dw 0x1000 pw_128: times 4 dw 128 pw_2048: times 8 dw 2048 pw_m16384: times 8 dw -16384 cextern cdef_dir_8bpc_ssse3.main cextern cdef_dir_8bpc_sse4.main cextern shufw_6543210x SECTION .text %if ARCH_X86_32 DECLARE_REG_TMP 5, 3 %elif WIN64 DECLARE_REG_TMP 8, 4 %else DECLARE_REG_TMP 8, 6 %endif %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, tmp, pridmp, pri, sec, dir mova m8, [base+pw_2048] %else DEFINE_ARGS dst, pridmp, tmp, sec, pri, _, dir %define m8 [base+pw_2048] %define m9 [rsp+16*1+gprsize] %define m10 [rsp+16*2+gprsize] %endif movifnidn prid, r5m movifnidn secd, r6m test prid, prid jz .sec_only movd m6, r5m %if ARCH_X86_32 mov [rsp+24], pridmpd %endif bsr pridmpd, prid lea tmpd, [priq*4] cmp dword r10m, 0x3ff ; if (bpc == 10) cmove prid, tmpd ; pri <<= 2 mov tmpd, r8m ; damping mov dird, r7m and prid, 16 pshufb m6, m7 ; splat lea dirq, [base+dir_table+dirq*2] lea priq, [base+pri_taps+priq*2] test secd, secd jz .pri_only mova [rsp], m6 movd m6, secd tzcnt secd, secd sub pridmpd, tmpd sub tmpd, secd pshufb m6, m7 xor secd, secd neg pridmpd cmovs pridmpd, secd %if ARCH_X86_32 mov [pri_shift+4], secd mov [sec_shift+4], secd %endif mov [pri_shift+0], pridmpq mov [sec_shift+0], tmpq lea tmpq, [px] %if WIN64 movaps r4m, m9 movaps r6m, m10 %elif ARCH_X86_32 mov pridmpd, [rsp+24] %endif %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri_sec %endrep %if WIN64 movaps m9, r4m movaps m10, r6m %endif jmp .end .pri_only: sub tmpd, pridmpd cmovs tmpd, secd %if ARCH_X86_32 mov pridmpd, [rsp+24] mov [pri_shift+4], secd %endif mov [pri_shift+0], tmpq lea tmpq, [px] %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).pri %endrep .end: RET .sec_only: mov tmpd, r8m ; damping movd m6, r6m tzcnt secd, secd mov dird, r7m pshufb m6, m7 sub tmpd, secd lea dirq, [base+dir_table+dirq*2] %if ARCH_X86_32 mov [sec_shift+4], prid %endif mov [sec_shift+0], tmpq lea tmpq, [px] %rep %1*%2/8 call mangle(private_prefix %+ _cdef_filter_%1x%1_16bpc %+ SUFFIX).sec %endrep jmp .end %if %1 == %2 %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, tmp, off, pri, _, dir %else DEFINE_ARGS dst, stride, tmp, off, pri, _, dir %endif ALIGN function_align .pri: movsx offq, byte [dirq+4] ; off_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0p0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0p1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+5] ; off_k1 psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0p1 pminsw m0, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m0, m2 ; constrain(diff_k0p0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1p0 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1p1 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 ; adiff_k1p0 paddw m0, m2 ; constrain(diff_k0) psrlw m2, m3, [pri_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, m6, m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m0, [priq+16*0] ; pri_tap_k0 pmullw m7, [priq+16*1] ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret ALIGN function_align .sec: movsx offq, byte [dirq+8] ; off1_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0s1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+0] ; off2_k0 psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k0s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k0s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+9] ; off1_k1 psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 psignw m2, m3 ; constrain(diff_k0s1) pabsw m3, m4 ; adiff_k0s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k1s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k1s1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+1] ; off2_k1 paddw m0, m7 psignw m4, m5 ; constrain(diff_k0s3) paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif paddw m0, m7 psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 psignw m4, m5 ; constrain(diff_k1s3) paddw m0, m4 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret ALIGN function_align .pri_sec: movsx offq, byte [dirq+8] ; off2_k0 %if %1 == 4 movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] movq m2, [tmpq+offq+32*0] ; k0s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0s1 movhps m3, [tmpq+offq+32*1] %else mova m1, [dstq] movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+0] ; off3_k0 pabsw m4, m2 %if ARCH_X86_64 pabsw m10, m3 pmaxsw m9, m2, m3 pminsw m10, m4 %else pabsw m7, m3 pmaxsw m5, m2, m3 pminsw m4, m7 mova m9, m5 mova m10, m4 %endif psubw m2, m1 ; diff_k0s0 psubw m3, m1 ; diff_k0s1 pabsw m4, m2 ; adiff_k0s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m0, m6, m5 pabsw m5, m3 ; adiff_k0s1 pminsw m0, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m0, m2 ; constrain(diff_k0s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k0s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k0s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+9] ; off2_k1 pabsw m7, m4 psignw m2, m3 pabsw m3, m5 ; constrain(diff_k0s1) %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m7 pmaxsw m9, m5 pminsw m10, m3 %else pminsw m7, m10 pminsw m7, m3 pmaxsw m3, m9, m4 pmaxsw m3, m5 mova m10, m7 mova m9, m3 %endif psubw m4, m1 ; diff_k0s2 psubw m5, m1 ; diff_k0s3 paddw m0, m2 pabsw m3, m4 ; adiff_k0s2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k0s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k0s2) psubusw m4, m6, m3 pminsw m4, m2 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k1s0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k1s1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+1] ; off3_k1 paddw m0, m7 pabsw m7, m2 psignw m4, m5 ; constrain(diff_k0s3) pabsw m5, m3 %if ARCH_X86_64 pmaxsw m9, m2 pminsw m10, m7 pmaxsw m9, m3 pminsw m10, m5 %else pminsw m7, m10 pminsw m7, m5 pmaxsw m5, m9, m2 pmaxsw m5, m3 mova m10, m7 mova m9, m5 %endif paddw m0, m4 ; constrain(diff_k0) psubw m2, m1 ; diff_k1s0 psubw m3, m1 ; diff_k1s1 paddw m0, m0 ; sec_tap_k0 pabsw m4, m2 ; adiff_k1s0 psrlw m5, m4, [sec_shift+gprsize] psubusw m7, m6, m5 pabsw m5, m3 ; adiff_k1s1 pminsw m7, m4 psrlw m4, m5, [sec_shift+gprsize] psignw m7, m2 ; constrain(diff_k1s0) psubusw m2, m6, m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1s2 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1s3 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif movsx offq, byte [dirq+4] ; off1_k0 paddw m0, m7 pabsw m7, m4 psignw m2, m3 ; constrain(diff_k1s1) pabsw m3, m5 %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m7 pmaxsw m9, m5 pminsw m10, m3 %else pminsw m7, m10 pminsw m7, m3 pmaxsw m3, m9, m4 pmaxsw m3, m5 mova m10, m7 mova m9, m3 %endif psubw m4, m1 ; diff_k1s2 psubw m5, m1 ; diff_k1s3 pabsw m3, m4 ; adiff_k1s2 paddw m0, m2 psrlw m2, m3, [sec_shift+gprsize] psubusw m7, m6, m2 pabsw m2, m5 ; adiff_k1s3 pminsw m7, m3 psrlw m3, m2, [sec_shift+gprsize] psignw m7, m4 ; constrain(diff_k1s2) psubusw m4, m6, m3 pminsw m4, m2 paddw m0, m7 %if %1 == 4 movq m2, [tmpq+offq+32*0] ; k0p0 movhps m2, [tmpq+offq+32*1] neg offq movq m3, [tmpq+offq+32*0] ; k0p1 movhps m3, [tmpq+offq+32*1] %else movu m2, [tmpq+offq] neg offq movu m3, [tmpq+offq] %endif movsx offq, byte [dirq+5] ; off1_k1 pabsw m7, m2 psignw m4, m5 ; constrain(diff_k1s3) pabsw m5, m3 %if ARCH_X86_64 pmaxsw m9, m2 pminsw m10, m7 pmaxsw m9, m3 pminsw m10, m5 %else pminsw m7, m10 pminsw m7, m5 pmaxsw m5, m9, m2 pmaxsw m5, m3 mova m10, m7 mova m9, m5 %endif psubw m2, m1 ; diff_k0p0 psubw m3, m1 ; diff_k0p1 paddw m0, m4 pabsw m4, m2 ; adiff_k0p0 psrlw m5, m4, [pri_shift+gprsize] psubusw m7, [rsp+gprsize], m5 pabsw m5, m3 ; adiff_k0p1 pminsw m7, m4 psrlw m4, m5, [pri_shift+gprsize] psignw m7, m2 ; constrain(diff_k0p0) psubusw m2, [rsp+gprsize], m4 pminsw m2, m5 %if %1 == 4 movq m4, [tmpq+offq+32*0] ; k1p0 movhps m4, [tmpq+offq+32*1] neg offq movq m5, [tmpq+offq+32*0] ; k1p1 movhps m5, [tmpq+offq+32*1] %else movu m4, [tmpq+offq] neg offq movu m5, [tmpq+offq] %endif psignw m2, m3 ; constrain(diff_k0p1) pabsw m3, m4 paddw m7, m2 ; constrain(diff_k0) pabsw m2, m5 %if ARCH_X86_64 pmaxsw m9, m4 pminsw m10, m3 pmaxsw m9, m5 pminsw m10, m2 %else pminsw m3, m10 pminsw m3, m2 pmaxsw m2, m9, m4 pmaxsw m2, m5 mova m10, m3 mova m9, m2 %endif psubw m4, m1 ; diff_k1p0 psubw m5, m1 ; diff_k1p1 pabsw m3, m4 ; adiff_k1p0 pmullw m7, [priq+16*0] ; pri_tap_k0 paddw m0, m7 psrlw m2, m3, [pri_shift+gprsize] psubusw m7, [rsp+16*0+gprsize], m2 pabsw m2, m5 ; adiff_k1p1 pminsw m7, m3 psrlw m3, m2, [pri_shift+gprsize] psignw m7, m4 ; constrain(diff_k1p0) psubusw m4, [rsp+16*0+gprsize], m3 pminsw m4, m2 psignw m4, m5 ; constrain(diff_k1p1) paddw m7, m4 ; constrain(diff_k1) pmullw m7, [priq+16*1] ; pri_tap_k1 paddw m0, m7 ; sum psraw m2, m0, 15 paddw m0, m2 pmulhrsw m0, m8 paddw m0, m1 %if ARCH_X86_64 pmaxsw m9, m1 pminsw m0, m9 %else pmaxsw m2, m9, m1 pminsw m0, m2 %endif pminsw m1, m10 pmaxsw m0, m1 %if %1 == 4 add tmpq, 32*2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] %else add tmpq, 32 mova [dstq], m0 add dstq, strideq %endif ret %endif %endmacro INIT_XMM ssse3 %if ARCH_X86_64 cglobal cdef_filter_4x4_16bpc, 5, 9, 9, 32*10, dst, stride, left, top, bot, \ pri, sec, edge %define px rsp+32*4 %else cglobal cdef_filter_4x4_16bpc, 2, 7, 8, -32*11, dst, stride, edge, top, left %define botq topq %define px rsp+32*5 %endif %define base t0-dir_table %define pri_shift px-16*6 %define sec_shift px-16*5 mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea t1, [dstq+strideq*2] movu m2, [t1 +strideq*0] movu m3, [t1 +strideq*1] movddup m7, [base+pw_m16384] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], m7 movd [px-32*1-4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*4+0], m0 mova [px+32*5+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*4-4], m0 movd [px+32*5-4], m1 jmp .bottom_done .no_bottom: mova [px+32*4+0], m7 mova [px+32*5+0], m7 .bottom_no_left: movd [px+32*4-4], m7 movd [px+32*5-4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0-4], m0 movd [px+32*1-4], m1 movd [px+32*2-4], m2 movd [px+32*3-4], m3 jmp .left_done .no_left: REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5 .padding_done: CDEF_FILTER 4, 4 %if ARCH_X86_64 cglobal cdef_filter_4x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ pri, sec, edge %else cglobal cdef_filter_4x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif mov edged, r9m LEA t0, dir_table movu m0, [dstq+strideq*0] movu m1, [dstq+strideq*1] lea t1, [dstq+strideq*2] movu m2, [t1 +strideq*0] movu m3, [t1 +strideq*1] lea t1, [t1 +strideq*2] movu m4, [t1 +strideq*0] movu m5, [t1 +strideq*1] lea t1, [t1 +strideq*2] movu m6, [t1 +strideq*0] movu m7, [t1 +strideq*1] mova [px+32*0+0], m0 mova [px+32*1+0], m1 mova [px+32*2+0], m2 mova [px+32*3+0], m3 mova [px+32*4+0], m4 mova [px+32*5+0], m5 mova [px+32*6+0], m6 mova [px+32*7+0], m7 movddup m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] mova [px-32*2+0], m0 mova [px-32*1+0], m1 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+0], m7 mova [px-32*1+0], m7 .top_no_left: movd [px-32*2-4], m7 movd [px-32*1-4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] mova [px+32*8+0], m0 mova [px+32*9+0], m1 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*8-4], m0 movd [px+32*9-4], m1 jmp .bottom_done .no_bottom: mova [px+32*8+0], m7 mova [px+32*9+0], m7 .bottom_no_left: movd [px+32*8-4], m7 movd [px+32*9-4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0-4], m0 movd [px+32*1-4], m1 movd [px+32*2-4], m2 movd [px+32*3-4], m3 movd m0, [leftq+4*4] movd m1, [leftq+4*5] movd m2, [leftq+4*6] movd m3, [leftq+4*7] movd [px+32*4-4], m0 movd [px+32*5-4], m1 movd [px+32*6-4], m2 movd [px+32*7-4], m3 jmp .left_done .no_left: REPX {movd [px+32*x-4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+8], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 4, 8 %if ARCH_X86_64 cglobal cdef_filter_8x8_16bpc, 5, 9, 9, 32*14, dst, stride, left, top, bot, \ pri, sec, edge %else cglobal cdef_filter_8x8_16bpc, 2, 7, 8, -32*15, dst, stride, edge, top, left %endif mov edged, r9m LEA t0, dir_table mova m0, [dstq+strideq*0+ 0] movd m1, [dstq+strideq*0+16] mova m2, [dstq+strideq*1+ 0] movd m3, [dstq+strideq*1+16] lea t1, [dstq+strideq*2] mova m4, [t1 +strideq*0+ 0] movd m5, [t1 +strideq*0+16] mova m6, [t1 +strideq*1+ 0] movd m7, [t1 +strideq*1+16] lea t1, [t1 +strideq*2] mova [px+32*0+ 0], m0 movd [px+32*0+16], m1 mova [px+32*1+ 0], m2 movd [px+32*1+16], m3 mova [px+32*2+ 0], m4 movd [px+32*2+16], m5 mova [px+32*3+ 0], m6 movd [px+32*3+16], m7 mova m0, [t1 +strideq*0+ 0] movd m1, [t1 +strideq*0+16] mova m2, [t1 +strideq*1+ 0] movd m3, [t1 +strideq*1+16] lea t1, [t1 +strideq*2] mova m4, [t1 +strideq*0+ 0] movd m5, [t1 +strideq*0+16] mova m6, [t1 +strideq*1+ 0] movd m7, [t1 +strideq*1+16] mova [px+32*4+ 0], m0 movd [px+32*4+16], m1 mova [px+32*5+ 0], m2 movd [px+32*5+16], m3 mova [px+32*6+ 0], m4 movd [px+32*6+16], m5 mova [px+32*7+ 0], m6 movd [px+32*7+16], m7 movddup m7, [base+pw_m16384] test edgeb, 4 ; HAVE_TOP jz .no_top movifnidn topq, topmp mova m0, [topq+strideq*0+ 0] mova m1, [topq+strideq*0+16] mova m2, [topq+strideq*1+ 0] mova m3, [topq+strideq*1+16] mova [px-32*2+ 0], m0 movd [px-32*2+16], m1 mova [px-32*1+ 0], m2 movd [px-32*1+16], m3 test edgeb, 1 ; HAVE_LEFT jz .top_no_left movd m0, [topq+strideq*0-4] movd m1, [topq+strideq*1-4] movd [px-32*2-4], m0 movd [px-32*1-4], m1 jmp .top_done .no_top: mova [px-32*2+ 0], m7 movd [px-32*2+16], m7 mova [px-32*1+ 0], m7 movd [px-32*1+16], m7 .top_no_left: movd [px-32*2- 4], m7 movd [px-32*1- 4], m7 .top_done: test edgeb, 8 ; HAVE_BOTTOM jz .no_bottom movifnidn botq, r4mp mova m0, [botq+strideq*0+ 0] movd m1, [botq+strideq*0+16] mova m2, [botq+strideq*1+ 0] movd m3, [botq+strideq*1+16] mova [px+32*8+ 0], m0 movd [px+32*8+16], m1 mova [px+32*9+ 0], m2 movd [px+32*9+16], m3 test edgeb, 1 ; HAVE_LEFT jz .bottom_no_left movd m0, [botq+strideq*0-4] movd m1, [botq+strideq*1-4] movd [px+32*8- 4], m0 movd [px+32*9- 4], m1 jmp .bottom_done .no_bottom: mova [px+32*8+ 0], m7 movd [px+32*8+16], m7 mova [px+32*9+ 0], m7 movd [px+32*9+16], m7 .bottom_no_left: movd [px+32*8- 4], m7 movd [px+32*9- 4], m7 .bottom_done: test edgeb, 1 ; HAVE_LEFT jz .no_left movifnidn leftq, r2mp movd m0, [leftq+4*0] movd m1, [leftq+4*1] movd m2, [leftq+4*2] movd m3, [leftq+4*3] movd [px+32*0- 4], m0 movd [px+32*1- 4], m1 movd [px+32*2- 4], m2 movd [px+32*3- 4], m3 movd m0, [leftq+4*4] movd m1, [leftq+4*5] movd m2, [leftq+4*6] movd m3, [leftq+4*7] movd [px+32*4- 4], m0 movd [px+32*5- 4], m1 movd [px+32*6- 4], m2 movd [px+32*7- 4], m3 jmp .left_done .no_left: REPX {movd [px+32*x- 4], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .left_done: test edgeb, 2 ; HAVE_RIGHT jnz .padding_done REPX {movd [px+32*x+16], m7}, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 .padding_done: CDEF_FILTER 8, 8 %macro CDEF_DIR 0 %if ARCH_X86_64 cglobal cdef_dir_16bpc, 4, 7, 16, src, stride, var, bdmax lea r6, [dir_shift] shr bdmaxd, 11 ; 0 for 10bpc, 1 for 12bpc movddup m7, [r6+bdmaxq*8] lea r6, [strideq*3] mova m0, [srcq+strideq*0] mova m1, [srcq+strideq*1] mova m2, [srcq+strideq*2] mova m3, [srcq+r6 ] lea srcq, [srcq+strideq*4] mova m4, [srcq+strideq*0] mova m5, [srcq+strideq*1] mova m6, [srcq+strideq*2] REPX {pmulhuw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhuw m7, [srcq+r6 ] pxor m8, m8 packuswb m9, m0, m1 packuswb m10, m2, m3 packuswb m11, m4, m5 packuswb m12, m6, m7 REPX {psadbw x, m8}, m9, m10, m11, m12 packssdw m9, m10 packssdw m11, m12 packssdw m9, m11 jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %else cglobal cdef_dir_16bpc, 2, 4, 8, 96, src, stride, var, bdmax mov bdmaxd, bdmaxm LEA r2, dir_shift shr bdmaxd, 11 movddup m7, [r2+bdmaxq*8] lea r3, [strideq*3] pmulhuw m3, m7, [srcq+strideq*0] pmulhuw m4, m7, [srcq+strideq*1] pmulhuw m5, m7, [srcq+strideq*2] pmulhuw m6, m7, [srcq+r3 ] movddup m1, [r2-dir_shift+pw_128] lea srcq, [srcq+strideq*4] pxor m0, m0 packuswb m2, m3, m4 psubw m3, m1 psubw m4, m1 mova [esp+0x00], m3 mova [esp+0x10], m4 packuswb m3, m5, m6 psadbw m2, m0 psadbw m3, m0 psubw m5, m1 psubw m6, m1 packssdw m2, m3 mova [esp+0x20], m5 mova [esp+0x50], m6 pmulhuw m4, m7, [srcq+strideq*0] pmulhuw m5, m7, [srcq+strideq*1] pmulhuw m6, m7, [srcq+strideq*2] pmulhuw m7, [srcq+r3 ] packuswb m3, m4, m5 packuswb m1, m6, m7 psadbw m3, m0 psadbw m1, m0 packssdw m3, m1 movddup m1, [r2-dir_shift+pw_128] LEA r2, shufw_6543210x jmp mangle(private_prefix %+ _cdef_dir_8bpc %+ SUFFIX).main %endif %endmacro INIT_XMM ssse3 CDEF_DIR INIT_XMM sse4 CDEF_DIR rav1e-0.7.1/src/x86/cdef_avx2.asm000064400000000000000000001622651046102023000144420ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 %macro JMP_TABLE 2-* %xdefine %1_jmptable %%table %xdefine %%base mangle(private_prefix %+ _%1_avx2) %%table: %rep %0 - 1 dd %%base %+ .%2 - %%table %rotate 1 %endrep %endmacro %macro CDEF_FILTER_JMP_TABLE 1 JMP_TABLE cdef_filter_%1_8bpc, \ d6k0, d6k1, d7k0, d7k1, \ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \ d0k0, d0k1, d1k0, d1k1 %endmacro SECTION_RODATA 32 pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 dd 0x80, 0x00, 0x00 blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 dd 0x00, 0x00 blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 dd 0x0000 blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 dd 0x0000, 0x0000 blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_128: times 2 dw 128 pw_2048: times 2 dw 2048 tap_table: ; masks for 8 bit shifts db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 ; weights db 4, 2, 3, 3, 2, 1 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 db 1 * 16 + 0, 2 * 16 + 0 db 1 * 16 + 0, 2 * 16 - 1 ; the last 6 are repeats of the first 6 so we don't need to & 7 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 CDEF_FILTER_JMP_TABLE 4x4 CDEF_FILTER_JMP_TABLE 4x8 CDEF_FILTER_JMP_TABLE 8x8 SECTION .text %macro PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] mov dird, r7m lea tableq, [cdef_filter_%1x%2_8bpc_jmptable] lea dirq, [tableq+dirq*2*4] %if %1 == 4 %if %2 == 4 DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ table, dir, dirjmp, stride3, k %else DEFINE_ARGS dst, stride, left, top, bot, pri, sec, \ table, dir, dirjmp, dst4, stride3, k lea dst4q, [dstq+strideq*4] %endif %else DEFINE_ARGS dst, stride, h, top1, bot, pri, sec, \ table, dir, dirjmp, top2, stride3, k mov hq, -8 lea top1q, [top1q+strideq*0] lea top2q, [top1q+strideq*1] %endif %if %1 == 4 lea stride3q, [strideq*3] %endif %endmacro %macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max mov kd, 1 pxor m15, m15 ; sum %if %2 == 8 pxor m12, m12 %if %1 == 4 movd xm4, [dstq +strideq*0] movd xm6, [dstq +strideq*1] movd xm5, [dstq +strideq*2] movd xm7, [dstq +stride3q ] vinserti128 m4, [dst4q+strideq*0], 1 vinserti128 m6, [dst4q+strideq*1], 1 vinserti128 m5, [dst4q+strideq*2], 1 vinserti128 m7, [dst4q+stride3q ], 1 punpckldq m4, m6 punpckldq m5, m7 %else movq xm4, [dstq+strideq*0] movq xm5, [dstq+strideq*1] vinserti128 m4, [dstq+strideq*2], 1 vinserti128 m5, [dstq+stride3q ], 1 %endif punpcklqdq m4, m5 %else movd xm4, [dstq+strideq*0] movd xm5, [dstq+strideq*1] vinserti128 m4, [dstq+strideq*2], 1 vinserti128 m5, [dstq+stride3q ], 1 punpckldq m4, m5 %endif %if %3 == 1 mova m7, m4 ; min mova m8, m4 ; max %endif %endmacro %macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength ; mul_tap, w, h, clip ; load p0/p1 movsxd dirjmpq, [dirq+kq*4+%1*2*4] add dirjmpq, tableq call dirjmpq %if %8 == 1 pmaxub m7, m5 pminub m8, m5 pmaxub m7, m6 pminub m8, m6 %endif ; accumulate sum[m15] over p0/p1 %if %7 == 4 punpcklbw m5, m6 punpcklbw m6, m4, m4 psubusb m9, m5, m6 psubusb m5, m6, m5 por m9, m5 ; abs_diff_p01(p01 - px) pcmpeqb m5, m9 por m5, %5 psignb m6, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 psubusb m5, %4, m5 pminub m5, m9 pmaddubsw m5, m6 paddw m15, m5 %else psubusb m9, m5, m4 psubusb m5, m4, m5 psubusb m11, m6, m4 psubusb m6, m4, m6 por m9, m5 ; abs_diff_p0(p0 - px) por m11, m6 ; abs_diff_p1(p1 - px) pcmpeqb m5, m9 pcmpeqb m6, m11 punpckhbw m10, m9, m11 punpcklbw m9, m11 por m5, %5 por m11, m6, %5 punpckhbw m6, m5, m11 punpcklbw m5, m11 psignb m11, %5, m6 psrlw m6, m10, %2 ; emulate 8-bit shift pand m6, %3 psubusb m6, %4, m6 pminub m6, m10 pmaddubsw m6, m11 paddw m12, m6 psignb m11, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 psubusb m5, %4, m5 pminub m5, m9 pmaddubsw m5, m11 paddw m15, m5 %endif %endmacro %macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip %if %2 == 4 %if %5 == 1 punpcklbw m4, %3 %endif pcmpgtw %3, m15 paddw m15, %3 pmulhrsw m15, %4 %if %5 == 0 packsswb m15, m15 paddb m4, m15 %else paddw m4, m15 packuswb m4, m4 ; clip px in [0x0,0xff] pminub m4, m7 pmaxub m4, m8 %endif vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 movd [dstq+strideq*2], xm5 pextrd [dstq+strideq*1], xm4, 1 pextrd [dstq+stride3q ], xm5, 1 %else pcmpgtw m6, %3, m12 pcmpgtw m5, %3, m15 paddw m12, m6 paddw m15, m5 %if %5 == 1 punpckhbw m5, m4, %3 punpcklbw m4, %3 %endif pmulhrsw m12, %4 pmulhrsw m15, %4 %if %5 == 0 packsswb m15, m12 paddb m4, m15 %else paddw m5, m12 paddw m4, m15 packuswb m4, m5 ; clip px in [0x0,0xff] pminub m4, m7 pmaxub m4, m8 %endif vextracti128 xm5, m4, 1 %if %1 == 4 movd [dstq +strideq*0], xm4 movd [dst4q+strideq*0], xm5 pextrd [dstq +strideq*1], xm4, 1 pextrd [dst4q+strideq*1], xm5, 1 pextrd [dstq +strideq*2], xm4, 2 pextrd [dst4q+strideq*2], xm5, 2 pextrd [dstq +stride3q ], xm4, 3 pextrd [dst4q+stride3q ], xm5, 3 %else movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm5 movhps [dstq+strideq*1], xm4 movhps [dstq+stride3q ], xm5 %endif %endif %endmacro %macro BORDER_PREP_REGS 2 ; w, h ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k] mov dird, r7m lea dirq, [tableq+dirq*2+14] %if %1*%2*2/mmsize > 1 %if %1 == 4 DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, h, off %else DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, h, off %endif mov hd, %1*%2*2/mmsize %else DEFINE_ARGS dst, stride, k, dir, stk, pri, sec, stride3, off %endif lea stkq, [px] pxor m11, m11 %endmacro %macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max mov kd, 1 %if %1 == 4 movq xm4, [stkq+32*0] movhps xm4, [stkq+32*1] movq xm5, [stkq+32*2] movhps xm5, [stkq+32*3] vinserti128 m4, xm5, 1 %else mova xm4, [stkq+32*0] ; px vinserti128 m4, [stkq+32*1], 1 %endif pxor m15, m15 ; sum %if %3 == 1 mova m7, m4 ; max mova m8, m4 ; min %endif %endmacro %macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength ; mul_tap, w, clip ; load p0/p1 movsx offq, byte [dirq+kq+%1] ; off1 %if %6 == 4 movq xm5, [stkq+offq*2+32*0] ; p0 movq xm6, [stkq+offq*2+32*2] movhps xm5, [stkq+offq*2+32*1] movhps xm6, [stkq+offq*2+32*3] vinserti128 m5, xm6, 1 %else movu xm5, [stkq+offq*2+32*0] ; p0 vinserti128 m5, [stkq+offq*2+32*1], 1 %endif neg offq ; -off1 %if %6 == 4 movq xm6, [stkq+offq*2+32*0] ; p1 movq xm9, [stkq+offq*2+32*2] movhps xm6, [stkq+offq*2+32*1] movhps xm9, [stkq+offq*2+32*3] vinserti128 m6, xm9, 1 %else movu xm6, [stkq+offq*2+32*0] ; p1 vinserti128 m6, [stkq+offq*2+32*1], 1 %endif %if %7 == 1 ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. ; use signed max and unsigned min to remove them pmaxsw m7, m5 ; max after p0 pminuw m8, m5 ; min after p0 pmaxsw m7, m6 ; max after p1 pminuw m8, m6 ; min after p1 %endif ; accumulate sum[m15] over p0/p1 ; calculate difference before converting psubw m5, m4 ; diff_p0(p0 - px) psubw m6, m4 ; diff_p1(p1 - px) ; convert to 8-bits with signed saturation ; saturating to large diffs has no impact on the results packsswb m5, m6 ; group into pairs so we can accumulate using maddubsw pshufb m5, m12 pabsb m9, m5 psignb m10, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 psubusb m5, %4, m5 ; use unsigned min since abs diff can equal 0x80 pminub m5, m9 pmaddubsw m5, m10 paddw m15, m5 %endmacro %macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip pcmpgtw m9, m11, m15 paddw m15, m9 pmulhrsw m15, %2 paddw m4, m15 %if %3 == 1 pminsw m4, m7 pmaxsw m4, m8 %endif packuswb m4, m4 vextracti128 xm5, m4, 1 %if %1 == 4 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 movd [dstq+strideq*2], xm5 pextrd [dstq+stride3q ], xm5, 1 %else movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 %endif %endmacro %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 cglobal cdef_filter_%1x%2_8bpc, 5, 11, 0, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %assign stack_offset_entry stack_offset mov edged, edgem cmp edged, 0xf jne .border_block PUSH r11 PUSH r12 %if %2 == 4 %assign regs_used 13 ALLOC_STACK 0x60, 16 pmovzxbw xm0, [leftq+1] vpermq m0, m0, q0110 psrldq m1, m0, 4 vpalignr m2, m0, m0, 12 movu [rsp+0x10], m0 movu [rsp+0x28], m1 movu [rsp+0x40], m2 %elif %1 == 4 %assign regs_used 14 PUSH r13 ALLOC_STACK 8*2+%1*%2*1, 16 pmovzxwd m0, [leftq] mova [rsp+0x10], m0 %else %assign regs_used 15 PUSH r13 PUSH r14 ALLOC_STACK 8*4+%1*%2*2+32, 16 lea r11, [strideq*3] movu xm4, [dstq+strideq*2] pmovzxwq m0, [leftq+0] pmovzxwq m1, [leftq+8] vinserti128 m4, [dstq+r11], 1 pmovzxbd m2, [leftq+1] pmovzxbd m3, [leftq+9] mov [rsp+16], botq mova [rsp+0x20], m0 mova [rsp+0x40], m1 mova [rsp+0x60], m2 mova [rsp+0x80], m3 mova [rsp+0xa0], m4 lea botq, [dstq+strideq*4] %endif DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, zero, pridmp, damping mov dampingd, r8m xor zerod, zerod movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm test prid, prid jz .sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift test secdmpd, secdmpd jz .pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, left, top, bot, pri, secdmp, table, pridmp lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, left, top, bot, pri, sec, table, dir vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps PREP_REGS %1, %2 %if %1*%2 > mmsize .v_loop: %endif LOAD_BLOCK %1, %2, 1 .k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0 ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2 dec kq jge .k_loop vpbroadcastd m10, [pw_2048] pxor m9, m9 ADJUST_PIXEL %1, %2, m9, m10, 1 %if %1*%2 > mmsize lea dstq, [dstq+strideq*4] lea top1q, [rsp+0xa0] lea top2q, [rsp+0xb0] mov botq, [rsp+16] add hq, 4 jl .v_loop %endif RET .pri_only: DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, pridmp lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, left, top, bot, pri, _, table, dir vpbroadcastb m0, xm0 ; pri_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps PREP_REGS %1, %2 vpbroadcastd m3, [pw_2048] pxor m1, m1 %if %1*%2 > mmsize .pri_v_loop: %endif LOAD_BLOCK %1, %2 .pri_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0 dec kq jge .pri_k_loop ADJUST_PIXEL %1, %2, m1, m3 %if %1*%2 > mmsize lea dstq, [dstq+strideq*4] lea top1q, [rsp+0xa0] lea top2q, [rsp+0xb0] mov botq, [rsp+16] add hq, 4 jl .pri_v_loop %endif RET .sec_only: DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, zero, _, damping movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, left, top, bot, _, secdmp, table lea tableq, [tap_table] vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, left, top, bot, _, sec, table, dir vpbroadcastb m1, xm1 ; sec_strength lea secq, [tableq+12] ; sec_taps PREP_REGS %1, %2 vpbroadcastd m2, [pw_2048] pxor m0, m0 %if %1*%2 > mmsize .sec_v_loop: %endif LOAD_BLOCK %1, %2 .sec_k_loop: vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2 ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2 dec kq jge .sec_k_loop ADJUST_PIXEL %1, %2, m0, m2 %if %1*%2 > mmsize lea dstq, [dstq+strideq*4] lea top1q, [rsp+0xa0] lea top2q, [rsp+0xb0] mov botq, [rsp+16] add hq, 4 jl .sec_v_loop %endif RET .d0k0: %if %1 == 4 %if %2 == 4 vpbroadcastq m6, [dstq+strideq*1-1] vpbroadcastq m10, [dstq+strideq*2-1] movd xm5, [topq+strideq*1+1] movd xm9, [dstq+strideq*0+1] psrldq m11, m6, 2 psrldq m12, m10, 2 vinserti128 m6, [dstq+stride3q -1], 1 vinserti128 m10, [botq -1], 1 vpblendd m5, m11, 0x10 vpblendd m9, m12, 0x10 movu m11, [blend_4x4+16] punpckldq m6, m10 punpckldq m5, m9 vpblendvb m6, [rsp+gprsize+0x28], m11 %else movd xm5, [topq +strideq*1+1] movq xm6, [dstq +strideq*1-1] movq xm10, [dstq +stride3q -1] movq xm11, [dst4q+strideq*1-1] pinsrd xm5, [dstq +strideq*0+1], 1 movhps xm6, [dstq +strideq*2-1] movhps xm10, [dst4q+strideq*0-1] movhps xm11, [dst4q+strideq*2-1] psrldq xm9, xm6, 2 shufps xm5, xm9, q2010 ; -1 +0 +1 +2 shufps xm6, xm10, q2020 ; +1 +2 +3 +4 psrldq xm9, xm11, 2 psrldq xm10, 2 shufps xm10, xm9, q2020 ; +3 +4 +5 +6 movd xm9, [dst4q+stride3q -1] pinsrd xm9, [botq -1], 1 shufps xm11, xm9, q1020 ; +5 +6 +7 +8 pmovzxbw m9, [leftq+3] vinserti128 m6, xm11, 1 movu m11, [blend_4x8_0+4] vinserti128 m5, xm10, 1 vpblendvb m6, m9, m11 %endif %else lea r13, [blend_8x8_0+16] movq xm5, [top2q +1] vbroadcasti128 m10, [dstq+strideq*1-1] vbroadcasti128 m11, [dstq+strideq*2-1] movhps xm5, [dstq+strideq*0+1] vinserti128 m6, m10, [dstq+stride3q-1], 1 vinserti128 m9, m11, [botq -1], 1 psrldq m10, 2 psrldq m11, 2 punpcklqdq m6, m9 movu m9, [r13+hq*2*1+16*1] punpcklqdq m10, m11 vpblendd m5, m10, 0xF0 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64+8*1], m9 %endif ret .d1k0: .d2k0: .d3k0: %if %1 == 4 %if %2 == 4 movq xm6, [dstq+strideq*0-1] movq xm9, [dstq+strideq*1-1] vinserti128 m6, [dstq+strideq*2-1], 1 vinserti128 m9, [dstq+stride3q -1], 1 movu m11, [rsp+gprsize+0x10] pcmpeqd m12, m12 psrldq m5, m6, 2 psrldq m10, m9, 2 psrld m12, 24 punpckldq m6, m9 punpckldq m5, m10 vpblendvb m6, m11, m12 %else movq xm6, [dstq +strideq*0-1] movq xm9, [dstq +strideq*2-1] movhps xm6, [dstq +strideq*1-1] movhps xm9, [dstq +stride3q -1] movq xm10, [dst4q+strideq*0-1] movhps xm10, [dst4q+strideq*1-1] psrldq xm5, xm6, 2 psrldq xm11, xm9, 2 shufps xm5, xm11, q2020 movq xm11, [dst4q+strideq*2-1] movhps xm11, [dst4q+stride3q -1] shufps xm6, xm9, q2020 shufps xm9, xm10, xm11, q2020 vinserti128 m6, xm9, 1 pmovzxbw m9, [leftq+1] psrldq xm10, 2 psrldq xm11, 2 shufps xm10, xm11, q2020 vpbroadcastd m11, [blend_4x8_0+4] vinserti128 m5, xm10, 1 vpblendvb m6, m9, m11 %endif %else movu xm5, [dstq+strideq*0-1] movu xm9, [dstq+strideq*1-1] vinserti128 m5, [dstq+strideq*2-1], 1 vinserti128 m9, [dstq+stride3q -1], 1 movu m10, [blend_8x8_0+16] punpcklqdq m6, m5, m9 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64], m10 psrldq m5, 2 psrldq m9, 2 punpcklqdq m5, m9 %endif ret .d4k0: %if %1 == 4 %if %2 == 4 vpbroadcastq m10, [dstq+strideq*1-1] vpbroadcastq m11, [dstq+strideq*2-1] movd xm6, [topq+strideq*1-1] movd xm9, [dstq+strideq*0-1] psrldq m5, m10, 2 psrldq m12, m11, 2 vpblendd m6, m10, 0x10 vpblendd m9, m11, 0x10 movu m10, [blend_4x4] vinserti128 m5, [dstq+stride3q +1], 1 vinserti128 m12, [botq +1], 1 punpckldq m6, m9 punpckldq m5, m12 vpblendvb m6, [rsp+gprsize+0x40], m10 %else movd xm6, [topq +strideq*1-1] movq xm9, [dstq +strideq*1-1] movq xm10, [dstq +stride3q -1] movq xm11, [dst4q+strideq*1-1] pinsrd xm6, [dstq +strideq*0-1], 1 movhps xm9, [dstq +strideq*2-1] movhps xm10, [dst4q+strideq*0-1] movhps xm11, [dst4q+strideq*2-1] psrldq xm5, xm9, 2 shufps xm6, xm9, q2010 psrldq xm9, xm10, 2 shufps xm5, xm9, q2020 shufps xm10, xm11, q2020 movd xm9, [dst4q+stride3q +1] vinserti128 m6, xm10, 1 pinsrd xm9, [botq +1], 1 psrldq xm11, 2 pmovzxbw m10, [leftq-1] shufps xm11, xm9, q1020 movu m9, [blend_4x8_0] vinserti128 m5, xm11, 1 vpblendvb m6, m10, m9 %endif %else lea r13, [blend_8x8_0+8] movq xm6, [top2q -1] vbroadcasti128 m5, [dstq+strideq*1-1] vbroadcasti128 m9, [dstq+strideq*2-1] movhps xm6, [dstq+strideq*0-1] movu m11, [r13+hq*2*1+16*1] punpcklqdq m10, m5, m9 vinserti128 m5, [dstq+stride3q -1], 1 vinserti128 m9, [botq -1], 1 vpblendd m6, m10, 0xF0 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*1], m11 psrldq m5, 2 psrldq m9, 2 punpcklqdq m5, m9 %endif ret .d5k0: .d6k0: .d7k0: %if %1 == 4 %if %2 == 4 movd xm6, [topq+strideq*1 ] vpbroadcastd m5, [dstq+strideq*1 ] vpbroadcastd m9, [dstq+strideq*2 ] vpblendd xm6, [dstq+strideq*0-4], 0x2 vpblendd m5, m9, 0x22 vpblendd m6, m5, 0x30 vinserti128 m5, [dstq+stride3q ], 1 vpblendd m5, [botq -20], 0x20 %else movd xm6, [topq +strideq*1] movd xm5, [dstq +strideq*1] movd xm9, [dstq +stride3q ] movd xm10, [dst4q+strideq*1] movd xm11, [dst4q+stride3q ] pinsrd xm6, [dstq +strideq*0], 1 pinsrd xm5, [dstq +strideq*2], 1 pinsrd xm9, [dst4q+strideq*0], 1 pinsrd xm10, [dst4q+strideq*2], 1 pinsrd xm11, [botq ], 1 punpcklqdq xm6, xm5 punpcklqdq xm5, xm9 punpcklqdq xm9, xm10 punpcklqdq xm10, xm11 vinserti128 m6, xm9, 1 vinserti128 m5, xm10, 1 %endif %else movq xm6, [top2q ] movq xm5, [dstq+strideq*1] movq xm9, [dstq+stride3q ] movhps xm6, [dstq+strideq*0] movhps xm5, [dstq+strideq*2] movhps xm9, [botq ] vinserti128 m6, xm5, 1 vinserti128 m5, xm9, 1 %endif ret .d0k1: %if %1 == 4 %if %2 == 4 movd xm6, [dstq+strideq*2-2] movd xm9, [dstq+stride3q -2] movd xm5, [topq+strideq*0+2] movd xm10, [topq+strideq*1+2] pinsrw xm6, [leftq+4], 0 pinsrw xm9, [leftq+6], 0 vinserti128 m5, [dstq+strideq*0+2], 1 vinserti128 m10, [dstq+strideq*1+2], 1 vinserti128 m6, [botq+strideq*0-2], 1 vinserti128 m9, [botq+strideq*1-2], 1 punpckldq m5, m10 punpckldq m6, m9 %else movq xm6, [dstq +strideq*2-2] movd xm10, [dst4q+strideq*2-2] movd xm5, [topq +strideq*0+2] movq xm9, [dst4q+strideq*0-2] movhps xm6, [dstq +stride3q -2] pinsrw xm10, [dst4q+stride3q ], 3 pinsrd xm5, [topq +strideq*1+2], 1 movhps xm9, [dst4q+strideq*1-2] pinsrd xm10, [botq +strideq*0-2], 2 pinsrd xm5, [dstq +strideq*0+2], 2 pinsrd xm10, [botq +strideq*1-2], 3 pinsrd xm5, [dstq +strideq*1+2], 3 shufps xm11, xm6, xm9, q3131 shufps xm6, xm9, q2020 movu m9, [blend_4x8_3+8] vinserti128 m6, xm10, 1 vinserti128 m5, xm11, 1 vpblendvb m6, [rsp+gprsize+0x10+8], m9 %endif %else lea r13, [blend_8x8_1+16] movq xm6, [dstq+strideq*2-2] movq xm9, [dstq+stride3q -2] movq xm5, [top1q +2] movq xm10, [top2q +2] movu m11, [r13+hq*2*2+16*2] vinserti128 m6, [botq+strideq*0-2], 1 vinserti128 m9, [botq+strideq*1-2], 1 vinserti128 m5, [dstq+strideq*0+2], 1 vinserti128 m10, [dstq+strideq*1+2], 1 punpcklqdq m6, m9 punpcklqdq m5, m10 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*2], m11 %endif ret .d1k1: %if %1 == 4 %if %2 == 4 vpbroadcastq m6, [dstq+strideq*1-2] vpbroadcastq m9, [dstq+strideq*2-2] movd xm5, [topq+strideq*1+2] movd xm10, [dstq+strideq*0+2] psrldq m11, m6, 4 psrldq m12, m9, 4 vpblendd m5, m11, 0x10 movq xm11, [leftq+2] vinserti128 m6, [dstq+stride3q-2], 1 punpckldq xm11, xm11 vpblendd m10, m12, 0x10 pcmpeqd m12, m12 pmovzxwd m11, xm11 psrld m12, 16 punpckldq m6, m9 vpbroadcastd m9, [botq-2] vpblendvb m6, m11, m12 punpckldq m5, m10 vpblendd m6, m9, 0x20 %else movd xm5, [topq +strideq*1+2] movq xm6, [dstq +strideq*1-2] movq xm9, [dstq +stride3q -2] movq xm10, [dst4q+strideq*1-2] movd xm11, [dst4q+stride3q -2] pinsrd xm5, [dstq +strideq*0+2], 1 movhps xm6, [dstq +strideq*2-2] movhps xm9, [dst4q+strideq*0-2] movhps xm10, [dst4q+strideq*2-2] pinsrd xm11, [botq -2], 1 shufps xm5, xm6, q3110 shufps xm6, xm9, q2020 shufps xm9, xm10, q3131 shufps xm10, xm11, q1020 movu m11, [blend_4x8_2+4] vinserti128 m6, xm10, 1 vinserti128 m5, xm9, 1 vpblendvb m6, [rsp+gprsize+0x10+4], m11 %endif %else lea r13, [blend_8x8_1+16] movq xm5, [top2q +2] vbroadcasti128 m6, [dstq+strideq*1-2] vbroadcasti128 m9, [dstq+strideq*2-2] movhps xm5, [dstq+strideq*0+2] shufps m10, m6, m9, q2121 vinserti128 m6, [dstq+stride3q -2], 1 vinserti128 m9, [botq -2], 1 movu m11, [r13+hq*2*1+16*1] vpblendd m5, m10, 0xF0 punpcklqdq m6, m9 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64+8*1], m11 %endif ret .d2k1: %if %1 == 4 %if %2 == 4 movq xm11, [leftq] movq xm6, [dstq+strideq*0-2] movq xm9, [dstq+strideq*1-2] vinserti128 m6, [dstq+strideq*2-2], 1 vinserti128 m9, [dstq+stride3q -2], 1 punpckldq xm11, xm11 psrldq m5, m6, 4 psrldq m10, m9, 4 pmovzxwd m11, xm11 punpckldq m6, m9 punpckldq m5, m10 pblendw m6, m11, 0x05 %else movq xm5, [dstq +strideq*0-2] movq xm9, [dstq +strideq*2-2] movq xm10, [dst4q+strideq*0-2] movq xm11, [dst4q+strideq*2-2] movhps xm5, [dstq +strideq*1-2] movhps xm9, [dstq +stride3q -2] movhps xm10, [dst4q+strideq*1-2] movhps xm11, [dst4q+stride3q -2] shufps xm6, xm5, xm9, q2020 shufps xm5, xm9, q3131 shufps xm9, xm10, xm11, q2020 shufps xm10, xm11, q3131 pmovzxwd m11, [leftq] vinserti128 m6, xm9, 1 vinserti128 m5, xm10, 1 pblendw m6, m11, 0x55 %endif %else mova m11, [rsp+gprsize+0x20+hq*8+64] movu xm5, [dstq+strideq*0-2] movu xm9, [dstq+strideq*1-2] vinserti128 m5, [dstq+strideq*2-2], 1 vinserti128 m9, [dstq+stride3q -2], 1 shufps m6, m5, m9, q1010 shufps m5, m9, q2121 pblendw m6, m11, 0x11 %endif ret .d3k1: %if %1 == 4 %if %2 == 4 vpbroadcastq m11, [dstq+strideq*1-2] vpbroadcastq m12, [dstq+strideq*2-2] movd xm6, [topq+strideq*1-2] movd xm9, [dstq+strideq*0-2] pblendw m11, [leftq-16+2], 0x01 pblendw m12, [leftq-16+4], 0x01 pinsrw xm9, [leftq- 0+0], 0 psrldq m5, m11, 4 psrldq m10, m12, 4 vinserti128 m5, [dstq+stride3q +2], 1 vinserti128 m10, [botq +2], 1 vpblendd m6, m11, 0x10 vpblendd m9, m12, 0x10 punpckldq m6, m9 punpckldq m5, m10 %else movd xm6, [topq +strideq*1-2] movq xm5, [dstq +strideq*1-2] movq xm9, [dstq +stride3q -2] movq xm10, [dst4q+strideq*1-2] movd xm11, [dst4q+stride3q +2] pinsrw xm6, [dstq +strideq*0 ], 3 movhps xm5, [dstq +strideq*2-2] movhps xm9, [dst4q+strideq*0-2] movhps xm10, [dst4q+strideq*2-2] pinsrd xm11, [botq +2], 1 shufps xm6, xm5, q2010 shufps xm5, xm9, q3131 shufps xm9, xm10, q2020 shufps xm10, xm11, q1031 movu m11, [blend_4x8_2] vinserti128 m6, xm9, 1 vinserti128 m5, xm10, 1 vpblendvb m6, [rsp+gprsize+0x10-4], m11 %endif %else lea r13, [blend_8x8_1+8] movq xm6, [top2q -2] vbroadcasti128 m5, [dstq+strideq*1-2] vbroadcasti128 m10, [dstq+strideq*2-2] movhps xm6, [dstq+strideq*0-2] punpcklqdq m9, m5, m10 vinserti128 m5, [dstq+stride3q -2], 1 vinserti128 m10, [botq -2], 1 movu m11, [r13+hq*2*1+16*1] vpblendd m6, m9, 0xF0 shufps m5, m10, q2121 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*1], m11 %endif ret .d4k1: %if %1 == 4 %if %2 == 4 vinserti128 m6, [dstq+strideq*0-2], 1 vinserti128 m9, [dstq+strideq*1-2], 1 movd xm5, [dstq+strideq*2+2] movd xm10, [dstq+stride3q +2] pblendw m6, [leftq-16+0], 0x01 pblendw m9, [leftq-16+2], 0x01 vinserti128 m5, [botq+strideq*0+2], 1 vinserti128 m10, [botq+strideq*1+2], 1 vpblendd m6, [topq+strideq*0-2], 0x01 vpblendd m9, [topq+strideq*1-2], 0x01 punpckldq m5, m10 punpckldq m6, m9 %else movd xm6, [topq +strideq*0-2] movq xm5, [dstq +strideq*2-2] movq xm9, [dst4q+strideq*0-2] movd xm10, [dst4q+strideq*2+2] pinsrd xm6, [topq +strideq*1-2], 1 movhps xm5, [dstq +stride3q -2] movhps xm9, [dst4q+strideq*1-2] pinsrd xm10, [dst4q+stride3q +2], 1 pinsrd xm6, [dstq +strideq*0-2], 2 pinsrd xm10, [botq +strideq*0+2], 2 pinsrd xm6, [dstq +strideq*1-2], 3 pinsrd xm10, [botq +strideq*1+2], 3 shufps xm11, xm5, xm9, q2020 shufps xm5, xm9, q3131 movu m9, [blend_4x8_3] vinserti128 m6, xm11, 1 vinserti128 m5, xm10, 1 vpblendvb m6, [rsp+gprsize+0x10-8], m9 %endif %else lea r13, [blend_8x8_1] movu m11, [r13+hq*2*2+16*2] movq xm6, [top1q -2] movq xm9, [top2q -2] movq xm5, [dstq+strideq*2+2] movq xm10, [dstq+stride3q +2] vinserti128 m6, [dstq+strideq*0-2], 1 vinserti128 m9, [dstq+strideq*1-2], 1 vinserti128 m5, [botq+strideq*0+2], 1 vinserti128 m10, [botq+strideq*1+2], 1 punpcklqdq m6, m9 vpblendvb m6, [rsp+gprsize+0x20+hq*8+64-8*2], m11 punpcklqdq m5, m10 %endif ret .d5k1: %if %1 == 4 %if %2 == 4 movd xm6, [topq+strideq*0-1] movd xm9, [topq+strideq*1-1] movd xm5, [dstq+strideq*2+1] movd xm10, [dstq+stride3q +1] pcmpeqd m12, m12 pmovzxbw m11, [leftq-8+1] psrld m12, 24 vinserti128 m6, [dstq+strideq*0-1], 1 vinserti128 m9, [dstq+strideq*1-1], 1 vinserti128 m5, [botq+strideq*0+1], 1 vinserti128 m10, [botq+strideq*1+1], 1 punpckldq m6, m9 pxor m9, m9 vpblendd m12, m9, 0x0F punpckldq m5, m10 vpblendvb m6, m11, m12 %else movd xm6, [topq +strideq*0-1] movq xm5, [dstq +strideq*2-1] movq xm9, [dst4q+strideq*0-1] movd xm10, [dst4q+strideq*2+1] pinsrd xm6, [topq +strideq*1-1], 1 movhps xm5, [dstq +stride3q -1] movhps xm9, [dst4q+strideq*1-1] pinsrd xm10, [dst4q+stride3q +1], 1 pinsrd xm6, [dstq +strideq*0-1], 2 pinsrd xm10, [botq +strideq*0+1], 2 pinsrd xm6, [dstq +strideq*1-1], 3 pinsrd xm10, [botq +strideq*1+1], 3 shufps xm11, xm5, xm9, q2020 vinserti128 m6, xm11, 1 pmovzxbw m11, [leftq-3] psrldq xm5, 2 psrldq xm9, 2 shufps xm5, xm9, q2020 movu m9, [blend_4x8_1] vinserti128 m5, xm10, 1 vpblendvb m6, m11, m9 %endif %else lea r13, [blend_8x8_0] movu m11, [r13+hq*2*2+16*2] movq xm6, [top1q -1] movq xm9, [top2q -1] movq xm5, [dstq+strideq*2+1] movq xm10, [dstq+stride3q +1] vinserti128 m6, [dstq+strideq*0-1], 1 vinserti128 m9, [dstq+strideq*1-1], 1 vinserti128 m5, [botq+strideq*0+1], 1 vinserti128 m10, [botq+strideq*1+1], 1 punpcklqdq m6, m9 punpcklqdq m5, m10 vpblendvb m6, [rsp+gprsize+0x60+hq*8+64-8*2], m11 %endif ret .d6k1: %if %1 == 4 %if %2 == 4 movd xm6, [topq+strideq*0] movd xm9, [topq+strideq*1] movd xm5, [dstq+strideq*2] movd xm10, [dstq+stride3q ] vinserti128 m6, [dstq+strideq*0], 1 vinserti128 m9, [dstq+strideq*1], 1 vinserti128 m5, [botq+strideq*0], 1 vinserti128 m10, [botq+strideq*1], 1 punpckldq m6, m9 punpckldq m5, m10 %else movd xm5, [dstq +strideq*2] movd xm6, [topq +strideq*0] movd xm9, [dst4q+strideq*2] pinsrd xm5, [dstq +stride3q ], 1 pinsrd xm6, [topq +strideq*1], 1 pinsrd xm9, [dst4q+stride3q ], 1 pinsrd xm5, [dst4q+strideq*0], 2 pinsrd xm6, [dstq +strideq*0], 2 pinsrd xm9, [botq +strideq*0], 2 pinsrd xm5, [dst4q+strideq*1], 3 pinsrd xm6, [dstq +strideq*1], 3 pinsrd xm9, [botq +strideq*1], 3 vinserti128 m6, xm5, 1 vinserti128 m5, xm9, 1 %endif %else movq xm5, [dstq+strideq*2] movq xm9, [botq+strideq*0] movq xm6, [top1q ] movq xm10, [dstq+strideq*0] movhps xm5, [dstq+stride3q ] movhps xm9, [botq+strideq*1] movhps xm6, [top2q ] movhps xm10, [dstq+strideq*1] vinserti128 m5, xm9, 1 vinserti128 m6, xm10, 1 %endif ret .d7k1: %if %1 == 4 %if %2 == 4 movd xm5, [dstq+strideq*2-1] movd xm9, [dstq+stride3q -1] movd xm6, [topq+strideq*0+1] movd xm10, [topq+strideq*1+1] pinsrb xm5, [leftq+ 5], 0 pinsrb xm9, [leftq+ 7], 0 vinserti128 m6, [dstq+strideq*0+1], 1 vinserti128 m10, [dstq+strideq*1+1], 1 vinserti128 m5, [botq+strideq*0-1], 1 vinserti128 m9, [botq+strideq*1-1], 1 punpckldq m6, m10 punpckldq m5, m9 %else movd xm6, [topq +strideq*0+1] movq xm9, [dstq +strideq*2-1] movq xm10, [dst4q+strideq*0-1] movd xm11, [dst4q+strideq*2-1] pinsrd xm6, [topq +strideq*1+1], 1 movhps xm9, [dstq +stride3q -1] movhps xm10, [dst4q+strideq*1-1] pinsrd xm11, [dst4q+stride3q -1], 1 pinsrd xm6, [dstq +strideq*0+1], 2 pinsrd xm11, [botq +strideq*0-1], 2 pinsrd xm6, [dstq +strideq*1+1], 3 pinsrd xm11, [botq +strideq*1-1], 3 shufps xm5, xm9, xm10, q2020 vinserti128 m5, xm11, 1 pmovzxbw m11, [leftq+5] psrldq xm9, 2 psrldq xm10, 2 shufps xm9, xm10, q2020 movu m10, [blend_4x8_1+8] vinserti128 m6, xm9, 1 vpblendvb m5, m11, m10 %endif %else lea r13, [blend_8x8_0+16] movq xm5, [dstq+strideq*2-1] movq xm9, [botq+strideq*0-1] movq xm6, [top1q +1] movq xm10, [dstq+strideq*0+1] movhps xm5, [dstq+stride3q -1] movhps xm9, [botq+strideq*1-1] movhps xm6, [top2q +1] movhps xm10, [dstq+strideq*1+1] movu m11, [r13+hq*2*2+16*2] vinserti128 m5, xm9, 1 vinserti128 m6, xm10, 1 vpblendvb m5, [rsp+gprsize+0x60+hq*8+64+8*2], m11 %endif ret .border_block: DEFINE_ARGS dst, stride, left, top, bot, pri, sec, stride3, dst4, edge %define rstk rsp %assign stack_offset stack_offset_entry %assign regs_used 11 ALLOC_STACK 2*16+(%2+4)*32, 16 %define px rsp+2*16+2*32 pcmpeqw m14, m14 psllw m14, 15 ; 0x8000 ; prepare pixel buffers - body/right %if %1 == 4 INIT_XMM avx2 %endif %if %2 == 8 lea dst4q, [dstq+strideq*4] %endif lea stride3q, [strideq*3] test edgeb, 2 ; have_right jz .no_right pmovzxbw m1, [dstq+strideq*0] pmovzxbw m2, [dstq+strideq*1] pmovzxbw m3, [dstq+strideq*2] pmovzxbw m4, [dstq+stride3q] mova [px+0*32], m1 mova [px+1*32], m2 mova [px+2*32], m3 mova [px+3*32], m4 %if %2 == 8 pmovzxbw m1, [dst4q+strideq*0] pmovzxbw m2, [dst4q+strideq*1] pmovzxbw m3, [dst4q+strideq*2] pmovzxbw m4, [dst4q+stride3q] mova [px+4*32], m1 mova [px+5*32], m2 mova [px+6*32], m3 mova [px+7*32], m4 %endif jmp .body_done .no_right: %if %1 == 4 movd xm1, [dstq+strideq*0] movd xm2, [dstq+strideq*1] movd xm3, [dstq+strideq*2] movd xm4, [dstq+stride3q] pmovzxbw xm1, xm1 pmovzxbw xm2, xm2 pmovzxbw xm3, xm3 pmovzxbw xm4, xm4 movq [px+0*32], xm1 movq [px+1*32], xm2 movq [px+2*32], xm3 movq [px+3*32], xm4 %else pmovzxbw xm1, [dstq+strideq*0] pmovzxbw xm2, [dstq+strideq*1] pmovzxbw xm3, [dstq+strideq*2] pmovzxbw xm4, [dstq+stride3q] mova [px+0*32], xm1 mova [px+1*32], xm2 mova [px+2*32], xm3 mova [px+3*32], xm4 %endif movd [px+0*32+%1*2], xm14 movd [px+1*32+%1*2], xm14 movd [px+2*32+%1*2], xm14 movd [px+3*32+%1*2], xm14 %if %2 == 8 %if %1 == 4 movd xm1, [dst4q+strideq*0] movd xm2, [dst4q+strideq*1] movd xm3, [dst4q+strideq*2] movd xm4, [dst4q+stride3q] pmovzxbw xm1, xm1 pmovzxbw xm2, xm2 pmovzxbw xm3, xm3 pmovzxbw xm4, xm4 movq [px+4*32], xm1 movq [px+5*32], xm2 movq [px+6*32], xm3 movq [px+7*32], xm4 %else pmovzxbw xm1, [dst4q+strideq*0] pmovzxbw xm2, [dst4q+strideq*1] pmovzxbw xm3, [dst4q+strideq*2] pmovzxbw xm4, [dst4q+stride3q] mova [px+4*32], xm1 mova [px+5*32], xm2 mova [px+6*32], xm3 mova [px+7*32], xm4 %endif movd [px+4*32+%1*2], xm14 movd [px+5*32+%1*2], xm14 movd [px+6*32+%1*2], xm14 movd [px+7*32+%1*2], xm14 %endif .body_done: ; top test edgeb, 4 ; have_top jz .no_top test edgeb, 1 ; have_left jz .top_no_left test edgeb, 2 ; have_right jz .top_no_right pmovzxbw m1, [topq+strideq*0-(%1/2)] pmovzxbw m2, [topq+strideq*1-(%1/2)] movu [px-2*32-%1], m1 movu [px-1*32-%1], m2 jmp .top_done .top_no_right: pmovzxbw m1, [topq+strideq*0-%1] pmovzxbw m2, [topq+strideq*1-%1] movu [px-2*32-%1*2], m1 movu [px-1*32-%1*2], m2 movd [px-2*32+%1*2], xm14 movd [px-1*32+%1*2], xm14 jmp .top_done .top_no_left: test edgeb, 2 ; have_right jz .top_no_left_right pmovzxbw m1, [topq+strideq*0] pmovzxbw m2, [topq+strideq*1] mova [px-2*32+0], m1 mova [px-1*32+0], m2 movd [px-2*32-4], xm14 movd [px-1*32-4], xm14 jmp .top_done .top_no_left_right: %if %1 == 4 movd xm1, [topq+strideq*0] pinsrd xm1, [topq+strideq*1], 1 pmovzxbw xm1, xm1 movq [px-2*32+0], xm1 movhps [px-1*32+0], xm1 %else pmovzxbw xm1, [topq+strideq*0] pmovzxbw xm2, [topq+strideq*1] mova [px-2*32+0], xm1 mova [px-1*32+0], xm2 %endif movd [px-2*32-4], xm14 movd [px-1*32-4], xm14 movd [px-2*32+%1*2], xm14 movd [px-1*32+%1*2], xm14 jmp .top_done .no_top: movu [px-2*32-%1], m14 movu [px-1*32-%1], m14 .top_done: ; left test edgeb, 1 ; have_left jz .no_left pmovzxbw xm1, [leftq+ 0] %if %2 == 8 pmovzxbw xm2, [leftq+ 8] %endif movd [px+0*32-4], xm1 pextrd [px+1*32-4], xm1, 1 pextrd [px+2*32-4], xm1, 2 pextrd [px+3*32-4], xm1, 3 %if %2 == 8 movd [px+4*32-4], xm2 pextrd [px+5*32-4], xm2, 1 pextrd [px+6*32-4], xm2, 2 pextrd [px+7*32-4], xm2, 3 %endif jmp .left_done .no_left: movd [px+0*32-4], xm14 movd [px+1*32-4], xm14 movd [px+2*32-4], xm14 movd [px+3*32-4], xm14 %if %2 == 8 movd [px+4*32-4], xm14 movd [px+5*32-4], xm14 movd [px+6*32-4], xm14 movd [px+7*32-4], xm14 %endif .left_done: ; bottom DEFINE_ARGS dst, stride, _, _, bot, pri, sec, stride3, _, edge test edgeb, 8 ; have_bottom jz .no_bottom test edgeb, 1 ; have_left jz .bottom_no_left test edgeb, 2 ; have_right jz .bottom_no_right pmovzxbw m1, [botq+strideq*0-(%1/2)] pmovzxbw m2, [botq+strideq*1-(%1/2)] movu [px+(%2+0)*32-%1], m1 movu [px+(%2+1)*32-%1], m2 jmp .bottom_done .bottom_no_right: pmovzxbw m1, [botq+strideq*0-%1] pmovzxbw m2, [botq+strideq*1-%1] movu [px+(%2+0)*32-%1*2], m1 movu [px+(%2+1)*32-%1*2], m2 %if %1 == 8 movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu %endif movd [px+(%2+0)*32+%1*2], xm14 movd [px+(%2+1)*32+%1*2], xm14 jmp .bottom_done .bottom_no_left: test edgeb, 2 ; have_right jz .bottom_no_left_right pmovzxbw m1, [botq+strideq*0] pmovzxbw m2, [botq+strideq*1] mova [px+(%2+0)*32+0], m1 mova [px+(%2+1)*32+0], m2 movd [px+(%2+0)*32-4], xm14 movd [px+(%2+1)*32-4], xm14 jmp .bottom_done .bottom_no_left_right: %if %1 == 4 movd xm1, [botq+strideq*0] pinsrd xm1, [botq+strideq*1], 1 pmovzxbw xm1, xm1 movq [px+(%2+0)*32+0], xm1 movhps [px+(%2+1)*32+0], xm1 %else pmovzxbw xm1, [botq+strideq*0] pmovzxbw xm2, [botq+strideq*1] mova [px+(%2+0)*32+0], xm1 mova [px+(%2+1)*32+0], xm2 %endif movd [px+(%2+0)*32-4], xm14 movd [px+(%2+1)*32-4], xm14 movd [px+(%2+0)*32+%1*2], xm14 movd [px+(%2+1)*32+%1*2], xm14 jmp .bottom_done .no_bottom: movu [px+(%2+0)*32-%1], m14 movu [px+(%2+1)*32-%1], m14 .bottom_done: ; actual filter INIT_YMM avx2 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, secdmp, stride3, zero %undef edged ; register to shuffle values into after packing vbroadcasti128 m12, [shufb_lohi] mov dampingd, r8m xor zerod, zerod movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm test prid, prid jz .border_sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift test secdmpd, secdmpd jz .border_pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, _, pridmp, table, pri, secdmp, stride3 lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, stride, _, dir, table, pri, sec, stride3 vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps BORDER_PREP_REGS %1, %2 %if %1*%2*2/mmsize > 1 .border_v_loop: %endif BORDER_LOAD_BLOCK %1, %2, 1 .border_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1 ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1 dec kq jge .border_k_loop vpbroadcastd m10, [pw_2048] BORDER_ADJUST_PIXEL %1, m10, 1 %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] add stkq, 32*vloop_lines dec hd jg .border_v_loop %endif RET .border_pri_only: DEFINE_ARGS dst, stride, _, pridmp, table, pri, _, stride3 lea tableq, [tap_table] vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask DEFINE_ARGS dst, stride, _, dir, table, pri, _, stride3 vpbroadcastb m0, xm0 ; pri_strength and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps BORDER_PREP_REGS %1, %2 vpbroadcastd m1, [pw_2048] %if %1*%2*2/mmsize > 1 .border_pri_v_loop: %endif BORDER_LOAD_BLOCK %1, %2 .border_pri_k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1 dec kq jge .border_pri_k_loop BORDER_ADJUST_PIXEL %1, m1 %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] add stkq, 32*vloop_lines dec hd jg .border_pri_v_loop %endif RET .border_sec_only: DEFINE_ARGS dst, stride, _, _, damping, _, secdmp, stride3 movd xm1, secdmpd lzcnt secdmpd, secdmpd add secdmpd, dampingd mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, stride, _, _, table, _, secdmp, stride3 lea tableq, [tap_table] vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask DEFINE_ARGS dst, stride, _, dir, table, _, sec, stride3 vpbroadcastb m1, xm1 ; sec_strength lea secq, [tableq+12] ; sec_taps BORDER_PREP_REGS %1, %2 vpbroadcastd m0, [pw_2048] %if %1*%2*2/mmsize > 1 .border_sec_v_loop: %endif BORDER_LOAD_BLOCK %1, %2 .border_sec_k_loop: vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1 ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1 dec kq jge .border_sec_k_loop BORDER_ADJUST_PIXEL %1, m0 %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+strideq*vloop_lines] add stkq, 32*vloop_lines dec hd jg .border_sec_v_loop %endif RET %endmacro CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 INIT_YMM avx2 cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3 lea stride3q, [strideq*3] movq xm0, [srcq+strideq*0] movq xm1, [srcq+strideq*1] movq xm2, [srcq+strideq*2] movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m4, [srcq+stride3q ] vpbroadcastq m5, [srcq+strideq*2] vpblendd m0, m4, 0xf0 vpblendd m1, m5, 0xf0 vpbroadcastq m4, [srcq+strideq*1] vpbroadcastq m5, [srcq+strideq*0] vpblendd m2, m4, 0xf0 vpblendd m3, m5, 0xf0 pxor m4, m4 punpcklbw m0, m4 punpcklbw m1, m4 punpcklbw m2, m4 punpcklbw m3, m4 cglobal_label .main vpbroadcastd m4, [pw_128] PROLOGUE 3, 4, 15 psubw m0, m4 psubw m1, m4 psubw m2, m4 psubw m3, m4 ; shuffle registers to generate partial_sum_diag[0-1] together vperm2i128 m7, m0, m0, 0x01 vperm2i128 m6, m1, m1, 0x01 vperm2i128 m5, m2, m2, 0x01 vperm2i128 m4, m3, m3, 0x01 ; start with partial_sum_hv[0-1] paddw m8, m0, m1 paddw m9, m2, m3 phaddw m10, m0, m1 phaddw m11, m2, m3 paddw m8, m9 phaddw m10, m11 vextracti128 xm9, m8, 1 vextracti128 xm11, m10, 1 paddw xm8, xm9 ; partial_sum_hv[1] phaddw xm10, xm11 ; partial_sum_hv[0] vinserti128 m8, xm10, 1 vpbroadcastd m9, [div_table+44] pmaddwd m8, m8 pmulld m8, m9 ; cost6[2a-d] | cost2[a-d] ; create aggregates [lower half]: ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+ ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0 ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+ ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x ; and [upper half]: ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+ ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567 ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+ ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd pslldq m9, m1, 2 psrldq m10, m1, 14 pslldq m11, m2, 4 psrldq m12, m2, 12 pslldq m13, m3, 6 psrldq m14, m3, 10 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 pslldq m11, m4, 8 psrldq m12, m4, 8 pslldq m13, m5, 10 psrldq m14, m5, 6 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 pslldq m11, m6, 12 psrldq m12, m6, 4 pslldq m13, m7, 14 psrldq m14, m7, 2 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero] vbroadcasti128 m14, [shufw_6543210x] vbroadcasti128 m13, [div_table+16] vbroadcasti128 m12, [div_table+0] paddw m9, m0 ; partial_sum_diag[0/1][0-7] pshufb m10, m14 punpckhwd m11, m9, m10 punpcklwd m9, m10 pmaddwd m11, m11 pmaddwd m9, m9 pmulld m11, m13 pmulld m9, m12 paddd m9, m11 ; cost0[a-d] | cost4[a-d] ; merge horizontally and vertically for partial_sum_alt[0-3] paddw m10, m0, m1 paddw m11, m2, m3 paddw m12, m4, m5 paddw m13, m6, m7 phaddw m0, m4 phaddw m1, m5 phaddw m2, m6 phaddw m3, m7 ; create aggregates [lower half]: ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234 ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx ; and [upper half]: ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567 ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd pslldq m4, m11, 2 psrldq m11, 14 pslldq m5, m12, 4 psrldq m12, 12 pslldq m6, m13, 6 psrldq m13, 10 paddw m4, m10 paddw m11, m12 vpbroadcastd m12, [div_table+44] paddw m5, m6 paddw m11, m13 ; partial_sum_alt[3/2] right vbroadcasti128 m13, [div_table+32] paddw m4, m5 ; partial_sum_alt[3/2] left pshuflw m5, m11, q3012 punpckhwd m6, m11, m4 punpcklwd m4, m5 pmaddwd m6, m6 pmaddwd m4, m4 pmulld m6, m12 pmulld m4, m13 paddd m4, m6 ; cost7[a-d] | cost5[a-d] ; create aggregates [lower half]: ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234 ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx ; and [upper half]: ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567 ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd pslldq m5, m1, 2 psrldq m1, 14 pslldq m6, m2, 4 psrldq m2, 12 pslldq m7, m3, 6 psrldq m3, 10 paddw m5, m0 paddw m1, m2 paddw m6, m7 paddw m1, m3 ; partial_sum_alt[0/1] right paddw m5, m6 ; partial_sum_alt[0/1] left pshuflw m0, m1, q3012 punpckhwd m1, m5 punpcklwd m5, m0 pmaddwd m1, m1 pmaddwd m5, m5 pmulld m1, m12 pmulld m5, m13 paddd m5, m1 ; cost1[a-d] | cost3[a-d] mova xm0, [pd_47130256+ 16] mova m1, [pd_47130256] phaddd m9, m8 phaddd m5, m4 phaddd m9, m5 vpermd m0, m9 ; cost[0-3] vpermd m1, m9 ; cost[4-7] | cost[0-3] ; now find the best cost pmaxsd xm2, xm0, xm1 pshufd xm3, xm2, q1032 pmaxsd xm2, xm3 pshufd xm3, xm2, q2301 pmaxsd xm2, xm3 ; best cost ; find the idx using minpos ; make everything other than the best cost negative via subtraction ; find the min of unsigned 16-bit ints to sort out the negative values psubd xm4, xm1, xm2 psubd xm3, xm0, xm2 packssdw xm3, xm4 phminposuw xm3, xm3 ; convert idx to 32-bits psrld xm3, 16 movd eax, xm3 ; get idx^4 complement vpermd m3, m1 psubd xm2, xm3 psrld xm2, 10 movd [varq], xm2 RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/cdef_avx512.asm000064400000000000000000000755561046102023000146160ustar 00000000000000; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 %macro DUP4 1-* %rep %0 times 4 db %1 %rotate 1 %endrep %endmacro %macro DIRS 16 ; cdef_directions[] %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 ; masking away unused bits allows us to use a single vpaddd {1to16} ; instruction instead of having to do vpbroadcastd + paddb db %13 & 0x3f, -%13 & 0x3f %rotate 1 %endrep %endmacro SECTION_RODATA 64 lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 lut_perm_8x8a: db 32, 33, 34, 35, 36, 37, 38, 39, 48, 49, 50, 51, 52, 53, 54, 55 db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 lut_perm_8x8b: db 12, 13, 0, 1, 2, 3, 4, 5, 14, 15, 16, 17, 18, 19, 20, 21 db 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 20, 21, 22, 23, 24, 25 db 28, 29, 32, 33, 34, 35, 36, 37, 30, 31, 48, 49, 50, 51, 52, 53 db 34, 35, 36, 37, 38, 39, 40, 41, 50, 51, 52, 53, 54, 55, 56, 57 end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 end_perm_clip: db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 sec_tap: db 32, 32, 16, 16 pd_268435568: dd 268435568 SECTION .text %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif ; lut: ; t0 t1 t2 t3 t4 t5 t6 t7 ; T0 T1 T2 T3 T4 T5 T6 T7 ; L0 L1 00 01 02 03 04 05 ; L2 L3 10 11 12 13 14 15 ; L4 L5 20 21 22 23 24 25 ; L6 L7 30 31 32 33 34 35 ; b0 b1 b2 b3 b4 b5 b6 b7 ; B0 B1 B2 B3 B4 B5 B6 B7 INIT_ZMM avx512icl cglobal cdef_filter_4x4_8bpc, 5, 8, 13, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r7-edge_mask movq xmm0, [dstq+strideq*0] movhps xmm0, [dstq+strideq*1] lea r7, [edge_mask] movq xmm1, [topq+strideq*0-2] movhps xmm1, [topq+strideq*1-2] mov r6d, edgem vinserti32x4 ym0, ymm0, [leftq], 1 lea r2, [strideq*3] vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 mova m5, [base+lut_perm_4x4] vinserti32x4 m0, [dstq+r2], 2 test r6b, 0x08 ; avoid buffer overread jz .main vinserti32x4 m1, [botq+strideq*0-4], 2 vinserti32x4 m0, [botq+strideq*1-4], 3 .main: movifnidn prid, prim mov t0d, dirm mova m3, [base+px_idx] mov r3d, dampingm vpermi2b m5, m0, m1 ; lut vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m7, m7 lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m6, m3, m5 ; px cmp r6d, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 %macro CDEF_FILTER_4x4_PRI 0 vpcmpub k1, m6, m1, 6 ; px > pN psubb m2, m1, m6 lzcnt r6d, prid vpsubb m2{k1}, m6, m1 ; abs(diff) vpbroadcastb m4, prid and prid, 1 vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift movifnidn secd, secm vpbroadcastd m10, [base+pri_tap+priq*4] vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) pminub m2, m4 vpdpbusd m0, m2, m10 ; sum %endmacro CDEF_FILTER_4x4_PRI test secd, secd jz .end_no_clip call .sec .end_clip: pminub m4, m6, m1 pmaxub m1, m6 pminub m5, m2, m3 pmaxub m2, m3 pminub m4, m5 pmaxub m2, m1 psrldq m1, m4, 2 psrldq m3, m2, 2 pminub m1, m4 vpcmpw k1, m0, m7, 1 vpshldd m6, m0, 8 pmaxub m2, m3 pslldq m3, m1, 1 psubw m7, m0 paddusw m0, m6 ; clip >0xff vpsubusw m0{k1}, m6, m7 ; clip <0x00 pslldq m4, m2, 1 pminub m1, m3 pmaxub m2, m4 pmaxub m0, m1 pminub m0, m2 jmp .end .sec_only: movifnidn secd, secm call .sec .end_no_clip: vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) .end: mova xm1, [base+end_perm] vpermb m0, m1, m0 ; output in bits 8-15 of each dword movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 RET .mask_edges_sec_only: movifnidn secd, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: vpbroadcastq m8, [base+edge_mask+r6*8] test prid, prid jz .mask_edges_sec_only vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m8, m2 ; index in-range mova m1, m6 vpermb m1{k1}, m2, m5 CDEF_FILTER_4x4_PRI test secd, secd jz .end_no_clip call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m8, m4 mova m2, m6 vpermb m2{k1}, m4, m5 vpshufbitqmb k1, m8, m9 mova m3, m6 vpermb m3{k1}, m9, m5 jmp .sec_main ALIGN function_align .sec: vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 .sec_main: vpbroadcastd m8, [base+sec_tap] vpcmpub k1, m6, m2, 6 psubb m4, m2, m6 vpbroadcastb m12, secd lzcnt secd, secd vpsubb m4{k1}, m6, m2 vpcmpub k2, m6, m3, 6 vpbroadcastq m11, [r3+secq*8] gf2p8affineqb m10, m4, m11, 0 psubb m5, m3, m6 mova m9, m8 vpsubb m8{k1}, m7, m8 psubusb m10, m12, m10 vpsubb m5{k2}, m6, m3 pminub m4, m10 vpdpbusd m0, m4, m8 gf2p8affineqb m11, m5, m11, 0 vpsubb m9{k2}, m7, m9 psubusb m12, m11 pminub m5, m12 vpdpbusd m0, m5, m9 ret DECLARE_REG_TMP 2, 7 ; lut top lut bottom ; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 ; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 ; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 ; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 ; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 ; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 ; L8 L9 40 41 42 43 44 45 b0 b1 b2 b3 b4 b5 b6 b7 ; La Lb 50 51 52 53 54 55 B0 B1 B2 B3 B4 B5 B6 B7 cglobal cdef_filter_4x8_8bpc, 5, 9, 22, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r8-edge_mask vpbroadcastd ym21, strided mov r6d, edgem lea r8, [edge_mask] movq xm1, [topq+strideq*0-2] pmulld ym21, [base+pd_01234567] kxnorb k1, k1, k1 movq xm2, [topq+strideq*1-2] vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 mova m14, [base+lut_perm_4x8a] movu m15, [base+lut_perm_4x8b] test r6b, 0x08 ; avoid buffer overread jz .main vinserti32x4 ym1, [botq+strideq*0-2], 1 vinserti32x4 ym2, [botq+strideq*1-2], 1 .main: punpcklqdq ym1, ym2 vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ movifnidn prid, prim mov t0d, dirm mova m16, [base+px_idx] mov r3d, dampingm vpermi2b m14, m0, m1 ; lut top vpermi2b m15, m0, m1 ; lut bottom vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m20, m20 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m2, m16, m14 ; pxt vpermb m3, m16, m15 ; pxb mova m1, m0 cmp r6b, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 vpermb m5, m6, m15 ; pNb %macro CDEF_FILTER_4x8_PRI 0 vpcmpub k1, m2, m4, 6 ; pxt > pNt vpcmpub k2, m3, m5, 6 ; pxb > pNb psubb m6, m4, m2 psubb m7, m5, m3 lzcnt r6d, prid vpsubb m6{k1}, m2, m4 ; abs(diff_top) vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) vpbroadcastb m13, prid vpbroadcastq m9, [r3+r6*8] and prid, 1 vpbroadcastd m11, [base+pri_tap+priq*4] vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift mova m10, m11 movifnidn t1d, secm vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) pminub m6, m12 pminub m7, m13 vpdpbusd m0, m6, m10 ; sum top vpdpbusd m1, m7, m11 ; sum bottom %endmacro CDEF_FILTER_4x8_PRI test t1d, t1d ; sec jz .end_no_clip call .sec .end_clip: pminub m10, m4, m2 pminub m12, m6, m8 pminub m11, m5, m3 pminub m13, m7, m9 pmaxub m4, m2 pmaxub m6, m8 pmaxub m5, m3 pmaxub m7, m9 pminub m10, m12 pminub m11, m13 pmaxub m4, m6 pmaxub m5, m7 mov r2d, 0xAAAAAAAA kmovd k1, r2d kxnorb k2, k2, k2 ; hw lw vpshrdd m12, m0, m1, 16 ; m1lw m0hw vpshrdd m6, m10, m11, 16 ; m11lw m10hw vpshrdd m8, m4, m5, 16 ; m5lw m4hw vpblendmw m7{k1}, m10, m11 ; m11hw m10lw vpblendmw m9{k1}, m4, m5 ; m5hw m4lw vpblendmw m4{k1}, m0, m12 ; m1lw m0lw vpblendmw m5{k1}, m12, m1 ; m1hw m0hw vpshrdd m2, m3, 16 pminub m6, m7 pmaxub m8, m9 mova ym14, [base+end_perm] vpcmpw k1, m4, m20, 1 vpshldw m2, m5, 8 pslldq m7, m6, 1 pslldq m9, m8, 1 psubw m5, m20, m4 paddusw m0, m4, m2 ; clip >0xff pminub m6, m7 pmaxub m8, m9 psubusw m0{k1}, m2, m5 ; clip <0x00 pmaxub m0, m6 pminub m0, m8 vpermb m0, m14, m0 vpscatterdd [dstq+ym21]{k2}, ym0 RET .sec_only: movifnidn t1d, secm call .sec .end_no_clip: mova ym4, [base+end_perm] kxnorb k1, k1, k1 vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m3, m1, 8 paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddw m1, m3 pslld m0, 16 vpshrdd m0, m1, 16 vpermb m0, m4, m0 ; output in bits 8-15 of each word vpscatterdd [dstq+ym21]{k1}, ym0 RET .mask_edges_sec_only: movifnidn t1d, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: mov t1d, r6d or r6d, 8 ; top 4x4 has bottom or t1d, 4 ; bottom 4x4 has top vpbroadcastq m17, [base+edge_mask+r6*8] vpbroadcastq m18, [base+edge_mask+t1*8] test prid, prid jz .mask_edges_sec_only vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m17, m6 ; index in-range vpshufbitqmb k2, m18, m6 mova m4, m2 mova m5, m3 vpermb m4{k1}, m6, m14 vpermb m5{k2}, m6, m15 CDEF_FILTER_4x8_PRI test t1d, t1d jz .end_no_clip call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m17, m10 vpshufbitqmb k2, m18, m10 vpshufbitqmb k3, m17, m11 vpshufbitqmb k4, m18, m11 mova m6, m2 mova m7, m3 mova m8, m2 mova m9, m3 vpermb m6{k1}, m10, m14 vpermb m7{k2}, m10, m15 vpermb m8{k3}, m11, m14 vpermb m9{k4}, m11, m15 jmp .sec_main ALIGN function_align .sec: vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 vpermb m7, m8, m15 ; pNb vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 vpermb m9, m9, m15 ; pNb .sec_main: vpbroadcastb m18, t1d lzcnt t1d, t1d vpcmpub k1, m2, m6, 6 vpcmpub k2, m3, m7, 6 vpcmpub k3, m2, m8, 6 vpcmpub k4, m3, m9, 6 vpbroadcastq m17, [r3+t1*8] psubb m10, m6, m2 psubb m11, m7, m3 psubb m12, m8, m2 psubb m13, m9, m3 vpsubb m10{k1}, m2, m6 ; abs(dt0) vpsubb m11{k2}, m3, m7 ; abs(db0) vpsubb m12{k3}, m2, m8 ; abs(dt1) vpsubb m13{k4}, m3, m9 ; abs(db1) vpbroadcastd m19, [base+sec_tap] gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) pminub m10, m14 pminub m11, m15 pminub m12, m16 pminub m13, m17 mova m14, m19 mova m15, m19 mova m16, m19 vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) vpdpbusd m0, m10, m14 vpdpbusd m1, m11, m15 vpdpbusd m0, m12, m16 vpdpbusd m1, m13, m19 ret ; lut tl lut tr ; t0 t1 t2 t3 t4 t5 t6 t7 t4 t5 t6 t7 t8 t9 ta tb ; T0 T1 T2 T3 T4 T5 T6 T7 T4 T5 T6 T7 T8 T9 Ta Tb ; L0 L1 00 01 02 03 04 05 02 03 04 05 06 07 08 09 ; L2 L3 10 11 12 13 14 15 12 13 14 15 16 17 18 19 ; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 ; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 ; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 ; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 ; lut bl lut br ; L4 L5 20 21 22 23 24 25 22 23 24 25 26 27 28 29 ; L6 L7 30 31 32 33 34 35 32 33 34 35 36 37 38 39 ; L8 L9 40 41 42 43 44 45 42 43 44 45 46 47 48 49 ; La Lb 50 51 52 53 54 55 52 53 54 55 56 57 58 59 ; Lc Ld 60 61 62 63 64 65 62 63 64 65 66 67 68 69 ; Le Lf 70 71 72 73 74 75 72 73 74 75 76 77 78 79 ; b0 b1 b2 b3 b4 b5 b6 b7 b4 b5 b6 b7 b8 b9 ba bb ; B0 B1 B2 B3 B4 B5 B6 B7 B4 B5 B6 B7 B8 B9 Ba Bb cglobal cdef_filter_8x8_8bpc, 5, 11, 32, 4*64, dst, stride, left, top, bot, \ pri, sec, dir, damping, edge %define base r8-edge_mask movu xm16, [dstq+strideq*0] pinsrd xm16, [leftq+4*0], 3 mov r6d, edgem vinserti128 ym16, [dstq+strideq*1], 1 lea r10, [dstq+strideq*4] movu xm17, [dstq+strideq*2] vinserti32x4 m16, [topq+strideq*0-2], 2 lea r9, [strideq*3] pinsrd xm17, [leftq+4*1], 3 vinserti32x4 m16, [topq+strideq*1-2], 3 ; 0 1 t T lea r8, [edge_mask] vinserti128 ym17, [dstq+r9 ], 1 vpbroadcastd ym18, [leftq+4*2] vpblendd ym17, ym18, 0x80 movu xm18, [r10 +strideq*2] vinserti32x4 m17, [r10 +strideq*0], 2 pinsrd xm18, [leftq+4*3], 3 vinserti32x4 m17, [r10 +strideq*1], 3 ; 2 3 4 5 vinserti128 ym18, [r10 +r9 ], 1 test r6b, 0x08 ; avoid buffer overread jz .main vinserti32x4 m18, [botq+strideq*0-2], 2 vinserti32x4 m18, [botq+strideq*1-2], 3 ; 6 7 b B .main: mova m0, [base+lut_perm_8x8a] movu m1, [base+lut_perm_8x8b] mova m30, [base+px_idx] vpermb m16, m0, m16 movifnidn prid, prim vpermb m17, m1, m17 mov t0d, dirm vpermb m18, m0, m18 mov r3d, dampingm vshufi32x4 m12, m16, m17, q2020 ; lut tl vshufi32x4 m13, m16, m17, q3131 ; lut tr vshufi32x4 m14, m17, m18, q0220 ; lut bl vshufi32x4 m15, m17, m18, q1331 ; lut br vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m31, m31 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m4, m30, m12 ; pxtl mova m1, m0 vpermb m5, m30, m13 ; pxtr mova m2, m0 vpermb m6, m30, m14 ; pxbl mova m3, m0 vpermb m7, m30, m15 ; pxbr cmp r6b, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 vpermb m9, m11, m13 ; pNtr vpermb m10, m11, m14 ; pNbl vpermb m11, m11, m15 ; pNbr %macro CDEF_FILTER_8x8_PRI 0 vpcmpub k1, m4, m8, 6 ; pxtl > pNtl vpcmpub k2, m5, m9, 6 ; pxtr > pNtr vpcmpub k3, m6, m10, 6 ; pxbl > pNbl vpcmpub k4, m7, m11, 6 ; pxbr > pNbr psubb m16, m8, m4 psubb m17, m9, m5 psubb m18, m10, m6 psubb m19, m11, m7 lzcnt r6d, prid vpsubb m16{k1}, m4, m8 ; abs(diff_tl) vpsubb m17{k2}, m5, m9 ; abs(diff_tr) vpsubb m18{k3}, m6, m10 ; abs(diff_bl) vpsubb m19{k4}, m7, m11 ; abs(diff_br) vpbroadcastq m28, [r3+r6*8] vpbroadcastb m29, prid and prid, 1 vpbroadcastd m27, [base+pri_tap+priq*4] vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift mova m24, m27 mova m25, m27 mova m26, m27 movifnidn t1d, secm vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) pminub m16, m20 pminub m17, m21 pminub m18, m22 pminub m19, m23 vpdpbusd m0, m16, m24 ; sum tl vpdpbusd m1, m17, m25 ; sum tr vpdpbusd m2, m18, m26 ; sum bl vpdpbusd m3, m19, m27 ; sum br %endmacro CDEF_FILTER_8x8_PRI test t1d, t1d ; sec jz .end_no_clip call .sec .end_clip: pminub m20, m8, m4 pminub m24, m12, m16 pminub m21, m9, m5 pminub m25, m13, m17 pminub m22, m10, m6 pminub m26, m14, m18 pminub m23, m11, m7 pminub m27, m15, m19 pmaxub m8, m4 pmaxub m12, m16 pmaxub m9, m5 pmaxub m13, m17 pmaxub m10, m6 pmaxub m14, m18 pmaxub m11, m7 pmaxub m15, m19 pminub m20, m24 pminub m21, m25 pminub m22, m26 pminub m23, m27 pmaxub m8, m12 pmaxub m9, m13 pmaxub m10, m14 pmaxub m11, m15 mov r2d, 0xAAAAAAAA kmovd k1, r2d vpshrdd m24, m0, m1, 16 vpshrdd m25, m2, m3, 16 vpshrdd m12, m20, m21, 16 vpshrdd m14, m22, m23, 16 vpshrdd m16, m8, m9, 16 vpshrdd m18, m10, m11, 16 vpblendmw m13{k1}, m20, m21 vpblendmw m15{k1}, m22, m23 vpblendmw m17{k1}, m8, m9 vpblendmw m19{k1}, m10, m11 vpblendmw m20{k1}, m0, m24 vpblendmw m21{k1}, m24, m1 vpblendmw m22{k1}, m2, m25 vpblendmw m23{k1}, m25, m3 vpshrdd m4, m5, 16 vpshrdd m6, m7, 16 pminub m12, m13 pminub m14, m15 pmaxub m16, m17 pmaxub m18, m19 mova m8, [base+end_perm_clip] vpcmpw k2, m20, m31, 1 vpcmpw k3, m22, m31, 1 vpshldw m4, m21, 8 vpshldw m6, m23, 8 kunpckdq k1, k1, k1 kxnorb k4, k4, k4 vpshrdw m11, m12, m14, 8 vpshrdw m15, m16, m18, 8 vpblendmb m13{k1}, m12, m14 vpblendmb m17{k1}, m16, m18 psubw m21, m31, m20 psubw m23, m31, m22 paddusw m0, m20, m4 ; clip >0xff paddusw m1, m22, m6 pminub m11, m13 pmaxub m15, m17 psubusw m0{k2}, m4, m21 ; clip <0x00 psubusw m1{k3}, m6, m23 psrlw m0, 8 vmovdqu8 m0{k1}, m1 pmaxub m0, m11 pminub m0, m15 vpermb m0, m8, m0 vextracti32x4 xm1, m0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*2], xm1 movq [r10 +strideq*0], xm2 movq [r10 +strideq*2], xm3 movhps [dstq+strideq*1], xm0 movhps [dstq+r9 ], xm1 movhps [r10 +strideq*1], xm2 movhps [r10 +r9 ], xm3 RET .sec_only: movifnidn t1d, secm call .sec .end_no_clip: mova xm8, [base+end_perm] kxnorb k1, k1, k1 vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m5, m1, 8 vpshldd m6, m2, 8 vpshldd m7, m3, 8 paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddw m1, m5 paddw m2, m6 paddw m3, m7 vpermb m0, m8, m0 vpermb m1, m8, m1 vpermb m2, m8, m2 vpermb m3, m8, m3 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 punpckhdq m2, m3 movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm0 movq [r10 +strideq*0], xm5 movq [r10 +strideq*2], xm2 movhps [dstq+strideq*1], xm4 movhps [dstq+r9 ], xm0 movhps [r10 +strideq*1], xm5 movhps [r10 +r9 ], xm2 RET .mask_edges_sec_only: movifnidn t1d, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: mov t0d, r6d mov t1d, r6d or t0d, 0xA ; top-left 4x4 has bottom and right or t1d, 0x9 ; top-right 4x4 has bottom and left vpbroadcastq m26, [base+edge_mask+t0*8] vpbroadcastq m27, [base+edge_mask+t1*8] mov t1d, r6d or r6d, 0x6 ; bottom-left 4x4 has top and right or t1d, 0x5 ; bottom-right 4x4 has top and left vpbroadcastq m28, [base+edge_mask+r6*8] vpbroadcastq m29, [base+edge_mask+t1*8] mov t0d, dirm test prid, prid jz .mask_edges_sec_only vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m26, m20 ; index in-range vpshufbitqmb k2, m27, m20 vpshufbitqmb k3, m28, m20 vpshufbitqmb k4, m29, m20 mova m8, m4 mova m9, m5 mova m10, m6 mova m11, m7 vpermb m8{k1}, m20, m12 vpermb m9{k2}, m20, m13 vpermb m10{k3}, m20, m14 vpermb m11{k4}, m20, m15 mova [rsp+0x00], m26 mova [rsp+0x40], m27 mova [rsp+0x80], m28 mova [rsp+0xC0], m29 CDEF_FILTER_8x8_PRI test t1d, t1d jz .end_no_clip mova m26, [rsp+0x00] mova m27, [rsp+0x40] mova m28, [rsp+0x80] mova m29, [rsp+0xC0] call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m26, m20 vpshufbitqmb k2, m27, m20 vpshufbitqmb k3, m28, m20 vpshufbitqmb k4, m29, m20 mova m16, m4 mova m17, m5 mova m18, m6 mova m19, m7 vpermb m16{k1}, m20, m12 vpermb m17{k2}, m20, m13 vpermb m18{k3}, m20, m14 vpermb m19{k4}, m20, m15 vpshufbitqmb k1, m26, m21 vpshufbitqmb k2, m27, m21 vpshufbitqmb k3, m28, m21 vpshufbitqmb k4, m29, m21 vpermb m12, m21, m12 vpermb m13, m21, m13 vpermb m14, m21, m14 vpermb m15, m21, m15 vpblendmb m12{k1}, m4, m12 vpblendmb m13{k2}, m5, m13 vpblendmb m14{k3}, m6, m14 vpblendmb m15{k4}, m7, m15 jmp .sec_main ALIGN function_align .sec: vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 vpermb m17, m20, m13 ; pNtr vpermb m18, m20, m14 ; pNbl vpermb m19, m20, m15 ; pNbr vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 vpermb m13, m21, m13 ; pNtr vpermb m14, m21, m14 ; pNbl vpermb m15, m21, m15 ; pNbr .sec_main: %macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants vpcmpub k1, m4, %1, 6 vpcmpub k2, m5, %2, 6 vpcmpub k3, m6, %3, 6 vpcmpub k4, m7, %4, 6 psubb m20, %1, m4 psubb m21, %2, m5 psubb m22, %3, m6 psubb m23, %4, m7 %if %5 vpbroadcastb m28, t1d lzcnt t1d, t1d vpbroadcastq m29, [r3+t1*8] %endif vpsubb m20{k1}, m4, %1 vpsubb m21{k2}, m5, %2 vpsubb m22{k3}, m6, %3 vpsubb m23{k4}, m7, %4 gf2p8affineqb m24, m20, m29, 0 gf2p8affineqb m25, m21, m29, 0 gf2p8affineqb m26, m22, m29, 0 gf2p8affineqb m27, m23, m29, 0 %if %5 vpbroadcastd m30, [base+sec_tap] %endif psubusb m24, m28, m24 psubusb m25, m28, m25 psubusb m26, m28, m26 psubusb m27, m28, m27 pminub m20, m24 pminub m21, m25 pminub m22, m26 pminub m23, m27 mova m24, m30 mova m25, m30 mova m26, m30 mova m27, m30 vpsubb m24{k1}, m31, m30 vpsubb m25{k2}, m31, m30 vpsubb m26{k3}, m31, m30 vpsubb m27{k4}, m31, m30 vpdpbusd m0, m20, m24 vpdpbusd m1, m21, m25 vpdpbusd m2, m22, m26 vpdpbusd m3, m23, m27 %endmacro CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 CDEF_FILTER_8x8_SEC m12, m13, m14, m15 ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/cdef_dist.asm000064400000000000000000000173141046102023000145170ustar 00000000000000; Copyright (c) 2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION .text %if ARCH_X86_64 ; m0: zero register ; m1: src input ; m2: dst input ; m3 = sum(src_{i,j}) ; m4 = sum(src_{i,j}^2) ; m5 = sum(dst_{i,j}) ; m6 = sum(dst_{i,j}^2) ; m7 = sum(src_{i,j} * dst_{i,j}) ; m8: tmp register %macro CDEF_DIST_W8_SSE2 0 psadbw m8, m1, m0 ; sum pixel values paddd m3, m8 ; accumulate punpcklbw m1, m0 ; convert to 16-bits pmaddwd m8, m1, m1 ; square and horz add paddd m4, m8 ; accumulate psadbw m8, m2, m0 ; same as above, but for dst paddd m5, m8 punpcklbw m2, m0 pmaddwd m8, m2, m2 paddd m6, m8 pmaddwd m8, m1, m2 ; src_{i,j} * dst_{i,j} (and horz add) paddd m7, m8 %endmacro ; Refine sums into variances and sse ; parameter: scale log2 relative to 8x8 %macro CDEF_DIST_REFINE_SSE2 1 ; Compute [sum(src)^2, sum(dst)^2] punpckldq m3, m5 ; store sums in a single vector pcmpeqd m0, m0 ; -1 (for rounding) pmaddwd m3, m3 ; Divide by area and round pslld m0, 5 - %1 psubd m3, m0 ; + (1 << (5 - %1)) psrld m3, 6 - %1 pshufd m0, m7, q3232 ; reduce sum(src * dst) punpckldq m1, m4, m6 ; reduce [sum(src^2), sum(dst^2)] paddd m7, m0 punpckhdq m4, m6 paddd m1, m4 paddd m7, m7 ; 2 * sum(src * dst) [Partially reduced; len 2] pshufd m0, m1, q3232 ; Equivelent to: ; paddd m1, m0 ; psubd m0, m1, m7 ; sse = sum(src^2) + sum(dst^2) - sum(src * dst) ; but with fewer dependancies psubd m7, m1 paddd m1, m0 ; [sum(src^2), sum(dst^2)] psubd m0, m7 ; sse (Needs reducing; len 2) psubd m1, m3 ; [src variance, dst variance] ; Scale up the variances up to 8x8 ; TODO: this can be handled inside ssim boost in the future %if %1 != 0 pslld m1, %1 %endif movq [ret_ptrq], m1 ; Final reduce for sse pshuflw m2, m0, q3232 paddd m0, m2 movd [ret_ptrq+8], m0 %endmacro INIT_XMM sse2 cglobal cdef_dist_kernel_4x4, 5, 5, 9, \ src, src_stride, dst, dst_stride, ret_ptr pxor m0, m0 movd m1, [srcq] movd m2, [srcq+src_strideq] punpckldq m1, m2 movd m2, [dstq] movd m8, [dstq+dst_strideq] lea srcq, [srcq+2*src_strideq] lea dstq, [dstq+2*dst_strideq] punpckldq m2, m8 psadbw m3, m1, m0 punpcklbw m1, m0 pmaddwd m4, m1, m1 psadbw m5, m2, m0 punpcklbw m2, m0 pmaddwd m6, m2, m2 pmaddwd m7, m1, m2 movd m1, [srcq] movd m2, [srcq+src_strideq] punpckldq m1, m2 movd m2, [dstq] movd m8, [dstq+dst_strideq] punpckldq m2, m8 CDEF_DIST_W8_SSE2 CDEF_DIST_REFINE_SSE2 2 RET cglobal cdef_dist_kernel_4x8, 5, 7, 9, \ src, src_stride, dst, dst_stride, ret_ptr, src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] pxor m0, m0 movd m1, [srcq] movd m2, [srcq+src_strideq] punpckldq m1, m2 movd m2, [dstq] movd m8, [dstq+dst_strideq] punpckldq m2, m8 psadbw m3, m1, m0 punpcklbw m1, m0 pmaddwd m4, m1, m1 psadbw m5, m2, m0 punpcklbw m2, m0 pmaddwd m6, m2, m2 pmaddwd m7, m1, m2 movd m1, [srcq+2*src_strideq] movd m2, [srcq+src_stride3q] punpckldq m1, m2 movd m2, [dstq+2*dst_strideq] movd m8, [dstq+dst_stride3q] lea srcq, [srcq+4*src_strideq] lea dstq, [dstq+4*dst_strideq] punpckldq m2, m8 CDEF_DIST_W8_SSE2 movd m1, [srcq] movd m2, [srcq+src_strideq] punpckldq m1, m2 movd m2, [dstq] movd m8, [dstq+dst_strideq] punpckldq m2, m8 CDEF_DIST_W8_SSE2 movd m1, [srcq+2*src_strideq] movd m2, [srcq+src_stride3q] punpckldq m1, m2 movd m2, [dstq+2*dst_strideq] movd m8, [dstq+dst_stride3q] punpckldq m2, m8 CDEF_DIST_W8_SSE2 CDEF_DIST_REFINE_SSE2 1 RET cglobal cdef_dist_kernel_8x4, 5, 5, 9, \ src, src_stride, dst, dst_stride, ret_ptr pxor m0, m0 movq m1, [srcq] psadbw m3, m1, m0 punpcklbw m1, m0 pmaddwd m4, m1, m1 movq m2, [dstq] psadbw m5, m2, m0 punpcklbw m2, m0 pmaddwd m6, m2, m2 pmaddwd m7, m1, m2 movq m1, [srcq+src_strideq] movq m2, [dstq+dst_strideq] lea srcq, [srcq+2*src_strideq] lea dstq, [dstq+2*dst_strideq] CDEF_DIST_W8_SSE2 movq m1, [srcq] movq m2, [dstq] CDEF_DIST_W8_SSE2 movq m1, [srcq+src_strideq] movq m2, [dstq+dst_strideq] CDEF_DIST_W8_SSE2 CDEF_DIST_REFINE_SSE2 1 RET cglobal cdef_dist_kernel_8x8, 5, 7, 9, \ src, src_stride, dst, dst_stride, ret_ptr, src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] pxor m0, m0 movq m1, [srcq] psadbw m3, m1, m0 punpcklbw m1, m0 pmaddwd m4, m1, m1 movq m2, [dstq] psadbw m5, m2, m0 punpcklbw m2, m0 pmaddwd m6, m2, m2 pmaddwd m7, m1, m2 movq m1, [srcq+src_strideq] movq m2, [dstq+dst_strideq] CDEF_DIST_W8_SSE2 movq m1, [srcq+2*src_strideq] movq m2, [dstq+2*dst_strideq] CDEF_DIST_W8_SSE2 movq m1, [srcq+src_stride3q] movq m2, [dstq+dst_stride3q] CDEF_DIST_W8_SSE2 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] movq m1, [srcq] movq m2, [dstq] CDEF_DIST_W8_SSE2 movq m1, [srcq+src_strideq] movq m2, [dstq+dst_strideq] CDEF_DIST_W8_SSE2 movq m1, [srcq+2*src_strideq] movq m2, [dstq+2*dst_strideq] CDEF_DIST_W8_SSE2 movq m1, [srcq+src_stride3q] movq m2, [dstq+dst_stride3q] CDEF_DIST_W8_SSE2 CDEF_DIST_REFINE_SSE2 0 RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/cdef_rav1e.asm000064400000000000000000000217241046102023000145720ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 tap_table: ; masks for 8 bit shifts db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01 ; weights db 4, 2, 3, 3, 2, 1 db 0, 0 ; padding db -1, -2, 0, 0, 1, 2, 0, 0 db 0, -1, 0, 1, 1, 2, 0, 0 db 0, 0, 1, 2, -1, -2, 0, 0 db 0, 1, 1, 2, 0, -1, 0, 0 db 1, 2, 1, 2, 0, 0, 0, 0 db 1, 2, 1, 2, 0, 1, 0, 0 db 1, 2, -1, -2, 1, 2, 0, 0 db 1, 2, 0, -1, 1, 2, 0, 0 db 1, 2, 1, 2, 0, 0, 0, 0 db 1, 2, 1, 2, 0, -1, 0, 0 db 1, 2, 1, 2, 1, 2, 0, 0 db 1, 2, 0, 1, 1, 2, 0, 0 db 1, 2, 0, 0, 1, 2, 0, 0 db 0, 1, 0, -1, 1, 2, 0, 0 db 0, 0, 1, 2, 1, 2, 0, 0 db 0, -1, 1, 2, 0, 1, 0, 0 pw_128: times 2 dw 128 pw_2048: times 2 dw 2048 SECTION .text ; stride unused %macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride ; load p0/p1 movsx offq, dword [offsets+kq*4+%1] ; off1 %if %6 == 4 movq xm5, [tmp0q+offq*2] ; p0 movq xm6, [tmp2q+offq*2] movhps xm5, [tmp1q+offq*2] movhps xm6, [tmp3q+offq*2] vinserti128 m5, xm6, 1 %else movu xm5, [tmp0q+offq*2] ; p0 vinserti128 m5, [tmp1q+offq*2], 1 %endif neg offq ; -off1 %if %6 == 4 movq xm6, [tmp0q+offq*2] ; p1 movq xm9, [tmp2q+offq*2] movhps xm6, [tmp1q+offq*2] movhps xm9, [tmp3q+offq*2] vinserti128 m6, xm9, 1 %else movu xm6, [tmp0q+offq*2] ; p1 vinserti128 m6, [tmp1q+offq*2], 1 %endif ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. ; use signed max and unsigned min to remove them pmaxsw m7, m5 ; max after p0 pminuw m8, m5 ; min after p0 pmaxsw m7, m6 ; max after p1 pminuw m8, m6 ; min after p1 ; accumulate sum[m15] over p0/p1 ; calculate difference before converting psubw m5, m4 ; diff_p0(p0 - px) psubw m6, m4 ; diff_p1(p1 - px) ; convert to 8-bits with signed saturation ; saturating to large diffs has no impact on the results packsswb m5, m6 ; group into pairs so we can accumulate using maddubsw pshufb m5, m12 pabsb m9, m5 psignb m10, %5, m5 psrlw m5, m9, %2 ; emulate 8-bit shift pand m5, %3 psubusb m5, %4, m5 ; use unsigned min since abs diff can equal 0x80 pminub m5, m9 pmaddubsw m5, m10 paddw m15, m5 %endmacro %macro CDEF_FILTER 2 ; w, h INIT_YMM avx2 %if %1*%2*2/mmsize > 1 %if %1 == 4 cglobal cdef_filter_%1x%2, 4, 13, 16, 64 %else cglobal cdef_filter_%1x%2, 4, 11, 16, 64 %endif %else cglobal cdef_filter_%1x%2, 4, 12, 16, 64 %endif %define offsets rsp+32 DEFINE_ARGS dst, dst_stride, tmp, tmp_stride, pri, sec, pridmp, table, \ secdmp, damping, dir lea tableq, [tap_table] ; off1/2/3[k] [6 total] movd xm2, tmp_strided vpbroadcastd m2, xm2 mov dird, r6m pmovsxbd m3, [tableq+dirq*8+16] pmulld m2, m3 pmovsxbd m4, [tableq+dirq*8+16+64] paddd m2, m4 mova [offsets], m2 ; register to shuffle values into after packing vbroadcasti128 m12, [shufb_lohi] movifnidn prid, prim mov dampingd, r7m lzcnt pridmpd, prid %if UNIX64 movd xm0, prid movd xm1, secd %endif lzcnt secdmpd, secm sub dampingd, 31 DEFINE_ARGS dst, dst_stride, tmp, tmp_stride, pri, sec, pridmp, table, \ secdmp, damping, zero xor zerod, zerod add pridmpd, dampingd cmovl pridmpd, zerod add secdmpd, dampingd cmovl secdmpd, zerod mov [rsp+0], pridmpq ; pri_shift mov [rsp+8], secdmpq ; sec_shift DEFINE_ARGS dst, dst_stride, tmp, tmp_stride, pri, sec, pridmp, table, \ secdmp vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask ; pri/sec_taps[k] [4 total] DEFINE_ARGS dst, dst_stride, tmp, tmp_stride, pri, sec, dummy, table, \ secdmp %if UNIX64 vpbroadcastb m0, xm0 ; pri_strength vpbroadcastb m1, xm1 ; sec_strength %else vpbroadcastb m0, prim vpbroadcastb m1, secm %endif and prid, 1 lea priq, [tableq+priq*2+8] ; pri_taps lea secq, [tableq+12] ; sec_taps %if %1*%2*2/mmsize > 1 %if %1 == 4 DEFINE_ARGS dst, dst_stride, tmp0, tmp_stride, pri, sec, dst_stride3, h, off, k, tmp1, tmp2, tmp3 lea dst_stride3q, [dst_strideq*3] %else DEFINE_ARGS dst, dst_stride, tmp0, tmp_stride, pri, sec, h, off, k, tmp1 %endif mov hd, %1*%2*2/mmsize %else DEFINE_ARGS dst, dst_stride, tmp0, tmp_stride, pri, sec, dst_stride3, off, k, tmp1, tmp2, tmp3 lea dst_stride3q, [dst_strideq*3] %endif pxor m11, m11 %if %1*%2*2/mmsize > 1 .v_loop: %endif lea tmp1q, [tmp0q+tmp_strideq*2] %if %1 == 4 lea tmp2q, [tmp0q+tmp_strideq*4] lea tmp3q, [tmp1q+tmp_strideq*4] %endif mov kd, 1 %if %1 == 4 movq xm4, [tmp0q] movhps xm4, [tmp1q] movq xm5, [tmp2q] movhps xm5, [tmp3q] vinserti128 m4, xm5, 1 %else mova xm4, [tmp0q] ; px vinserti128 m4, [tmp1q], 1 %endif pxor m15, m15 ; sum mova m7, m4 ; max mova m8, m4 ; min .k_loop: vpbroadcastb m2, [priq+kq] ; pri_taps vpbroadcastb m3, [secq+kq] ; sec_taps ACCUMULATE_TAP 0*8, [rsp+0], m13, m0, m2, %1, %3 ACCUMULATE_TAP 1*8, [rsp+8], m14, m1, m3, %1, %3 ACCUMULATE_TAP 2*8, [rsp+8], m14, m1, m3, %1, %3 dec kq jge .k_loop vpbroadcastd m10, [pw_2048] pcmpgtw m9, m11, m15 paddw m15, m9 pmulhrsw m15, m10 paddw m4, m15 pminsw m4, m7 pmaxsw m4, m8 packuswb m4, m4 vextracti128 xm5, m4, 1 %if %1 == 4 movd [dstq+dst_strideq*0], xm4 pextrd [dstq+dst_strideq*1], xm4, 1 movd [dstq+dst_strideq*2], xm5 pextrd [dstq+dst_stride3q], xm5, 1 %else movq [dstq+dst_strideq*0], xm4 movq [dstq+dst_strideq*1], xm5 %endif %if %1*%2*2/mmsize > 1 %define vloop_lines (mmsize/(%1*2)) lea dstq, [dstq+dst_strideq*vloop_lines] lea tmp0q, [tmp0q+tmp_strideq*2*vloop_lines] dec hd jg .v_loop %endif RET %endmacro CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/cdef_sse.asm000064400000000000000000001174221046102023000143470ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; Copyright © 2019, VideoLabs ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 %macro DUP8 1-* %rep %0 times 8 db %1 %rotate 1 %endrep %endmacro div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105 dd 420, 210, 140, 105, 105, 105, 105, 105 div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210 dw 168, 168, 140, 140, 120, 120, 105, 105 dw 420, 420, 210, 210, 140, 140, 105, 105 dw 105, 105, 105, 105, 105, 105, 105, 105 const shufw_6543210x, \ db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 pw_8: times 8 dw 8 pw_128: times 8 dw 128 pw_256: times 8 dw 256 pw_2048: times 8 dw 2048 pw_0x7FFF: times 8 dw 0x7FFF pw_0x8000: times 8 dw 0x8000 tap_table: ; masks for 8-bit shift emulation DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80 ; weights DUP8 4, 2, 3, 3, 2, 1 ; taps indices db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 db 1 * 16 + 0, 2 * 16 + 0 db 1 * 16 + 0, 2 * 16 - 1 ; the last 6 are repeats of the first 6 so we don't need to & 7 db -1 * 16 + 1, -2 * 16 + 2 db 0 * 16 + 1, -1 * 16 + 2 db 0 * 16 + 1, 0 * 16 + 2 db 0 * 16 + 1, 1 * 16 + 2 db 1 * 16 + 1, 2 * 16 + 2 db 1 * 16 + 0, 2 * 16 + 1 SECTION .text %macro movif32 2 %if ARCH_X86_32 mov %1, %2 %endif %endmacro %macro PMOVZXBW 2-3 0 ; %3 = half %if cpuflag(sse4) && %3 == 0 pmovzxbw %1, %2 %else %if %3 == 1 movd %1, %2 %else movq %1, %2 %endif punpcklbw %1, m7 %endif %endmacro %macro PSHUFB_0 2 %if cpuflag(ssse3) pshufb %1, %2 %else punpcklbw %1, %1 pshuflw %1, %1, q0000 punpcklqdq %1, %1 %endif %endmacro %macro MOVDDUP 2 %if cpuflag(ssse3) movddup %1, %2 %else movq %1, %2 punpcklqdq %1, %1 %endif %endmacro %macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax ; load p0/p1 movsx offq, byte [dirq+kq+%1+14*8] ; off1 %if %6 == 4 movq m5, [stkq+offq*2+32*0] ; p0 movhps m5, [stkq+offq*2+32*1] %else movu m5, [stkq+offq*2+32*0] ; p0 %endif neg offq ; -off1 %if %6 == 4 movq m6, [stkq+offq*2+32*0] ; p1 movhps m6, [stkq+offq*2+32*1] %else movu m6, [stkq+offq*2+32*0] ; p1 %endif %if %7 %if cpuflag(sse4) ; out of bounds values are set to a value that is a both a large unsigned ; value and a negative signed value. ; use signed max and unsigned min to remove them pmaxsw m7, m5 pminuw m8, m5 pmaxsw m7, m6 pminuw m8, m6 %else pcmpeqw m3, m14, m5 pminsw m8, m5 ; min after p0 pandn m3, m5 pmaxsw m7, m3 ; max after p0 pcmpeqw m3, m14, m6 pminsw m8, m6 ; min after p1 pandn m3, m6 pmaxsw m7, m3 ; max after p1 %endif %endif ; accumulate sum[m13] over p0/p1 psubw m5, m4 ; diff_p0(p0 - px) psubw m6, m4 ; diff_p1(p1 - px) packsswb m5, m6 ; convert pixel diff to 8-bit %if cpuflag(ssse3) pshufb m5, m13 ; group diffs p0 and p1 into pairs pabsb m6, m5 psignb m3, %5, m5 %else movlhps m6, m5 punpckhbw m6, m5 pxor m5, m5 pcmpgtb m5, m6 paddb m6, m5 pxor m6, m5 paddb m3, %5, m5 pxor m3, m5 %endif pand m9, %3, m6 ; emulate 8-bit shift psrlw m9, %2 psubusb m5, %4, m9 pminub m5, m6 ; constrain(diff_p) %if cpuflag(ssse3) pmaddubsw m5, m3 ; constrain(diff_p) * taps %else psrlw m9, m5, 8 psraw m6, m3, 8 psllw m5, 8 psllw m3, 8 pmullw m9, m6 pmulhw m5, m3 paddw m5, m9 %endif paddw m0, m5 %endmacro %macro LOAD_BODY 3 ; dst, src, block_width %if %3 == 4 PMOVZXBW m0, [%2+strideq*0] PMOVZXBW m1, [%2+strideq*1] PMOVZXBW m2, [%2+strideq*2] PMOVZXBW m3, [%2+stride3q] mova [%1+32*0], m0 mova [%1+32*1], m1 mova [%1+32*2], m2 mova [%1+32*3], m3 %else movu m0, [%2+strideq*0] movu m1, [%2+strideq*1] movu m2, [%2+strideq*2] movu m3, [%2+stride3q] punpcklbw m4, m0, m7 punpckhbw m0, m7 mova [%1+32*0+ 0], m4 mova [%1+32*0+16], m0 punpcklbw m4, m1, m7 punpckhbw m1, m7 mova [%1+32*1+ 0], m4 mova [%1+32*1+16], m1 punpcklbw m4, m2, m7 punpckhbw m2, m7 mova [%1+32*2+ 0], m4 mova [%1+32*2+16], m2 punpcklbw m4, m3, m7 punpckhbw m3, m7 mova [%1+32*3+ 0], m4 mova [%1+32*3+16], m3 %endif %endmacro %macro CDEF_FILTER_END 2 ; w, minmax pxor m6, m6 pcmpgtw m6, m0 paddw m0, m6 %if cpuflag(ssse3) pmulhrsw m0, m15 %else paddw m0, m15 psraw m0, 4 %endif paddw m4, m0 %if %2 pminsw m4, m7 pmaxsw m4, m8 %endif packuswb m4, m4 %if %1 == 4 movd [dstq+strideq*0], m4 psrlq m4, 32 movd [dstq+strideq*1], m4 add stkq, 32*2 lea dstq, [dstq+strideq*2] %else movq [dstq], m4 add stkq, 32 add dstq, strideq %endif %endmacro %macro CDEF_FILTER 2 ; w, h %if ARCH_X86_64 cglobal cdef_filter_%1x%2_8bpc, 5, 9, 16, 3 * 16 + (%2+4)*32, \ dst, stride, left, top, bot, pri, dst4, edge, \ stride3 %define px rsp+3*16+2*32 %define base 0 %else cglobal cdef_filter_%1x%2_8bpc, 2, 7, 8, - 7 * 16 - (%2+4)*32, \ dst, stride, left, edge, stride3 %define topq r2 %define botq r2 %define dst4q r2 LEA r5, tap_table %define px esp+7*16+2*32 %define base r5-tap_table %endif mov edged, r9m %if cpuflag(sse4) %define OUT_OF_BOUNDS_MEM [base+pw_0x8000] %else %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF] %endif mova m6, OUT_OF_BOUNDS_MEM pxor m7, m7 ; prepare pixel buffers - body/right %if %2 == 8 lea dst4q, [dstq+strideq*4] %endif lea stride3q, [strideq*3] test edgeb, 2 ; have_right jz .no_right LOAD_BODY px, dstq, %1 %if %2 == 8 LOAD_BODY px+4*32, dst4q, %1 %endif jmp .body_done .no_right: PMOVZXBW m0, [dstq+strideq*0], %1 == 4 PMOVZXBW m1, [dstq+strideq*1], %1 == 4 PMOVZXBW m2, [dstq+strideq*2], %1 == 4 PMOVZXBW m3, [dstq+stride3q ], %1 == 4 mova [px+32*0], m0 mova [px+32*1], m1 mova [px+32*2], m2 mova [px+32*3], m3 movd [px+32*0+%1*2], m6 movd [px+32*1+%1*2], m6 movd [px+32*2+%1*2], m6 movd [px+32*3+%1*2], m6 %if %2 == 8 PMOVZXBW m0, [dst4q+strideq*0], %1 == 4 PMOVZXBW m1, [dst4q+strideq*1], %1 == 4 PMOVZXBW m2, [dst4q+strideq*2], %1 == 4 PMOVZXBW m3, [dst4q+stride3q ], %1 == 4 mova [px+32*4], m0 mova [px+32*5], m1 mova [px+32*6], m2 mova [px+32*7], m3 movd [px+32*4+%1*2], m6 movd [px+32*5+%1*2], m6 movd [px+32*6+%1*2], m6 movd [px+32*7+%1*2], m6 %endif .body_done: ; top movifnidn topq, r3mp test edgeb, 4 ; have_top jz .no_top test edgeb, 1 ; have_left jz .top_no_left test edgeb, 2 ; have_right jz .top_no_right %if %1 == 4 PMOVZXBW m0, [topq+strideq*0-2] PMOVZXBW m1, [topq+strideq*1-2] %else movu m0, [topq+strideq*0-4] movu m1, [topq+strideq*1-4] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 movu [px-32*2+8], m2 movu [px-32*1+8], m3 %endif movu [px-32*2-%1], m0 movu [px-32*1-%1], m1 jmp .top_done .top_no_right: %if %1 == 4 PMOVZXBW m0, [topq+strideq*0-%1] PMOVZXBW m1, [topq+strideq*1-%1] movu [px-32*2-8], m0 movu [px-32*1-8], m1 %else movu m0, [topq+strideq*0-%1] movu m1, [topq+strideq*1-%2] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 mova [px-32*2-16], m0 mova [px-32*2+ 0], m2 mova [px-32*1-16], m1 mova [px-32*1+ 0], m3 %endif movd [px-32*2+%1*2], m6 movd [px-32*1+%1*2], m6 jmp .top_done .top_no_left: test edgeb, 2 ; have_right jz .top_no_left_right %if %1 == 4 PMOVZXBW m0, [topq+strideq*0] PMOVZXBW m1, [topq+strideq*1] %else movu m0, [topq+strideq*0] movu m1, [topq+strideq*1] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 movd [px-32*2+16], m2 movd [px-32*1+16], m3 %endif movd [px-32*2- 4], m6 movd [px-32*1- 4], m6 mova [px-32*2+ 0], m0 mova [px-32*1+ 0], m1 jmp .top_done .top_no_left_right: PMOVZXBW m0, [topq+strideq*0], %1 == 4 PMOVZXBW m1, [topq+strideq*1], %1 == 4 movd [px-32*2-4], m6 movd [px-32*1-4], m6 mova [px-32*2+0], m0 mova [px-32*1+0], m1 movd [px-32*2+%1*2], m6 movd [px-32*1+%1*2], m6 jmp .top_done .no_top: movu [px-32*2- 4], m6 movu [px-32*1- 4], m6 %if %1 == 8 movq [px-32*2+12], m6 movq [px-32*1+12], m6 %endif .top_done: ; left test edgeb, 1 ; have_left jz .no_left movifnidn leftq, leftmp %if %2 == 4 movq m0, [leftq] %else movu m0, [leftq] %endif %if %2 == 4 punpcklbw m0, m7 %else punpckhbw m1, m0, m7 punpcklbw m0, m7 movhlps m3, m1 movd [px+32*4-4], m1 movd [px+32*6-4], m3 psrlq m1, 32 psrlq m3, 32 movd [px+32*5-4], m1 movd [px+32*7-4], m3 %endif movhlps m2, m0 movd [px+32*0-4], m0 movd [px+32*2-4], m2 psrlq m0, 32 psrlq m2, 32 movd [px+32*1-4], m0 movd [px+32*3-4], m2 jmp .left_done .no_left: movd [px+32*0-4], m6 movd [px+32*1-4], m6 movd [px+32*2-4], m6 movd [px+32*3-4], m6 %if %2 == 8 movd [px+32*4-4], m6 movd [px+32*5-4], m6 movd [px+32*6-4], m6 movd [px+32*7-4], m6 %endif .left_done: ; bottom movifnidn botq, r4mp test edgeb, 8 ; have_bottom jz .no_bottom test edgeb, 1 ; have_left jz .bottom_no_left test edgeb, 2 ; have_right jz .bottom_no_right %if %1 == 4 PMOVZXBW m0, [botq+strideq*0-(%1/2)] PMOVZXBW m1, [botq+strideq*1-(%1/2)] %else movu m0, [botq+strideq*0-4] movu m1, [botq+strideq*1-4] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 movu [px+32*(%2+0)+8], m2 movu [px+32*(%2+1)+8], m3 %endif movu [px+32*(%2+0)-%1], m0 movu [px+32*(%2+1)-%1], m1 jmp .bottom_done .bottom_no_right: %if %1 == 4 PMOVZXBW m0, [botq+strideq*0-4] PMOVZXBW m1, [botq+strideq*1-4] movu [px+32*(%2+0)-8], m0 movu [px+32*(%2+1)-8], m1 %else movu m0, [botq+strideq*0-8] movu m1, [botq+strideq*1-8] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 mova [px+32*(%2+0)-16], m0 mova [px+32*(%2+0)+ 0], m2 mova [px+32*(%2+1)-16], m1 mova [px+32*(%2+1)+ 0], m3 movd [px+32*(%2-1)+16], m6 ; overwritten by first mova %endif movd [px+32*(%2+0)+%1*2], m6 movd [px+32*(%2+1)+%1*2], m6 jmp .bottom_done .bottom_no_left: test edgeb, 2 ; have_right jz .bottom_no_left_right %if %1 == 4 PMOVZXBW m0, [botq+strideq*0] PMOVZXBW m1, [botq+strideq*1] %else movu m0, [botq+strideq*0] movu m1, [botq+strideq*1] punpckhbw m2, m0, m7 punpcklbw m0, m7 punpckhbw m3, m1, m7 punpcklbw m1, m7 mova [px+32*(%2+0)+16], m2 mova [px+32*(%2+1)+16], m3 %endif mova [px+32*(%2+0)+ 0], m0 mova [px+32*(%2+1)+ 0], m1 movd [px+32*(%2+0)- 4], m6 movd [px+32*(%2+1)- 4], m6 jmp .bottom_done .bottom_no_left_right: PMOVZXBW m0, [botq+strideq*0], %1 == 4 PMOVZXBW m1, [botq+strideq*1], %1 == 4 mova [px+32*(%2+0)+ 0], m0 mova [px+32*(%2+1)+ 0], m1 movd [px+32*(%2+0)+%1*2], m6 movd [px+32*(%2+1)+%1*2], m6 movd [px+32*(%2+0)- 4], m6 movd [px+32*(%2+1)- 4], m6 jmp .bottom_done .no_bottom: movu [px+32*(%2+0)- 4], m6 movu [px+32*(%2+1)- 4], m6 %if %1 == 8 movq [px+32*(%2+0)+12], m6 movq [px+32*(%2+1)+12], m6 %endif .bottom_done: ; actual filter %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, pridmp, damping, pri, sec mova m13, [shufb_lohi] %if cpuflag(ssse3) mova m15, [pw_2048] %else mova m15, [pw_8] %endif mova m14, m6 %else DEFINE_ARGS dst, pridmp, sec, damping, pri, tap %xdefine m8 m1 %xdefine m9 m2 %xdefine m10 m0 %xdefine m13 [base+shufb_lohi] %xdefine m14 OUT_OF_BOUNDS_MEM %if cpuflag(ssse3) %xdefine m15 [base+pw_2048] %else %xdefine m15 [base+pw_8] %endif %endif movifnidn prid, r5m movifnidn secd, r6m mov dampingd, r8m movif32 [esp+0x3C], r1d test prid, prid jz .sec_only movd m1, r5m bsr pridmpd, prid test secd, secd jz .pri_only movd m10, r6m tzcnt secd, secd and prid, 1 sub pridmpd, dampingd sub secd, dampingd xor dampingd, dampingd add prid, prid neg pridmpd cmovs pridmpd, dampingd neg secd PSHUFB_0 m1, m7 PSHUFB_0 m10, m7 %if ARCH_X86_64 DEFINE_ARGS dst, stride, _, pridmp, tap, pri, sec lea tapq, [tap_table] MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask mov [rsp+0x00], pridmpq ; pri_shift mov [rsp+0x10], secq ; sec_shift DEFINE_ARGS dst, stride, h, dir, tap, pri, stk, k, off %else MOVDDUP m2, [tapq+pridmpq*8] MOVDDUP m3, [tapq+secq*8] mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP mov [esp+0x00], pridmpd mov [esp+0x30], secd DEFINE_ARGS dst, stride, dir, stk, pri, tap, h %define offq dstq %define kd strided %define kq strideq mova [esp+0x10], m2 mova [esp+0x40], m3 mova [esp+0x20], m1 mova [esp+0x50], m10 %endif mov dird, r7m lea stkq, [px] lea priq, [tapq+8*8+priq*8] ; pri_taps mov hd, %1*%2/8 lea dirq, [tapq+dirq*2] .v_loop: movif32 [esp+0x38], dstd mov kd, 1 %if %1 == 4 movq m4, [stkq+32*0] movhps m4, [stkq+32*1] %else mova m4, [stkq+32*0] ; px %endif pxor m0, m0 ; sum mova m7, m4 ; max mova m8, m4 ; min .k_loop: MOVDDUP m2, [priq+kq*8] %if ARCH_X86_64 ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1 MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1 ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1 %else ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1 MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1 %endif dec kd jge .k_loop movif32 dstq, [esp+0x38] movif32 strideq, [esp+0x3C] CDEF_FILTER_END %1, 1 dec hd jg .v_loop RET .pri_only: %if ARCH_X86_64 DEFINE_ARGS dst, stride, zero, pridmp, damping, pri, tap lea tapq, [tap_table] %else DEFINE_ARGS dst, pridmp, zero, damping, pri, tap %endif and prid, 1 xor zerod, zerod sub dampingd, pridmpd cmovs dampingd, zerod add prid, prid PSHUFB_0 m1, m7 MOVDDUP m7, [tapq+dampingq*8] mov [rsp+0x00], dampingq %if ARCH_X86_64 DEFINE_ARGS dst, stride, h, dir, stk, pri, tap, k, off %else mov [rsp+0x04], zerod DEFINE_ARGS dst, stride, dir, stk, pri, tap, h %endif mov dird, r7m lea stkq, [px] lea priq, [tapq+8*8+priq*8] mov hd, %1*%2/8 lea dirq, [tapq+dirq*2] .pri_v_loop: movif32 [esp+0x38], dstd mov kd, 1 %if %1 == 4 movq m4, [stkq+32*0] movhps m4, [stkq+32*1] %else mova m4, [stkq+32*0] %endif pxor m0, m0 .pri_k_loop: MOVDDUP m2, [priq+kq*8] ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0 dec kd jge .pri_k_loop movif32 dstq, [esp+0x38] movif32 strideq, [esp+0x3C] CDEF_FILTER_END %1, 0 dec hd jg .pri_v_loop RET .sec_only: %if ARCH_X86_64 DEFINE_ARGS dst, stride, zero, dir, damping, tap, sec %else DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero %endif movd m1, r6m tzcnt secd, secd mov dird, r7m xor zerod, zerod sub dampingd, secd cmovs dampingd, zerod PSHUFB_0 m1, m7 %if ARCH_X86_64 lea tapq, [tap_table] %else mov [rsp+0x04], zerod %endif mov [rsp+0x00], dampingq MOVDDUP m7, [tapq+dampingq*8] lea dirq, [tapq+dirq*2] %if ARCH_X86_64 DEFINE_ARGS dst, stride, h, dir, stk, tap, off, k %else DEFINE_ARGS dst, stride, off, stk, dir, tap, h %endif lea stkq, [px] mov hd, %1*%2/8 .sec_v_loop: mov kd, 1 %if %1 == 4 movq m4, [stkq+32*0] movhps m4, [stkq+32*1] %else mova m4, [stkq+32*0] %endif pxor m0, m0 .sec_k_loop: MOVDDUP m2, [tapq+12*8+kq*8] ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0 %if ARCH_X86_32 MOVDDUP m2, [tapq+12*8+kq*8] %endif ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0 dec kd jge .sec_k_loop movif32 strideq, [esp+0x3C] CDEF_FILTER_END %1, 0 dec hd jg .sec_v_loop RET %endmacro %macro MULLD 2 %if cpuflag(sse4) pmulld %1, %2 %else %if ARCH_X86_32 %define m15 m1 %endif pmulhuw m15, %1, %2 pmullw %1, %2 pslld m15, 16 paddd %1, m15 %endif %endmacro %macro CDEF_DIR 0 %if ARCH_X86_64 cglobal cdef_dir_8bpc, 3, 7, 16, src, stride, var lea r6, [strideq*3] movq m1, [srcq+strideq*0] movhps m1, [srcq+strideq*1] movq m3, [srcq+strideq*2] movhps m3, [srcq+r6 ] lea srcq, [srcq+strideq*4] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] movhps m7, [srcq+r6 ] pxor m8, m8 psadbw m9, m1, m8 psadbw m2, m3, m8 psadbw m4, m5, m8 psadbw m6, m7, m8 packssdw m9, m2 packssdw m4, m6 packssdw m9, m4 punpcklbw m0, m1, m8 punpckhbw m1, m8 punpcklbw m2, m3, m8 punpckhbw m3, m8 punpcklbw m4, m5, m8 punpckhbw m5, m8 punpcklbw m6, m7, m8 punpckhbw m7, m8 cglobal_label .main mova m8, [pw_128] psubw m0, m8 psubw m1, m8 psubw m2, m8 psubw m3, m8 psubw m4, m8 psubw m5, m8 psubw m6, m8 psubw m7, m8 psllw m8, 3 psubw m9, m8 ; partial_sum_hv[0] paddw m8, m0, m1 paddw m10, m2, m3 paddw m8, m4 paddw m10, m5 paddw m8, m6 paddw m10, m7 paddw m8, m10 ; partial_sum_hv[1] pmaddwd m8, m8 pmaddwd m9, m9 phaddd m9, m8 SWAP m8, m9 MULLD m8, [div_table%+SUFFIX+48] pslldq m9, m1, 2 psrldq m10, m1, 14 pslldq m11, m2, 4 psrldq m12, m2, 12 pslldq m13, m3, 6 psrldq m14, m3, 10 paddw m9, m0 paddw m10, m12 paddw m11, m13 paddw m10, m14 ; partial_sum_diag[0] top/right half paddw m9, m11 ; partial_sum_diag[0] top/left half pslldq m11, m4, 8 psrldq m12, m4, 8 pslldq m13, m5, 10 psrldq m14, m5, 6 paddw m9, m11 paddw m10, m12 paddw m9, m13 paddw m10, m14 pslldq m11, m6, 12 psrldq m12, m6, 4 pslldq m13, m7, 14 psrldq m14, m7, 2 paddw m9, m11 paddw m10, m12 paddw m9, m13 ; partial_sum_diag[0][0-7] paddw m10, m14 ; partial_sum_diag[0][8-14,zero] pshufb m10, [shufw_6543210x] punpckhwd m11, m9, m10 punpcklwd m9, m10 pmaddwd m11, m11 pmaddwd m9, m9 MULLD m11, [div_table%+SUFFIX+16] MULLD m9, [div_table%+SUFFIX+0] paddd m9, m11 ; cost[0a-d] pslldq m10, m0, 14 psrldq m11, m0, 2 pslldq m12, m1, 12 psrldq m13, m1, 4 pslldq m14, m2, 10 psrldq m15, m2, 6 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m15 pslldq m12, m3, 8 psrldq m13, m3, 8 pslldq m14, m4, 6 psrldq m15, m4, 10 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m15 pslldq m12, m5, 4 psrldq m13, m5, 12 pslldq m14, m6, 2 psrldq m15, m6, 14 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m15 ; partial_sum_diag[1][8-14,zero] paddw m10, m7 ; partial_sum_diag[1][0-7] pshufb m11, [shufw_6543210x] punpckhwd m12, m10, m11 punpcklwd m10, m11 pmaddwd m12, m12 pmaddwd m10, m10 MULLD m12, [div_table%+SUFFIX+16] MULLD m10, [div_table%+SUFFIX+0] paddd m10, m12 ; cost[4a-d] phaddd m9, m10 ; cost[0a/b,4a/b] paddw m10, m0, m1 paddw m11, m2, m3 paddw m12, m4, m5 paddw m13, m6, m7 phaddw m0, m4 phaddw m1, m5 phaddw m2, m6 phaddw m3, m7 ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1) pslldq m4, m11, 2 psrldq m5, m11, 14 pslldq m6, m12, 4 psrldq m7, m12, 12 pslldq m14, m13, 6 psrldq m15, m13, 10 paddw m4, m10 paddw m5, m7 paddw m4, m6 paddw m5, m15 ; partial_sum_alt[3] right paddw m4, m14 ; partial_sum_alt[3] left pshuflw m6, m5, q3012 punpckhwd m5, m4 punpcklwd m4, m6 pmaddwd m5, m5 pmaddwd m4, m4 MULLD m5, [div_table%+SUFFIX+48] MULLD m4, [div_table%+SUFFIX+32] paddd m4, m5 ; cost[7a-d] pslldq m5, m10, 6 psrldq m6, m10, 10 pslldq m7, m11, 4 psrldq m10, m11, 12 pslldq m11, m12, 2 psrldq m12, 14 paddw m5, m7 paddw m6, m10 paddw m5, m11 paddw m6, m12 paddw m5, m13 pshuflw m7, m6, q3012 punpckhwd m6, m5 punpcklwd m5, m7 pmaddwd m6, m6 pmaddwd m5, m5 MULLD m6, [div_table%+SUFFIX+48] MULLD m5, [div_table%+SUFFIX+32] paddd m5, m6 ; cost[5a-d] pslldq m6, m1, 2 psrldq m7, m1, 14 pslldq m10, m2, 4 psrldq m11, m2, 12 pslldq m12, m3, 6 psrldq m13, m3, 10 paddw m6, m0 paddw m7, m11 paddw m6, m10 paddw m7, m13 ; partial_sum_alt[3] right paddw m6, m12 ; partial_sum_alt[3] left pshuflw m10, m7, q3012 punpckhwd m7, m6 punpcklwd m6, m10 pmaddwd m7, m7 pmaddwd m6, m6 MULLD m7, [div_table%+SUFFIX+48] MULLD m6, [div_table%+SUFFIX+32] paddd m6, m7 ; cost[1a-d] pshufd m0, m0, q1032 pshufd m1, m1, q1032 pshufd m2, m2, q1032 pshufd m3, m3, q1032 pslldq m10, m0, 6 psrldq m11, m0, 10 pslldq m12, m1, 4 psrldq m13, m1, 12 pslldq m14, m2, 2 psrldq m2, 14 paddw m10, m12 paddw m11, m13 paddw m10, m14 paddw m11, m2 paddw m10, m3 pshuflw m12, m11, q3012 punpckhwd m11, m10 punpcklwd m10, m12 pmaddwd m11, m11 pmaddwd m10, m10 MULLD m11, [div_table%+SUFFIX+48] MULLD m10, [div_table%+SUFFIX+32] paddd m10, m11 ; cost[3a-d] phaddd m9, m8 ; cost[0,4,2,6] phaddd m6, m10 phaddd m5, m4 phaddd m6, m5 ; cost[1,3,5,7] pshufd m4, m9, q3120 ; now find the best cost %if cpuflag(sse4) pmaxsd m9, m6 pshufd m0, m9, q1032 pmaxsd m0, m9 pshufd m1, m0, q2301 pmaxsd m0, m1 ; best cost %else pcmpgtd m0, m9, m6 pand m9, m0 pandn m0, m6 por m9, m0 pshufd m1, m9, q1032 pcmpgtd m0, m9, m1 pand m9, m0 pandn m0, m1 por m9, m0 pshufd m1, m9, q2301 pcmpgtd m0, m9, m1 pand m9, m0 pandn m0, m1 por m0, m9 %endif ; get direction and variance punpckhdq m1, m4, m6 punpckldq m4, m6 psubd m2, m0, m1 psubd m3, m0, m4 %if WIN64 WIN64_RESTORE_XMM %define tmp rsp+stack_offset+8 %else %define tmp rsp-40 %endif mova [tmp+0x00], m2 ; emulate ymm in stack mova [tmp+0x10], m3 pcmpeqd m1, m0 ; compute best cost mask pcmpeqd m4, m0 packssdw m4, m1 pmovmskb eax, m4 ; get byte-idx from mask tzcnt eax, eax mov r1d, [tmp+rax*2] ; get idx^4 complement from emulated ymm shr eax, 1 ; get direction by converting byte-idx to word-idx shr r1d, 10 mov [varq], r1d %else cglobal cdef_dir_8bpc, 2, 4, 8, 96, src, stride, var, stride3 %define base r2-shufw_6543210x LEA r2, shufw_6543210x pxor m0, m0 lea stride3q, [strideq*3] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] movhps m7, [srcq+stride3q] mova m1, [base+pw_128] psadbw m2, m5, m0 psadbw m3, m7, m0 packssdw m2, m3 punpcklbw m4, m5, m0 punpckhbw m5, m0 punpcklbw m6, m7, m0 punpckhbw m7, m0 psubw m4, m1 psubw m5, m1 psubw m6, m1 psubw m7, m1 mova [esp+0x00], m4 mova [esp+0x10], m5 mova [esp+0x20], m6 mova [esp+0x50], m7 lea srcq, [srcq+strideq*4] movq m5, [srcq+strideq*0] movhps m5, [srcq+strideq*1] movq m7, [srcq+strideq*2] movhps m7, [srcq+stride3q] psadbw m3, m5, m0 psadbw m0, m7 packssdw m3, m0 pxor m0, m0 punpcklbw m4, m5, m0 punpckhbw m5, m0 punpcklbw m6, m7, m0 punpckhbw m7, m0 cglobal_label .main psubw m4, m1 psubw m5, m1 psubw m6, m1 psubw m7, m1 packssdw m2, m3 psllw m1, 3 psubw m2, m1 ; partial_sum_hv[0] pmaddwd m2, m2 mova m3, [esp+0x50] mova m0, [esp+0x00] paddw m0, [esp+0x10] paddw m1, m3, [esp+0x20] paddw m0, m4 paddw m1, m5 paddw m0, m6 paddw m1, m7 paddw m0, m1 ; partial_sum_hv[1] pmaddwd m0, m0 phaddd m2, m0 MULLD m2, [base+div_table%+SUFFIX+48] mova [esp+0x30], m2 mova m1, [esp+0x10] pslldq m0, m1, 2 psrldq m1, 14 paddw m0, [esp+0x00] pslldq m2, m3, 6 psrldq m3, 10 paddw m0, m2 paddw m1, m3 mova m3, [esp+0x20] pslldq m2, m3, 4 psrldq m3, 12 paddw m0, m2 ; partial_sum_diag[0] top/left half paddw m1, m3 ; partial_sum_diag[0] top/right half pslldq m2, m4, 8 psrldq m3, m4, 8 paddw m0, m2 paddw m1, m3 pslldq m2, m5, 10 psrldq m3, m5, 6 paddw m0, m2 paddw m1, m3 pslldq m2, m6, 12 psrldq m3, m6, 4 paddw m0, m2 paddw m1, m3 pslldq m2, m7, 14 psrldq m3, m7, 2 paddw m0, m2 ; partial_sum_diag[0][0-7] paddw m1, m3 ; partial_sum_diag[0][8-14,zero] mova m3, [esp+0x50] pshufb m1, [base+shufw_6543210x] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 MULLD m2, [base+div_table%+SUFFIX+16] MULLD m0, [base+div_table%+SUFFIX+ 0] paddd m0, m2 ; cost[0a-d] mova [esp+0x40], m0 mova m1, [esp+0x00] pslldq m0, m1, 14 psrldq m1, 2 paddw m0, m7 pslldq m2, m3, 8 psrldq m3, 8 paddw m0, m2 paddw m1, m3 mova m3, [esp+0x20] pslldq m2, m3, 10 psrldq m3, 6 paddw m0, m2 paddw m1, m3 mova m3, [esp+0x10] pslldq m2, m3, 12 psrldq m3, 4 paddw m0, m2 paddw m1, m3 pslldq m2, m4, 6 psrldq m3, m4, 10 paddw m0, m2 paddw m1, m3 pslldq m2, m5, 4 psrldq m3, m5, 12 paddw m0, m2 paddw m1, m3 pslldq m2, m6, 2 psrldq m3, m6, 14 paddw m0, m2 ; partial_sum_diag[1][0-7] paddw m1, m3 ; partial_sum_diag[1][8-14,zero] mova m3, [esp+0x50] pshufb m1, [base+shufw_6543210x] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 MULLD m2, [base+div_table%+SUFFIX+16] MULLD m0, [base+div_table%+SUFFIX+ 0] paddd m0, m2 ; cost[4a-d] phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b] phaddd m1, [esp+0x30] ; cost[0,4,2,6] mova [esp+0x30], m1 phaddw m0, [esp+0x00], m4 phaddw m1, [esp+0x10], m5 paddw m4, m5 mova m2, [esp+0x20] paddw m5, m2, m3 phaddw m2, m6 paddw m6, m7 phaddw m3, m7 mova m7, [esp+0x00] paddw m7, [esp+0x10] mova [esp+0x00], m0 mova [esp+0x10], m1 mova [esp+0x20], m2 pslldq m1, m4, 4 pslldq m2, m6, 6 pslldq m0, m5, 2 paddw m1, m2 paddw m0, m7 psrldq m2, m5, 14 paddw m0, m1 ; partial_sum_alt[3] left psrldq m1, m4, 12 paddw m1, m2 psrldq m2, m6, 10 paddw m1, m2 ; partial_sum_alt[3] right pshuflw m1, m1, q3012 punpckhwd m2, m0, m1 punpcklwd m0, m1 pmaddwd m2, m2 pmaddwd m0, m0 MULLD m2, [base+div_table%+SUFFIX+48] MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m2 ; cost[7a-d] mova [esp+0x40], m0 pslldq m0, m7, 6 psrldq m7, 10 pslldq m1, m5, 4 psrldq m5, 12 pslldq m2, m4, 2 psrldq m4, 14 paddw m0, m6 paddw m7, m5 paddw m0, m1 paddw m7, m4 paddw m0, m2 pshuflw m2, m7, q3012 punpckhwd m7, m0 punpcklwd m0, m2 pmaddwd m7, m7 pmaddwd m0, m0 MULLD m7, [base+div_table%+SUFFIX+48] MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m7 ; cost[5a-d] mova [esp+0x50], m0 mova m7, [esp+0x10] mova m2, [esp+0x20] pslldq m0, m7, 2 psrldq m7, 14 pslldq m4, m2, 4 psrldq m2, 12 pslldq m5, m3, 6 psrldq m6, m3, 10 paddw m0, [esp+0x00] paddw m7, m2 paddw m4, m5 paddw m7, m6 ; partial_sum_alt[3] right paddw m0, m4 ; partial_sum_alt[3] left pshuflw m2, m7, q3012 punpckhwd m7, m0 punpcklwd m0, m2 pmaddwd m7, m7 pmaddwd m0, m0 MULLD m7, [base+div_table%+SUFFIX+48] MULLD m0, [base+div_table%+SUFFIX+32] paddd m0, m7 ; cost[1a-d] SWAP m0, m4 pshufd m0, [esp+0x00], q1032 pshufd m1, [esp+0x10], q1032 pshufd m2, [esp+0x20], q1032 pshufd m3, m3, q1032 mova [esp+0x00], m4 pslldq m4, m0, 6 psrldq m0, 10 pslldq m5, m1, 4 psrldq m1, 12 pslldq m6, m2, 2 psrldq m2, 14 paddw m4, m3 paddw m0, m1 paddw m5, m6 paddw m0, m2 paddw m4, m5 pshuflw m2, m0, q3012 punpckhwd m0, m4 punpcklwd m4, m2 pmaddwd m0, m0 pmaddwd m4, m4 MULLD m0, [base+div_table%+SUFFIX+48] MULLD m4, [base+div_table%+SUFFIX+32] paddd m4, m0 ; cost[3a-d] mova m1, [esp+0x00] mova m2, [esp+0x50] mova m0, [esp+0x30] ; cost[0,4,2,6] phaddd m1, m4 phaddd m2, [esp+0x40] ; cost[1,3,5,7] phaddd m1, m2 pshufd m2, m0, q3120 ; now find the best cost %if cpuflag(sse4) pmaxsd m0, m1 pshufd m3, m0, q1032 pmaxsd m3, m0 pshufd m0, m3, q2301 pmaxsd m0, m3 %else pcmpgtd m3, m0, m1 pand m0, m3 pandn m3, m1 por m0, m3 pshufd m4, m0, q1032 pcmpgtd m3, m0, m4 pand m0, m3 pandn m3, m4 por m0, m3 pshufd m4, m0, q2301 pcmpgtd m3, m0, m4 pand m0, m3 pandn m3, m4 por m0, m3 %endif ; get direction and variance mov vard, varm punpckhdq m3, m2, m1 punpckldq m2, m1 psubd m1, m0, m3 psubd m4, m0, m2 mova [esp+0x00], m1 ; emulate ymm in stack mova [esp+0x10], m4 pcmpeqd m3, m0 ; compute best cost mask pcmpeqd m2, m0 packssdw m2, m3 pmovmskb eax, m2 ; get byte-idx from mask tzcnt eax, eax mov r1d, [esp+eax*2] ; get idx^4 complement from emulated ymm shr eax, 1 ; get direction by converting byte-idx to word-idx shr r1d, 10 mov [vard], r1d %endif RET %endmacro INIT_XMM sse4 CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 CDEF_DIR INIT_XMM ssse3 CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 CDEF_DIR INIT_XMM sse2 CDEF_FILTER 8, 8 CDEF_FILTER 4, 8 CDEF_FILTER 4, 4 rav1e-0.7.1/src/x86/filmgrain16_avx2.asm000064400000000000000000002215401046102023000156500ustar 00000000000000; Copyright © 2021-2022, VideoLAN and dav1d authors ; Copyright © 2021-2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 16 pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 gen_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_27_17_17_27: dw 27, 17, 17, 27 pw_23_22: dw 23, 22, 0, 32 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 gen_ar0_shift: times 4 db 128 times 4 db 64 times 4 db 32 times 4 db 16 pd_16: dd 16 pd_m65536: dd -65536 pb_1: times 4 db 1 grain_max: times 2 dw 511 times 2 dw 2047 grain_min: times 2 dw -512 times 2 dw -2048 fg_max: times 2 dw 1023 times 2 dw 4095 times 2 dw 960 times 2 dw 3840 times 2 dw 940 times 2 dw 3760 fg_min: times 2 dw 0 times 2 dw 64 times 2 dw 256 uv_offset_mul: dd 256 dd 1024 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16, 8 round_vals: dw 32, 64, 128, 256, 512, 1024 pb_8_9_0_1: db 8, 9, 0, 1 %macro JMP_TABLE 1-* %xdefine %1_table %%table %xdefine %%base %1_table %xdefine %%prefix mangle(private_prefix %+ _%1) %%table: %rep %0 - 1 dd %%prefix %+ .ar%2 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422_16bpc_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444_16bpc_avx2, 0, 1, 2, 3 SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) INIT_YMM avx2 cglobal generate_grain_y_16bpc, 3, 9, 14, buf, fg_data, bdmax %define base r4-generate_grain_y_16bpc_avx2_table lea r4, [generate_grain_y_16bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] mov r3, -73*82*2 movsxd r5, [fg_dataq+FGData.ar_coeff_lag] lea r7d, [bdmaxq+1] movq xm4, [base+mul_bits] shr r7d, 11 ; 0 for 10bpc, 2 for 12bpc movq xm5, [base+hmul_bits] sub r6, r7 mova xm6, [base+pb_mask] sub bufq, r3 vpbroadcastw xm7, [base+round+r6*2-2] lea r6, [gaussian_sequence] movsxd r5, [r4+r5*4] .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm3, xm2 ; 4 next output seeds pshuflw xm0, xm3, q3333 psrlw xm3, 5 pand xm2, xm0, xm1 movq r7, xm3 psrlw xm3, xm2, 10 por xm2, xm3 pmullw xm2, xm4 pmulhuw xm0, xm5 movzx r8d, r7w pshufb xm3, xm6, xm2 psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm0, xm2 movd xm2, [r6+r8*2] rorx r8, r7, 32 por xm3, xm0 shr r7d, 16 pinsrw xm2, [r6+r7*2], 1 pshuflw xm0, xm3, q3333 movzx r7d, r8w psrlw xm3, 5 pinsrw xm2, [r6+r7*2], 2 shr r8d, 16 movq r7, xm3 pinsrw xm2, [r6+r8*2], 3 movzx r8d, r7w pinsrw xm2, [r6+r8*2], 4 rorx r8, r7, 32 shr r7d, 16 pinsrw xm2, [r6+r7*2], 5 movzx r7d, r8w pinsrw xm2, [r6+r7*2], 6 shr r8d, 16 pinsrw xm2, [r6+r8*2], 7 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support mova [bufq+r3], xm2 add r3, 8*2 jl .loop ; auto-regression code add r5, r4 jmp r5 .ar1: DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd xm4, [fg_dataq+FGData.ar_coeffs_y] DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 pinsrb xm4, [base+pb_1], 3 pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd sub bufq, 2*(82*73-(82*3+79)) mov hd, 70 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: mov xq, -76 movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu xm0, [bufq+xq*2-82*2-2] ; top/left psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right punpcklwd xm0, xm2 punpcklwd xm1, xm3 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xb, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 dec hd jg .y_loop_ar1 .ar0: RET .ar2: DEFINE_ARGS buf, fg_data, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movq xm0, [fg_dataq+FGData.ar_coeffs_y+5] ; cf5-11 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+0], 1 ; cf0-4 vpbroadcastw xm10, [base+round_vals-12+shiftq*2] pxor m1, m1 punpcklwd xm10, xm1 pcmpgtb m1, m0 punpcklbw m0, m1 ; cf5-11,0-4 vpermq m1, m0, q3333 ; cf4 vbroadcasti128 m11, [base+gen_shufA] pshufd m6, m0, q0000 ; cf[5,6], cf[0-1] vbroadcasti128 m12, [base+gen_shufB] pshufd m7, m0, q1111 ; cf[7,8], cf[2-3] punpckhwd xm1, xm0 pshufhw xm9, xm0, q2121 pshufd xm8, xm1, q0000 ; cf[4,9] sar bdmaxd, 1 punpckhqdq xm9, xm9 ; cf[10,11] movd xm4, bdmaxd ; max_grain pcmpeqd xm5, xm5 sub bufq, 2*(82*73-(82*3+79)) pxor xm5, xm4 ; min_grain DEFINE_ARGS buf, fg_data, h, x mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: vbroadcasti128 m2, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] vinserti128 m1, m2, [bufq+xq*2-82*2-4], 0 ; y=-1,x=[-2,+5] pshufb m0, m1, m11 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] pmaddwd m0, m6 punpckhwd xm2, xm1 ; y=-2/-1 interleaved, x=[+2,+5] pshufb m1, m12 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, m7 pmaddwd xm2, xm8 paddd m0, m1 vextracti128 xm1, m0, 1 paddd xm0, xm10 paddd xm2, xm0 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] paddd xm2, xm1 pmovsxwd xm1, [bufq+xq*2] ; in dwords, y=0,x=[0,3] .x_loop_ar2_inner: pmaddwd xm3, xm9, xm0 psrldq xm0, 2 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; skip packssdw because we only care about one value paddd xm3, xm1 pminsd xm3, xm4 psrldq xm1, 4 pmaxsd xm3, xm5 pextrw [bufq+xq*2], xm3, 0 punpcklwd xm3, xm3 pblendw xm0, xm3, 0010b inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 dec hd jg .y_loop_ar2 RET .ar3: DEFINE_ARGS buf, fg_data, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] sar bdmaxd, 1 movq xm7, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-6 movd xm0, [fg_dataq+FGData.ar_coeffs_y+14] ; cf14-16 pinsrb xm7, [fg_dataq+FGData.ar_coeffs_y+13], 7 ; cf0-6,13 pinsrb xm0, [base+pb_1], 3 ; cf14-16,pb_1 movd xm1, [fg_dataq+FGData.ar_coeffs_y+21] ; cf21-23 vinserti128 m7, [fg_dataq+FGData.ar_coeffs_y+ 7], 1 ; cf7-13 vinserti128 m0, [fg_dataq+FGData.ar_coeffs_y+17], 1 ; cf17-20 vpbroadcastw xm11, [base+round_vals+shiftq*2-12] movd xm12, bdmaxd ; max_grain punpcklbw m7, m7 ; sign-extension punpcklbw m0, m0 ; sign-extension punpcklbw xm1, xm1 REPX {psraw x, 8}, m7, m0, xm1 pshufd m4, m7, q0000 ; cf[0,1] | cf[7,8] pshufd m5, m7, q1111 ; cf[2,3] | cf[9,10] pshufd m6, m7, q2222 ; cf[4,5] | cf[11,12] pshufd xm7, xm7, q3333 ; cf[6,13] pshufd m8, m0, q0000 ; cf[14,15] | cf[17,18] pshufd m9, m0, q1111 ; cf[16],pw_1 | cf[19,20] paddw xm0, xm11, xm11 pcmpeqd xm13, xm13 pblendw xm10, xm1, xm0, 00001000b pxor xm13, xm12 ; min_grain DEFINE_ARGS buf, fg_data, h, x sub bufq, 2*(82*73-(82*3+79)) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu xm0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] vinserti128 m0, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] palignr m3, m1, m0, 2 ; y=-3/-2,x=[-2,+5] palignr m1, m0, 12 ; y=-3/-2,x=[+3,+6] punpckhwd m2, m0, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, m4 pmaddwd m2, m6 pmaddwd m3, m5 paddd m0, m2 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] paddd m0, m3 psrldq m3, m2, 2 punpcklwd m3, m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] pmaddwd m3, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] paddd m0, m3 psrldq m3, m2, 4 psrldq m2, 6 vpblendd m2, m11, 0x0f ; rounding constant punpcklwd m3, m2 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] pmaddwd m3, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] vextracti128 xm2, m1, 1 punpcklwd xm1, xm2 pmaddwd xm1, xm7 ; y=-3/-2 interleaved,x=[+3,+4,+5,+6] paddd m0, m3 vextracti128 xm2, m0, 1 paddd xm0, xm1 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] paddd xm0, xm2 .x_loop_ar3_inner: pmaddwd xm2, xm1, xm10 pshuflw xm3, xm2, q1032 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] ; skip packssdw because we only care about one value pminsd xm2, xm12 pmaxsd xm2, xm13 pextrw [bufq+xq*2], xm2, 0 pslldq xm2, 4 psrldq xm1, 2 pblendw xm1, xm2, 0100b inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 dec hd jg .y_loop_ar3 RET %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 cglobal generate_grain_uv_%1_16bpc, 4, 11, 8, buf, bufy, fg_data, uv, bdmax %define base r8-generate_grain_uv_%1_16bpc_avx2_table lea r8, [generate_grain_uv_%1_16bpc_avx2_table] movifnidn bdmaxd, bdmaxm vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r5d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] lea r6d, [bdmaxq+1] movq xm4, [base+mul_bits] shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc movq xm5, [base+hmul_bits] sub r5, r6 mova xm6, [base+pb_mask] vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] vpbroadcastw xm7, [base+round+r5*2-2] pxor xm0, xm2 lea r6, [gaussian_sequence] %if %2 mov r7d, 73-35*%3 add bufq, 44*2 .loop_y: mov r5, -44*2 %else mov r5, -82*73*2 sub bufq, r5 %endif .loop_x: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm2, xm3 ; 4 next output seeds pshuflw xm0, xm2, q3333 psrlw xm2, 5 movq r10, xm2 movzx r9d, r10w movd xm2, [r6+r9*2] rorx r9, r10, 32 shr r10d, 16 pinsrw xm2, [r6+r10*2], 1 movzx r10d, r9w pinsrw xm2, [r6+r10*2], 2 shr r9d, 16 pinsrw xm2, [r6+r9*2], 3 paddw xm2, xm2 ; otherwise bpc=12 w/ grain_scale_shift=0 pmulhrsw xm2, xm7 ; shifts by 0, which pmulhrsw does not support movq [bufq+r5], xm2 add r5, 8 jl .loop_x %if %2 add bufq, 82*2 dec r7d jg .loop_y %endif ; auto-regression code movsxd r6, [fg_dataq+FGData.ar_coeff_lag] movsxd r6, [r8+r6*4] add r6, r8 jmp r6 INIT_YMM avx2 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] vpbroadcastb m0, [fg_dataq+FGData.ar_coeffs_uv+uvq] sar bdmaxd, 1 vpbroadcastd m4, [base+gen_ar0_shift-24+shiftq*4] movd xm6, bdmaxd pcmpeqw m7, m7 pmaddubsw m4, m0 ; ar_coeff << (14 - shift) vpbroadcastw m6, xm6 ; max_gain pxor m7, m6 ; min_grain DEFINE_ARGS buf, bufy, h, x %if %2 vpbroadcastw m5, [base+hmul_bits+2+%3*2] sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) %else sub bufq, 2*(82*70-3) %endif add bufyq, 2*(3+82*3) mov hd, 70-35*%3 .y_loop_ar0: %if %2 ; first 32 pixels movu xm0, [bufyq+16*0] vinserti128 m0, [bufyq+16*2], 1 movu xm1, [bufyq+16*1] vinserti128 m1, [bufyq+16*3], 1 %if %3 movu xm2, [bufyq+82*2+16*0] vinserti128 m2, [bufyq+82*2+16*2], 1 movu xm3, [bufyq+82*2+16*1] vinserti128 m3, [bufyq+82*2+16*3], 1 paddw m0, m2 paddw m1, m3 %endif phaddw m0, m1 movu xm1, [bufyq+16*4] vinserti128 m1, [bufyq+16*6], 1 movu xm2, [bufyq+16*5] vinserti128 m2, [bufyq+16*7], 1 %if %3 movu xm3, [bufyq+82*2+16*4] vinserti128 m3, [bufyq+82*2+16*6], 1 paddw m1, m3 movu xm3, [bufyq+82*2+16*5] vinserti128 m3, [bufyq+82*2+16*7], 1 paddw m2, m3 %endif phaddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 %else xor xd, xd .x_loop_ar0: movu m0, [bufyq+xq*2] movu m1, [bufyq+xq*2+32] %endif paddw m0, m0 paddw m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 %if %2 paddw m0, [bufq+ 0] paddw m1, [bufq+32] %else paddw m0, [bufq+xq*2+ 0] paddw m1, [bufq+xq*2+32] %endif pminsw m0, m6 pminsw m1, m6 pmaxsw m0, m7 pmaxsw m1, m7 %if %2 movu [bufq+ 0], m0 movu [bufq+32], m1 ; last 6 pixels movu xm0, [bufyq+32*4] movu xm1, [bufyq+32*4+16] %if %3 paddw xm0, [bufyq+32*4+82*2] paddw xm1, [bufyq+32*4+82*2+16] %endif phaddw xm0, xm1 movu xm1, [bufq+32*2] pmulhrsw xm0, xm5 paddw xm0, xm0 pmulhrsw xm0, xm4 paddw xm0, xm1 pminsw xm0, xm6 pmaxsw xm0, xm7 vpblendd xm0, xm1, 0x08 movu [bufq+32*2], xm0 %else movu [bufq+xq*2+ 0], m0 movu [bufq+xq*2+32], m1 add xd, 32 cmp xd, 64 jl .x_loop_ar0 ; last 12 pixels movu m0, [bufyq+64*2] movu m1, [bufq+64*2] paddw m0, m0 pmulhrsw m0, m4 paddw m0, m1 pminsw m0, m6 pmaxsw m0, m7 vpblendd m0, m1, 0xc0 movu [bufq+64*2], m0 %endif add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar0 RET INIT_XMM avx2 .ar1: DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 DEFINE_ARGS buf, bufy, h, val0, max, cf3, min, val3, x, shift pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd vpbroadcastw xm6, [base+hmul_bits+2+%3*2] vpbroadcastd xm3, xm3 %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu xm0, [bufq+xq*2-82*2-2] ; top/left %if %2 movu xm2, [bufyq+xq*4] %else movq xm2, [bufyq+xq*2] %endif %if %2 %if %3 phaddw xm2, [bufyq+xq*4+82*2] punpckhqdq xm1, xm2, xm2 paddw xm2, xm1 %else phaddw xm2, xm2 %endif pmulhrsw xm2, xm6 %endif psrldq xm1, xm0, 4 ; top/right punpcklwd xm1, xm2 psrldq xm2, xm0, 2 ; top punpcklwd xm0, xm2 pmaddwd xm1, xm5 pmaddwd xm0, xm4 paddd xm1, xm3 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xb, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar1 RET INIT_YMM avx2 .ar2: %if WIN64 ; xmm6 and xmm7 already saved %assign xmm_regs_used 13 + %2 %assign stack_size_padded 136 SUB rsp, stack_size_padded movaps [rsp+16*2], xmm8 movaps [rsp+16*3], xmm9 movaps [rsp+16*4], xmm10 movaps [rsp+16*5], xmm11 movaps [rsp+16*6], xmm12 %if %2 movaps [rsp+16*7], xmm13 %endif %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 vbroadcasti128 m10, [base+gen_shufA] sar bdmaxd, 1 vbroadcasti128 m11, [base+gen_shufB] movd xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 5] pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+12], 4 pinsrb xm7, [base+pb_1], 5 pinsrw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+10], 3 movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 9], 13 pmovsxbw m7, xm7 movd xm8, bdmaxd ; max_grain pshufd m4, m7, q0000 vpbroadcastw xm12, [base+round_vals-12+shiftq*2] pshufd m5, m7, q1111 pcmpeqd xm9, xm9 pshufd m6, m7, q2222 pxor xm9, xm8 ; min_grain pshufd xm7, xm7, q3333 DEFINE_ARGS buf, bufy, fg_data, h, x %if %2 vpbroadcastw xm13, [base+hmul_bits+2+%3*2] sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: vbroadcasti128 m3, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] vinserti128 m2, m3, [bufq+xq*2-82*4-4], 1 ; y=-2,x=[-2,+5] pshufb m0, m2, m10 ; y=-1/-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] pmaddwd m0, m4 pshufb m1, m2, m11 ; y=-1/-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, m5 punpckhwd m2, m3 ; y=-2/-1 interleaved, x=[+2,+5] %if %2 movu xm3, [bufyq+xq*4] %if %3 paddw xm3, [bufyq+xq*4+82*2] %endif phaddw xm3, xm3 pmulhrsw xm3, xm13 %else movq xm3, [bufyq+xq*2] %endif punpcklwd xm3, xm12 ; luma, round interleaved vpblendd m2, m3, 0x0f pmaddwd m2, m6 paddd m1, m0 movu xm0, [bufq+xq*2-4] ; y=0,x=[-2,+5] paddd m2, m1 vextracti128 xm1, m2, 1 paddd xm2, xm1 pshufd xm1, xm0, q3321 pmovsxwd xm1, xm1 ; y=0,x=[0,3] in dword .x_loop_ar2_inner: pmaddwd xm3, xm7, xm0 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; we do not need to packssdw since we only care about one value paddd xm3, xm1 psrldq xm1, 4 pminsd xm3, xm8 pmaxsd xm3, xm9 pextrw [bufq+xq*2], xm3, 0 psrldq xm0, 2 pslldq xm3, 2 pblendw xm0, xm3, 00000010b inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar2 RET .ar3: %if WIN64 ; xmm6 and xmm7 already saved %assign stack_offset 32 %assign xmm_regs_used 14 + %2 %assign stack_size_padded 152 SUB rsp, stack_size_padded movaps [rsp+16*2], xmm8 movaps [rsp+16*3], xmm9 movaps [rsp+16*4], xmm10 movaps [rsp+16*5], xmm11 movaps [rsp+16*6], xmm12 movaps [rsp+16*7], xmm13 %if %2 movaps [rsp+16*8], xmm14 %endif %endif DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 vpbroadcastw xm11, [base+round_vals-12+shiftq*2] sar bdmaxd, 1 movq xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pinsrb xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+24], 7 ; luma movhps xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 7] pmovsxbw m7, xm7 %if %2 vpbroadcastw xm14, [base+hmul_bits+2+%3*2] %endif pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 movd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+14] pinsrb xm0, [base+pb_1], 3 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+21], 1 pinsrd xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+17], 2 pmovsxbw m0, xm0 movd xm12, bdmaxd ; max_grain pshufd m8, m0, q0000 pshufd m9, m0, q1111 pcmpeqd xm13, xm13 punpckhqdq xm10, xm0, xm0 pxor xm13, xm12 ; min_grain pinsrw xm10, [base+round_vals-10+shiftq*2], 3 DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: movu xm2, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] vinserti128 m2, [bufq+xq*2-82*4-6+ 0], 1 ; y=-3/-2,x=[-3,+4] movq xm1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+8] vinserti128 m1, [bufq+xq*2-82*4-6+16], 1 ; y=-3/-2,x=[+5,+12] palignr m3, m1, m2, 2 ; y=-3/-2,x=[-2,+5] palignr m1, m2, 12 ; y=-3/-2,x=[+3,+6] punpcklwd m0, m2, m3 ; y=-3/-2,x=[-3/-2,-2/-1,-1/+0,+0/+1] punpckhwd m2, m3 ; y=-3/-2,x=[+1/+2,+2/+3,+3/+4,+4/+5] shufps m3, m0, m2, q1032 ; y=-3/-2,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, m4 pmaddwd m2, m6 pmaddwd m3, m5 paddd m0, m2 paddd m0, m3 movu xm2, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] vinserti128 m2, [bufq+xq*2-82*2-6+ 6], 1 ; y=-1,x=[+1,+8] %if %2 movu xm3, [bufyq+xq*4] %if %3 paddw xm3, [bufyq+xq*4+82*2] %endif phaddw xm3, xm3 pmulhrsw xm3, xm14 %else movq xm3, [bufyq+xq*2] %endif punpcklwd m1, m3 pmaddwd m1, m7 paddd m0, m1 psrldq m1, m2, 4 psrldq m3, m2, 6 vpblendd m3, m11, 0x0f ; rounding constant punpcklwd m1, m3 ; y=-1,x=[-1/rnd,+0/rnd,+1/rnd,+2/rnd] pmaddwd m1, m9 ; x=[+2/+3,+3/+4,+4/+5,+5,+6] psrldq m3, m2, 2 punpcklwd m2, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] pmaddwd m2, m8 ; x=[+0/+1,+1/+2,+2/+3,+3/+4] paddd m0, m1 movu xm1, [bufq+xq*2-6] ; y=0,x=[-3,+4] paddd m0, m2 vextracti128 xm2, m0, 1 paddd xm0, xm2 .x_loop_ar3_inner: pmaddwd xm2, xm1, xm10 pshuflw xm3, xm2, q1032 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] psrldq xm1, 2 ; no need to packssdw since we only care about one value pminsd xm2, xm12 pmaxsd xm2, xm13 pextrw [bufq+xq*2], xm2, 0 pslldq xm2, 4 pblendw xm1, xm2, 00000100b inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar3 RET %endmacro cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, unused, sby, see %define base r11-grain_min lea r11, [grain_min] mov r6d, r9m ; bdmax mov r9d, [fg_dataq+FGData.clip_to_restricted_range] mov r7d, [fg_dataq+FGData.scaling_shift] mov sbyd, sbym vpbroadcastd m8, r9m shr r6d, 11 ; is_12bpc vpbroadcastd m9, [base+grain_min+r6*4] shlx r10d, r9d, r6d vpbroadcastd m10, [base+grain_max+r6*4] lea r9d, [r6+r9*4] vpbroadcastw m11, [base+mul_bits+r7*2-12] vpbroadcastd m12, [base+fg_min+r10*4] vpbroadcastd m13, [base+fg_max+r9*4] test sbyd, sbyd setnz r7b vpbroadcastd m14, [base+pd_16] test r7b, [fg_dataq+FGData.overlap_flag] jnz .vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak mov grain_lutq, grain_lutmp mov hd, hm .loop_y: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m4, m2, 0x55 psrld m2, m1, 16 mova m9, m6 pand m2, m8 vpgatherdd m5, [scalingq+m2-2], m6 pblendw m5, m3, 0x55 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m4, [grain_lutq+offxyq*2] pmulhrsw m5, [grain_lutq+offxyq*2+32] ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hd jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq*2] cmp byte [fg_dataq+FGData.overlap_flag], 0 je .loop_x movq xm7, [pw_27_17_17_27] cmp dword r8m, 0 ; sby jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m4, m2, 0x55 psrld m2, m1, 16 mova m9, m6 pand m2, m8 vpgatherdd m5, [scalingq+m2-2], m6 pblendw m5, m3, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] movd xm6, [grain_lutq+left_offxyq*2] punpcklwd xm6, xm3 pmaddwd xm6, xm7 paddd xm6, xm14 psrad xm6, 5 packssdw xm6, xm6 pmaxsw xm6, xm9 pminsw xm6, xm10 vpblendd m3, m6, 0x01 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m4, m3 pmulhrsw m5, [grain_lutq+offxyq*2+32] ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hd jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq*2] cmp dword r8m, 0 ; sby jne .loop_x_hv_overlap jmp .loop_x_h_overlap .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see, src_bak movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq .loop_x_v_overlap: vpbroadcastd m15, [pw_27_17_17_27] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, unused, top_offxy rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, unused, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_v_overlap: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m9, m6 pand m4, m8 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq*2] movu m5, [grain_lutq+top_offxyq*2] punpcklwd m4, m5, m6 punpckhwd m5, m6 pmaddwd m4, m15 pmaddwd m5, m15 movu m7, [grain_lutq+offxyq*2+32] movu m6, [grain_lutq+top_offxyq*2+32] paddd m4, m14 paddd m5, m14 psrad m4, 5 psrad m5, 5 packssdw m4, m5 punpcklwd m5, m6, m7 punpckhwd m6, m7 pmaddwd m5, m15 pmaddwd m6, m15 paddd m5, m14 paddd m6, m14 psrad m5, 5 psrad m6, 5 packssdw m5, m6 pmaxsw m4, m9 pmaxsw m5, m9 pminsw m4, m10 pminsw m5, m10 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m11 pmaddubsw m3, m11 paddw m2, m2 paddw m3, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hb jz .end_y_v_overlap vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: add wq, 32 jge .end lea srcq, [src_bakq+wq*2] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap .loop_x_hv_overlap: vpbroadcastd m15, [pw_27_17_17_27] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy lea topleft_offxyd, [top_offxyq+32] lea left_offxyd, [offyq+32] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_hv_overlap: ; scaling[src] mova m0, [srcq+ 0] mova m1, [srcq+32] pand m4, m8, m0 psrld m3, m0, 16 mova m6, m9 vpgatherdd m2, [scalingq+m4-0], m9 pand m3, m8 mova m9, m6 vpgatherdd m4, [scalingq+m3-2], m6 pand m5, m8, m1 mova m6, m9 vpgatherdd m3, [scalingq+m5-0], m9 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m9, m6 pand m4, m8 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m7, [grain_lutq+offxyq*2] movd xm6, [grain_lutq+left_offxyq*2] movu m5, [grain_lutq+top_offxyq*2] movd xm4, [grain_lutq+topleft_offxyq*2] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklwd xm6, xm7 punpcklwd xm4, xm5 punpcklqdq xm6, xm4 movddup xm4, [pw_27_17_17_27] pmaddwd xm6, xm4 paddd xm6, xm14 psrad xm6, 5 packssdw xm6, xm6 pmaxsw xm6, xm9 pminsw xm6, xm10 pshuflw xm4, xm6, q1032 vpblendd m6, m7, 0xfe vpblendd m4, m5, 0xfe ; followed by v interpolation (top | cur -> cur) punpckhwd m5, m7 pmaddwd m5, m15 punpcklwd m4, m6 pmaddwd m4, m15 movu m7, [grain_lutq+offxyq*2+32] movu m6, [grain_lutq+top_offxyq*2+32] paddd m5, m14 paddd m4, m14 psrad m5, 5 psrad m4, 5 packssdw m4, m5 punpcklwd m5, m6, m7 punpckhwd m6, m7 pmaddwd m5, m15 pmaddwd m6, m15 paddd m5, m14 paddd m6, m14 psrad m5, 5 psrad m6, 5 packssdw m5, m6 pmaxsw m4, m9 pmaxsw m5, m9 pminsw m4, m10 pminsw m5, m10 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m11 pmaddubsw m3, m11 paddw m2, m2 paddw m3, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq+srcq+ 0], m0 mova [dstq+srcq+32], m1 add srcq, strideq add grain_lutq, 82*2 dec hb jz .end_y_hv_overlap vpbroadcastd m15, [pw_27_17_17_27+4] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_hv_overlap movq xm7, [pw_27_17_17_27] jmp .loop_y_h_overlap .end_y_hv_overlap: add wq, 32 lea srcq, [src_bakq+wq*2] jl .loop_x_hv_overlap .end: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r12-grain_min lea r12, [grain_min] mov r9d, r13m ; bdmax mov r7d, [fg_dataq+FGData.scaling_shift] mov r11d, is_idm mov sbyd, sbym vpbroadcastw m11, [base+mul_bits+r7*2-12] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] shr r9d, 11 ; is_12bpc vpbroadcastd m8, [base+grain_min+r9*4] shlx r10d, r6d, r9d vpbroadcastd m9, [base+grain_max+r9*4] vpbroadcastw m10, r13m shlx r6d, r6d, r11d vpbroadcastd m12, [base+fg_min+r10*4] lea r6d, [r9+r6*2] vpbroadcastd m13, [base+fg_max+r6*4] test sbyd, sbyd setnz r7b cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused, sby, see, overlap %if %1 mov r6d, r11m vpbroadcastd m0, [base+pb_8_9_0_1] vpbroadcastd m1, [base+uv_offset_mul+r9*4] vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastd m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m0 ; { uv_luma_mult, uv_mult } pmaddwd m15, m1 %else %if %2 vpbroadcastq m15, [base+pw_23_22] %else vpbroadcastq m15, [base+pw_27_17_17_27] %endif vpbroadcastd m14, [base+pd_16] %endif test r7b, [fg_dataq+FGData.overlap_flag] jnz %%vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused2, unused3, see, unused4, unused5, unused6, luma, lstride mov lumaq, r9mp mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r9mp, r10 mov r11mp, r11 mov r12mp, r12 neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, unused2, unused3, luma, lstride rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, unused2, unused3, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq+ 0] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 pminuw m3, m10 ; clip_pixel() %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m4, [grain_lutq+offxyq*2] %if %2 pmulhrsw m5, [grain_lutq+offxyq*2+82*2] %else pmulhrsw m5, [grain_lutq+offxyq*2+32] %endif ; dst = clip_pixel(src, noise) %if %1 paddw m0, m4 paddw m1, m5 %else paddw m0, m4, [srcq] %if %2 paddw m1, m5, [srcq+strideq] %else paddw m1, m5, [srcq+32] %endif %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else mova [dstq+32], m1 add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82*(2<<%2) %if %2 sub hb, 2 %else dec hb %endif jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp byte [fg_dataq+FGData.overlap_flag], 0 je %%loop_x cmp dword r8m, 0 ; sby jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 ; clip_pixel() pminuw m3, m10 %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m2, [grain_lutq+offxyq*2] %if %2 movu m3, [grain_lutq+offxyq*2+82*2] %else movu m3, [grain_lutq+offxyq*2+32] %endif movd xm6, [grain_lutq+left_offxyq*2] %if %2 pinsrw xm6, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1} punpckldq xm7, xm2, xm3 ; {cur0, cur1} punpcklwd xm6, xm7 ; {left0, cur0, left1, cur1} %else punpcklwd xm6, xm2 %endif %if %1 %if %2 vpbroadcastq xm7, [pw_23_22] %else movq xm7, [pw_27_17_17_27] %endif pmaddwd xm6, xm7 vpbroadcastd xm7, [pd_16] paddd xm6, xm7 %else pmaddwd xm6, xm15 paddd xm6, xm14 %endif psrad xm6, 5 packssdw xm6, xm6 pmaxsw xm6, xm8 pminsw xm6, xm9 vpblendd m2, m6, 0x01 %if %2 pshuflw xm6, xm6, q1032 vpblendd m3, m6, 0x01 %endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m2, m4 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) %if %1 paddw m0, m2 paddw m1, m3 %else paddw m0, m2, [srcq] %if %2 paddw m1, m3, [srcq+strideq] %else paddw m1, m3, [srcq+32] %endif %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else mova [dstq+32], m1 add srcq, strideq add dstq, strideq add lumaq, r10mp %endif add grain_lutq, 82*(2<<%2) %if %2 sub hb, 2 %else dec hb %endif jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp dword r8m, 0 ; sby jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, unused1, unused2, unused3, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, top_offxy, unused2, luma, lstride mov lumaq, r9mp mov lstrideq, r10mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r9mp, r10 mov r11mp, r11 mov r12mp, r12 neg wq %%loop_x_v_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, top_offxy, unused2, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 lea r10, [pw_27_17_17_27] %endif %%loop_y_v_overlap: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 ; clip_pixel() pminuw m3, m10 %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq*2] movu m3, [grain_lutq+top_offxyq*2] punpcklwd m2, m3, m6 punpckhwd m3, m6 ; { top, cur } %if %3 vpbroadcastd m0, [pw_23_22] %elif %2 vpbroadcastd m0, [pw_27_17_17_27] %else vpbroadcastd m0, [r10] %endif REPX {pmaddwd x, m0}, m2, m3 %if %1 vpbroadcastd m1, [pd_16] REPX {paddd x, m1}, m2, m3 %else REPX {paddd x, m14}, m2, m3 %endif REPX {psrad x, 5}, m2, m3 packssdw m2, m3 %if %2 movu m3, [grain_lutq+offxyq*2+82*2] %else movu m3, [grain_lutq+offxyq*2+32] %endif %if %3 pmaxsw m2, m8 pminsw m2, m9 %else %if %2 movu m7, [grain_lutq+top_offxyq*2+82*2] punpckhwd m6, m3, m7 ; { cur, top } punpcklwd m3, m7 %else movu m7, [grain_lutq+top_offxyq*2+32] punpckhwd m6, m7, m3 punpcklwd m3, m7, m3 ; { top, cur } %endif pmaddwd m6, m0 pmaddwd m3, m0 %if %1 paddd m6, m1 paddd m3, m1 %else paddd m6, m14 paddd m3, m14 %endif psrad m6, 5 psrad m3, 5 packssdw m3, m6 pmaxsw m2, m8 pmaxsw m3, m8 pminsw m2, m9 pminsw m3, m9 %endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m2, m4 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m2, [srcq] %if %2 paddw m1, m3, [srcq+strideq] %else paddw m1, m3, [srcq+32] %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 sub hb, 2 %else mova [dstq+32], m1 dec hb %endif jle %%end_y_v_overlap %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82*(2<<%2) %if %2 jmp %%loop_y %else add hd, 0x80000000 jc %%loop_y add r10, 4 jmp %%loop_y_v_overlap %endif %%end_y_v_overlap: add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %%loop_x_hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride %if %2 == 0 lea r14, [pw_27_17_17_27] %endif lea topleft_offxyq, [top_offxyq+(32>>%2)] lea left_offxyq, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %%loop_y_hv_overlap: ; luma_src %if %2 mova xm2, [lumaq+lstrideq*0+ 0] vinserti128 m2, [lumaq+lstrideq*0+32], 1 mova xm4, [lumaq+lstrideq*0+16] vinserti128 m4, [lumaq+lstrideq*0+48], 1 mova xm3, [lumaq+lstrideq*(1<<%3)+ 0] vinserti128 m3, [lumaq+lstrideq*(1<<%3)+32], 1 mova xm5, [lumaq+lstrideq*(1<<%3)+16] vinserti128 m5, [lumaq+lstrideq*(1<<%3)+48], 1 phaddw m2, m4 phaddw m3, m5 pxor m4, m4 pavgw m2, m4 pavgw m3, m4 %elif %1 mova m2, [lumaq] mova m3, [lumaq+32] %endif %if %1 mova m0, [srcq] %if %2 mova m1, [srcq+strideq] %else mova m1, [srcq+32] %endif punpckhwd m4, m2, m0 punpcklwd m2, m0 punpckhwd m5, m3, m1 punpcklwd m3, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m4, m2, m5, m3 REPX {paddd x, m15}, m4, m2, m5, m3 REPX {psrad x, 6 }, m4, m2, m5, m3 packusdw m2, m4 packusdw m3, m5 pminuw m2, m10 ; clip_pixel() pminuw m3, m10 %elif %2 pand m2, m10 pand m3, m10 %else pand m2, m10, [lumaq+ 0] pand m3, m10, [lumaq+32] %endif ; scaling[luma_src] vpbroadcastd m7, [pd_m65536] pandn m4, m7, m2 mova m6, m7 vpgatherdd m5, [scalingq+m4-0], m7 psrld m2, 16 mova m7, m6 vpgatherdd m4, [scalingq+m2-2], m6 pblendw m4, m5, 0x55 pandn m5, m7, m3 mova m6, m7 vpgatherdd m2, [scalingq+m5-0], m7 psrld m3, 16 vpgatherdd m5, [scalingq+m3-2], m6 pblendw m5, m2, 0x55 ; grain = grain_lut[offy+y][offx+x] movu m0, [grain_lutq+offxyq*2] movd xm2, [grain_lutq+left_offxyq*2] movu m6, [grain_lutq+top_offxyq*2] %if %2 pinsrw xm2, [grain_lutq+left_offxyq*2+82*2], 2 movu m3, [grain_lutq+offxyq*2+82*2] punpckldq xm1, xm0, xm3 ; { cur0, cur1 } %if %3 vinserti128 m2, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left } vinserti128 m1, [grain_lutq+top_offxyq*2], 1 ; { cur0, cur1, top0 } %else vinserti128 m2, [grain_lutq+topleft_offxyq*2+82*2], 1 vpbroadcastd m7, [grain_lutq+topleft_offxyq*2] vpblendd m2, m7, 0x20 movd xm7, [grain_lutq+top_offxyq*2+82*2] punpckldq xm7, xm6 vinserti128 m1, xm7, 1 movu m7, [grain_lutq+top_offxyq*2+82*2] %endif punpcklwd m2, m1 ; { cur, left } %if %1 vpbroadcastq m1, [pw_23_22] pmaddwd m2, m1 vpbroadcastd m1, [pd_16] paddd m2, m1 psrad m2, 5 packssdw m2, m2 vpermq m2, m2, q3120 %else pmaddwd m2, m15 paddd m2, m14 psrad m2, 5 vextracti128 xm1, m2, 1 packssdw xm2, xm1 %endif %else pinsrd xm2, [grain_lutq+topleft_offxyq*2], 1 movu m3, [grain_lutq+offxyq*2+32] movu m7, [grain_lutq+top_offxyq*2+32] punpckldq xm1, xm0, xm6 punpcklwd xm2, xm1 ; { cur, left } %if %1 movddup xm1, [pw_27_17_17_27] pmaddwd xm2, xm1 vpbroadcastd m1, [pd_16] paddd xm2, xm1 %else pmaddwd xm2, xm15 paddd xm2, xm14 %endif psrad xm2, 5 packssdw xm2, xm2 %endif pmaxsw xm2, xm8 pminsw xm2, xm9 vpblendd m0, m2, 0x01 %if %2 pshufd xm2, xm2, q0321 vpblendd m3, m2, 0x01 %if %3 == 0 pshufd xm2, xm2, q0321 vpblendd m7, m2, 0x01 %endif %endif pshuflw xm2, xm2, q1032 vpblendd m2, m6, 0xfe punpckhwd m6, m0 ; { top, cur } punpcklwd m2, m0 %if %3 vpbroadcastd m0, [pw_23_22] %elif %2 vpbroadcastd m0, [pw_27_17_17_27] %else vpbroadcastd m0, [r14] %endif pmaddwd m6, m0 pmaddwd m2, m0 %if %1 paddd m6, m1 paddd m2, m1 %else paddd m6, m14 paddd m2, m14 %endif psrad m6, 5 psrad m2, 5 packssdw m2, m6 %if %3 pmaxsw m2, m8 pminsw m2, m9 %else %if %2 punpckhwd m6, m3, m7 punpcklwd m3, m7 ; { cur, top } %else punpckhwd m6, m7, m3 punpcklwd m3, m7, m3 ; { top, cur } %endif REPX {pmaddwd x, m0}, m6, m3 %if %1 REPX {paddd x, m1}, m6, m3 %else REPX {paddd x, m14}, m6, m3 %endif REPX {psrad x, 5}, m6, m3 packssdw m3, m6 pmaxsw m2, m8 pmaxsw m3, m8 pminsw m2, m9 pminsw m3, m9 %endif ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m4, m11 pmaddubsw m5, m11 paddw m4, m4 paddw m5, m5 pmulhrsw m2, m4 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m2, [srcq] %if %2 paddw m1, m3, [srcq+strideq] %else paddw m1, m3, [srcq+32] %endif pmaxsw m0, m12 pmaxsw m1, m12 pminsw m0, m13 pminsw m1, m13 mova [dstq], m0 %if %2 mova [dstq+strideq], m1 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else mova [dstq+32], m1 add srcq, strideq add dstq, strideq add lumaq, r10mp %endif add grain_lutq, 82*(2<<%2) %if %2 sub hb, 2 jg %%loop_y_h_overlap %else dec hb jle %%end_y_hv_overlap add hd, 0x80000000 jc %%loop_y_h_overlap add r14, 4 jmp %%loop_y_hv_overlap %endif %%end_y_hv_overlap: add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] jmp %%loop_x_hv_overlap %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro GEN_GRAIN_UV_FN 420, 1, 1 FGUV_FN 420, 1, 1 GEN_GRAIN_UV_FN 422, 1, 0 FGUV_FN 422, 1, 0 GEN_GRAIN_UV_FN 444, 0, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/filmgrain16_avx512.asm000064400000000000000000000770101046102023000160170ustar 00000000000000; Copyright © 2022, VideoLAN and dav1d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 64 pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 pw_23_22: dw 92, 88, 0, 128, 23, 22, 0, 32 fg_min: times 2 dw 0 times 2 dw 64 times 2 dw 256 fg_max: times 2 dw 1023 times 2 dw 4095 times 2 dw 960 times 2 dw 3840 times 2 dw 940 times 2 dw 3760 scale_rnd: dd 64 dd 16 uv_offset_mul: dd 256 dd 1024 pb_8_9_0_1: db 8, 9, 0, 1 SECTION .text INIT_ZMM avx512icl cglobal fgy_32x32xn_16bpc, 6, 15, 21, dst, src, stride, fg_data, w, scaling, \ grain_lut, offx, sby, see, offy, src_bak %define base r11-fg_min lea r11, [fg_min] mov r6d, r9m ; bdmax mov r9d, [fg_dataq+FGData.clip_to_restricted_range] mov r7d, [fg_dataq+FGData.scaling_shift] mov sbyd, sbym vpbroadcastd m6, r9m shr r6d, 11 ; is_12bpc vbroadcasti32x4 m7, [base+scale_mask] shlx r10d, r9d, r6d vpbroadcastd m10, [base+scale_shift+r7*4-32] lea r9d, [r6+r9*4] vpbroadcastd m8, [base+fg_min+r10*4] kxnorw k1, k1, k1 ; 0xffff vpbroadcastd m9, [base+fg_max+r9*4] mov r12, 0xeeeeeeeeeeeeeeee vpbroadcastd m19, [base+scale_rnd+r6*4] kshiftrb k2, k1, 4 ; 0xf vpbroadcastq xm20, [base+pw_27_17_17_27+r6*8] kmovq k3, r12 vpbroadcastd m11, [base+scale_shift+r6*8+4] test sbyd, sbyd setnz r7b vpbroadcastd m12, [base+pw_27_17_17_27+r6*8+0] vpbroadcastd m13, [base+pw_27_17_17_27+r6*8+4] test r7b, [fg_dataq+FGData.overlap_flag] jnz .v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak mov grain_lutq, grain_lutmp mov hd, hm .loop_y: movu m4, [grain_lutq+offxyq*2+82*0] movu m5, [grain_lutq+offxyq*2+82*2] call .add_noise sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq*2] cmp byte [fg_dataq+FGData.overlap_flag], 0 je .loop_x test sbyd, sbyd jnz .hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ sby, see, offy, src_bak, left_offxy lea left_offxyd, [offyq+73] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: movu m4, [grain_lutq+offxyq*2+82*0] movu m5, [grain_lutq+offxyq*2+82*2] movd xm17, [grain_lutq+left_offxyq*2-82*1] pinsrd xm17, [grain_lutq+left_offxyq*2+82*1], 1 punpckldq xm16, xm4, xm5 punpcklwd xm17, xm16 mova xm16, xm19 vpdpwssd xm16, xm20, xm17 psrad xm16, 1 packssdw xm16, xm16 vpsravw xm16, xm11 vmovdqu8 m4{k2}, m16 vpalignr m5{k2}, m16, m16, 4 call .add_noise sub hb, 2 jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq*2] test sbyd, sbyd jnz .hv_overlap jmp .loop_x_h_overlap .v_overlap: movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed lea src_bakq, [srcq+wq*2] neg wq sub dstq, srcq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ sby, see, offy, src_bak, _, top_offxy rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak, _, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu m16, [grain_lutq+offxyq*2+82*0] movu m0, [grain_lutq+top_offxyq*2+82*0] movu m17, [grain_lutq+offxyq*2+82*2] movu m1, [grain_lutq+top_offxyq*2+82*2] punpckhwd m4, m0, m16 punpcklwd m0, m16 punpckhwd m5, m1, m17 punpcklwd m1, m17 call .add_noise_v sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq*2] ; since fg_dataq.overlap is guaranteed to be set, we never jump back ; to .v_overlap, and instead always fall-through to .hv_overlap .hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, offx, \ sby, see, offy, src_bak, left_offxy, top_offxy, topleft_offxy lea topleft_offxyd, [top_offxyq+73] lea left_offxyd, [offyq+73] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, h, \ sby, see, offxy, src_bak, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu m5, [grain_lutq+offxyq*2+82*0] movu m0, [grain_lutq+top_offxyq*2+82*0] movd xm17, [grain_lutq+left_offxyq*2-82*1] pinsrd xm17, [grain_lutq+topleft_offxyq*2-82*1], 1 movu m2, [grain_lutq+offxyq*2+82*2] movu m1, [grain_lutq+top_offxyq*2+82*2] movd xm18, [grain_lutq+left_offxyq*2+82*1] pinsrd xm18, [grain_lutq+topleft_offxyq*2+82*1], 1 punpckldq xm16, xm5, xm0 punpcklwd xm17, xm16 mova xm16, xm19 vpdpwssd xm16, xm20, xm17 punpckldq xm17, xm2, xm1 punpcklwd xm18, xm17 mova xm17, xm19 vpdpwssd xm17, xm20, xm18 punpckhwd m4, m0, m5 punpcklwd m0, m5 punpckhwd m5, m1, m2 punpcklwd m1, m2 psrad xm16, 1 psrad xm17, 1 packssdw xm16, xm17 vpsravw xm16, xm11 vpshuflw m0{k2}, m16, q1302 punpckhqdq xm16, xm16 vpshuflw m1{k2}, m16, q1302 call .add_noise_v sub hb, 2 jg .loop_y_h_overlap add wq, 32 lea srcq, [src_bakq+wq*2] jl .hv_overlap .end: RET ALIGN function_align .add_noise_v: mova m2, m19 vpdpwssd m2, m12, m4 mova m3, m19 vpdpwssd m3, m13, m5 mova m4, m19 vpdpwssd m4, m12, m0 mova m5, m19 vpdpwssd m5, m13, m1 REPX {psrad x, 1}, m2, m3, m4, m5 packssdw m4, m2 packssdw m5, m3 vpsravw m4, m11 vpsravw m5, m11 .add_noise: mova m0, [srcq+strideq*0] mova m1, [srcq+strideq*1] kmovw k4, k1 pand m16, m6, m0 psrld m3, m0, 16 vpgatherdd m2{k4}, [scalingq+m16] vpcmpud k4, m3, m6, 2 ; px <= bdmax vpgatherdd m16{k4}, [scalingq+m3] kmovw k4, k1 pand m17, m6, m1 vpgatherdd m3{k4}, [scalingq+m17] vpshufb m2{k3}, m16, m7 psrld m16, m1, 16 vpcmpud k4, m16, m6, 2 vpgatherdd m17{k4}, [scalingq+m16] vpshufb m3{k3}, m17, m7 vpsllvw m2, m10 vpsllvw m3, m10 pmulhrsw m4, m2 pmulhrsw m5, m3 add grain_lutq, 82*4 paddw m0, m4 paddw m1, m5 pmaxsw m0, m8 pmaxsw m1, m8 pminsw m0, m9 pminsw m1, m9 mova [dstq+srcq], m0 add srcq, strideq mova [dstq+srcq], m1 add srcq, strideq ret %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r12-fg_min lea r12, [fg_min] mov r9d, r13m ; bdmax mov r7d, [fg_dataq+FGData.scaling_shift] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] mov r11d, is_idm kxnorw k1, k1, k1 ; 0xffff vpbroadcastd m5, r13m mov r13, 0xeeeeeeeeeeeeeeee vbroadcasti32x4 m6, [base+scale_mask] shr r9d, 11 ; is_12bpc vpbroadcastd m7, [base+scale_shift+r7*4-32] shlx r10d, r6d, r9d mov sbyd, sbym shlx r6d, r6d, r11d vpbroadcastd m8, [base+fg_min+r10*4] lea r6d, [r9+r6*2] vpbroadcastd m9, [base+fg_max+r6*4] kmovq k2, r13 vpbroadcastd m20, [base+scale_rnd+r9*4] packssdw m4, m5, m5 vpbroadcastd m21, [base+scale_shift+r9*8+4] %if %2 mova m12, [base+pb_0to63] ; pw_even mov r13d, 0x0101 vpbroadcastq m10, [base+pw_23_22+r9*8] kmovw k3, r13d %if %3 pshufd m11, m10, q0000 %else vpbroadcastd ym16, [base+pw_27_17_17_27+r9*8+0] vpbroadcastd m11, [base+pw_27_17_17_27+r9*8+4] vmovdqu16 m11{k1}, m16 %endif psrlw m13, m12, 8 ; pw_odd %else vpbroadcastq m10, [base+pw_27_17_17_27+r9*8] kshiftrb k3, k1, 7 ; 0x01 kshiftrb k4, k1, 4 ; 0x0f pshufd m11, m10, q0000 %endif mov lstrideq, r10mp test sbyd, sbyd setnz r7b cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ _, sby, see, lstride %if %1 mov r6d, r11m vpbroadcastd m0, [base+uv_offset_mul+r9*4] vpbroadcastd m1, [base+pb_8_9_0_1] vpbroadcastd m14, [fg_dataq+FGData.uv_offset+r6*4] vbroadcasti32x4 m15, [fg_dataq+FGData.uv_mult+r6*4] pmaddwd m14, m0 pshufb m15, m1 ; { uv_luma_mult, uv_mult } %endif test r7b, [fg_dataq+FGData.overlap_flag] jnz %%v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma mov lumaq, r9mp lea r12, [srcq+wq*2] lea r13, [dstq+wq*2] lea r14, [lumaq+wq*(2<<%2)] mov r9mp, r12 mov r10mp, r13 mov r11mp, r14 neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: %if %2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+offxyq*2+82*2] %endif call %%add_noise sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp byte [fg_dataq+FGData.overlap_flag], 0 je %%loop_x cmp dword r8m, 0 ; sby jne %%hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma, left_offxy lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma, left_offxy mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: %if %2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 movd xm16, [grain_lutq+left_offxyq*2+82*0] vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 movd xm17, [grain_lutq+left_offxyq*2+82*4] vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 punpckldq m16, m17 punpckldq m17, m18, m19 punpcklwd m16, m17 mova m17, m20 vpdpwssd m17, m16, m10 psrad m17, 1 packssdw m17, m17 vpsravw m17, m21 %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+offxyq*2+82*2] movd xm16, [grain_lutq+left_offxyq*2+82*0] pinsrd xm16, [grain_lutq+left_offxyq*2+82*2], 1 punpckldq xm17, xm18, xm19 punpcklwd xm16, xm17 mova xm17, xm20 vpdpwssd xm17, xm16, xm10 psrad xm17, 1 packssdw xm17, xm17 vpsravw xm17, xm21 %endif vmovdqa32 m18{k3}, m17 vpshufd m19{k3}, m17, q0321 call %%add_noise sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] cmp dword r8m, 0 ; sby jne %%hv_overlap jmp %%loop_x_h_overlap %%v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ _, sby, see, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma, _, top_offxy mov lumaq, r9mp lea r12, [srcq+wq*2] lea r13, [dstq+wq*2] lea r14, [lumaq+wq*(2<<%2)] mov r9mp, r12 mov r10mp, r13 mov r11mp, r14 neg wq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma, _, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %3 movu ym16, [grain_lutq+offxyq*2+82*0] movu ym1, [grain_lutq+top_offxyq*2+82*0] vbroadcasti32x8 m18, [grain_lutq+offxyq*2+82*2] movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 punpcklwd ym17, ym1, ym16 punpckhwd ym1, ym16 %elif %2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym17, [grain_lutq+top_offxyq*2+82*0] vinserti32x8 m17, [grain_lutq+top_offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 punpcklwd m16, m17, m18 punpckhwd m17, m18 %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+top_offxyq*2+82*0] movu m2, [grain_lutq+offxyq*2+82*2] movu m16, [grain_lutq+top_offxyq*2+82*2] punpckhwd m1, m19, m18 punpcklwd m19, m18 punpckhwd m18, m2, m16 punpcklwd m2, m16 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] ; since fg_dataq.overlap is guaranteed to be set, we never jump back ; to %%v_overlap, and instead always fall-through to %%hv_overlap %%hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+(32>>%2)] lea left_offxyq, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, lstride, luma, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 ; grain = grain_lut[offy+y][offx+x] %if %2 movd xm16, [grain_lutq+left_offxyq*2+82*0] vinserti32x4 m16, [grain_lutq+left_offxyq*2+82*2], 2 movd xm17, [grain_lutq+left_offxyq*2+82*4] vinserti32x4 m17, [grain_lutq+left_offxyq*2+82*6], 2 movu ym18, [grain_lutq+offxyq*2+82*0] vinserti32x8 m18, [grain_lutq+offxyq*2+82*2], 1 movu ym19, [grain_lutq+offxyq*2+82*4] vinserti32x8 m19, [grain_lutq+offxyq*2+82*6], 1 punpckldq m16, m17 punpckldq m17, m18, m19 punpcklwd m16, m17 movu ym1, [grain_lutq+top_offxyq*2+82*0] movd xm17, [grain_lutq+topleft_offxyq*2+82*0] mova m0, m20 vpdpwssd m0, m16, m10 %if %3 punpcklwd xm17, xm1 mova xm16, xm20 vpdpwssd xm16, xm17, xm10 psrad xm16, 1 %else vinserti32x8 m1, [grain_lutq+top_offxyq*2+82*2], 1 vinserti32x4 m17, [grain_lutq+topleft_offxyq*2+82*2], 2 punpcklwd m17, m1 mova m16, m20 vpdpwssd m16, m17, m10 psrad m16, 1 %endif psrad m0, 1 packssdw m0, m16 vpsravw m0, m21 vmovdqa32 m18{k3}, m0 vpshufd m19{k3}, m0, q0321 %if %3 vpunpckhdq ym1{k3}, ym0, ym0 punpcklwd ym17, ym1, ym18 punpckhwd ym1, ym18 %else vpunpckhdq m1{k3}, m0, m0 punpcklwd m16, m1, m18 punpckhwd m17, m1, m18 %endif %else movu m18, [grain_lutq+offxyq*2+82*0] movu m19, [grain_lutq+top_offxyq*2+82*0] movd xm17, [grain_lutq+left_offxyq*2+82*0] pinsrd xm17, [grain_lutq+topleft_offxyq*2+82*0], 1 punpckldq xm16, xm18, xm19 punpcklwd xm17, xm16 movu m2, [grain_lutq+offxyq*2+82*2] movu m0, [grain_lutq+top_offxyq*2+82*2] movd xm16, [grain_lutq+left_offxyq*2+82*2] pinsrd xm16, [grain_lutq+topleft_offxyq*2+82*2], 1 punpckldq xm1, xm2, xm0 punpcklwd xm1, xm16, xm1 mova xm16, xm20 vpdpwssd xm16, xm17, xm10 mova xm17, xm20 vpdpwssd xm17, xm1, xm10 punpckhwd m1, m19, m18 punpcklwd m19, m18 punpckhwd m18, m2, m0 punpcklwd m2, m0 psrad xm16, 1 psrad xm17, 1 packssdw xm16, xm17 vpsravw xm16, xm21 vpshuflw m19{k4}, m16, q1302 punpckhqdq xm16, xm16 vpshuflw m2{k4}, m16, q3120 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r9mp mov dstq, r10mp mov lumaq, r11mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] jmp %%hv_overlap ALIGN function_align %%add_noise_v: %if %3 mova ym16, ym20 vpdpwssd ym16, ym17, ym11 mova ym17, ym20 vpdpwssd ym17, ym1, ym11 psrad ym16, 1 psrad ym17, 1 packssdw ym16, ym17 vpsravw m18{k1}, m16, m21 %elif %2 mova m18, m20 vpdpwssd m18, m16, m11 mova m16, m20 vpdpwssd m16, m17, m11 psrad m18, 1 psrad m16, 1 packssdw m18, m16 vpsravw m18, m21 %else mova m16, m20 vpdpwssd m16, m1, m11 mova m17, m20 vpdpwssd m17, m18, m11 mova m18, m20 vpdpwssd m18, m19, m11 mova m19, m20 vpdpwssd m19, m2, m11 REPX {psrad x, 1}, m16, m17, m18, m19 packssdw m18, m16 packssdw m19, m17 vpsravw m18, m21 vpsravw m19, m21 %endif %%add_noise: %if %2 mova m2, [lumaq+lstrideq*(0<<%3)] mova m0, [lumaq+lstrideq*(1<<%3)] lea lumaq, [lumaq+lstrideq*(2<<%3)] mova m3, [lumaq+lstrideq*(0<<%3)] mova m1, [lumaq+lstrideq*(1<<%3)] mova m16, m12 vpermi2w m16, m2, m0 vpermt2w m2, m13, m0 mova m17, m12 vpermi2w m17, m3, m1 vpermt2w m3, m13, m1 pavgw m2, m16 pavgw m3, m17 %elif %1 mova m2, [lumaq+lstrideq*0] mova m3, [lumaq+lstrideq*1] %endif %if %2 mova ym16, [srcq+strideq*0] vinserti32x8 m16, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] %else mova m16, [srcq+strideq*0] %endif %if %1 punpckhwd m17, m2, m16 mova m0, m14 vpdpwssd m0, m17, m15 punpcklwd m17, m2, m16 mova m2, m14 vpdpwssd m2, m17, m15 %endif %if %2 mova ym17, [srcq+strideq*0] vinserti32x8 m17, [srcq+strideq*1], 1 %else mova m17, [srcq+strideq*1] %endif %if %1 psrad m0, 6 psrad m2, 6 packusdw m2, m0 punpckhwd m0, m3, m17 mova m1, m14 vpdpwssd m1, m15, m0 punpcklwd m0, m3, m17 mova m3, m14 vpdpwssd m3, m15, m0 psrad m1, 6 psrad m3, 6 packusdw m3, m1 pminuw m2, m4 pminuw m3, m4 .add_noise_main: ; scaling[luma_src] kmovw k5, k1 pand m1, m5, m2 vpgatherdd m0{k5}, [scalingq+m1] kmovw k5, k1 psrld m2, 16 vpgatherdd m1{k5}, [scalingq+m2] vpshufb m0{k2}, m1, m6 kmovw k5, k1 psrld m1, m3, 16 vpgatherdd m2{k5}, [scalingq+m1] kmovw k5, k1 pand m3, m5 vpgatherdd m1{k5}, [scalingq+m3] vpshufb m1{k2}, m2, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) vpsllvw m0, m7 vpsllvw m1, m7 pmulhrsw m18, m0 pmulhrsw m19, m1 add grain_lutq, 82*(4<<%2) lea lumaq, [lumaq+lstrideq*(2<<%3)] lea srcq, [srcq+strideq*2] paddw m16, m18 paddw m17, m19 pmaxsw m16, m8 pmaxsw m17, m8 pminsw m16, m9 pminsw m17, m9 %if %2 mova [dstq+strideq*0], ym16 vextracti32x8 [dstq+strideq*1], m16, 1 lea dstq, [dstq+strideq*2] mova [dstq+strideq*0], ym17 vextracti32x8 [dstq+strideq*1], m17, 1 %else mova [dstq+strideq*0], m16 mova [dstq+strideq*1], m17 %endif lea dstq, [dstq+strideq*2] ret %else %if %2 pand m2, m4 pand m3, m4 %else pand m2, m4, [lumaq+lstrideq*0] pand m3, m4, [lumaq+lstrideq*1] %endif jmp .add_noise_main %endif %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro FGUV_FN 420, 1, 1 FGUV_FN 422, 1, 0 FGUV_FN 444, 0, 0 %endif rav1e-0.7.1/src/x86/filmgrain16_sse.asm000064400000000000000000002740041046102023000155650ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" SECTION_RODATA 16 pd_16: times 4 dd 16 pw_1: times 8 dw 1 pw_16384: times 8 dw 16384 pw_8192: times 8 dw 8192 pw_23_22: dw 23, 22 times 3 dw 0, 32 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 pw_27_17_17_27: dw 27, 17, 17, 27 times 2 dw 0, 32 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512, 1024 max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16 min: dw 0, 16*4, 16*16 ; these two should be next to each other pw_4: times 2 dw 4 pw_16: times 2 dw 16 %macro JMP_TABLE 1-* %xdefine %1_table %%table %xdefine %%base %1_table %xdefine %%prefix mangle(private_prefix %+ _%1) %%table: %rep %0 - 1 dd %%prefix %+ .ar%2 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422_16bpc_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444_16bpc_ssse3, 0, 1, 2, 3 SECTION .text %if ARCH_X86_32 %undef base %define PIC_ptr(a) base+a %else %define PIC_ptr(a) a %endif %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg %assign %%idx 0 %define %%tmp %2 %if %0 == 8 %define %%tmp %8 %endif %rep (%6/2) %if %%idx == 0 movd %5 %+ d, %2 pshuflw %%tmp, %2, q3232 %else movd %5 %+ d, %%tmp %if %6 == 8 %if %%idx == 2 punpckhqdq %%tmp, %%tmp %elif %%idx == 4 psrlq %%tmp, 32 %endif %endif %endif movzx %4 %+ d, %5 %+ w shr %5 %+ d, 16 %if %%idx == 0 movd %1, [%3+%4*%7] %else pinsrw %1, [%3+%4*%7], %%idx + 0 %endif pinsrw %1, [%3+%5*%7], %%idx + 1 %assign %%idx %%idx+2 %endrep %endmacro %macro SPLATD 2 ; dst, src %ifnidn %1, %2 movd %1, %2 %endif pshufd %1, %1, q0000 %endmacro %macro SPLATW 2 ; dst, src %ifnidn %1, %2 movd %1, %2 %endif pshuflw %1, %1, q0000 punpcklqdq %1, %1 %endmacro INIT_XMM ssse3 %if ARCH_X86_64 cglobal generate_grain_y_16bpc, 3, 8, 16, buf, fg_data, bdmax lea r4, [pb_mask] %define base r4-pb_mask %else cglobal generate_grain_y_16bpc, 3, 6, 8, buf, fg_data, bdmax LEA r4, $$ %define base r4-$$ %endif movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r3d, [fg_dataq+FGData.grain_scale_shift] lea r5d, [bdmaxq+1] shr r5d, 11 ; 0 for 10bpc, 2 for 12bpc sub r3, r5 SPLATW m6, [base+round+r3*2-2] mova m5, [base+pb_mask] SPLATW m0, [fg_dataq+FGData.seed] mov r3, -73*82*2 sub bufq, r3 %if ARCH_X86_64 lea r6, [gaussian_sequence] %endif .loop: pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set pshufb m3, m5, m2 ; set 15th bit for next 4 seeds psllq m2, m3, 30 por m2, m3 psllq m3, m2, 15 por m2, m3 ; aggregate each bit into next seed's high bit pmulhuw m3, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 %if ARCH_X86_64 vpgatherdw m3, m2, r6, r5, r7, 4, 2 %else vpgatherdw m3, m2, base+gaussian_sequence, r5, r2, 4, 2 %endif paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 ; shifts by 0, which pmulhrsw does not support pmulhrsw m3, m6 movq [bufq+r3], m3 add r3, 4*2 jl .loop ; auto-regression code movsxd r3, [fg_dataq+FGData.ar_coeff_lag] movsxd r3, [base+generate_grain_y_16bpc_ssse3_table+r3*4] lea r3, [r3+base+generate_grain_y_16bpc_ssse3_table] jmp r3 .ar1: %if WIN64 DEFINE_ARGS shift, fg_data, max, buf, val3, min, cf3, x, val0 lea bufq, [r0-2*(82*73-(82*3+79))] PUSH r8 %else %if ARCH_X86_64 DEFINE_ARGS buf, fg_data, max, shift, val3, min, cf3, x, val0 %else ; x86-32 DEFINE_ARGS buf, fg_data, min, val3, x, cf3, val0 PUSH r6 %define shiftd r1d %endif sub bufq, 2*(82*73-(82*3+79)) %endif movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd m4, [fg_dataq+FGData.ar_coeffs_y] mov shiftd, [fg_dataq+FGData.ar_coeff_shift] %if WIN64 DEFINE_ARGS shift, h, max, buf, val3, min, cf3, x, val0 %elif ARCH_X86_64 DEFINE_ARGS buf, h, max, shift, val3, min, cf3, x, val0 %else ; x86-32 %undef shiftd DEFINE_ARGS buf, shift, min, val3, x, cf3, val0 %define hd dword r0m %define maxd dword minm %endif %if cpuflag(sse4) pmovsxbw m4, m4 %else pxor m3, m3 pcmpgtb m3, m4 punpcklbw m4, m3 %endif pinsrw m4, [base+pw_1], 3 pshufd m5, m4, q1111 pshufd m4, m4, q0000 SPLATW m3, [base+round_vals+shiftq*2-12] ; rnd mov hd, 70 sar maxd, 1 mov mind, maxd xor mind, -1 .y_loop_ar1: mov xq, -76 movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu m0, [bufq+xq*2-82*2-2] ; top/left psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 punpcklwd m1, m3 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 imul val3d, cf3d add val3d, val0d sar val3d, shiftb movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 dec hd jg .y_loop_ar1 %if WIN64 POP r8 %elif ARCH_X86_32 POP r6 %undef maxd %undef hd %endif .ar0: RET .ar2: %if ARCH_X86_32 %assign stack_offset_old stack_offset ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, bdmax, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m0, [base+round_vals-12+shiftq*2] pshuflw m0, m0, q0000 movu m6, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-11 pxor m2, m2 punpcklwd m0, m2 pcmpgtb m2, m6 punpckhbw m3, m6, m2 punpcklbw m6, m2 pshufd m2, m6, q3333 pshufd m1, m6, q2222 pshufd m7, m6, q1111 pshufd m6, m6, q0000 pshufd m4, m3, q1111 pshufd m3, m3, q0000 %if ARCH_X86_64 SWAP 0, 12 SWAP 1, 8 SWAP 2, 9 SWAP 3, 10 SWAP 4, 11 %else %define m12 [rsp+0*16] %define m8 [rsp+1*16] %define m9 [rsp+2*16] %define m10 [rsp+3*16] %define m11 [rsp+4*16] mova m12, m0 mova m8, m1 mova m9, m2 mova m10, m3 mova m11, m4 mov bdmaxd, bdmaxm %endif sar bdmaxd, 1 SPLATW m0, bdmaxd ; max_grain pcmpeqw m1, m1 %if !cpuflag(sse4) pcmpeqw m2, m2 psrldq m2, 14 pslldq m2, 2 pxor m2, m1 %endif pxor m1, m0 ; min_grain %if ARCH_X86_64 SWAP 0, 13 SWAP 1, 14 SWAP 2, 15 %else %define m13 [rsp+5*16] %define m14 [rsp+6*16] mova m13, m0 mova m14, m1 %if !cpuflag(sse4) %define m15 [rsp+7*16] mova m15, m2 %endif %endif sub bufq, 2*(82*73-(82*3+79)) DEFINE_ARGS buf, fg_data, h, x mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] movu m1, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] psrldq m2, m0, 2 psrldq m3, m0, 4 psrldq m4, m0, 6 psrldq m5, m0, 8 punpcklwd m0, m2 punpcklwd m3, m4 punpcklwd m5, m1 psrldq m2, m1, 2 psrldq m4, m1, 4 punpcklwd m2, m4 psrldq m4, m1, 6 psrldq m1, 8 punpcklwd m4, m1 pmaddwd m0, m6 pmaddwd m3, m7 pmaddwd m5, m8 pmaddwd m2, m9 pmaddwd m4, m10 paddd m0, m3 paddd m5, m2 paddd m0, m4 paddd m0, m5 ; accumulated top 2 rows paddd m0, m12 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] pshufd m4, m1, q3321 pxor m2, m2 pcmpgtw m2, m4 punpcklwd m4, m2 ; in dwords, y=0,x=[0,3] .x_loop_ar2_inner: pmaddwd m2, m1, m11 paddd m2, m0 psrldq m0, 4 ; shift top to next pixel psrad m2, [fg_dataq+FGData.ar_coeff_shift] paddd m2, m4 packssdw m2, m2 pminsw m2, m13 pmaxsw m2, m14 psrldq m4, 4 pslldq m2, 2 psrldq m1, 2 %if cpuflag(sse4) pblendw m1, m2, 00000010b %else pand m1, m15 pandn m3, m15, m2 por m1, m3 %endif ; overwrite previous pixel, this should be ok movd [bufq+xq*2-2], m1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 dec hd jg .y_loop_ar2 %if ARCH_X86_32 %undef m8 %undef m9 %undef m10 %undef m11 %undef m12 %undef m13 %undef m14 %undef m15 %endif RET .ar3: DEFINE_ARGS buf, fg_data, bdmax, shift %if WIN64 mov r6, rsp and rsp, ~15 sub rsp, 64 %define tmp rsp %elif ARCH_X86_64 %define tmp rsp+stack_offset-72 %else %assign stack_offset stack_offset_old ALLOC_STACK -16*12 %define tmp rsp mov bdmaxd, bdmaxm %endif sar bdmaxd, 1 SPLATW m7, bdmaxd ; max_grain pcmpeqw m6, m6 %if !cpuflag(sse4) pcmpeqw m4, m4 psrldq m4, 14 pslldq m4, 4 pxor m4, m6 %endif pxor m6, m7 ; min_grain mov shiftd, [fg_dataq+FGData.ar_coeff_shift] %if ARCH_X86_64 SWAP 6, 14 SWAP 7, 15 %else %define m14 [rsp+10*16] %define m15 [esp+11*16] mova m14, m6 mova m15, m7 %endif ; build cf0-1 until 18-19 in m5-12 and r0/1 pxor m1, m1 movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 pcmpgtb m1, m0 punpckhbw m2, m0, m1 punpcklbw m0, m1 %if cpuflag(sse4) pshufd m4, m2, q3333 %else pshufd m5, m2, q3333 mova [tmp+48], m5 %endif pshufd m3, m2, q2222 pshufd m1, m2, q0000 pshufd m2, m2, q1111 pshufd m7, m0, q2222 pshufd m6, m0, q1111 pshufd m5, m0, q0000 pshufd m0, m0, q3333 %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 2, 10 SWAP 3, 11 SWAP 4, 12 %else %define m8 [rsp+4*16] %define m9 [esp+5*16] %define m10 [rsp+6*16] %define m11 [esp+7*16] %define m12 [rsp+8*16] mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 mova m12, m4 %endif ; build cf20,round in r2 ; build cf21-23,round*2 in m13 pxor m1, m1 movq m0, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pcmpgtb m1, m0 punpcklbw m0, m1 pshufd m1, m0, q0000 pshufd m2, m0, q1111 mova [tmp+ 0], m1 mova [tmp+16], m2 psrldq m3, m0, 10 pinsrw m3, [base+round_vals+shiftq*2-10], 3 %if ARCH_X86_64 SWAP 3, 13 %else %define m13 [esp+9*16] mova m13, m3 %endif pinsrw m0, [base+round_vals+shiftq*2-12], 5 pshufd m3, m0, q2222 mova [tmp+32], m3 DEFINE_ARGS buf, fg_data, h, x sub bufq, 2*(82*73-(82*3+79)) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, m5 pmaddwd m2, m6 pmaddwd m3, m7 paddd m0, m2 paddd m0, m3 ; m0 = top line first 6 multiplied by cf, m1 = top line last entry movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] palignr m4, m3, m2, 2 ; y=-3,x=[-2,+5] palignr m3, m3, m2, 4 ; y=-3,x=[-1,+6] punpckhwd m2, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] shufps m3, m4, m2, q1032 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, m8 pmaddwd m4, m9 pmaddwd m3, m10 pmaddwd m2, m11 paddd m1, m4 paddd m3, m2 paddd m0, m1 paddd m0, m3 ; m0 = top 2 lines multiplied by cf movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] punpcklwd m2, [base+pw_1] %if cpuflag(sse4) pmaddwd m1, m12 %else pmaddwd m1, [tmp+48] %endif pmaddwd m3, [tmp+ 0] pmaddwd m4, [tmp+16] pmaddwd m2, [tmp+32] paddd m1, m3 paddd m4, m2 paddd m0, m1 paddd m0, m4 ; m0 = top 3 lines multiplied by cf plus rounding for downshift movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pmaddwd m2, m1, m13 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] packssdw m2, m2 pminsw m2, m15 pmaxsw m2, m14 pslldq m2, 4 psrldq m1, 2 %if cpuflag(sse4) pblendw m1, m2, 00000100b %else pand m1, m12 pandn m3, m12, m2 por m1, m3 %endif ; overwrite a couple of pixels, should be ok movq [bufq+xq*2-4], m1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 dec hd jg .y_loop_ar3 %if WIN64 mov rsp, r6 %elif ARCH_X86_32 %undef m8 %undef m9 %undef m10 %undef m11 %undef m12 %undef m13 %undef m14 %undef m15 %endif RET %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 %if ARCH_X86_64 cglobal generate_grain_uv_%1_16bpc, 4, 11, 16, buf, bufy, fg_data, uv, bdmax, x, gaussian_reg, h, pic_reg %define base r8-pb_mask lea r8, [pb_mask] movifnidn bdmaxd, bdmaxm lea r6d, [bdmaxq+1] %else cglobal generate_grain_uv_%1_16bpc, 1, 7, 8, buf, x, pic_reg, fg_data, h %define base r2-$$ LEA r2, $$ mov fg_dataq, r2m mov r6d, r4m inc r6d %endif movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r5d, [fg_dataq+FGData.grain_scale_shift] shr r6d, 11 ; 0 for 10bpc, 2 for 12bpc sub r5, r6 SPLATW m6, [base+round+r5*2-2] mova m5, [base+pb_mask] SPLATW m0, [fg_dataq+FGData.seed] %if ARCH_X86_64 SPLATW m2, [base+pw_seed_xor+uvq*4] %else mov r5d, r3m SPLATW m2, [base+pw_seed_xor+r5*4] %endif pxor m0, m2 %if ARCH_X86_64 lea r6, [gaussian_sequence] %endif %if %2 mov hd, 73-35*%3 add bufq, 44*2 .loop_y: mov xq, -44 %else mov xq, -82*73 add bufq, 82*73*2 %endif .loop_x: pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set pshufb m3, m5, m2 ; set 15th bit for next 4 seeds psllq m2, m3, 30 por m2, m3 psllq m3, m2, 15 por m2, m3 ; aggregate each bit into next seed's high bit pmulhuw m3, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 %if ARCH_X86_64 vpgatherdw m3, m2, r6, r9, r10, 4, 2 %else vpgatherdw m3, m2, base+gaussian_sequence, r5, r6, 4, 2 %endif paddw m3, m3 ; otherwise bpc=12 w/ grain_scale_shift=0 ; shifts by 0, which pmulhrsw does not support pmulhrsw m3, m6 movq [bufq+xq*2], m3 add xq, 4 jl .loop_x %if %2 add bufq, 82*2 dec hd jg .loop_y %endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movsxd r5, [base+generate_grain_uv_%1_16bpc_ssse3_table+r5*4] lea r5, [r5+base+generate_grain_uv_%1_16bpc_ssse3_table] jmp r5 .ar0: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift %assign stack_offset_old stack_offset ALLOC_STACK -16*2 mov bufyq, r1m mov uvd, r3m %endif imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] SPLATW m3, [base+hmul_bits+shiftq*2-10] %if ARCH_X86_64 sar bdmaxd, 1 SPLATW m1, bdmaxd ; max_gain %else SPLATW m1, r4m psraw m1, 1 %endif pcmpeqw m7, m7 pxor m7, m1 ; min_grain %if ARCH_X86_64 SWAP 1, 14 DEFINE_ARGS buf, bufy, h, x %else %define m14 [rsp+0*16] mova m14, m1 DEFINE_ARGS buf, bufy, pic_reg, h, x %endif pxor m5, m5 pcmpgtb m5, m4 punpcklbw m4, m5 %if %2 SPLATW m6, [base+hmul_bits+2+%3*2] %endif SPLATW m4, m4 pxor m5, m5 %if %2 %if !cpuflag(sse4) pcmpeqw m2, m2 pslldq m2, 12 %if ARCH_X86_64 SWAP 2, 12 %else %define m12 [rsp+1*16] mova m12, m2 %endif %endif %endif %if %2 sub bufq, 2*(82*(73-35*%3)+82-(82*3+41)) %else sub bufq, 2*(82*70-3) %endif add bufyq, 2*(3+82*3) mov hd, 70-35*%3 .y_loop_ar0: ; first 32 pixels xor xd, xd .x_loop_ar0: movu m0, [bufyq+xq*(2<<%2)] %if %2 %if %3 movu m2, [bufyq+xq*4+82*2] paddw m0, m2 %endif movu m1, [bufyq+xq*4 +16] %if %3 movu m2, [bufyq+xq*4+82*2+16] paddw m1, m2 %endif phaddw m0, m1 pmulhrsw m0, m6 %endif punpckhwd m1, m0, m5 punpcklwd m0, m5 REPX {pmaddwd x, m4}, m0, m1 REPX {psrad x, 5}, m0, m1 packssdw m0, m1 pmulhrsw m0, m3 movu m1, [bufq+xq*2] paddw m0, m1 pminsw m0, m14 pmaxsw m0, m7 cmp xd, 72-40*%2 je .end movu [bufq+xq*2], m0 add xd, 8 jmp .x_loop_ar0 ; last 6/4 pixels .end: %if %2 %if cpuflag(sse4) pblendw m0, m1, 11000000b %else pand m1, m12 pandn m2, m12, m0 por m0, m1, m2 %endif movu [bufq+xq*2], m0 %else movq [bufq+xq*2], m0 %endif add bufq, 82*2 add bufyq, 82*(2<<%3) dec hd jg .y_loop_ar0 %if ARCH_X86_32 %undef m12 %undef m14 %endif RET .ar1: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, max, cf3, min, val3, x %else %assign stack_offset stack_offset_old %xdefine rstk rsp %assign stack_size_padded 0 DEFINE_ARGS buf, shift, pic_reg, fg_data, uv, bufy, cf3 mov bufyq, r1m mov uvd, r3m %endif imul uvd, 28 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movq m4, [fg_dataq+FGData.ar_coeffs_uv+uvq] %if WIN64 DEFINE_ARGS shift, bufy, h, buf, max, cf3, min, val3, x, val0 %if %2 lea bufq, [r0-2*(82*(73-35*%3)+44-(82*3+41))] %else lea bufq, [r0-2*(82*69+3)] %endif %else %if ARCH_X86_64 DEFINE_ARGS buf, bufy, h, shift, max, cf3, min, val3, x, val0 %else DEFINE_ARGS buf, shift, pic_reg, fg_data, val0, bufy, cf3 %define hd dword r1m %define mind dword r3m %define maxd dword r4m %endif %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif %endif %if ARCH_X86_64 mov shiftd, [r2+FGData.ar_coeff_shift] %else mov shiftd, [r3+FGData.ar_coeff_shift] %endif pxor m5, m5 pcmpgtb m5, m4 punpcklbw m4, m5 ; cf0-4 in words pshuflw m4, m4, q2100 psrldq m4, 2 ; cf0-3,4 in words pshufd m5, m4, q1111 pshufd m4, m4, q0000 movd m3, [base+round_vals+shiftq*2-12] ; rnd pxor m6, m6 punpcklwd m3, m6 %if %2 SPLATW m6, [base+hmul_bits+2+%3*2] %endif SPLATD m3, m3 add bufyq, 2*(79+82*3) mov hd, 70-35*%3 sar maxd, 1 %if ARCH_X86_64 mov mind, maxd xor mind, -1 %else DEFINE_ARGS buf, shift, val3, x, val0, bufy, cf3 mov r2, maxd xor r2, -1 mov mind, r2 %endif .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, word [bufq+xq*2-2] .x_loop_ar1: movu m0, [bufq+xq*2-82*2-2] ; top/left %if %2 movu m7, [bufyq+xq*4] %if %3 movu m1, [bufyq+xq*4+82*2] phaddw m7, m1 %else phaddw m7, m7 %endif %else movq m7, [bufyq+xq*2] %endif psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 %if %2 %if %3 pshufd m2, m7, q3232 paddw m7, m2 %endif pmulhrsw m7, m6 %endif punpcklwd m1, m7 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 paddd m0, m3 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 imul val3d, cf3d add val3d, val0d sar val3d, shiftb movsx val0d, word [bufq+xq*2] add val3d, val0d cmp val3d, maxd cmovg val3d, maxd cmp val3d, mind cmovl val3d, mind mov word [bufq+xq*2], val3w ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar1 %if ARCH_X86_32 %undef maxd %undef mind %undef hd %endif RET .ar2: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift ALLOC_STACK -16*8 mov bufyq, r1m mov uvd, r3m %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 %if ARCH_X86_64 sar bdmaxd, 1 SPLATW m5, bdmaxd ; max_grain %else SPLATW m5, r4m psraw m5, 1 %endif pcmpeqw m6, m6 %if !cpuflag(sse4) pcmpeqw m7, m7 psrldq m7, 14 pslldq m7, 2 pxor m7, m6 %endif pxor m6, m5 ; min_grain %if %2 && cpuflag(sse4) SPLATW m7, [base+hmul_bits+2+%3*2] %endif %if ARCH_X86_64 SWAP 5, 13 SWAP 6, 14 SWAP 7, 15 %else %define m13 [rsp+5*16] %define m14 [rsp+6*16] %define m15 [rsp+7*16] mova m13, m5 mova m14, m6 mova m15, m7 %endif ; coef values movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] pxor m1, m1 pcmpgtb m1, m0 punpckhbw m2, m0, m1 punpcklbw m0, m1 pinsrw m2, [base+round_vals-12+shiftq*2], 5 pshufd m6, m0, q0000 pshufd m7, m0, q1111 pshufd m1, m0, q3333 pshufd m0, m0, q2222 pshufd m3, m2, q1111 pshufd m4, m2, q2222 pshufd m2, m2, q0000 %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 2, 10 SWAP 3, 11 SWAP 4, 12 %else %define m8 [rsp+0*16] %define m9 [rsp+1*16] %define m10 [rsp+2*16] %define m11 [rsp+3*16] %define m12 [rsp+4*16] mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 mova m12, m4 %endif %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, h, x %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x %endif %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: movu m0, [bufq+xq*2-82*4-4] ; y=-2,x=[-2,+5] movu m5, [bufq+xq*2-82*2-4] ; y=-1,x=[-2,+5] psrldq m4, m0, 2 ; y=-2,x=[-1,+5] psrldq m1, m0, 4 ; y=-2,x=[-0,+5] psrldq m3, m0, 6 ; y=-2,x=[+1,+5] psrldq m2, m0, 8 ; y=-2,x=[+2,+5] punpcklwd m0, m4 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] punpcklwd m1, m3 ; y=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] punpcklwd m2, m5 ; y=-2/-1,x=[+2/-2,+3/-1,+4/+0,+5/+1] pmaddwd m0, m6 pmaddwd m1, m7 pmaddwd m2, m8 paddd m0, m1 paddd m0, m2 psrldq m3, m5, 2 ; y=-1,x=[-1,+5] psrldq m1, m5, 4 ; y=-1,x=[-0,+5] psrldq m4, m5, 6 ; y=-1,x=[+1,+5] psrldq m2, m5, 8 ; y=-1,x=[+2,+5] punpcklwd m3, m1 punpcklwd m4, m2 pmaddwd m3, m9 pmaddwd m4, m10 paddd m3, m4 paddd m0, m3 ; luma component & rounding %if %2 movu m1, [bufyq+xq*4] %if %3 movu m2, [bufyq+xq*4+82*2] phaddw m1, m2 pshufd m2, m1, q3232 paddw m1, m2 %else phaddw m1, m1 %endif %if cpuflag(sse4) pmulhrsw m1, m15 %elif %3 pmulhrsw m1, [base+pw_8192] %else pmulhrsw m1, [base+pw_16384] %endif %else movq m1, [bufyq+xq*2] %endif punpcklwd m1, [base+pw_1] pmaddwd m1, m12 paddd m0, m1 movu m1, [bufq+xq*2-4] ; y=0,x=[-2,+5] pshufd m2, m1, q3321 pxor m3, m3 pcmpgtw m3, m2 punpcklwd m2, m3 ; y=0,x=[0,3] in dword .x_loop_ar2_inner: pmaddwd m3, m1, m11 paddd m3, m0 psrldq m0, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] ; we do not need to packssdw since we only care about one value paddd m3, m2 packssdw m3, m3 pminsw m3, m13 pmaxsw m3, m14 psrldq m1, 2 pslldq m3, 2 psrldq m2, 4 %if cpuflag(sse4) pblendw m1, m3, 00000010b %else pand m1, m15 pandn m4, m15, m3 por m1, m4 %endif ; overwrite previous pixel, should be ok movd [bufq+xq*2-2], m1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar2 %if ARCH_X86_32 %undef m13 %undef m14 %undef m15 %endif RET .ar3: %if ARCH_X86_64 DEFINE_ARGS buf, bufy, fg_data, uv, bdmax, shift %if WIN64 mov r6, rsp and rsp, ~15 sub rsp, 96 %define tmp rsp %else %define tmp rsp+stack_offset-120 %endif %else DEFINE_ARGS buf, bufy, pic_reg, fg_data, uv, shift %assign stack_offset stack_offset_old ALLOC_STACK -16*14 mov bufyq, r1m mov uvd, r3m %define tmp rsp %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 SPLATW m4, [base+round_vals-12+shiftq*2] pxor m5, m5 pcmpgtw m5, m4 punpcklwd m4, m5 %if ARCH_X86_64 sar bdmaxd, 1 SPLATW m6, bdmaxd ; max_grain %else SPLATW m6, r4m psraw m6, 1 %endif pcmpeqw m7, m7 %if !cpuflag(sse4) pcmpeqw m3, m3 psrldq m3, 14 pslldq m3, 4 pxor m3, m7 %endif pxor m7, m6 ; min_grain %if %2 && cpuflag(sse4) SPLATW m3, [base+hmul_bits+2+%3*2] %endif %if ARCH_X86_64 SWAP 3, 11 SWAP 4, 12 SWAP 6, 14 SWAP 7, 15 %else %define m11 [rsp+ 9*16] %define m12 [rsp+10*16] %define m14 [rsp+12*16] %define m15 [rsp+13*16] mova m11, m3 mova m12, m4 mova m14, m6 mova m15, m7 %endif ; cf from y=-3,x=-3 until y=-3,x=-2 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] pxor m1, m1 pcmpgtb m1, m0 punpckhbw m2, m0, m1 punpcklbw m0, m1 pshufd m1, m0, q0000 pshufd m3, m0, q1111 pshufd m4, m0, q2222 pshufd m0, m0, q3333 pshufd m5, m2, q0000 pshufd m6, m2, q1111 mova [tmp+16*0], m1 mova [tmp+16*1], m3 mova [tmp+16*2], m4 mova [tmp+16*3], m0 mova [tmp+16*4], m5 mova [tmp+16*5], m6 pshufd m6, m2, q2222 pshufd m7, m2, q3333 ; cf from y=-1,x=-1 to y=0,x=-1 + luma component movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] pxor m1, m1 pcmpgtb m1, m0 punpckhbw m2, m0, m1 ; luma punpcklbw m0, m1 pshufd m3, m0, q3232 psrldq m5, m0, 10 ; y=0,x=[-3 to -1] + "1.0" for current pixel pinsrw m5, [base+round_vals-10+shiftq*2], 3 ; y=-1,x=[-1 to +2] pshufd m1, m0, q0000 pshufd m0, m0, q1111 ; y=-1,x=+3 + luma punpcklwd m3, m2 pshufd m3, m3, q0000 %if ARCH_X86_64 SWAP 1, 8 SWAP 0, 9 SWAP 3, 10 SWAP 5, 13 DEFINE_ARGS buf, bufy, fg_data, h, x %else %define m8 [rsp+ 6*16] %define m9 [rsp+ 7*16] %define m10 [rsp+ 8*16] %define m13 [rsp+11*16] mova m8, m1 mova m9, m0 mova m10, m3 mova m13, m5 DEFINE_ARGS buf, bufy, pic_reg, fg_data, h, x %endif %if %2 sub bufq, 2*(82*(73-35*%3)+44-(82*3+41)) %else sub bufq, 2*(82*69+3) %endif add bufyq, 2*(79+82*3) mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: ; first line movu m0, [bufq+xq*2-82*6-6+ 0] ; y=-3,x=[-3,+4] movd m1, [bufq+xq*2-82*6-6+16] ; y=-3,x=[+5,+6] palignr m2, m1, m0, 2 ; y=-3,x=[-2,+5] palignr m1, m1, m0, 12 ; y=-3,x=[+3,+6] punpckhwd m3, m0, m2 ; y=-3,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m0, m2 ; y=-3,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m2, m0, m3, q1032 ; y=-3,x=[-1/+0,+0/+1,+1/+2,+2/+3] pmaddwd m0, [tmp+0*16] pmaddwd m2, [tmp+1*16] pmaddwd m3, [tmp+2*16] paddd m0, m2 paddd m0, m3 ; first 6 x of top y ; second line [m0/1 are busy] movu m2, [bufq+xq*2-82*4-6+ 0] ; y=-2,x=[-3,+4] movd m3, [bufq+xq*2-82*4-6+16] ; y=-2,x=[+5,+6] punpcklwd m1, m2 ; y=-3/-2,x=[+3/-3,+4/-2,+5/-1,+6/+0] palignr m4, m3, m2, 2 ; y=-2,x=[-2,+5] palignr m3, m3, m2, 4 ; y=-2,x=[-2,+5] punpckhwd m5, m4, m3 ; y=-2,x=[+2/+3,+3/+4,+4/+5,+5/+6] punpcklwd m4, m3 ; y=-2,x=[-2/-1,-1/+0,+0/+1,+1/+2] shufps m3, m4, m5, q1032 ; t=-2,x=[+0/+1,+1/+2,+2/+3,+3/+4] pmaddwd m1, [tmp+3*16] pmaddwd m4, [tmp+4*16] pmaddwd m3, [tmp+5*16] pmaddwd m5, m6 paddd m1, m4 paddd m3, m5 paddd m0, m1 paddd m0, m3 ; top 2 lines ; third line [m0 is busy] & luma + round movu m1, [bufq+xq*2-82*2-6+ 0] ; y=-1,x=[-3,+4] movd m2, [bufq+xq*2-82*2-6+16] ; y=-1,x=[+5,+6] %if %2 movu m5, [bufyq+xq*4] %if %3 movu m4, [bufyq+xq*4+82*2] phaddw m5, m4 %else phaddw m5, m5 %endif %else movq m5, [bufyq+xq*2] %endif palignr m3, m2, m1, 2 ; y=-1,x=[-2,+5] palignr m2, m2, m1, 12 ; y=-1,x=[+3,+6] %if %3 pshufd m4, m5, q3232 paddw m5, m4 %endif %if %2 %if cpuflag(sse4) pmulhrsw m5, m11 %elif %3 pmulhrsw m5, [base+pw_8192] %else pmulhrsw m5, [base+pw_16384] %endif %endif punpckhwd m4, m1, m3 ; y=-1,x=[+1/+2,+2/+3,+3/+4,+4/+5] punpcklwd m1, m3 ; y=-1,x=[-3/-2,-2/-1,-1/+0,+0/+1] shufps m3, m1, m4, q1032 ; y=-1,x=[-1/+0,+0/+1,+1/+2,+2/+3] punpcklwd m2, m5 pmaddwd m1, m7 pmaddwd m3, m8 pmaddwd m4, m9 pmaddwd m2, m10 paddd m1, m3 paddd m4, m2 paddd m0, m12 ; += round paddd m1, m4 paddd m0, m1 movu m1, [bufq+xq*2-6] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pmaddwd m2, m1, m13 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] packssdw m2, m2 pminsw m2, m14 pmaxsw m2, m15 pslldq m2, 4 psrldq m1, 2 %if cpuflag(sse4) pblendw m1, m2, 00000100b %else pand m1, m11 pandn m3, m11, m2 por m1, m3 %endif ; overwrite previous pixels, should be ok movq [bufq+xq*2-4], m1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82*2 add bufyq, 82*2<<%3 dec hd jg .y_loop_ar3 %if WIN64 mov rsp, r6 %elif ARCH_X86_32 %undef m8 %undef m9 %undef m10 %undef m11 %undef m12 %undef m13 %undef m14 %undef m15 %endif RET %endmacro generate_grain_uv_fn 420, 1, 1 generate_grain_uv_fn 422, 1, 0 generate_grain_uv_fn 444, 0, 0 %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro INIT_XMM ssse3 %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize cglobal fgy_32x32xn_16bpc, 0, 7, 8, 0-(8 * mmsize + 12 * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov r5, r8m %define r0m [rsp+8*mmsize+ 3*gprsize] %define r2m [rsp+8*mmsize+ 5*gprsize] %define r4m [rsp+8*mmsize+ 7*gprsize] %define r6m [rsp+8*mmsize+ 9*gprsize] %define r7m [rsp+8*mmsize+10*gprsize] %define r8m [rsp+8*mmsize+11*gprsize] mov r0m, r0 mov r2m, r1 mov r4m, r2 mov r6m, r3 mov r7m, r4 mov r8m, r5 %else cglobal fgy_32x32xn_16bpc, 0, 7, 8, 8 * mmsize + 4 * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm mov scalingq, r5m mov fg_dataq, r3m %if STACK_ALIGNMENT < mmsize mov r6, r9m %define r9m [rsp+8*mmsize+ 4*gprsize] %define r3m [rsp+8*mmsize+ 6*gprsize] %define r5m [rsp+8*mmsize+ 8*gprsize] mov r9m, r6 %endif LEA r5, $$ %define base r5-$$ mov r5m, picptrq %else cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut lea r8, [pb_mask] %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] SPLATW m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] %if ARCH_X86_32 DECLARE_REG_TMP 0, 3 %else DECLARE_REG_TMP 9, 10 %endif mov t0d, r9m ; bdmax sar t0d, 11 ; is_12bpc inc t0d mov t1d, r6d imul t1d, t0d dec t0d SPLATW m5, [base+min+t1*2] lea t0d, [t0d*3] lea t0d, [r6d*2+t0d] SPLATW m4, [base+max+t0*2] SPLATW m2, r9m pcmpeqw m1, m1 psraw m7, m2, 1 ; max_grain pxor m1, m7 ; min_grain SPLATD m6, [base+pd_16] SCRATCH 1, 9, 0 SCRATCH 2, 10, 1 SCRATCH 3, 11, 2 SCRATCH 4, 12, 3 SCRATCH 5, 13, 4 SCRATCH 6, 14, 5 SCRATCH 7, 15, 6 mova m6, [base+pw_27_17_17_27] ; for horizontal filter %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2 DECLARE_REG_TMP 0 %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see DECLARE_REG_TMP 7 %endif mov sbyd, r8m movzx t0d, byte [fg_dataq+FGData.overlap_flag] test t0d, t0d jz .no_vertical_overlap test sbyd, sbyd jnz .vertical_overlap .no_vertical_overlap: mov dword r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, src_bak %endif lea src_bakq, [srcq+wq*2] mov r9mp, src_bakq neg wq sub dstmp, srcq %if ARCH_X86_32 mov r4m, wq %endif .loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak %endif .loop_x_odd: movzx hd, word r7m mov grain_lutq, grain_lutmp .loop_y: ; src pand m0, m10, [srcq+ 0] pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m4 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m4 %else vpgatherdw m2, m0, scalingq-1, r11, r13, 8, 1, m4 vpgatherdw m3, m1, scalingq-1, r11, r13, 8, 1, m4 %endif REPX {psrlw x, 8}, m2, m3 ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2] movu m5, [grain_lutq+offxyq*2+16] ; noise = round2(scaling[src] * grain, scaling_shift) REPX {pmullw x, m11}, m2, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp ; src += stride add grain_lutq, 82*2 dec hd jg .loop_y %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif btc dword r8m, 2 jc .next_blk add offxyd, 16 test dword r8m, 2 jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r12d, 16 ; top_offxy += 16 %endif jmp .loop_x_odd_v_overlap .next_blk: test dword r8m, 1 jz .loop_x ; r8m = sbym test dword r8m, 2 jnz .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: %if ARCH_X86_32 add offxyd, 16 mov [rsp+8*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy %endif mov hd, dword r7m mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+0*gprsize] movd m4, [grain_lutq+r5*2] %else movd m4, [grain_lutq+left_offxyq*2] %endif punpcklwd m4, m5 pmaddwd m4, m6 paddd m4, m14 psrad m4, 5 packssdw m4, m4 pminsw m4, m15 pmaxsw m4, m9 shufps m4, m5, q3210 ; src pand m0, m10, [srcq+ 0] pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m2, m0, scalingq-1, r0, r5, 8, 1, m5 vpgatherdw m3, m1, scalingq-1, r0, r5, 8, 1, m5 %else vpgatherdw m2, m0, scalingq-1, r13, r14, 8, 1, m5 vpgatherdw m3, m1, scalingq-1, r13, r14, 8, 1, m5 %endif REPX {psrlw x, 8}, m2, m3 ; noise = round2(scaling[src] * grain, scaling_shift) movu m5, [grain_lutq+offxyq*2+16] REPX {pmullw x, m11}, m2, m3 pmulhrsw m4, m2 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp add grain_lutq, 82*2 dec hd jg .loop_y_h_overlap %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif or dword r8m, 4 add offxyd, 16 ; r8m = sbym test dword r8m, 2 jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r12d, 16 ; top_offxy += 16 %endif jmp .loop_x_odd_v_overlap .end: RET .vertical_overlap: or t0d, 2 mov r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \ sby, see %endif movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS dst, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul t0d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add t0d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and t0d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, t0d %if ARCH_X86_32 xor sbyd, seed DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, src_bak %endif lea src_bakq, [srcq+wq*2] mov r9mp, src_bakq neg wq sub dstmp, srcq %if ARCH_X86_32 mov r4m, wq %endif .loop_x_v_overlap: %if ARCH_X86_32 mov r5, r5m SPLATD m7, [base+pw_27_17_17_27] mov seed, r3m %else SPLATD m7, [pw_27_17_17_27] %endif ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, unused, top_offxy mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, unused, top_offxy %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 .loop_x_odd_v_overlap: %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)] mov hd, dword r7m mov grain_lutq, grain_lutmp .loop_y_v_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+1*gprsize] movu m2, [grain_lutq+r5*2] %else movu m2, [grain_lutq+top_offxyq*2] %endif punpckhwd m4, m2, m3 punpcklwd m2, m3 REPX {pmaddwd x, m7}, m4, m2 REPX {paddd x, m14}, m4, m2 REPX {psrad x, 5}, m4, m2 packssdw m2, m4 pminsw m2, m15 pmaxsw m2, m9 movu m4, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m3, [grain_lutq+r5*2+16] %else movu m3, [grain_lutq+top_offxyq*2+16] %endif punpckhwd m5, m3, m4 punpcklwd m3, m4 REPX {pmaddwd x, m7}, m5, m3 REPX {paddd x, m14}, m5, m3 REPX {psrad x, 5}, m5, m3 packssdw m3, m5 pminsw m3, m15 pmaxsw m3, m9 ; src pand m0, m10, [srcq+ 0] ; m0-1: src as word pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] ; noise = round2(scaling[src] * grain, scaling_shift) %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 %else vpgatherdw m4, m0, scalingq-1, r11, r13, 8, 1, m5 %endif psrlw m4, 8 pmullw m4, m11 pmulhrsw m4, m2 %if ARCH_X86_32 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m2 %else vpgatherdw m5, m1, scalingq-1, r11, r13, 8, 1, m2 %endif psrlw m5, 8 pmullw m5, m11 pmulhrsw m5, m3 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m5 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif btc dword r8m, 2 jc .next_blk_v %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 jmp .loop_x_odd_v_overlap .next_blk_v: ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap .loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r0, [rsp+8*mmsize+1*gprsize] add r3, 16 add r0, 16 mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy mov [rsp+8*mmsize+2*gprsize], r0 ; topleft_offxy mov seed, r3m xor r0, r0 %else ; we assume from the block above that bits 8-15 of r7d are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, src_bak, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, w, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, src_bak, left_offxy, top_offxy, topleft_offxy %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)] movzx hd, word r7m mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movu m2, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy movu m4, [grain_lutq+r0*2] movd m5, [grain_lutq+r5*2] mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy movd m3, [grain_lutq+r5*2] %else movu m4, [grain_lutq+top_offxyq*2] movd m5, [grain_lutq+left_offxyq*2] movd m3, [grain_lutq+topleft_offxyq*2] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklwd m5, m2 punpcklwd m3, m4 REPX {pmaddwd x, m6}, m5, m3 REPX {paddd x, m14}, m5, m3 REPX {psrad x, 5}, m5, m3 packssdw m5, m3 pminsw m5, m15 pmaxsw m5, m9 shufps m3, m5, m2, q3210 shufps m5, m4, q3232 ; followed by v interpolation (top | cur -> cur) movu m0, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m1, [grain_lutq+r0*2+16] %else movu m1, [grain_lutq+top_offxyq*2+16] %endif punpcklwd m2, m5, m3 punpckhwd m5, m3 punpcklwd m3, m1, m0 punpckhwd m1, m0 REPX {pmaddwd x, m7}, m2, m5, m3, m1 REPX {paddd x, m14}, m2, m5, m3, m1 REPX {psrad x, 5}, m2, m5, m3, m1 packssdw m2, m5 packssdw m3, m1 REPX {pminsw x, m15}, m2, m3 REPX {pmaxsw x, m9}, m2, m3 ; src pand m0, m10, [srcq+ 0] pand m1, m10, [srcq+16] ; m0-1: src as word ; scaling[src] ; noise = round2(scaling[src] * grain, scaling_shift) %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, 8, 1, m5 %else vpgatherdw m4, m0, scalingq-1, r14, r10, 8, 1, m5 %endif psrlw m4, 8 pmullw m4, m11 pmulhrsw m2, m4 %if ARCH_X86_32 vpgatherdw m5, m1, scalingq-1, r0, r5, 8, 1, m4 %else vpgatherdw m5, m1, scalingq-1, r14, r10, 8, 1, m4 %endif psrlw m5, 8 pmullw m5, m11 pmulhrsw m3, m5 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+srcq+ 0], m0 mova [dstq+srcq+16], m1 add srcq, r2mp add grain_lutq, 82*2 dec hw jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines %if ARCH_X86_32 mov r5, r5m %endif SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4] xor hd, 0x10000 test hd, 0x10000 jnz .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: or dword r8m, 4 %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov r5, r5m add offxyd, 16 add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16 mov srcq, r9mp add srcq, r4mp add srcq, r4mp %else add offxyd, 16 add top_offxyd, 16 mov src_bakq, r9mp lea srcq, [src_bakq+wq*2] %endif jmp .loop_x_odd_v_overlap .end_hv: RET %if ARCH_X86_32 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 %endif %macro FGUV_FN 3 ; name, ss_hor, ss_ver INIT_XMM ssse3 %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 0-(8 * mmsize + 16 * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r1m mov r2, r2m mov r4, r3m mov r3, r4m mov r5, r5m %define r0m [rsp+8*mmsize+ 3*gprsize] %define r1m [rsp+8*mmsize+ 4*gprsize] %define r2m [rsp+8*mmsize+ 5*gprsize] %define r3m [rsp+8*mmsize+ 6*gprsize] %define r4m [rsp+8*mmsize+ 7*gprsize] %define r5m [rsp+8*mmsize+ 8*gprsize] mov r0m, r0 mov r2m, r2 mov r4m, r3 mov r5m, r5 mov r0, r6m mov r2, r7m mov r3, r8m mov r5, r9m %define r6m [rsp+8*mmsize+ 9*gprsize] %define r7m [rsp+8*mmsize+10*gprsize] %define r8m [rsp+8*mmsize+11*gprsize] %define r9m [rsp+8*mmsize+12*gprsize] mov r6m, r0 mov r7m, r2 mov r8m, r3 mov r9m, r5 mov r2, r10m mov r3, r11m mov r5, r12m mov r0, r13m %define r10m [rsp+8*mmsize+13*gprsize] %define r11m [rsp+8*mmsize+14*gprsize] %define r12m [rsp+8*mmsize+15*gprsize] mov r10m, r2 mov r11m, r3 mov r12m, r5 SPLATW m2, r13m %else cglobal fguv_32x32xn_i%1_16bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused mov srcq, srcm mov fg_dataq, r3m %endif LEA r5, $$ %define base r5-$$ DECLARE_REG_TMP 0, 2, 3 %else cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, lstride, uv_pl, is_id %define base r8-pb_mask lea r8, [pb_mask] DECLARE_REG_TMP 9, 10, 11 %endif mov r6d, [fg_dataq+FGData.scaling_shift] SPLATW m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] %if STACK_ALIGNMENT >= mmsize mov t0d, r13m ; bdmax %endif sar t0d, 11 ; is_12bpc inc t0d mov t1d, r6d imul t1d, t0d dec t0d SPLATW m5, [base+min+t1*2] lea t1d, [t0d*3] mov t2d, r12m inc t2d imul r6d, t2d add t1d, r6d SPLATW m4, [base+max+t1*2] %if STACK_ALIGNMENT >= mmsize SPLATW m2, r13m %endif SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 %define mzero m7 %if %3 SPLATD m2, [base+pw_23_22] %endif %if ARCH_X86_32 mov scalingq, r5m mov r5m, r5 %else mov r13mp, strideq %endif pcmpeqw m0, m0 psraw m1, m10, 1 pxor m0, m1 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_h, ss_v %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap DECLARE_REG_TMP 0 %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap DECLARE_REG_TMP 9 %endif %if %1 mov r6d, r11m SPLATW m0, [fg_dataq+FGData.uv_mult+r6*4] SPLATW m1, [fg_dataq+FGData.uv_luma_mult+r6*4] punpcklwd m6, m1, m0 SPLATW m5, [fg_dataq+FGData.uv_offset+r6*4] SPLATD m7, [base+pw_4+t0*4] pmullw m5, m7 %else SPLATD m6, [base+pd_16] %if %2 mova m5, [base+pw_23_22] %else mova m5, [base+pw_27_17_17_27] %endif %endif SCRATCH 6, 14, 6 SCRATCH 5, 15, 7 %if ARCH_X86_32 DECLARE_REG_TMP 0 %else DECLARE_REG_TMP 7 %endif mov sbyd, r8m mov t0d, [fg_dataq+FGData.overlap_flag] test t0d, t0d jz %%no_vertical_overlap test sbyd, sbyd jnz %%vertical_overlap %%no_vertical_overlap: mov r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, see, w, picptr, luma mov dstq, r0mp mov lumaq, r9mp mov wq, r4m lea r3, [srcq+wq*2] mov r1mp, r3 lea r3, [dstq+wq*2] mov r11mp, r3 lea r3, [lumaq+wq*(2<<%2)] mov r12mp, r3 %if %3 shl r10mp, 1 %endif %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused2, unused3, see, unused4, unused5, unused6, luma, lstride mov lstrideq, r10mp %if %3 add lstrideq, lstrideq %endif mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 %endif neg wq %if ARCH_X86_32 mov r4mp, wq %endif %%loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, unused2, unused3, luma, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, unused2, unused3, luma, lstride %endif %if %2 == 0 %%loop_x_odd: %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: ; src mova m0, [srcq] mova m1, [srcq+16] ; m0-1: src as word ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9m %endif mova m4, [lumaq+ 0] mova m6, [lumaq+(16<<%2)] %if %2 phaddw m4, [lumaq+16] phaddw m6, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9m, lumaq %endif %if %2 pavgw m4, mzero pavgw m6, mzero %endif %if %1 punpckhwd m3, m4, m0 punpcklwd m4, m0 punpckhwd m5, m6, m1 punpcklwd m6, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m3, m4, m5, m6 REPX {psrad x, 6}, m3, m4, m5, m6 packssdw m4, m3 packssdw m6, m5 REPX {paddw x, m15}, m4, m6 REPX {pmaxsw x, mzero}, m4, m6 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() %else REPX {pand x, m10}, m4, m6 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m3, m4, scalingq-1, r0, r5, 8, 1 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 %else vpgatherdw m3, m4, scalingq-1, r10, r12, 8, 1 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 %endif REPX {psrlw x, 8}, m3, m5 ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2] movu m6, [grain_lutq+offxyq*2+16] ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m3, m5 pmulhrsw m4, m3 pmulhrsw m6, m5 ; dst = clip_pixel(src, noise) paddw m0, m4 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hd jg %%loop_y %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, luma mov wq, r4mp %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0m, dstq mov r9m, lumaq mov r4m, wq %endif %if %2 == 0 btc dword r8m, 2 jc %%next_blk add offxyd, 16 test dword r8m, 2 jz %%loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 %endif jmp %%loop_x_odd_v_overlap %%next_blk: %endif test dword r8m, 1 je %%loop_x ; r8m = sbym test dword r8m, 2 jnz %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: %if ARCH_X86_32 add offxyd, 16 mov [rsp+8*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, luma, lstride lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx mov offxd, seed mov offyd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, luma, lstride %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_h_overlap: mova m0, [srcq] mova m1, [srcq+16] ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9m %endif mova m4, [lumaq+ 0] mova m6, [lumaq+(16<<%2)] %if %2 phaddw m4, [lumaq+16] phaddw m6, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9m, lumaq %endif %if %2 pavgw m4, mzero pavgw m6, mzero %endif %if %1 punpckhwd m3, m4, m0 punpcklwd m4, m0 punpckhwd m5, m6, m1 punpcklwd m6, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m3, m4, m5, m6 REPX {psrad x, 6}, m3, m4, m5, m6 packssdw m4, m3 packssdw m6, m5 REPX {paddw x, m15}, m4, m6 REPX {pmaxsw x, mzero}, m4, m6 REPX {pminsw x, m10}, m4, m6 ; clip_pixel() %else REPX {pand x, m10}, m4, m6 %endif ; grain = grain_lut[offy+y][offx+x] movu m7, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+0*gprsize] movd m5, [grain_lutq+r5*2] %else movd m5, [grain_lutq+left_offxyq*2+ 0] %endif punpcklwd m5, m7 ; {left0, cur0} %if %1 %if ARCH_X86_32 mov r5, r5m %endif %if %2 pmaddwd m5, [PIC_ptr(pw_23_22)] %else pmaddwd m5, [PIC_ptr(pw_27_17_17_27)] %endif paddd m5, [PIC_ptr(pd_16)] %else pmaddwd m5, m15 paddd m5, m14 %endif psrad m5, 5 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, m9 shufps m5, m7, q3210 movu m3, [grain_lutq+offxyq*2+16] ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5, 8, 1 vpgatherdw m4, m6, scalingq-1, r0, r5, 8, 1 %else vpgatherdw m7, m4, scalingq-1, r2, r12, 8, 1 vpgatherdw m4, m6, scalingq-1, r2, r12, 8, 1 %endif REPX {psrlw x, 8}, m7, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m7, m4 pmulhrsw m5, m7 pmulhrsw m3, m4 ; dst = clip_pixel(src, noise) paddw m0, m5 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hd jg %%loop_y_h_overlap %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut mov wq, r4mp %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0mp, dstq mov r9mp, lumaq mov r4m, wq %endif %if %2 ; r8m = sbym test dword r8m, 2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %else or dword r8m, 4 add offxyd, 16 ; r8m = sbym test dword r8m, 2 jz %%loop_x_odd %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxy += 16 %endif jmp %%loop_x_odd_v_overlap %endif %%end: RET %%vertical_overlap: or t0d, 2 mov r8m, t0d %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, unused1, unused2, unused3, lstride %endif movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul t0d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add t0d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and t0d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, t0d %if ARCH_X86_32 xor sbyd, seed DEFINE_ARGS dst, src, scaling, see, w, picptr, luma mov r3m, seed mov dstq, r0mp mov lumaq, r9mp mov wq, r4m lea r3, [srcq+wq*2] mov r1mp, r3 lea r3, [dstq+wq*2] mov r11mp, r3 lea r3, [lumaq+wq*(2<<%2)] mov r12mp, r3 %if %3 shl r10mp, 1 %endif %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused1, unused2, see, unused3, unused4, unused5, luma, lstride mov lstrideq, r10mp %if %3 add lstrideq, lstrideq %endif mov lumaq, r9mp lea r10, [srcq+wq*2] lea r11, [dstq+wq*2] lea r12, [lumaq+wq*(2<<%2)] mov r10mp, r10 mov r11mp, r11 mov r12mp, r12 %endif neg wq %if ARCH_X86_32 mov r4m, wq %endif %%loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m xor t0d, t0d %else ; we assume from the block above that bits 8-15 of r7d are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, unused1, top_offxy, unused2, luma, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, unused1, top_offxy, unused2, luma, lstride %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 %if %2 == 0 %%loop_x_odd_v_overlap: %endif %if %3 == 0 %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)] %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_v_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r0, [rsp+mmsize*8+gprsize*1] ; top_offxy movu m5, [grain_lutq+r0*2] %else movu m5, [grain_lutq+top_offxyq*2] %endif punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m7, m5 %if %1 %if ARCH_X86_32 mov r5, r5m %endif REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else REPX {paddd x, m14}, m7, m5 %endif REPX {psrad x, 5}, m7, m5 packssdw m3, m5, m7 pmaxsw m3, m8 pminsw m3, m9 ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m5, [grain_lutq+r0*2+16] %else movu m5, [grain_lutq+top_offxyq*2+16] %endif punpckhwd m7, m5, m4 punpcklwd m5, m4 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m7, m5 %if %1 REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5 %else REPX {paddd x, m14}, m7, m5 %endif REPX {psrad x, 5}, m7, m5 packssdw m4, m5, m7 pmaxsw m4, m8 pminsw m4, m9 ; src mova m0, [srcq] mova m1, [srcq+16] ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9mp %endif mova m5, [lumaq+ 0] mova m6, [lumaq+(16<<%2)] %if %2 phaddw m5, [lumaq+16] phaddw m6, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif %if %2 pavgw m5, mzero pavgw m6, mzero %endif %if %1 punpckhwd m7, m5, m0 punpcklwd m5, m0 REPX {pmaddwd x, m14}, m7, m5 REPX {psrad x, 6}, m7, m5 packssdw m5, m7 punpckhwd m7, m6, m1 punpcklwd m6, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m7, m6 REPX {psrad x, 6}, m7, m6 packssdw m6, m7 pxor mzero, mzero REPX {paddw x, m15}, m5, m6 REPX {pmaxsw x, mzero}, m5, m6 REPX {pminsw x, m10}, m5, m6 ; clip_pixel() %else REPX {pand x, m10}, m5, m6 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m5, scalingq-1, r0, r5, 8, 1 vpgatherdw m5, m6, scalingq-1, r0, r5, 8, 1 %else vpgatherdw m7, m5, scalingq-1, r10, r12, 8, 1 vpgatherdw m5, m6, scalingq-1, r10, r12, 8, 1 %endif REPX {psrlw x, 8}, m7, m5 ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m7, m5 pmulhrsw m3, m7 pmulhrsw m4, m5 ; dst = clip_pixel(src, noise) paddw m0, m3 paddw m1, m4 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 dec hw jle %%end_y_v_overlap %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 %if %3 jmp %%loop_y %else btc hd, 16 jc %%loop_y %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] jmp %%loop_y_v_overlap %endif %%end_y_v_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov r0mp, dstq mov r9mp, lumaq mov r4m, wq %endif %if %2 ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %else btc dword r8m, 2 jc %%loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 %endif jmp %%loop_x_odd_v_overlap %endif %%loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, picptr, grain_lut mov t0d, [rsp+mmsize*8+gprsize*1] ; top_offxy add offxyd, 16 add t0d, 16 mov [rsp+mmsize*8+gprsize*0], offxyd ; left_offxyd mov [rsp+mmsize*8+gprsize*2], t0d ; topleft_offxyd DEFINE_ARGS dst, src, scaling, see, w, picptr, grain_lut mov seed, r3m xor t0d, t0d %else ; we assume from the block above that bits 8-15 of r7d are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp t0b ; parity of top_seed shr seed, 16 shl t0d, 16 test seeb, seeh setp t0b ; parity of cur_seed or r6d, 0x00010001 xor t0d, r6d mov seed, t0d ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, top_offxy %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, luma, lstride %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+8*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 %if %3 == 0 %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)] %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy movd m5, [grain_lutq+r5*2] %else movd m5, [grain_lutq+left_offxyq*2] %endif movu m7, [grain_lutq+offxyq*2] %if ARCH_X86_32 mov r5, [rsp+8*mmsize+2*gprsize] movu m4, [grain_lutq+r0*2] %if %2 pinsrw m5, [grain_lutq+r5*2], 2 %else movd m3, [grain_lutq+r5*2] %endif %else movu m4, [grain_lutq+top_offxyq*2] %if %2 pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left } %else movd m3, [grain_lutq+topleft_offxyq*2] %endif %endif %if %2 == 0 punpckldq m5, m3 %endif punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 } punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 } %if %1 %if ARCH_X86_32 mov r5, r5m %endif %if %2 movddup m0, [PIC_ptr(pw_23_22)] %else movddup m0, [PIC_ptr(pw_27_17_17_27)] %endif %else pshufd m0, m15, q1010 %endif pmaddwd m5, m0 %if %1 paddd m5, [PIC_ptr(pd_16)] %else paddd m5, m14 %endif psrad m5, 5 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, m9 shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3 shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter shufps m5, m4, q3231 ; top0-7 post-h_filter punpckhwd m7, m5, m3 punpcklwd m5, m3 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m7, m5 %if %1 REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7 %else REPX {paddd x, m14}, m5, m7 %endif REPX {psrad x, 5}, m5, m7 packssdw m3, m5, m7 pmaxsw m3, m8 pminsw m3, m9 ; right half movu m4, [grain_lutq+offxyq*2+16] %if ARCH_X86_32 movu m0, [grain_lutq+r0*2+16] %else movu m0, [grain_lutq+top_offxyq*2+16] %endif punpckhwd m1, m0, m4 punpcklwd m0, m4 ; {top/cur interleaved} REPX {pmaddwd x, m2}, m1, m0 %if %1 REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0 %else REPX {paddd x, m14}, m1, m0 %endif REPX {psrad x, 5}, m1, m0 packssdw m4, m0, m1 pmaxsw m4, m8 pminsw m4, m9 ; src mova m0, [srcq] mova m1, [srcq+16] ; luma_src pxor mzero, mzero %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut mov lumaq, r9mp %endif mova m6, [lumaq+ 0] mova m5, [lumaq+(16<<%2)] %if %2 phaddw m6, [lumaq+16] phaddw m5, [lumaq+48] %endif %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif %if %2 pavgw m6, mzero pavgw m5, mzero %endif %if %1 punpckhwd m7, m6, m0 punpcklwd m6, m0 REPX {pmaddwd x, m14}, m7, m6 REPX {psrad x, 6}, m7, m6 packssdw m6, m7 punpckhwd m7, m5, m1 punpcklwd m5, m1 ; { luma, chroma } REPX {pmaddwd x, m14}, m7, m5 REPX {psrad x, 6}, m7, m5 packssdw m5, m7 pxor mzero, mzero REPX {paddw x, m15}, m6, m5 REPX {pmaxsw x, mzero}, m6, m5 REPX {pminsw x, m10}, m6, m5 ; clip_pixel() %else REPX {pand x, m10}, m6, m5 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m6, scalingq-1, r0, r5, 8, 1 vpgatherdw m6, m5, scalingq-1, r0, r5, 8, 1 %else %if %3 == 0 ; register shortage :) push r12 %endif vpgatherdw m7, m6, scalingq-1, r2, r12, 8, 1 vpgatherdw m6, m5, scalingq-1, r2, r12, 8, 1 %if %3 == 0 pop r12 %endif %endif REPX {psrlw x, 8}, m7, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) REPX {pmullw x, m11}, m7, m6 pmulhrsw m3, m7 pmulhrsw m4, m6 ; dst = clip_pixel(src, noise) paddw m0, m3 paddw m1, m4 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 movifnidn dstq, dstmp mova [dstq+ 0], m0 mova [dstq+16], m1 %if ARCH_X86_32 add srcq, r2mp add dstq, r2mp mov dstmp, dstq %else add srcq, r13mp add dstq, r13mp add lumaq, lstrideq %endif add grain_lutq, 82*2 dec hw %if %3 jg %%loop_y_h_overlap %else jle %%end_y_hv_overlap btc hd, 16 jc %%loop_y_h_overlap %if ARCH_X86_32 mov r5, r5m %endif SPLATD m2, [PIC_ptr(pw_27_17_17_27)+4] jmp %%loop_y_hv_overlap %%end_y_hv_overlap: %endif %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, w, luma, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp %else mov srcq, r10mp %endif mov dstq, r11mp mov lumaq, r12mp lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] lea lumaq, [lumaq+wq*(2<<%2)] %if ARCH_X86_32 mov dstmp, dstq mov r9mp, lumaq mov r4m, wq %endif %if %2 jmp %%loop_x_hv_overlap %else or dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 add dword [rsp+8*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxy += 16 %endif jmp %%loop_x_odd_v_overlap %endif %%end_hv: RET %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif %endmacro FGUV_FN 420, 1, 1 FGUV_FN 422, 1, 0 FGUV_FN 444, 0, 0 rav1e-0.7.1/src/x86/filmgrain_avx2.asm000064400000000000000000002003441046102023000155000ustar 00000000000000; Copyright © 2019-2022, VideoLAN and dav1d authors ; Copyright © 2019-2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 32 pb_mask: db 0,128,128, 0,128, 0, 0,128,128, 0, 0,128, 0,128,128, 0 gen_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 gen_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 gen_shufB: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 gen_shufC: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 gen_shufD: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 ; note: the order of (some of) the following constants matter pb_27_17: times 2 db 27, 17 byte_blend: db 0, 0, 0, -1 pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 pb_17_27: times 2 db 17, 27 pb_1: times 4 db 1 pb_23_22: db 23, 22, 0, 32, 0, 32, 0, 32 next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 fg_min: times 4 db 0 times 4 db 16 fg_max: times 4 db 255 times 4 db 240 times 4 db 235 pd_m65536: dd -65536 pw_8: times 2 dw 8 pw_1024: times 2 dw 1024 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512 pw_1: dw 1 %macro JMP_TABLE 2-* %1_8bpc_%2_table: %xdefine %%base %1_8bpc_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %rep %0 - 2 dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422, avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444, avx2, 0, 1, 2, 3 SECTION .text INIT_YMM avx2 cglobal generate_grain_y_8bpc, 2, 9, 8, buf, fg_data %define base r4-generate_grain_y_8bpc_avx2_table lea r4, [generate_grain_y_8bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movq xm4, [base+mul_bits] movq xm5, [base+hmul_bits] mov r7, -73*82 mova xm6, [base+pb_mask] sub bufq, r7 vpbroadcastw xm7, [base+round+r6*2] lea r6, [gaussian_sequence] movsxd r5, [r4+r5*4] .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm3, xm2 ; 4 next output seeds pshuflw xm0, xm3, q3333 psrlw xm3, 5 pand xm2, xm0, xm1 movq r2, xm3 psrlw xm3, xm2, 10 por xm2, xm3 pmullw xm2, xm4 pmulhuw xm0, xm5 movzx r3d, r2w pshufb xm3, xm6, xm2 psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm0, xm2 movd xm2, [r6+r3*2] rorx r3, r2, 32 por xm3, xm0 shr r2d, 16 pinsrw xm2, [r6+r2*2], 1 pshuflw xm0, xm3, q3333 movzx r2d, r3w psrlw xm3, 5 pinsrw xm2, [r6+r2*2], 2 shr r3d, 16 movq r2, xm3 pinsrw xm2, [r6+r3*2], 3 movzx r3d, r2w pinsrw xm2, [r6+r3*2], 4 rorx r3, r2, 32 shr r2d, 16 pinsrw xm2, [r6+r2*2], 5 movzx r2d, r3w pinsrw xm2, [r6+r2*2], 6 shr r3d, 16 pinsrw xm2, [r6+r3*2], 7 pmulhrsw xm2, xm7 packsswb xm2, xm2 movq [bufq+r7], xm2 add r7, 8 jl .loop ; auto-regression code add r5, r4 jmp r5 .ar1: DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd xm5, [fg_dataq+FGData.ar_coeffs_y] mova xm2, [base+gen_shufC] DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 pinsrb xm5, [base+pb_1], 3 vpbroadcastw xm3, [base+round_vals+shiftq*2-12] ; rnd pmovsxbw xm5, xm5 pshufd xm4, xm5, q0000 pshufd xm5, xm5, q1111 sub bufq, 82*73-(82*3+79) mov hd, 70 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -76 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm1, [bufq+xq-82-3] pshufb xm0, xm1, xm2 punpckhwd xm1, xm3 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d movsx val0d, byte [bufq+xq] sarx val3d, val3d, shiftd add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xb, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 dec hd jg .y_loop_ar1 .ar0: RET .ar2: %if WIN64 ; xmm6 and xmm7 already saved %assign xmm_regs_used 16 %assign stack_size_padded 168 SUB rsp, stack_size_padded movaps [rsp+16*2], xmm8 movaps [rsp+16*3], xmm9 movaps [rsp+16*4], xmm10 movaps [rsp+16*5], xmm11 movaps [rsp+16*6], xmm12 movaps [rsp+16*7], xmm13 movaps [rsp+16*8], xmm14 movaps [rsp+16*9], xmm15 %endif DEFINE_ARGS buf, fg_data, h, x mov r6d, [fg_dataq+FGData.ar_coeff_shift] pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 movd xm9, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 vpbroadcastd xm10, [base+round_vals-14+r6*2] movd xm11, [base+byte_blend+1] pmovsxbw xm9, xm9 pshufd xm4, xm7, q0000 mova xm12, [base+gen_shufA] pshufd xm5, xm7, q3333 mova xm13, [base+gen_shufB] pshufd xm6, xm7, q1111 mova xm14, [base+gen_shufC] pshufd xm7, xm7, q2222 mova xm15, [base+gen_shufD] pshufd xm8, xm9, q0000 psrld xm10, 16 pshufd xm9, xm9, q1111 sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pshufb xm2, xm0, xm12 pmaddwd xm2, xm4 pshufb xm3, xm1, xm13 pmaddwd xm3, xm5 paddd xm2, xm3 pshufb xm3, xm0, xm14 pmaddwd xm3, xm6 punpckhqdq xm0, xm0 punpcklwd xm0, xm1 pmaddwd xm0, xm7 pshufb xm1, xm15 pmaddwd xm1, xm8 paddd xm2, xm10 paddd xm2, xm3 paddd xm0, xm1 paddd xm2, xm0 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm1, xm0 pmaddwd xm3, xm9, xm1 psrldq xm1, 4 ; y=0,x=0 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value paddw xm3, xm1 packsswb xm3, xm3 pextrb [bufq+xq], xm3, 0 pslldq xm3, 2 vpblendvb xm0, xm3, xm11 psrldq xm0, 1 inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 dec hd jg .y_loop_ar2 RET INIT_YMM avx2 .ar3: %if WIN64 ; xmm6 and xmm7 already saved %assign stack_offset 16 ALLOC_STACK 16*14 %assign stack_size stack_size - 16*4 %assign xmm_regs_used 12 movaps [rsp+16*12], xmm8 movaps [rsp+16*13], xmm9 movaps [rsp+16*14], xmm10 movaps [rsp+16*15], xmm11 %else ALLOC_STACK 16*12 %endif mov r6d, [fg_dataq+FGData.ar_coeff_shift] movd xm11, [base+byte_blend] pmovsxbw m1, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pshufd m0, m1, q0000 mova [rsp+16* 0], m0 pshufd m0, m1, q1111 mova [rsp+16* 2], m0 pshufd m0, m1, q2222 mova [rsp+16* 4], m0 pshufd m1, m1, q3333 mova [rsp+16* 6], m1 pshufd xm0, xm2, q0000 mova [rsp+16* 8], xm0 pshufd xm0, xm2, q1111 mova [rsp+16* 9], xm0 psrldq xm7, xm2, 10 mova m8, [base+gen_shufA] pinsrw xm2, [base+pw_1], 5 mova m9, [base+gen_shufC] pshufd xm2, xm2, q2222 movu m10, [base+gen_shufE] vpbroadcastw xm6, [base+round_vals-12+r6*2] pinsrw xm7, [base+round_vals+r6*2-10], 3 mova [rsp+16*10], xm2 DEFINE_ARGS buf, fg_data, h, x sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu xm5, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] vinserti128 m5, [bufq+xq-82*2-3], 1 ; y=-2,x=[-3,+12] movu xm4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] punpcklbw m3, m5, m5 punpckhwd m5, m4 psraw m3, 8 punpcklbw m5, m5 psraw m5, 8 punpcklbw xm4, xm4 psraw xm4, 8 pshufb m0, m3, m8 pmaddwd m0, [rsp+16*0] pshufb m1, m3, m9 pmaddwd m1, [rsp+16*2] shufps m2, m3, m5, q1032 paddd m0, m1 pshufb m1, m2, m8 vperm2i128 m3, m4, 0x21 pmaddwd m1, [rsp+16*4] shufps xm2, xm3, q1021 vpblendd m2, m3, 0xf0 pshufb m2, m10 paddd m0, m1 pmaddwd m2, [rsp+16*6] pshufb xm1, xm4, xm9 pmaddwd xm1, [rsp+16*8] shufps xm4, xm5, q1132 paddd m0, m2 pshufb xm2, xm4, xm8 pshufd xm4, xm4, q2121 pmaddwd xm2, [rsp+16*9] punpcklwd xm4, xm6 pmaddwd xm4, [rsp+16*10] vextracti128 xm3, m0, 1 paddd xm0, xm1 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] paddd xm2, xm4 paddd xm0, xm2 paddd xm0, xm3 .x_loop_ar3_inner: pmovsxbw xm2, xm1 pmaddwd xm2, xm7 pshufd xm3, xm2, q1111 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value packsswb xm2, xm2 pextrb [bufq+xq], xm2, 0 pslldq xm2, 3 vpblendvb xm1, xm2, xm11 psrldq xm1, 1 inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 dec hd jg .y_loop_ar3 RET %macro GEN_GRAIN_UV_FN 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 cglobal generate_grain_uv_%1_8bpc, 4, 10, 16, buf, bufy, fg_data, uv %define base r4-generate_grain_uv_%1_8bpc_avx2_table lea r4, [generate_grain_uv_%1_8bpc_avx2_table] vpbroadcastw xm0, [fg_dataq+FGData.seed] mov r6d, [fg_dataq+FGData.grain_scale_shift] movq xm1, [base+next_upperbit_mask] movq xm4, [base+mul_bits] movq xm5, [base+hmul_bits] mova xm6, [base+pb_mask] vpbroadcastw xm7, [base+round+r6*2] vpbroadcastd xm2, [base+pw_seed_xor+uvq*4] pxor xm0, xm2 lea r6, [gaussian_sequence] %if %2 mov r7d, 73-35*%3 add bufq, 44 .loop_y: mov r5, -44 %else mov r5, -73*82 sub bufq, r5 %endif .loop: pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw xm2, xm4 ; bits 0x0f00 are set pmulhuw xm0, xm5 pshufb xm3, xm6, xm2 ; set 15th bit for next 4 seeds psllq xm2, xm3, 30 por xm2, xm3 psllq xm3, xm2, 15 por xm2, xm0 ; aggregate each bit into next seed's high bit por xm2, xm3 ; 4 next output seeds pshuflw xm0, xm2, q3333 psrlw xm2, 5 movq r8, xm2 movzx r9d, r8w movd xm2, [r6+r9*2] rorx r9, r8, 32 shr r8d, 16 pinsrw xm2, [r6+r8*2], 1 movzx r8d, r9w pinsrw xm2, [r6+r8*2], 2 shr r9d, 16 pinsrw xm2, [r6+r9*2], 3 pmulhrsw xm2, xm7 packsswb xm2, xm2 movd [bufq+r5], xm2 add r5, 4 jl .loop %if %2 add bufq, 82 dec r7d jg .loop_y %endif ; auto-regression code movsxd r6, [fg_dataq+FGData.ar_coeff_lag] movsxd r6, [base+generate_grain_uv_%1_8bpc_avx2_table+r6*4] add r6, r4 jmp r6 INIT_YMM avx2 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd xm3, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h pmovsxbw xm2, xm2 %if %2 vpbroadcastd m7, [base+pb_1] vpbroadcastw m6, [base+hmul_bits+2+%3*2] %endif vpbroadcastw m2, xm2 vpbroadcastw m3, xm3 pxor m12, m12 %if %2 sub bufq, 82*(73-35*%3)+82-(82*3+41) %else sub bufq, 82*70-3 %endif add bufyq, 3+82*3 mov hd, 70-35*%3 .y_loop_ar0: %if %2 ; first 32 pixels movu xm4, [bufyq] vinserti128 m4, [bufyq+32], 1 %if %3 movu xm0, [bufyq+82] vinserti128 m0, [bufyq+82+32], 1 %endif movu xm5, [bufyq+16] vinserti128 m5, [bufyq+48], 1 %if %3 movu xm1, [bufyq+82+16] vinserti128 m1, [bufyq+82+48], 1 %endif pmaddubsw m4, m7, m4 %if %3 pmaddubsw m0, m7, m0 %endif pmaddubsw m5, m7, m5 %if %3 pmaddubsw m1, m7, m1 paddw m4, m0 paddw m5, m1 %endif pmulhrsw m4, m6 pmulhrsw m5, m6 %else xor r3d, r3d ; first 32x2 pixels .x_loop_ar0: movu m4, [bufyq+r3] pcmpgtb m0, m12, m4 punpckhbw m5, m4, m0 punpcklbw m4, m0 %endif pmullw m4, m2 pmullw m5, m2 pmulhrsw m4, m3 pmulhrsw m5, m3 %if %2 movu m1, [bufq] %else movu m1, [bufq+r3] %endif pcmpgtb m8, m12, m1 punpcklbw m0, m1, m8 punpckhbw m1, m8 paddw m0, m4 paddw m1, m5 packsswb m0, m1 %if %2 movu [bufq], m0 %else movu [bufq+r3], m0 add r3d, 32 cmp r3d, 64 jl .x_loop_ar0 %endif ; last 6/12 pixels movu xm4, [bufyq+32*2] %if %2 %if %3 movu xm5, [bufyq+32*2+82] %endif pmaddubsw xm4, xm7, xm4 %if %3 pmaddubsw xm5, xm7, xm5 paddw xm4, xm5 %endif movq xm0, [bufq+32] pmulhrsw xm4, xm6 pmullw xm4, xm2 pmulhrsw xm4, xm3 pcmpgtb xm5, xm12, xm0 punpcklbw xm5, xm0, xm5 paddw xm4, xm5 packsswb xm4, xm4 pblendw xm0, xm4, xm0, 1000b movq [bufq+32], xm0 %else movu xm0, [bufq+64] pcmpgtb xm1, xm12, xm4 punpckhbw xm5, xm4, xm1 punpcklbw xm4, xm1 pmullw xm5, xm2 pmullw xm4, xm2 vpblendd xm1, xm3, xm12, 0x0c pmulhrsw xm5, xm1 pmulhrsw xm4, xm3 pcmpgtb xm1, xm12, xm0 punpckhbw xm8, xm0, xm1 punpcklbw xm0, xm1 paddw xm5, xm8 paddw xm0, xm4 packsswb xm0, xm5 movu [bufq+64], xm0 %endif add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET INIT_XMM avx2 .ar1: DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq] pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3 DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift pmovsxbw xm4, xm4 pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd %if %2 vpbroadcastd xm7, [base+pb_1] vpbroadcastw xm6, [base+hmul_bits+2+%3*2] %endif vpbroadcastd xm3, xm3 %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm0, [bufq+xq-82-1] ; top/left %if %2 movq xm8, [bufyq+xq*2] %if %3 movq xm9, [bufyq+xq*2+82] %endif %endif psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right %if %2 pmaddubsw xm8, xm7, xm8 %if %3 pmaddubsw xm9, xm7, xm9 paddw xm8, xm9 %endif pmulhrsw xm8, xm6 %else pmovsxbw xm8, [bufyq+xq] %endif punpcklwd xm0, xm2 punpcklwd xm1, xm8 pmaddwd xm0, xm4 pmaddwd xm1, xm5 paddd xm0, xm1 paddd xm0, xm3 .x_loop_ar1_inner: movd val0d, xm0 psrldq xm0, 4 imul val3d, cf3d add val3d, val0d sarx val3d, val3d, shiftd movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar1 RET .ar2: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 vpbroadcastw xm13, [base+round_vals-12+shiftq*2] pmovsxbw xm7, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 pinsrw xm0, [base+pw_1], 5 %if %2 vpbroadcastw xm12, [base+hmul_bits+2+%3*2] vpbroadcastd xm11, [base+pb_1] %endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd xm4, xm7, q0000 pshufd xm5, xm7, q3333 pshufd xm6, xm7, q1111 pshufd xm7, xm7, q2222 pshufd xm8, xm0, q0000 pshufd xm9, xm0, q1111 pshufd xm10, xm0, q2222 %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pshufb xm2, xm0, [base+gen_shufA] pmaddwd xm2, xm4 pshufb xm3, xm1, [base+gen_shufB] pmaddwd xm3, xm5 paddd xm2, xm3 pshufb xm3, xm0, [base+gen_shufC] pmaddwd xm3, xm6 punpckhqdq xm0, xm0 ; y=-2,x=[+2,+5] punpcklwd xm0, xm1 pmaddwd xm0, xm7 pshufb xm1, [gen_shufD] pmaddwd xm1, xm8 paddd xm2, xm3 paddd xm0, xm1 paddd xm2, xm0 %if %2 movq xm0, [bufyq+xq*2] %if %3 movq xm3, [bufyq+xq*2+82] %endif pmaddubsw xm0, xm11, xm0 %if %3 pmaddubsw xm3, xm11, xm3 paddw xm0, xm3 %endif pmulhrsw xm0, xm12 %else pmovsxbw xm0, [bufyq+xq] %endif punpcklwd xm0, xm13 pmaddwd xm0, xm10 paddd xm2, xm0 movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pmovsxbw xm0, xm0 pmaddwd xm3, xm0, xm9 psrldq xm0, 2 paddd xm3, xm2 psrldq xm2, 4 ; shift top to next pixel psrad xm3, [fg_dataq+FGData.ar_coeff_shift] pslldq xm3, 2 paddw xm3, xm0 pblendw xm0, xm3, 00000010b packsswb xm0, xm0 pextrb [bufq+xq], xm0, 1 inc xq jz .x_loop_ar2_end test xb, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET INIT_YMM avx2 .ar3: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 pmovsxbw m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23 vpbroadcastb xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma] movd xm13, [base+round_vals-10+shiftq*2] vpbroadcastd xm14, [base+round_vals-14+shiftq*2] pshufd m6, m0, q0000 pshufd m7, m0, q1111 pshufd m8, m0, q2222 pshufd m9, m0, q3333 pshufd xm10, xm1, q0000 pshufd xm11, xm1, q1111 pshufhw xm12, xm1, q0000 psraw xm2, 8 palignr xm13, xm1, 10 punpckhwd xm12, xm2 ; interleave luma cf psrld xm14, 16 DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 vpbroadcastw xm15, [base+hmul_bits+2+%3*2] sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*70-(82-3) %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: vbroadcasti128 m3, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12 palignr xm1, xm3, [bufq+xq-82*3-9], 6 ; y=-3,x=[-3,+12] vbroadcasti128 m4, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] vpblendd m3, m1, 0x0f pxor m0, m0 pcmpgtb m2, m0, m3 pcmpgtb m0, m4 punpcklbw m1, m3, m2 punpckhbw m3, m2 punpcklbw m2, m4, m0 punpckhbw xm4, xm0 pshufb m0, m1, [base+gen_shufA] pmaddwd m0, m6 pshufb m5, m1, [base+gen_shufC] pmaddwd m5, m7 shufps m1, m3, q1032 paddd m0, m5 pshufb m5, m1, [base+gen_shufA] pmaddwd m5, m8 shufps xm1, xm3, q2121 vpblendd m1, m2, 0xf0 pshufb m1, [base+gen_shufE] pmaddwd m1, m9 paddd m0, m5 pshufb xm3, xm2, [base+gen_shufC] paddd m0, m1 pmaddwd xm3, xm10 palignr xm1, xm4, xm2, 2 punpckhwd xm1, xm2, xm1 pmaddwd xm1, xm11 palignr xm4, xm2, 12 paddd xm3, xm1 %if %2 vpbroadcastd xm5, [base+pb_1] movq xm1, [bufyq+xq*2] pmaddubsw xm1, xm5, xm1 %if %3 movq xm2, [bufyq+xq*2+82] pmaddubsw xm5, xm2 paddw xm1, xm5 %endif pmulhrsw xm1, xm15 %else pmovsxbw xm1, [bufyq+xq] %endif punpcklwd xm4, xm1 pmaddwd xm4, xm12 movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4] vextracti128 xm2, m0, 1 paddd xm0, xm14 paddd xm3, xm4 paddd xm0, xm3 paddd xm0, xm2 .x_loop_ar3_inner: pmovsxbw xm1, xm1 pmaddwd xm2, xm13, xm1 pshuflw xm3, xm2, q1032 paddd xm2, xm0 ; add top paddd xm2, xm3 ; left+cur psrldq xm0, 4 psrad xm2, [fg_dataq+FGData.ar_coeff_shift] psrldq xm1, 2 ; don't packssdw, we only care about one value punpckldq xm2, xm2 pblendw xm1, xm2, 0100b packsswb xm1, xm1 pextrb [bufq+xq], xm1, 2 inc xq jz .x_loop_ar3_end test xb, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET %endmacro INIT_YMM avx2 cglobal fgy_32x32xn_8bpc, 6, 13, 15, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, see, overlap %define base r9-pd_m65536 lea r9, [pd_m65536] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] vpbroadcastd m8, [base+pd_m65536] vpbroadcastw m9, [base+mul_bits+r6*2-14] vpbroadcastd m10, [base+fg_min+r7*4] vpbroadcastd m11, [base+fg_max+r7*8] vpbroadcastd m12, [base+pw_1024] movq xm13, [base+pb_27_17_17_27] test sbyd, sbyd setnz r7b pxor m7, m7 test r7b, overlapb jnz .vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap mov hd, hm mov grain_lutq, grain_lutmp .loop_y: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq] punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] test overlapd, overlapd jz .loop_x ; r8m = sbym cmp dword r8m, 0 jne .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy lea left_offxyd, [offyq+32] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offyd, [offyq+offxq*2+747] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m5, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] punpcklbw xm4, xm5 pmaddubsw xm4, xm13, xm4 pmulhrsw xm4, xm12 packsswb xm4, xm4 vpblendd m4, m5, 0xfe punpckhbw m5, m7 punpcklbw m4, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hd jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq] ; r8m = sbym cmp dword r8m, 0 jne .loop_x_hv_overlap jmp .loop_x_h_overlap .vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ unused, sby, see, overlap movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x_v_overlap: vpbroadcastd m14, [pb_27_17] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_v_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq] movu m4, [grain_lutq+top_offxyq] punpcklbw m5, m4, m6 punpckhbw m4, m6 pmaddubsw m5, m14, m5 pmaddubsw m4, m14, m4 pmulhrsw m5, m12 pmulhrsw m4, m12 packsswb m5, m4 punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hb jz .end_y_v_overlap vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: add wq, 32 jge .end lea srcq, [src_bakq+wq] ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap .loop_x_hv_overlap: vpbroadcastd m14, [pb_27_17] ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy lea topleft_offxyd, [top_offxyq+32] lea left_offxyd, [offyq+32] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*2+0x10001*747+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 .loop_y_hv_overlap: ; src mova m2, [srcq] punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 ; scaling[src] pandn m4, m8, m0 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, m0, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m1 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 pblendw m2, m4, 0xaa psrld m4, m1, 16 mova m8, m6 vpgatherdd m5, [scalingq+m4-2], m6 pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] movu m6, [grain_lutq+offxyq] movd xm7, [grain_lutq+left_offxyq] movu m4, [grain_lutq+top_offxyq] movd xm5, [grain_lutq+topleft_offxyq] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw xm7, xm6 punpcklbw xm5, xm4 pmaddubsw xm7, xm13, xm7 pmaddubsw xm5, xm13, xm5 pmulhrsw xm7, xm12 pmulhrsw xm5, xm12 packsswb xm7, xm7 packsswb xm5, xm5 vpblendd m7, m6, 0xfe vpblendd m5, m4, 0xfe ; followed by v interpolation (top | cur -> cur) punpckhbw m4, m6 punpcklbw m5, m7 pmaddubsw m4, m14, m4 pmaddubsw m5, m14, m5 pmulhrsw m4, m12 pmulhrsw m5, m12 pxor m7, m7 packsswb m5, m4 punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 mova [dstq+srcq], m0 add srcq, strideq add grain_lutq, 82 dec hb jz .end_y_hv_overlap vpbroadcastd m14, [pb_17_27] ; swap weights for second v-overlap line ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines add hd, 0x80000000 jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: add wq, 32 lea srcq, [src_bakq+wq] jl .loop_x_hv_overlap .end: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, luma, overlap, uv_pl, is_id %define base r11-pd_m65536 lea r11, [pd_m65536] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, is_idm mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] vpbroadcastd m8, [base+pd_m65536] vpbroadcastw m9, [base+mul_bits+r6*2-14] vpbroadcastd m10, [base+fg_min+r7*4] shlx r7d, r7d, r9d vpbroadcastd m11, [base+fg_max+r7*4] vpbroadcastd m12, [base+pw_1024] pxor m7, m7 test sbyd, sbyd setnz r7b cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, \ h, sby, see, overlap, uv_pl %if %1 mov r6d, uv_plm vpbroadcastd m0, [base+pw_8] vbroadcasti128 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m0 ; uv_luma_mult, uv_mult %elif %2 vpbroadcastq m15, [base+pb_23_22] %else vpbroadcastq xm15, [base+pb_27_17_17_27] %endif %if %3 vpbroadcastw m13, [base+pb_23_22] %elif %2 pshufd m13, [base+pb_27_17], q0000 ; 8x27_17, 8x17_27 %endif test r7b, overlapb jnz %%vertical_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, unused5, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, unused1, unused2, lstride rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3) +0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm5, [grain_lutq+offxyq+ 0] vinserti128 m5, [grain_lutq+offxyq+82], 1 %else movu m5, [grain_lutq+offxyq] %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 sub hb, 1+%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq test overlapd, overlapd jz %%loop_x ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xEFF4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm5, [grain_lutq+offxyq+ 0] vinserti128 m5, [grain_lutq+offxyq+82], 1 movd xm4, [grain_lutq+left_offxyq+ 0] vinserti128 m4, [grain_lutq+left_offxyq+82], 1 punpcklbw m4, m5 %if %1 vpbroadcastq m0, [pb_23_22] pmaddubsw m4, m0, m4 %else pmaddubsw m4, m15, m4 %endif pmulhrsw m4, m12 packsswb m4, m4 vpblendd m4, m5, 0xee %else movu m5, [grain_lutq+offxyq] movd xm4, [grain_lutq+left_offxyq] punpcklbw xm4, xm5 %if %1 movq xm0, [pb_27_17_17_27] pmaddubsw xm4, xm0, xm4 %else pmaddubsw xm4, xm15, xm4 %endif pmulhrsw xm4, xm12 packsswb xm4, xm4 vpblendd m4, m5, 0xfe %endif punpckhbw m5, m7 punpcklbw m4, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82*(1+%2) sub hb, 1+%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq ; r8m = sbym cmp dword r8m, 0 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %%vertical_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, \ sby, see, overlap, unused1, unused2, lstride movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused1, unused2, see, overlap, unused3, unused4, lstride mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp neg wq %%loop_x_v_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy, unused, lstride rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 vpbroadcastd m13, [pb_27_17] %endif %%loop_y_v_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %3 == 0 %if %2 movu xm0, [grain_lutq+offxyq] vinserti128 m0, [grain_lutq+offxyq+82], 1 movu xm4, [grain_lutq+top_offxyq] vinserti128 m4, [grain_lutq+top_offxyq+82], 1 %else movu m0, [grain_lutq+offxyq] movu m4, [grain_lutq+top_offxyq] %endif punpcklbw m5, m4, m0 punpckhbw m4, m0 pmaddubsw m5, m13, m5 pmaddubsw m4, m13, m4 pmulhrsw m5, m12 pmulhrsw m4, m12 packsswb m5, m4 %else movq xm4, [grain_lutq+offxyq] vinserti128 m4, [grain_lutq+offxyq+8], 1 movq xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+8], 1 punpcklbw m5, m4 pmaddubsw m5, m13, m5 pmulhrsw m5, m12 vextracti128 xm4, m5, 1 packsswb xm5, xm4 ; only interpolate first line, insert second line unmodified vinserti128 m5, [grain_lutq+offxyq+82], 1 %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma_source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif sub hb, 1+%2 jle %%end_y_v_overlap %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 %if %2 == 0 vpbroadcastd m13, [pb_17_27] add hd, 0x80000000 jnc %%loop_y_v_overlap %endif jmp %%loop_y %%end_y_v_overlap: add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %%loop_x_hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride lea topleft_offxyd, [top_offxyq+(32>>%2)] lea left_offxyd, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 == 0 vpbroadcastd m13, [pb_27_17] %endif %%loop_y_hv_overlap: ; src %if %2 mova xm3, [lumaq+lstrideq*0+ 0] vinserti128 m3, [lumaq+lstrideq*(1+%3)+ 0], 1 vpbroadcastd m2, [pb_1] mova xm0, [lumaq+lstrideq*0+16] vinserti128 m0, [lumaq+lstrideq*(1+%3)+16], 1 mova xm1, [srcq] vinserti128 m1, [srcq+strideq], 1 pmaddubsw m3, m2 pmaddubsw m0, m2 pavgw m3, m7 pavgw m0, m7 %else mova m2, [lumaq] mova m1, [srcq] %endif %if %1 %if %2 packuswb m2, m3, m0 ; luma %endif punpckhbw m3, m2, m1 punpcklbw m2, m1 ; { luma, chroma } pmaddubsw m3, m14 pmaddubsw m2, m14 psraw m3, 6 psraw m2, 6 paddw m3, m15 paddw m2, m15 packuswb m2, m3 ; pack+unpack = clip %endif %if %1 || %2 == 0 punpcklbw m3, m2, m7 punpckhbw m0, m2, m7 %endif ; scaling[luma_src] pandn m4, m8, m3 mova m6, m8 vpgatherdd m2, [scalingq+m4-0], m8 psrld m3, 16 mova m8, m6 vpgatherdd m4, [scalingq+m3-2], m6 pandn m5, m8, m0 mova m6, m8 vpgatherdd m3, [scalingq+m5-0], m8 psrld m0, 16 mova m8, m6 vpgatherdd m5, [scalingq+m0-2], m6 pblendw m2, m4, 0xaa pblendw m3, m5, 0xaa ; grain = grain_lut[offy+y][offx+x] %if %2 movu xm4, [grain_lutq+offxyq] vinserti128 m4, [grain_lutq+offxyq+82], 1 movd xm0, [grain_lutq+left_offxyq] vinserti128 m0, [grain_lutq+left_offxyq+82], 1 movd xm6, [grain_lutq+topleft_offxyq] %if %3 movq xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+8], 1 %else vinserti128 m6, [grain_lutq+topleft_offxyq+82], 1 movu xm5, [grain_lutq+top_offxyq] vinserti128 m5, [grain_lutq+top_offxyq+82], 1 %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m0, m4 %if %3 punpcklbw xm6, xm5 %else punpcklbw m6, m5 %endif punpcklqdq m0, m6 %if %1 vpbroadcastq m6, [pb_23_22] pmaddubsw m0, m6, m0 %else pmaddubsw m0, m15, m0 %endif pmulhrsw m0, m12 packsswb m0, m0 vpblendd m4, m0, 0x11 %if %3 pshuflw xm0, xm0, q1032 vpblendd m5, m0, 0x01 %else pshuflw m0, m0, q1032 vpblendd m5, m0, 0x11 %endif %else movu m4, [grain_lutq+offxyq] movd xm0, [grain_lutq+left_offxyq] movu m5, [grain_lutq+top_offxyq] movd xm6, [grain_lutq+topleft_offxyq] punpcklbw xm0, xm4 punpcklbw xm6, xm5 punpcklqdq xm0, xm6 %if %1 vpbroadcastq xm6, [pb_27_17_17_27] pmaddubsw xm0, xm6, xm0 %else pmaddubsw xm0, xm15, xm0 %endif pmulhrsw xm0, xm12 packsswb xm0, xm0 vpblendd m4, m0, 0x01 pshuflw xm0, xm0, q1032 vpblendd m5, m0, 0x01 %endif ; followed by v interpolation (top | cur -> cur) %if %3 vpermq m0, m4, q3120 punpcklbw m5, m0 pmaddubsw m5, m13, m5 pmulhrsw m5, m12 vextracti128 xm0, m5, 1 packsswb xm5, xm0 vpblendd m5, m4, 0xf0 %else punpckhbw m0, m5, m4 punpcklbw m5, m4 pmaddubsw m4, m13, m0 pmaddubsw m5, m13, m5 pmulhrsw m4, m12 pmulhrsw m5, m12 packsswb m5, m4 %endif punpcklbw m4, m5, m7 punpckhbw m5, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmaddubsw m2, m4 pmaddubsw m3, m5 pmulhrsw m2, m9 pmulhrsw m3, m9 ; unpack chroma source punpcklbw m0, m1, m7 punpckhbw m1, m7 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 packuswb m0, m1 pmaxub m0, m10 pminub m0, m11 %if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 %else mova [dstq], m0 %endif %if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] lea lumaq, [lumaq+lstrideq*(2<<%3)] %else add srcq, strideq add dstq, strideq add lumaq, lstrideq %endif add grain_lutq, 82<<%2 sub hb, 1+%2 %if %2 jg %%loop_y_h_overlap %else je %%end_y_hv_overlap vpbroadcastd m13, [pb_17_27] add hd, 0x80000000 jnc %%loop_y_hv_overlap jmp %%loop_y_h_overlap %endif %%end_y_hv_overlap: add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq jmp %%loop_x_hv_overlap %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro GEN_GRAIN_UV_FN 420, 1, 1 FGUV_FN 420, 1, 1 GEN_GRAIN_UV_FN 422, 1, 0 FGUV_FN 422, 1, 0 GEN_GRAIN_UV_FN 444, 0, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/filmgrain_avx512.asm000064400000000000000000000700401046102023000156440ustar 00000000000000; Copyright © 2022, VideoLAN and dav1d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" %if ARCH_X86_64 SECTION_RODATA 64 pb_even: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 pb_odd: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 interleave_hl: db 8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7 pb_27_17_17_27: db 27, 17, 17, 27, 0, 32, 0, 32 pb_23_22_0_32: db 23, 22, 0, 32, 0, 32, 0, 32 pb_27_17: times 2 db 27, 17 pb_23_22: times 2 db 23, 22 pw_8: times 2 dw 8 pw_1024: times 2 dw 1024 pb_17_27: times 2 db 17, 27 fg_max: times 4 db 255 times 4 db 240 times 4 db 235 fg_min: times 4 db 0 times 4 db 16 noise_rnd: times 2 dw 128 times 2 dw 64 times 2 dw 32 times 2 dw 16 SECTION .text INIT_ZMM avx512icl cglobal fgy_32x32xn_8bpc, 6, 13, 22, dst, src, stride, fg_data, w, scaling, \ grain_lut, h, sby, see, overlap %define base r11-fg_min lea r11, [fg_min] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] mov r12, 0x0000000f0000000f ; h_overlap mask mova m0, [scalingq+64*0] mova m1, [scalingq+64*1] mova m2, [scalingq+64*2] mova m3, [scalingq+64*3] kmovq k1, r12 vbroadcasti32x4 m4, [base+interleave_hl] vpbroadcastd ym16, [base+pb_27_17] vpbroadcastd m12, [base+pb_17_27] vpbroadcastd m6, [base+noise_rnd+r6*4-32] test sbyd, sbyd setnz r6b vpbroadcastd m7, [base+fg_min+r7*4] vpbroadcastd m8, [base+fg_max+r7*8] pxor m5, m5 vpbroadcastd m9, [base+pw_1024] vpbroadcastq m10, [base+pb_27_17_17_27] vmovdqa64 m12{k1}, m16 test r6b, overlapb jnz .v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq .loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, overlap mov grain_lutq, grain_lutmp mov hd, hm .loop_y: movu ym21, [grain_lutq+offxyq-82] vinserti32x8 m21, [grain_lutq+offxyq+ 0], 1 call .add_noise sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] test overlapd, overlapd jz .loop_x test sbyd, sbyd jnz .hv_overlap .loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, left_offxy rorx offyd, seed, 8 mov left_offxyd, offxd ; previous column's offy*stride rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164 lea offxd, [offyq+offxq*2+829] ; offy*stride+offx DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, left_offxy mov grain_lutq, grain_lutmp mov hd, hm .loop_y_h_overlap: movu ym20, [grain_lutq+offxyq-82] vinserti32x8 m20, [grain_lutq+offxyq+ 0], 1 movd xm19, [grain_lutq+left_offxyq-50] vinserti32x4 m19, [grain_lutq+left_offxyq+32], 2 punpcklbw m19, m20 pmaddubsw m19, m10, m19 pmulhrsw m19, m9 punpckhbw m21, m20, m5 packsswb m20{k1}, m19, m19 punpcklbw m20, m5, m20 call .add_noise_h sub hb, 2 jg .loop_y_h_overlap add wq, 32 jge .end lea srcq, [src_bakq+wq] test sbyd, sbyd jnz .hv_overlap jmp .loop_x_h_overlap .v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, offy, offx, \ h, sby, see, overlap movzx r6d, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, r6d, 173 * 0x00010001 imul r6d, 37 * 0x01000100 add r7d, (105 << 16) | 188 add r6d, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and r6d, 0xff00ff00 xor seed, r7d xor seed, r6d ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, overlap lea src_bakq, [srcq+wq] neg wq sub dstq, srcq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offxd, [offyq+offxq*2+0x10001*829+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, overlap, top_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu ym19, [grain_lutq+offxyq-82] vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 movu ym21, [grain_lutq+top_offxyq-82] vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 punpckhbw m20, m21, m19 punpcklbw m21, m19 call .add_noise_v sub hb, 2 jg .loop_y add wq, 32 jge .end lea srcq, [src_bakq+wq] ; since fg_dataq.overlap is guaranteed to be set, we never jump back ; to .v_overlap, and instead always fall-through to h+v overlap .hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, offx, offy, \ h, sby, see, left_offxy, top_offxy, topleft_offxy mov topleft_offxyd, top_offxyd rorx offyd, seed, 8 mov left_offxyd, offxd rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offxd, [offyq+offxq*2+0x10001*829+32*82] DEFINE_ARGS dst, src, stride, src_bak, w, offxy, grain_lut, \ h, sby, see, left_offxy, top_offxy, topleft_offxy mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 movu ym19, [grain_lutq+offxyq-82] vinserti32x8 m19, [grain_lutq+offxyq+ 0], 1 movd xm16, [grain_lutq+left_offxyq-50] vinserti32x4 m16, [grain_lutq+left_offxyq+32], 2 movu ym21, [grain_lutq+top_offxyq-82] vinserti32x8 m21, [grain_lutq+top_offxyq+ 0], 1 movd xm17, [grain_lutq+topleft_offxyq-50] vinserti32x4 m17, [grain_lutq+topleft_offxyq+32], 2 ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m16, m19 pmaddubsw m16, m10, m16 punpcklbw m17, m21 pmaddubsw m17, m10, m17 punpckhbw m20, m21, m19 pmulhrsw m16, m9 pmulhrsw m17, m9 packsswb m19{k1}, m16, m16 packsswb m21{k1}, m17, m17 ; followed by v interpolation (top | cur -> cur) punpcklbw m21, m19 call .add_noise_v sub hb, 2 jg .loop_y_h_overlap add wq, 32 lea srcq, [src_bakq+wq] jl .hv_overlap .end: RET ALIGN function_align .add_noise_v: pmaddubsw m20, m12, m20 pmaddubsw m21, m12, m21 pmulhrsw m20, m9 pmulhrsw m21, m9 packsswb m21, m20 .add_noise: punpcklbw m20, m5, m21 punpckhbw m21, m5 .add_noise_h: mova ym18, [srcq+strideq*0] vinserti32x8 m18, [srcq+strideq*1], 1 mova m19, m0 punpcklbw m16, m18, m5 vpermt2b m19, m18, m1 ; scaling[ 0..127] vpmovb2m k2, m18 punpckhbw m17, m18, m5 vpermi2b m18, m2, m3 ; scaling[128..255] vmovdqu8 m19{k2}, m18 ; scaling[src] pshufb m19, m4 pmaddubsw m18, m19, m20 pmaddubsw m19, m21 add grain_lutq, 82*2 pmulhrsw m18, m6 ; noise pmulhrsw m19, m6 paddw m16, m18 paddw m17, m19 packuswb m16, m17 pmaxub m16, m7 pminub m16, m8 mova [dstq+srcq], ym16 add srcq, strideq vextracti32x8 [dstq+srcq], m16, 1 add srcq, strideq ret %macro FGUV_FN 3 ; name, ss_hor, ss_ver cglobal fguv_32x32xn_i%1_8bpc, 6, 14+%2, 22, dst, src, stride, fg_data, w, \ scaling, grain_lut, h, sby, luma, \ overlap, uv_pl, is_id, _, stride3 lea r11, [fg_min] mov r6d, [fg_dataq+FGData.scaling_shift] mov r7d, [fg_dataq+FGData.clip_to_restricted_range] mov r9d, is_idm mov sbyd, sbym mov overlapd, [fg_dataq+FGData.overlap_flag] %if %2 mov r12, 0x000f000f000f000f ; h_overlap mask vpbroadcastq m10, [base+pb_23_22_0_32] lea stride3q, [strideq*3] %else mov r12, 0x0000000f0000000f vpbroadcastq m10, [base+pb_27_17_17_27] %endif mova m0, [scalingq+64*0] mova m1, [scalingq+64*1] mova m2, [scalingq+64*2] mova m3, [scalingq+64*3] kmovq k1, r12 vbroadcasti32x4 m4, [base+interleave_hl] vpbroadcastd m6, [base+noise_rnd+r6*4-32] vpbroadcastd m7, [base+fg_min+r7*4] shlx r7d, r7d, r9d vpbroadcastd m8, [base+fg_max+r7*4] test sbyd, sbyd setnz r7b vpbroadcastd m9, [base+pw_1024] mova m11, [base+pb_even] mova m12, [base+pb_odd] pxor m5, m5 mov r5, r10mp ; lstride cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ h, sby, see, overlap, uv_pl, _, _, stride3 %if %1 mov r6d, uv_plm vpbroadcastd m16, [base+pw_8] vbroadcasti32x4 m14, [fg_dataq+FGData.uv_mult+r6*4] vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r6*4] pshufb m14, m16 ; uv_luma_mult, uv_mult %endif test r7b, overlapb jnz %%v_overlap imul seed, sbyd, (173 << 24) | 37 add seed, (105 << 24) | 178 rorx seed, seed, 24 movzx seed, seew xor seed, [fg_dataq+FGData.seed] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, overlap, _, _, _, stride3 mov lumaq, r9mp lea r11, [srcq+wq] lea r12, [dstq+wq] lea r13, [lumaq+wq*(1+%2)] mov r11mp, r11 mov r12mp, r12 neg wq %%loop_x: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, overlap, _, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm %%loop_y: %if %2 movu xm21, [grain_lutq+offxyq+82*0] vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 %endif call %%add_noise sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq test overlapd, overlapd jz %%loop_x cmp dword r8m, 0 ; sby jne %%hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: rorx r6, seeq, 1 or seed, 0xeff4 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, left_offxy, _, _, _, stride3 lea left_offxyd, [offyq+(32>>%2)] ; previous column's offy*stride+offx rorx offyd, seed, 8 rorx offxq, seeq, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyd, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, left_offxy, _, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm %%loop_y_h_overlap: %if %2 movu xm20, [grain_lutq+offxyq +82*0] movd xm19, [grain_lutq+left_offxyq+82*0] vinserti32x4 ym20, [grain_lutq+offxyq +82*1], 1 vinserti32x4 ym19, [grain_lutq+left_offxyq+82*1], 1 vinserti32x4 m20, [grain_lutq+offxyq +82*2], 2 vinserti32x4 m19, [grain_lutq+left_offxyq+82*2], 2 vinserti32x4 m20, [grain_lutq+offxyq +82*3], 3 vinserti32x4 m19, [grain_lutq+left_offxyq+82*3], 3 %else movu ym20, [grain_lutq+offxyq + 0] movd xm19, [grain_lutq+left_offxyq+ 0] vinserti32x8 m20, [grain_lutq+offxyq +82], 1 vinserti32x4 m19, [grain_lutq+left_offxyq+82], 2 %endif punpcklbw m19, m20 pmaddubsw m19, m10, m19 punpckhbw m21, m20, m5 pmulhrsw m19, m9 vpacksswb m20{k1}, m19, m19 punpcklbw m20, m5, m20 call %%add_noise_h sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq cmp dword r8m, 0 ; sby jne %%hv_overlap jmp %%loop_x_h_overlap %%v_overlap: DEFINE_ARGS dst, src, stride, fg_data, w, lstride, grain_lut, \ _, sby, see, overlap, _, _, _, stride3 movzx sbyd, sbyb imul seed, [fg_dataq+FGData.seed], 0x00010001 imul r7d, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add r7d, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and r7d, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, r7d xor seed, sbyd ; (cur_seed << 16) | top_seed %if %3 vpbroadcastd m13, [base+pb_23_22] kxnorw k3, k3, k3 ; v_overlap mask %elif %2 vbroadcasti32x8 m13, [base+pb_27_17] kxnord k3, k3, k3 pshufd m13, m13, q0000 ; 8x27_17, 8x17_27 %else vpbroadcastd ym16, [base+pb_27_17] vpbroadcastd m13, [base+pb_17_27] vmovdqa64 m13{k1}, m16 %endif DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, overlap, top_offxy, _, _, stride3 mov lumaq, r9mp lea r11, [srcq+wq] lea r12, [dstq+wq] lea r13, [lumaq+wq*(1<<%2)] mov r11mp, r11 mov r12mp, r12 neg wq ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0x000f000f and offxd, 0x000f000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, overlap, top_offxy, _, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %3 movu xm18, [grain_lutq+offxyq+82*0] movu xm20, [grain_lutq+top_offxyq+82*0] ; only interpolate first line, insert remaining line unmodified vbroadcasti128 ym21, [grain_lutq+offxyq+82*1] vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 punpcklbw xm19, xm20, xm18 punpckhbw xm20, xm18 %elif %2 movu xm18, [grain_lutq+offxyq+82*0] vinserti128 ym18, [grain_lutq+offxyq+82*1], 1 movu xm20, [grain_lutq+top_offxyq+82*0] vinserti32x4 ym20, [grain_lutq+top_offxyq+82*1], 1 vbroadcasti32x4 m21, [grain_lutq+offxyq+82*2] vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 punpcklbw ym19, ym20, ym18 punpckhbw ym20, ym18 %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 movu ym20, [grain_lutq+top_offxyq+82*0] vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq %%hv_overlap: ; we assume from the block above that bits 8-15 of r7d are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp r7b ; parity of top_seed shr seed, 16 shl r7d, 16 test seeb, seeh setp r7b ; parity of cur_seed or r6d, 0x00010001 xor r7d, r6d rorx seed, r7d, 1 ; updated (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 lea topleft_offxyd, [top_offxyq+(32>>%2)] lea left_offxyd, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0x000f000f and offxd, 0x000f000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyd, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, lstride, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, _, stride3 mov grain_lutq, grain_lutmp mov hd, hm movzx top_offxyd, offxyw shr offxyd, 16 %if %2 movu xm21, [grain_lutq+offxyq+82*0] movd xm16, [grain_lutq+left_offxyq+82*0] vinserti128 ym21, [grain_lutq+offxyq+82*1], 1 vinserti128 ym16, [grain_lutq+left_offxyq+82*1], 1 vinserti32x4 m21, [grain_lutq+offxyq+82*2], 2 vinserti32x4 m16, [grain_lutq+left_offxyq+82*2], 2 vinserti32x4 m21, [grain_lutq+offxyq+82*3], 3 vinserti32x4 m16, [grain_lutq+left_offxyq+82*3], 3 movd xm18, [grain_lutq+topleft_offxyq+82*0] movu xm20, [grain_lutq+top_offxyq] ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m16, m21 %if %3 punpcklbw xm18, xm20 %else vinserti128 ym18, [grain_lutq+topleft_offxyq+82*1], 1 vinserti128 ym20, [grain_lutq+top_offxyq+82*1], 1 punpcklbw ym18, ym20 %endif punpcklqdq m16, m18 pmaddubsw m16, m10, m16 pmulhrsw m16, m9 packsswb m16, m16 vmovdqu8 m21{k1}, m16 %if %3 vpalignr xm20{k1}, xm16, xm16, 4 punpcklbw xm19, xm20, xm21 punpckhbw xm20, xm21 %else vpalignr ym20{k1}, ym16, ym16, 4 punpcklbw ym19, ym20, ym21 punpckhbw ym20, ym21 %endif %else movu ym21, [grain_lutq+offxyq+82*0] vinserti32x8 m21, [grain_lutq+offxyq+82*1], 1 movd xm16, [grain_lutq+left_offxyq+82*0] vinserti32x4 m16, [grain_lutq+left_offxyq+82*1], 2 movu ym20, [grain_lutq+top_offxyq+82*0] vinserti32x8 m20, [grain_lutq+top_offxyq+82*1], 1 movd xm18, [grain_lutq+topleft_offxyq+82*0] vinserti32x4 m18, [grain_lutq+topleft_offxyq+82*1], 2 punpcklbw m16, m21 punpcklbw m18, m20 punpcklqdq m16, m18 pmaddubsw m16, m10, m16 pmulhrsw m16, m9 packsswb m16, m16 vpalignr m20{k1}, m16, m16, 4 vmovdqu8 m21{k1}, m16 %endif call %%add_noise_v sub hb, 2<<%2 jg %%loop_y_h_overlap add wq, 32>>%2 jge .end mov srcq, r11mp mov dstq, r12mp lea lumaq, [r13+wq*(1<<%2)] add srcq, wq add dstq, wq jmp %%hv_overlap ALIGN function_align %%add_noise_v: %if %3 pmaddubsw xm19, xm13, xm19 pmaddubsw xm20, xm13, xm20 pmulhrsw xm19, xm9 pmulhrsw xm20, xm9 vpacksswb m21{k3}, m19, m20 %elif %2 pmaddubsw ym19, ym13, ym19 pmaddubsw ym20, ym13, ym20 pmulhrsw ym19, ym9 pmulhrsw ym20, ym9 vpacksswb m21{k3}, m19, m20 %else punpcklbw m19, m20, m21 punpckhbw m20, m21 pmaddubsw m19, m13, m19 pmaddubsw m20, m13, m20 pmulhrsw m19, m9 pmulhrsw m20, m9 packsswb m21, m19, m20 %endif %%add_noise: punpcklbw m20, m5, m21 punpckhbw m21, m5 %%add_noise_h: mova ym18, [lumaq+lstrideq*(0<<%3)] vinserti32x8 m18, [lumaq+lstrideq*(1<<%3)], 1 %if %2 lea lumaq, [lumaq+lstrideq*(2<<%3)] mova ym16, [lumaq+lstrideq*(0<<%3)] vinserti32x8 m16, [lumaq+lstrideq*(1<<%3)], 1 mova xm17, [srcq+strideq*0] mova m19, m11 vpermi2b m19, m18, m16 vinserti128 ym17, [srcq+strideq*1], 1 vpermt2b m18, m12, m16 vinserti32x4 m17, [srcq+strideq*2], 2 pavgb m18, m19 vinserti32x4 m17, [srcq+stride3q ], 3 %else mova ym17, [srcq+strideq*0] vinserti32x8 m17, [srcq+strideq*1], 1 %endif %if %1 punpckhbw m19, m18, m17 punpcklbw m18, m17 ; { luma, chroma } pmaddubsw m19, m14 pmaddubsw m18, m14 psraw m19, 6 psraw m18, 6 paddw m19, m15 paddw m18, m15 packuswb m18, m19 .add_noise_main: mova m19, m0 vpermt2b m19, m18, m1 ; scaling[ 0..127] vpmovb2m k2, m18 vpermi2b m18, m2, m3 ; scaling[128..255] vmovdqu8 m19{k2}, m18 ; scaling[src] pshufb m19, m4 pmaddubsw m18, m19, m20 pmaddubsw m19, m21 add grain_lutq, 82*2<<%2 lea lumaq, [lumaq+lstrideq*(2<<%3)] lea srcq, [srcq+strideq*(2<<%2)] pmulhrsw m18, m6 ; noise pmulhrsw m19, m6 punpcklbw m16, m17, m5 ; chroma punpckhbw m17, m5 paddw m16, m18 paddw m17, m19 packuswb m16, m17 pmaxub m16, m7 pminub m16, m8 %if %2 mova [dstq+strideq*0], xm16 vextracti128 [dstq+strideq*1], ym16, 1 vextracti32x4 [dstq+strideq*2], m16, 2 vextracti32x4 [dstq+stride3q ], m16, 3 %else mova [dstq+strideq*0], ym16 vextracti32x8 [dstq+strideq*1], m16, 1 %endif lea dstq, [dstq+strideq*(2<<%2)] ret %else jmp .add_noise_main %endif %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 .end: RET %endmacro FGUV_FN 420, 1, 1 FGUV_FN 422, 1, 0 FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/filmgrain_common.asm000064400000000000000000000041521046102023000161070ustar 00000000000000; Copyright © 2019-2022, VideoLAN and dav1d authors ; Copyright © 2019-2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. struc FGData .seed: resd 1 .num_y_points: resd 1 .y_points: resb 14 * 2 .chroma_scaling_from_luma: resd 1 .num_uv_points: resd 2 .uv_points: resb 2 * 10 * 2 .scaling_shift: resd 1 .ar_coeff_lag: resd 1 .ar_coeffs_y: resb 24 .ar_coeffs_uv: resb 2 * 28 ; includes padding .ar_coeff_shift: resq 1 .grain_scale_shift: resd 1 .uv_mult: resd 2 .uv_luma_mult: resd 2 .uv_offset: resd 2 .overlap_flag: resd 1 .clip_to_restricted_range: resd 1 endstruc cextern gaussian_sequence rav1e-0.7.1/src/x86/filmgrain_sse.asm000064400000000000000000002607271046102023000154250ustar 00000000000000; Copyright © 2019-2021, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %include "x86/filmgrain_common.asm" SECTION_RODATA pw_1024: times 8 dw 1024 pb_27_17_17_27: db 27, 17, 17, 27 times 6 db 0, 32 pb_23_22_h: db 23, 22 times 7 db 0, 32 pb_27_17: times 8 db 27, 17 pb_17_27: times 8 db 17, 27 pb_23_22: times 8 db 23, 22 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0 pw_seed_xor: times 2 dw 0xb524 times 2 dw 0x49d8 pb_1: times 4 db 1 hmul_bits: dw 32768, 16384, 8192, 4096 round: dw 2048, 1024, 512 mul_bits: dw 256, 128, 64, 32, 16 round_vals: dw 32, 64, 128, 256, 512 max: dw 255, 240, 235 min: dw 0, 16 pw_1: dw 1 %macro JMP_TABLE 2-* %xdefine %1_8bpc_%2_table %%table %xdefine %%base %1_8bpc_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .ar%3 - %%base %rotate 1 %endrep %endmacro JMP_TABLE generate_grain_y, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_422, ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_444, ssse3, 0, 1, 2, 3 SECTION .text %if ARCH_X86_32 %define PIC_ptr(a) base+a %else %define PIC_ptr(a) a %endif %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro INIT_XMM ssse3 cglobal generate_grain_y_8bpc, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r2d, [fg_dataq+FGData.grain_scale_shift] movd m2, [base+round+r2*2] movd m0, [fg_dataq+FGData.seed] mova m5, [base+pb_mask] pshuflw m2, m2, q0000 pshuflw m0, m0, q0000 mov r2, -73*82 sub bufq, r2 lea r3, [base+gaussian_sequence] .loop: pand m6, m0, m1 psrlw m3, m6, 10 por m6, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m6, m4 ; bits 0x0f00 are set pshufb m3, m5, m6 ; set 15th bit for next 4 seeds psllq m6, m3, 30 por m3, m6 psllq m6, m3, 15 por m3, m6 ; aggregate each bit into next seed's high bit pmulhuw m6, m0, m7 por m3, m6 ; 4 next output seeds pshuflw m0, m3, q3333 psrlw m3, 5 %if ARCH_X86_64 movq r6, m3 mov r8, r6 movzx r5d, r6w shr r6d, 16 shr r8, 32 movzx r7, r8w shr r8, 16 movd m6, [r3+r5*2] pinsrw m6, [r3+r6*2], 1 pinsrw m6, [r3+r7*2], 2 pinsrw m6, [r3+r8*2], 3 %else movd r6, m3 pshuflw m3, m3, q3232 movzx r5, r6w shr r6, 16 movd m6, [r3+r5*2] pinsrw m6, [r3+r6*2], 1 movd r6, m3 movzx r5, r6w shr r6, 16 pinsrw m6, [r3+r5*2], 2 pinsrw m6, [r3+r6*2], 3 %endif pmulhrsw m6, m2 packsswb m6, m6 movd [bufq+r2], m6 add r2, 4 jl .loop ; auto-regression code movsxd r2, [fg_dataq+FGData.ar_coeff_lag] movsxd r2, [base+generate_grain_y_8bpc_ssse3_table+r2*4] lea r2, [r2+base+generate_grain_y_8bpc_ssse3_table] jmp r2 .ar1: %if ARCH_X86_32 DEFINE_ARGS buf, fg_data, cf3, unused, val3, min, max %elif WIN64 DEFINE_ARGS shift, fg_data, cf3, buf, val3, min, max, x, val0 mov bufq, r0 %else DEFINE_ARGS buf, fg_data, cf3, shift, val3, min, max, x, val0 %endif movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_y+3] movd m4, [fg_dataq+FGData.ar_coeffs_y] mov ecx, [fg_dataq+FGData.ar_coeff_shift] %if ARCH_X86_32 mov r1m, cf3d DEFINE_ARGS buf, shift, val3, min, max, x, val0 %define hd r0mp %define cf3d r1mp %elif WIN64 DEFINE_ARGS shift, h, cf3, buf, val3, min, max, x, val0 %else DEFINE_ARGS buf, h, cf3, shift, val3, min, max, x, val0 %endif pxor m6, m6 pcmpgtb m7, m6, m4 punpcklbw m4, m7 pinsrw m4, [base+pw_1], 3 pshufd m5, m4, q1111 pshufd m4, m4, q0000 movd m3, [base+round_vals+shiftq*2-12] ; rnd pshuflw m3, m3, q0000 sub bufq, 82*73-(82*3+79) mov hd, 70 mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -76 movsx val3d, byte [bufq+xq-1] .x_loop_ar1: movq m0, [bufq+xq-82-1] ; top/left pcmpgtb m7, m6, m0 punpcklbw m0, m7 psrldq m2, m0, 2 ; top psrldq m1, m0, 4 ; top/right punpcklwd m0, m2 punpcklwd m1, m3 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 imul val3d, cf3d add val3d, val0d sar val3d, shiftb movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 dec hd jg .y_loop_ar1 .ar0: RET .ar2: %if ARCH_X86_32 %assign stack_offset_old stack_offset ALLOC_STACK -16*8 %endif DEFINE_ARGS buf, fg_data, shift mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m6, [base+round_vals-12+shiftq*2] movd m7, [base+byte_blend+1] SCRATCH 7, 15, 7 movq m0, [fg_dataq+FGData.ar_coeffs_y+0] ; cf0-7 movd m1, [fg_dataq+FGData.ar_coeffs_y+8] ; cf8-11 pxor m7, m7 pshuflw m6, m6, q0000 punpcklwd m6, m7 pcmpgtb m4, m7, m0 pcmpgtb m5, m7, m1 punpcklbw m0, m4 punpcklbw m1, m5 DEFINE_ARGS buf, fg_data, h, x pshufd m4, m1, q0000 pshufd m5, m1, q1111 pshufd m3, m0, q3333 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 SCRATCH 6, 14, 6 sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar2: mov xq, -76 .x_loop_ar2: movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pcmpgtb m2, m7, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] psrldq m3, m1, 2 ; y=-1,x=[-1,+5] psrldq m4, m1, 4 ; y=-1,x=[+0,+5] punpcklwd m2, m0, m5 punpcklwd m3, m4 pmaddwd m2, m8 pmaddwd m3, m11 paddd m2, m3 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] psrldq m5, m0, 6 ; y=-2,x=[+1,+5] psrldq m6, m0, 8 ; y=-2,x=[+2,+5] punpcklwd m4, m5 punpcklwd m6, m1 psrldq m5, m1, 6 ; y=-1,x=[+1,+5] psrldq m1, m1, 8 ; y=-1,x=[+2,+5] punpcklwd m5, m1 pmaddwd m4, m9 pmaddwd m6, m10 pmaddwd m5, m12 paddd m4, m6 paddd m2, m5 paddd m2, m4 paddd m2, m14 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] .x_loop_ar2_inner: pcmpgtb m4, m7, m0 punpcklbw m1, m0, m4 pmaddwd m3, m1, m13 paddd m3, m2 psrldq m1, 4 ; y=0,x=0 psrldq m2, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value paddw m3, m1 packsswb m3, m3 pslldq m3, 2 pand m3, m15 pandn m1, m15, m0 por m0, m1, m3 psrldq m0, 1 ; overwrite 2 pixels, but that's ok movd [bufq+xq-1], m0 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 dec hd jg .y_loop_ar2 RET .ar3: DEFINE_ARGS buf, fg_data, shift %if ARCH_X86_32 %assign stack_offset stack_offset_old ALLOC_STACK -16*14 %elif WIN64 SUB rsp, 16*6 %assign stack_size_padded (stack_size_padded+16*6) %assign stack_size (stack_size+16*6) %else ALLOC_STACK -16*6 %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m6, [base+round_vals-12+shiftq*2] movd m7, [base+byte_blend] movu m0, [fg_dataq+FGData.ar_coeffs_y+ 0] ; cf0-15 movq m2, [fg_dataq+FGData.ar_coeffs_y+16] ; cf16-23 pxor m3, m3 pcmpgtb m4, m3, m0 pcmpgtb m3, m2 pshuflw m6, m6, q0000 SCRATCH 6, 14, 12 SCRATCH 7, 15, 13 punpckhbw m1, m0, m4 punpcklbw m0, m4 punpcklbw m2, m3 pshufd m3, m0, q1111 pshufd m4, m0, q2222 pshufd m5, m0, q3333 pshufd m0, m0, q0000 mova [rsp+ 0*16], m0 mova [rsp+ 1*16], m3 mova [rsp+ 2*16], m4 mova [rsp+ 3*16], m5 pshufd m6, m1, q1111 pshufd m7, m1, q2222 pshufd m5, m1, q3333 pshufd m1, m1, q0000 pshufd m3, m2, q1111 psrldq m0, m2, 10 pinsrw m2, [base+pw_1], 5 pshufd m4, m2, q2222 pshufd m2, m2, q0000 pinsrw m0, [base+round_vals+shiftq*2-10], 3 mova [rsp+ 4*16], m1 mova [rsp+ 5*16], m6 SCRATCH 7, 8, 6 SCRATCH 5, 9, 7 SCRATCH 2, 10, 8 SCRATCH 3, 11, 9 SCRATCH 4, 12, 10 SCRATCH 0, 13, 11 DEFINE_ARGS buf, fg_data, h, x sub bufq, 82*73-(82*3+79) mov hd, 70 .y_loop_ar3: mov xq, -76 .x_loop_ar3: movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] pxor m3, m3 pcmpgtb m3, m0 punpckhbw m2, m0, m3 punpcklbw m0, m3 psrldq m5, m0, 2 psrldq m6, m0, 4 psrldq m7, m0, 6 punpcklwd m4, m0, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 0*16] pmaddwd m6, [rsp+ 1*16] paddd m4, m6 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] pxor m5, m5 pcmpgtb m5, m1 punpckhbw m3, m1, m5 punpcklbw m1, m5 palignr m6, m2, m0, 10 palignr m7, m2, m0, 12 psrldq m0, 8 punpcklwd m0, m6 punpcklwd m7, m1 pmaddwd m0, [rsp+ 2*16] pmaddwd m7, [rsp+ 3*16] paddd m0, m7 paddd m0, m4 psrldq m4, m1, 2 psrldq m5, m1, 4 psrldq m6, m1, 6 psrldq m7, m1, 8 punpcklwd m4, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 4*16] pmaddwd m6, [rsp+ 5*16] paddd m4, m6 paddd m0, m4 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor m7, m7 pcmpgtb m7, m2 punpckhbw m5, m2, m7 punpcklbw m2, m7 palignr m7, m3, m1, 10 palignr m3, m1, 12 psrldq m1, m2, 2 punpcklwd m7, m3 punpcklwd m3, m2, m1 pmaddwd m7, m8 pmaddwd m3, m9 paddd m7, m3 paddd m0, m7 psrldq m6, m2, 4 psrldq m1, m2, 6 psrldq m3, m2, 8 palignr m4, m5, m2, 10 palignr m5, m5, m2, 12 punpcklwd m6, m1 punpcklwd m3, m4 punpcklwd m5, m14 pmaddwd m6, m10 pmaddwd m3, m11 pmaddwd m5, m12 paddd m0, m6 paddd m3, m5 paddd m0, m3 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] .x_loop_ar3_inner: pxor m5, m5 pcmpgtb m5, m1 punpcklbw m2, m1, m5 pmaddwd m2, m13 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw since we only care about one value packsswb m2, m2 pslldq m2, 3 pand m2, m15 pandn m3, m15, m1 por m1, m2, m3 movd [bufq+xq-3], m1 psrldq m1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 dec hd jg .y_loop_ar3 RET %macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 cglobal generate_grain_uv_%1_8bpc, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv movifnidn r2, r2mp movifnidn r3, r3mp LEA r4, $$ %define base r4-$$ movq m1, [base+rnd_next_upperbit_mask] movq m4, [base+mul_bits] movq m7, [base+hmul_bits] mov r5d, [fg_dataq+FGData.grain_scale_shift] movd m6, [base+round+r5*2] mova m5, [base+pb_mask] movd m0, [fg_dataq+FGData.seed] movd m2, [base+pw_seed_xor+uvq*4] pxor m0, m2 pshuflw m6, m6, q0000 pshuflw m0, m0, q0000 lea r6, [base+gaussian_sequence] %if %2 %if ARCH_X86_64 mov r7d, 73-35*%3 %else mov r3mp, 73-35*%3 %endif add bufq, 44 .loop_y: mov r5, -44 .loop_x: %else mov r5, -82*73 sub bufq, r5 .loop: %endif pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set pmullw m2, m4 ; bits 0x0f00 are set pshufb m3, m5, m2 ; set 15th bit for next 4 seeds psllq m2, m3, 30 por m3, m2 psllq m2, m3, 15 por m3, m2 ; aggregate each bit into next seed's high bit pmulhuw m2, m0, m7 por m2, m3 ; 4 next output seeds pshuflw m0, m2, q3333 psrlw m2, 5 %if ARCH_X86_64 movd r9d, m2 pshuflw m2, m2, q3232 movzx r8, r9w shr r9, 16 movd m3, [r6+r8*2] pinsrw m3, [r6+r9*2], 1 movd r9d, m2 movzx r8, r9w shr r9, 16 pinsrw m3, [r6+r8*2], 2 pinsrw m3, [r6+r9*2], 3 %else movd r2, m2 pshuflw m2, m2, q3232 movzx r1, r2w shr r2, 16 movd m3, [r6+r1*2] pinsrw m3, [r6+r2*2], 1 movd r2, m2 movzx r1, r2w shr r2, 16 pinsrw m3, [r6+r1*2], 2 pinsrw m3, [r6+r2*2], 3 %endif pmulhrsw m3, m6 packsswb m3, m3 movd [bufq+r5], m3 add r5, 4 %if %2 jl .loop_x add bufq, 82 %if ARCH_X86_64 dec r7d %else dec r3mp %endif jg .loop_y %else jl .loop %endif %if ARCH_X86_32 mov r2, r2mp %endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] movsxd r5, [base+generate_grain_uv_%1_8bpc_ssse3_table+r5*4] lea r5, [r5+base+generate_grain_uv_%1_8bpc_ssse3_table] jmp r5 .ar0: DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 %assign stack_offset_old stack_offset ALLOC_STACK -2*16 %endif imul uvd, 28 mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd m4, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h, x pxor m0, m0 pcmpgtb m0, m5 punpcklbw m5, m0 movd m7, [base+pb_1] %if %2 movd m6, [base+hmul_bits+2+%3*2] %endif pshuflw m5, m5, q0000 pshuflw m4, m4, q0000 pshufd m7, m7, q0000 %if %2 pshuflw m6, m6, q0000 %endif punpcklqdq m5, m5 punpcklqdq m4, m4 %if %2 punpcklqdq m6, m6 %endif pcmpeqw m1, m1 pslldq m1, 12>>%2 SCRATCH 1, 8, 0 SCRATCH 4, 9, 1 %if %2 sub bufq, 82*(73-35*%3)+82-(82*3+41) %else sub bufq, 82*70-3 %endif add bufyq, 3+82*3 mov hd, 70-35*%3 .y_loop_ar0: xor xd, xd .x_loop_ar0: ; first 32 pixels %if %2 movu m1, [bufyq+xq*2] %if %3 movu m2, [bufyq+xq*2+82] %endif movu m3, [bufyq+xq*2+16] %if %3 movu m4, [bufyq+xq*2+82+16] %endif pmaddubsw m0, m7, m1 %if %3 pmaddubsw m1, m7, m2 %endif pmaddubsw m2, m7, m3 %if %3 pmaddubsw m3, m7, m4 paddw m0, m1 paddw m2, m3 %endif pmulhrsw m0, m6 pmulhrsw m2, m6 %else movu m0, [bufyq+xq] pxor m6, m6 pcmpgtb m6, m0 punpckhbw m2, m0, m6 punpcklbw m0, m6 %endif pmullw m0, m5 pmullw m2, m5 pmulhrsw m0, m9 pmulhrsw m2, m9 movu m1, [bufq+xq] pxor m4, m4 pcmpgtb m4, m1 punpckhbw m3, m1, m4 %if %2 punpcklbw m1, m4 paddw m2, m3 paddw m0, m1 %else punpcklbw m6, m1, m4 paddw m2, m3 paddw m0, m6 %endif packsswb m0, m2 %if %2 movu [bufq+xq], m0 add xd, 16 cmp xd, 32 jl .x_loop_ar0 ; last 6/12 pixels movu m1, [bufyq+xq*(1+%2)] %if %3 movu m2, [bufyq+xq*2+82] %endif pmaddubsw m0, m7, m1 %if %3 pmaddubsw m1, m7, m2 paddw m0, m1 %endif pmulhrsw m0, m6 pmullw m0, m5 pmulhrsw m0, m9 movq m1, [bufq+xq] pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 paddw m0, m2 packsswb m0, m0 pandn m2, m8, m0 pand m1, m8 por m2, m1 movq [bufq+xq], m2 %else add xd, 16 cmp xd, 80 je .y_loop_final_ar0 movu [bufq+xq-16], m0 jmp .x_loop_ar0 .y_loop_final_ar0: pandn m2, m8, m0 pand m1, m8 por m2, m1 movu [bufq+xq-16], m2 %endif add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET .ar1: %if ARCH_X86_32 %assign stack_offset stack_offset_old %assign stack_size_padded 0 %xdefine rstk rsp %endif DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x imul uvd, 28 movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3] movd m4, [fg_dataq+FGData.ar_coeffs_uv+uvq-1] pinsrw m4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 2 %if ARCH_X86_32 mov r3mp, cf3d DEFINE_ARGS buf, shift, fg_data, val3, min, max, x %elif WIN64 DEFINE_ARGS shift, bufy, fg_data, buf, val3, cf3, min, max, x mov bufq, r0 %else DEFINE_ARGS buf, bufy, fg_data, shift, val3, cf3, min, max, x %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m3, [base+round_vals+shiftq*2-12] ; rnd %if %2 movd m7, [base+pb_1] movd m6, [base+hmul_bits+2+%3*2] %endif psrldq m4, 1 %if ARCH_X86_32 DEFINE_ARGS buf, shift, val0, val3, min, max, x %elif WIN64 DEFINE_ARGS shift, bufy, h, buf, val3, cf3, min, max, x, val0 %else DEFINE_ARGS buf, bufy, h, shift, val3, cf3, min, max, x, val0 %endif pxor m5, m5 punpcklwd m3, m5 %if %2 punpcklwd m6, m6 %endif pcmpgtb m5, m4 punpcklbw m4, m5 pshufd m5, m4, q1111 pshufd m4, m4, q0000 pshufd m3, m3, q0000 %if %2 pshufd m7, m7, q0000 pshufd m6, m6, q0000 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif %if ARCH_X86_32 add r1mp, 79+82*3 mov r0mp, 70-35*%3 %else add bufyq, 79+82*3 mov hd, 70-35*%3 %endif mov mind, -128 mov maxd, 127 .y_loop_ar1: mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: %if %2 %if ARCH_X86_32 mov r2, r1mp movq m0, [r2+xq*2] %if %3 movq m1, [r2+xq*2+82] %endif %else movq m0, [bufyq+xq*2] %if %3 movq m1, [bufyq+xq*2+82] %endif %endif pmaddubsw m2, m7, m0 %if %3 pmaddubsw m0, m7, m1 paddw m2, m0 %endif pmulhrsw m2, m6 %else %if ARCH_X86_32 mov r2, r1mp movd m2, [r2+xq] %else movd m2, [bufyq+xq] %endif pxor m0, m0 pcmpgtb m0, m2 punpcklbw m2, m0 %endif movq m0, [bufq+xq-82-1] ; top/left pxor m1, m1 pcmpgtb m1, m0 punpcklbw m0, m1 psrldq m1, m0, 4 ; top/right punpcklwd m1, m2 psrldq m2, m0, 2 ; top punpcklwd m0, m2 pmaddwd m0, m4 pmaddwd m1, m5 paddd m0, m1 paddd m0, m3 .x_loop_ar1_inner: movd val0d, m0 psrldq m0, 4 %if ARCH_X86_32 imul val3d, r3mp %else imul val3d, cf3d %endif add val3d, val0d sar val3d, shiftb movsx val0d, byte [bufq+xq] add val3d, val0d cmp val3d, maxd cmovns val3d, maxd cmp val3d, mind cmovs val3d, mind mov byte [bufq+xq], val3b ; keep val3d in-place as left for next x iteration inc xq jz .x_loop_ar1_end test xq, 3 jnz .x_loop_ar1_inner jmp .x_loop_ar1 .x_loop_ar1_end: add bufq, 82 %if ARCH_X86_32 add r1mp, 82<<%3 dec r0mp %else add bufyq, 82<<%3 dec hd %endif jg .y_loop_ar1 RET .ar2: %if ARCH_X86_32 %assign stack_offset stack_offset_old %assign stack_size_padded 0 %xdefine rstk rsp ALLOC_STACK -8*16 %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 movd m7, [base+round_vals-12+shiftq*2] movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-12 pxor m2, m2 pcmpgtb m2, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 pinsrw m1, [base+pw_1], 5 punpcklwd m7, m7 pshufd m7, m7, q0000 DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd m4, m1, q0000 pshufd m5, m1, q1111 pshufd m6, m1, q2222 pshufd m3, m0, q3333 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 SCRATCH 0, 8, 0 SCRATCH 1, 9, 1 SCRATCH 2, 10, 2 SCRATCH 3, 11, 3 SCRATCH 4, 12, 4 SCRATCH 5, 13, 5 SCRATCH 6, 14, 6 SCRATCH 7, 15, 7 %if %2 movd m7, [base+hmul_bits+2+%3*2] movd m6, [base+pb_1] punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar2: mov xq, -(76>>%2) .x_loop_ar2: pxor m2, m2 movq m0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] movhps m0, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5] pcmpgtb m2, m0 punpckhbw m1, m0, m2 punpcklbw m0, m2 psrldq m5, m0, 2 ; y=-2,x=[-1,+5] psrldq m3, m1, 2 ; y=-1,x=[-1,+5] psrldq m4, m1, 4 ; y=-1,x=[+0,+5] punpcklwd m2, m0, m5 punpcklwd m3, m4 pmaddwd m2, m8 pmaddwd m3, m11 paddd m2, m3 psrldq m4, m0, 4 ; y=-2,x=[+0,+5] psrldq m5, m0, 6 ; y=-2,x=[+1,+5] psrldq m0, 8 ; y=-2,x=[+2,+5] punpcklwd m4, m5 punpcklwd m0, m1 psrldq m3, m1, 6 ; y=-1,x=[+1,+5] psrldq m1, m1, 8 ; y=-1,x=[+2,+5] punpcklwd m3, m1 pmaddwd m4, m9 pmaddwd m0, m10 pmaddwd m3, m12 paddd m4, m0 paddd m2, m3 paddd m2, m4 %if %2 movq m1, [bufyq+xq*2] %if %3 movq m3, [bufyq+xq*2+82] %endif pmaddubsw m0, m6, m1 %if %3 pmaddubsw m1, m6, m3 paddw m0, m1 %endif pmulhrsw m0, m7 %else movd m0, [bufyq+xq] pxor m1, m1 pcmpgtb m1, m0 punpcklbw m0, m1 %endif punpcklwd m0, m15 pmaddwd m0, m14 paddd m2, m0 movq m0, [bufq+xq-2] ; y=0,x=[-2,+5] pxor m4, m4 movd m5, [base+byte_blend+1] punpcklbw m5, m5 .x_loop_ar2_inner: pcmpgtb m1, m4, m0 punpcklbw m0, m1 pmaddwd m3, m0, m13 paddd m3, m2 psrldq m2, 4 ; shift top to next pixel psrad m3, [fg_dataq+FGData.ar_coeff_shift] pslldq m3, 4 pand m3, m5 paddw m0, m3 packsswb m0, m0 movd [bufq+xq-2], m0 psrldq m0, 1 inc xq jz .x_loop_ar2_end test xq, 3 jnz .x_loop_ar2_inner jmp .x_loop_ar2 .x_loop_ar2_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET .ar3: %if ARCH_X86_32 %assign stack_offset stack_offset_old %assign stack_size_padded 0 %xdefine rstk rsp %endif DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift movifnidn bufyq, bufymp %if ARCH_X86_32 ALLOC_STACK -15*16 %else SUB rsp, 16*7 %assign stack_size_padded (stack_size_padded+16*7) %assign stack_size (stack_size+16*7) %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] imul uvd, 28 movu m0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-15 pxor m3, m3 pcmpgtb m3, m0 punpckhbw m1, m0, m3 punpcklbw m0, m3 pshufd m2, m0, q1111 pshufd m3, m0, q2222 pshufd m4, m0, q3333 pshufd m0, m0, q0000 pshufd m5, m1, q1111 pshufd m6, m1, q2222 pshufd m7, m1, q3333 pshufd m1, m1, q0000 mova [rsp+ 0*16], m0 mova [rsp+ 1*16], m2 mova [rsp+ 2*16], m3 mova [rsp+ 3*16], m4 mova [rsp+ 4*16], m1 mova [rsp+ 5*16], m5 mova [rsp+ 6*16], m6 SCRATCH 7, 8, 7 movu m2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-24 [24=luma] pxor m4, m4 pcmpgtb m4, m2 punpckhbw m5, m2, m4 punpcklbw m2, m4 pshufd m4, m2, q3232 punpcklwd m3, m4, m5 pshuflw m5, m4, q3321 pshufd m4, m3, q0000 pshufd m3, m2, q1111 pshufd m2, m2, q0000 pinsrw m5, [base+round_vals+shiftq*2-10], 3 SCRATCH 2, 9, 8 SCRATCH 3, 10, 9 SCRATCH 4, 11, 10 SCRATCH 5, 12, 11 movd m2, [base+round_vals-12+shiftq*2] %if %2 movd m1, [base+pb_1] movd m3, [base+hmul_bits+2+%3*2] %endif pxor m0, m0 punpcklwd m2, m0 %if %2 punpcklwd m3, m3 %endif pshufd m2, m2, q0000 %if %2 pshufd m1, m1, q0000 pshufd m3, m3, q0000 SCRATCH 1, 13, 12 %endif SCRATCH 2, 14, 13 %if %2 SCRATCH 3, 15, 14 %endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x %if %2 sub bufq, 82*(73-35*%3)+44-(82*3+41) %else sub bufq, 82*69+3 %endif add bufyq, 79+82*3 mov hd, 70-35*%3 .y_loop_ar3: mov xq, -(76>>%2) .x_loop_ar3: movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] pxor m4, m4 pcmpgtb m4, m0 punpckhbw m3, m0, m4 punpcklbw m0, m4 psrldq m5, m0, 2 psrldq m6, m0, 4 psrldq m7, m0, 6 punpcklwd m4, m0, m5 punpcklwd m6, m7 pmaddwd m4, [rsp+ 0*16] pmaddwd m6, [rsp+ 1*16] paddd m4, m6 palignr m2, m3, m0, 10 palignr m3, m0, 12 psrldq m0, 8 movu m1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12] pxor m6, m6 pcmpgtb m6, m1 punpckhbw m5, m1, m6 punpcklbw m1, m6 punpcklwd m0, m2 punpcklwd m3, m1 pmaddwd m0, [rsp+ 2*16] pmaddwd m3, [rsp+ 3*16] paddd m0, m3 paddd m0, m4 movu m2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12] pxor m7, m7 pcmpgtb m7, m2 punpckhbw m6, m2, m7 punpcklbw m2, m7 palignr m3, m5, m1, 10 palignr m5, m1, 12 psrldq m4, m2, 2 punpcklwd m3, m5 punpcklwd m5, m2, m4 pmaddwd m3, [rsp+ 6*16] pmaddwd m5, m8 paddd m3, m5 paddd m0, m3 psrldq m3, m1, 2 psrldq m4, m1, 4 psrldq m5, m1, 6 psrldq m1, 8 punpcklwd m3, m4 punpcklwd m5, m1 pmaddwd m3, [rsp+ 4*16] pmaddwd m5, [rsp+ 5*16] paddd m3, m5 paddd m0, m3 %if %2 movq m1, [bufyq+xq*2] %if %3 movq m3, [bufyq+xq*2+82] %endif pmaddubsw m7, m13, m1 %if %3 pmaddubsw m5, m13, m3 paddw m7, m5 %endif pmulhrsw m7, m15 %else movd m7, [bufyq+xq] pxor m1, m1 pcmpgtb m1, m7 punpcklbw m7, m1 %endif psrldq m1, m2, 4 psrldq m3, m2, 6 palignr m4, m6, m2, 10 palignr m6, m2, 12 psrldq m2, 8 punpcklwd m1, m3 punpcklwd m2, m4 punpcklwd m6, m7 pmaddwd m1, m9 pmaddwd m2, m10 pmaddwd m6, m11 paddd m1, m2 paddd m0, m6 paddd m0, m1 paddd m0, m14 movq m1, [bufq+xq-3] ; y=0,x=[-3,+4] pxor m4, m4 movd m5, [base+byte_blend] .x_loop_ar3_inner: pcmpgtb m2, m4, m1 punpcklbw m3, m1, m2 pmaddwd m2, m3, m12 pshufd m3, m2, q1111 paddd m2, m3 ; left+cur paddd m2, m0 ; add top psrldq m0, 4 psrad m2, [fg_dataq+FGData.ar_coeff_shift] ; don't packssdw, we only care about one value packsswb m2, m2 pandn m3, m5, m1 pslld m2, 24 pand m2, m5 por m1, m2, m3 movd [bufq+xq-3], m1 psrldq m1, 1 inc xq jz .x_loop_ar3_end test xq, 3 jnz .x_loop_ar3_inner jmp .x_loop_ar3 .x_loop_ar3_end: add bufq, 82 add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET %endmacro generate_grain_uv_fn 420, 1, 1 generate_grain_uv_fn 422, 1, 0 generate_grain_uv_fn 444, 0, 0 %macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg %assign %%idx 0 %define %%tmp %2 %if %0 == 6 %define %%tmp %6 %endif %rep 4 %if %%idx == 0 movd %5 %+ d, %2 pshuflw %%tmp, %2, q3232 %else movd %5 %+ d, %%tmp %if %%idx == 2 punpckhqdq %%tmp, %%tmp %elif %%idx == 4 psrlq %%tmp, 32 %endif %endif movzx %4 %+ d, %5 %+ w shr %5 %+ d, 16 %if %%idx == 0 movd %1, [%3+%4] %else pinsrw %1, [%3+%4], %%idx + 0 %endif pinsrw %1, [%3+%5], %%idx + 1 %assign %%idx %%idx+2 %endrep %endmacro INIT_XMM ssse3 ; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby) %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \ dst, src, scaling, unused1, fg_data, picptr, unused2 ; copy stack arguments to new position post-alignment, so that we ; don't have to keep the old stack location in a separate register mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov r5, r8m mov [rsp+5*mmsize+ 4*gprsize], r0 mov [rsp+5*mmsize+ 6*gprsize], r1 mov [rsp+5*mmsize+ 8*gprsize], r2 mov [rsp+5*mmsize+10*gprsize], r3 mov [rsp+5*mmsize+11*gprsize], r4 mov [rsp+5*mmsize+12*gprsize], r5 %else cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \ dst, src, scaling, unused1, fg_data, picptr, unused2 %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize %define r0m [rsp+5*mmsize+ 4*gprsize] %define r1m [rsp+5*mmsize+ 5*gprsize] %define r2m [rsp+5*mmsize+ 6*gprsize] %define r3m [rsp+5*mmsize+ 7*gprsize] %define r4m [rsp+5*mmsize+ 8*gprsize] %define r5m [rsp+5*mmsize+ 9*gprsize] %define r6m [rsp+5*mmsize+10*gprsize] %define r7m [rsp+5*mmsize+11*gprsize] %define r8m [rsp+5*mmsize+12*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, picptrq %else cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut lea r7, [pb_mask] %define base r7-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] movd m4, [base+max+r6*4] movd m5, [base+min+r6*2] punpcklwd m3, m3 punpcklwd m4, m4 punpcklwd m5, m5 pshufd m3, m3, q0000 pshufd m4, m4, q0000 pshufd m5, m5, q0000 SCRATCH 3, 11, 0 SCRATCH 4, 12, 1 SCRATCH 5, 13, 2 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %endif mov sbyd, r8m mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 test overlapd, overlapd jz .no_vertical_overlap mova m6, [base+pw_1024] mova m7, [base+pb_27_17_17_27] SCRATCH 6, 14, 3 SCRATCH 7, 15, 4 test sbyd, sbyd jnz .vertical_overlap ; fall-through .no_vertical_overlap: mov r8m, overlapd %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, unused imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ unused1, unused2, see, unused3 %endif lea src_bakq, [srcq+wq] neg wq sub dstmp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r4m, wq DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 %endif .loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, unused mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, ; r6m=grain_lut, r7m=h, r8m=overlap_v|h DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, unused %endif .loop_x_odd: mov hd, r7m mov grain_lutq, grain_lutmp .loop_y: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else vpgatherdw m4, m0, scalingq-1, r12, r13, m3 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m4 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 add srcq, r2mp add grain_lutq, 82 dec hd jg .loop_y %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp %else lea srcq, [src_bakq+wq] %endif btc dword r8m, 2 jc .next_blk add offxyd, 16 test dword r8m, 2 ; r8m & 2 = have_top_overlap jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif jnz .loop_x_odd_v_overlap .next_blk: test dword r8m, 1 jz .loop_x test dword r8m, 2 jnz .loop_x_hv_overlap ; horizontal overlap (without vertical overlap) .loop_x_h_overlap: %if ARCH_X86_32 ; r0m=dst, r1m=src_bak, r2m=stride, r3m=see, r4m=w, r5m=picptr, ; r6m=grain_lut, r7m=h, r8m=overlap_v|h DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3 add offxyd, 16 ; left_offxyd mov [rsp+5*mmsize+0*gprsize], offxyd DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3 mov seed, r3m %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164 lea offyq, [offyq+offxq*2+747] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy %endif mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_h_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else vpgatherdw m4, m0, scalingq-1, r12, r13, m3 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+5*mmsize+0*gprsize] movd m7, [grain_lutq+r5] %else movd m7, [grain_lutq+left_offxyq] %endif punpcklbw m7, m3 pmaddubsw m6, m15, m7 pmulhrsw m6, m14 packsswb m6, m6 shufps m6, m3, q3210 pcmpgtb m2, m6 punpcklbw m7, m6, m2 punpckhbw m6, m2 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m7, m4 pmullw m6, m5 pmulhrsw m7, m11 pmulhrsw m6, m11 ; dst = clip_pixel(src, noise) paddw m0, m7 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 add srcq, r2mp add grain_lutq, 82 dec hd jg .loop_y_h_overlap %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end %if ARCH_X86_32 mov srcq, r1m add srcq, r4m %else lea srcq, [src_bakq+wq] %endif xor dword r8m, 4 add offxyd, 16 ; since this half-block had left-overlap, the next does not test dword r8m, 2 ; have_top_overlap jz .loop_x_odd %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add r11d, 16 ; top_offxyd %endif jmp .loop_x_odd_v_overlap .end: RET .vertical_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap %endif or overlapd, 2 ; top_overlap: overlap & 2 mov r8m, overlapd movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul tmpd, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add tmpd, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and tmpd, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, tmpd %if ARCH_X86_32 xor sbyd, seed ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ tmp, unused2, see, unused3 %endif lea src_bakq, [srcq+wq] neg wq sub dstmp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %endif .loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed, ; because of the 'and tmpd, 0x00ff00ff' above mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, unused, top_offxy mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, unused, top_offxy %endif movzx top_offxyd, offxyw %if ARCH_X86_32 mov [rsp+5*mmsize+1*gprsize], top_offxyd DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif shr offxyd, 16 .loop_x_odd_v_overlap: %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] mov [rsp+5*mmsize+12], r5 %else mova m8, [pb_27_17] %endif mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_v_overlap: ; src mova m0, [srcq] pxor m2, m2 punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m4, m0, scalingq-1, r0, r5, m3 vpgatherdw m5, m1, scalingq-1, r0, r5, m3 %else vpgatherdw m4, m0, scalingq-1, r12, r13, m3 vpgatherdw m5, m1, scalingq-1, r12, r13, m3 %endif REPX {psrlw x, 8}, m4, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+5*mmsize+1*gprsize] movu m7, [grain_lutq+r5] %else movu m7, [grain_lutq+top_offxyq] %endif punpckhbw m6, m7, m3 punpcklbw m7, m3 %if ARCH_X86_32 mov r5, [rsp+5*mmsize+12] pmaddubsw m3, [r5], m6 pmaddubsw m6, [r5], m7 %else pmaddubsw m3, m8, m6 pmaddubsw m6, m8, m7 %endif pmulhrsw m3, m14 pmulhrsw m6, m14 packsswb m6, m3 pcmpgtb m7, m2, m6 punpcklbw m2, m6, m7 punpckhbw m6, m7 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m4 pmullw m6, m5 pmulhrsw m2, m11 pmulhrsw m6, m11 ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m6 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif add srcq, r2mp add grain_lutq, 82 dec hw jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines btc hd, 16 jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp %else lea srcq, [src_bakq+wq] %endif btc dword r8m, 2 jc .loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif jmp .loop_x_odd_v_overlap .loop_x_hv_overlap: %if ARCH_X86_32 mov r5, r5m lea r5, [base+pb_27_17] mov [rsp+5*mmsize+12], r5 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak mov r5, [rsp+5*mmsize+1*gprsize] mov r4, offxyd add r5, 16 add r4, 16 mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak xor tmpd, tmpd mov seed, r3m %else mova m8, [pb_27_17] DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ tmp, unused2, see, unused3 ; we assume from the block above that bits 8-15 of tmpd are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, unused1, unused2, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offyq+16] mov offyd, seed mov offxd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*2+0x10001*747+32*82] %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut movzx r5, offxyw ; top_offxy mov [rsp+5*mmsize+1*gprsize], r5 %else DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy movzx top_offxyd, offxyw %endif shr offxyd, 16 mov hd, r7m mov grain_lutq, grain_lutmp .loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy movu m6, [grain_lutq+r5] mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy movd m4, [grain_lutq+r0] movd m7, [grain_lutq+r5] %else movu m6, [grain_lutq+top_offxyq] movd m4, [grain_lutq+left_offxyq] movd m7, [grain_lutq+topleft_offxyq] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m4, m3 punpcklbw m7, m6 pmaddubsw m2, m15, m4 pmaddubsw m4, m15, m7 pmulhrsw m2, m14 pmulhrsw m4, m14 packsswb m2, m2 packsswb m4, m4 shufps m2, m3, q3210 shufps m4, m6, q3210 ; followed by v interpolation (top | cur -> cur) punpcklbw m3, m4, m2 punpckhbw m4, m2 %if ARCH_X86_32 mov r5, [rsp+5*mmsize+12] pmaddubsw m7, [r5], m4 pmaddubsw m4, [r5], m3 %else pmaddubsw m7, m8, m4 pmaddubsw m4, m8, m3 %endif pmulhrsw m7, m14 pmulhrsw m4, m14 packsswb m4, m7 pxor m2, m2 pcmpgtb m7, m2, m4 punpcklbw m3, m4, m7 punpckhbw m4, m7 ; src mova m0, [srcq] punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; scaling[src] %if ARCH_X86_32 vpgatherdw m5, m0, scalingq-1, r0, r5, m7 vpgatherdw m6, m1, scalingq-1, r0, r5, m7 %else vpgatherdw m5, m0, scalingq-1, r13, r14, m7 vpgatherdw m6, m1, scalingq-1, r13, r14, m7 %endif REPX {psrlw x, 8}, m5, m6 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m3, m5 pmullw m4, m6 pmulhrsw m3, m11 pmulhrsw m4, m11 ; dst = clip_pixel(src, noise) paddw m0, m3 paddw m1, m4 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add dword [rsp+5*mmsize+12], mmsize %else mova m8, [pb_17_27] %endif add srcq, r2mp add grain_lutq, 82 dec hw jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines btc hd, 16 jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: %if ARCH_X86_32 add r4mp, 16 %else add wq, 16 %endif jge .end_hv %if ARCH_X86_32 mov srcq, r1m add srcq, r4m %else lea srcq, [src_bakq+wq] %endif xor dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 add dword [rsp+5*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif jmp .loop_x_odd_v_overlap .end_hv: RET %macro FGUV_FN 3 ; name, ss_hor, ss_ver INIT_XMM ssse3 %if ARCH_X86_32 ; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, ; sby, luma, lstride, uv_pl, is_id) %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r2m mov r2, r4m mov r3, r6m mov r4, r7m mov [rsp+7*mmsize+3*gprsize], r0 mov [rsp+7*mmsize+5*gprsize], r1 mov [rsp+7*mmsize+7*gprsize], r2 mov [rsp+7*mmsize+9*gprsize], r3 mov [rsp+7*mmsize+10*gprsize], r4 mov r0, r8m mov r1, r9m mov r2, r10m mov r4, r11m mov r3, r12m mov [rsp+7*mmsize+11*gprsize], r0 mov [rsp+7*mmsize+12*gprsize], r1 mov [rsp+7*mmsize+13*gprsize], r2 mov [rsp+7*mmsize+14*gprsize], r4 %else cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused %endif mov srcq, srcm mov fg_dataq, r3m mov scalingq, r5m %if STACK_ALIGNMENT < mmsize %define r0m [rsp+7*mmsize+ 3*gprsize] %define r1m [rsp+7*mmsize+ 4*gprsize] %define r2m [rsp+7*mmsize+ 5*gprsize] %define r3m [rsp+7*mmsize+ 6*gprsize] %define r4m [rsp+7*mmsize+ 7*gprsize] %define r5m [rsp+7*mmsize+ 8*gprsize] %define r6m [rsp+7*mmsize+ 9*gprsize] %define r7m [rsp+7*mmsize+10*gprsize] %define r8m [rsp+7*mmsize+11*gprsize] %define r9m [rsp+7*mmsize+12*gprsize] %define r10m [rsp+7*mmsize+13*gprsize] %define r11m [rsp+7*mmsize+14*gprsize] %define r12m [rsp+7*mmsize+15*gprsize] %endif LEA r5, pb_mask %define base r5-pb_mask mov r5m, r5 %else cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ grain_lut, tmp, sby, luma, lstride, uv_pl, is_id lea r8, [pb_mask] %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] lea tmpd, [r6d*2] %if ARCH_X86_32 && STACK_ALIGNMENT < mmsize test r3, r3 %else cmp dword r12m, 0 ; is_idm %endif movd m5, [base+min+r6*2] cmovne r6d, tmpd movd m4, [base+max+r6*2] punpcklwd m3, m3 punpcklwd m5, m5 punpcklwd m4, m4 pshufd m3, m3, q0000 pshufd m5, m5, q0000 pshufd m4, m4, q0000 SCRATCH 3, 11, 0 SCRATCH 4, 12, 1 SCRATCH 5, 13, 2 cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl %macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %endif %if %1 mov r6d, dword r11m movd m0, [fg_dataq+FGData.uv_mult+r6*4] movd m1, [fg_dataq+FGData.uv_luma_mult+r6*4] punpcklbw m6, m1, m0 movd m7, [fg_dataq+FGData.uv_offset+r6*4] punpcklwd m6, m6 punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 SCRATCH 6, 14, 3 SCRATCH 7, 15, 4 %endif mov sbyd, r8m mov overlapd, [fg_dataq+FGData.overlap_flag] ; left_overlap: overlap & 1 test overlapd, overlapd jz %%no_vertical_overlap %if ARCH_X86_32 %if %2 mova m1, [base+pb_23_22_h] %else mova m1, [base+pb_27_17_17_27] %endif mova m0, [base+pw_1024] %else %if %2 mova m1, [pb_23_22_h] %else mova m1, [pb_27_17_17_27] %endif mova m0, [pw_1024] %endif SCRATCH 0, 8, 5 SCRATCH 1, 9, 6 test sbyd, sbyd jnz %%vertical_overlap ; fall-through %%no_vertical_overlap: mov r8m, overlapd %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, see, fg_data, picptr, overlap imul seed, (173 << 24) | 37 %else imul seed, sbyd, (173 << 24) | 37 %endif add seed, (105 << 24) | 178 rol seed, 8 movzx seed, seew xor seed, [fg_dataq+FGData.seed] %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak %define luma_bakq lumaq mov wq, r4m %if %3 shl r10mp, 1 %endif %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak mov lstrideq, r10mp %endif mov lumaq, r9mp lea src_bakq, [srcq+wq] lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r11m, luma_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq mov r12mp, strideq %endif %%loop_x: %if ARCH_X86_32 mov seed, r3m %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, unused1, unused2, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak %endif %%loop_x_odd: mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: ; src %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else vpgatherdw m7, m4, scalingq-1, r12, r2 vpgatherdw m5, m6, scalingq-1, r12, r2 %endif REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq+ 0] pcmpgtb m6, m2, m3 punpcklbw m2, m3, m6 punpckhbw m3, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; we already incremented lumaq above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 dec hw jg %%loop_y %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 == 0 ; adjust top_offxy %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif add offxyd, 16 btc dword r8m, 2 jc %%loop_x_even test dword r8m, 2 jz %%loop_x_odd jmp %%loop_x_odd_v_overlap %%loop_x_even: %endif test dword r8m, 1 jz %%loop_x ; r8m = sbym test dword r8m, 2 jne %%loop_x_hv_overlap ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: %if ARCH_X86_32 %if %2 lea r6, [offxyd+16] mov [rsp+7*mmsize+0*gprsize], r6 %else mov [rsp+7*mmsize+0*gprsize], offxyd %endif DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut mov seed, r3m %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride %if %2 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx %else mov left_offxyd, offyd %endif %endif mov r6d, seed or seed, 0xEFF4 shr r6d, 1 test seeb, seeh lea seed, [r6+0x8000] cmovp seed, r6d ; updated seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS luma, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride mov offyd, seed mov offxd, seed %endif ror offyd, 8 shr offxd, 12 and offyd, 0xf imul offyd, 164>>%3 lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride, luma_bak %endif mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y_h_overlap: ; src %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else vpgatherdw m7, m4, scalingq-1, r12, r2 vpgatherdw m5, m6, scalingq-1, r12, r2 %endif REPX {psrlw x, 8}, m7, m5 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] movu m4, [grain_lutq+offxyq+ 0] %if ARCH_X86_32 mov r0, [rsp+7*mmsize+0*gprsize] movd m2, [grain_lutq+r0+ 0] %else movd m2, [grain_lutq+left_offxyq+ 0] %endif punpcklbw m2, m4 pmaddubsw m3, m9, m2 pmulhrsw m3, m8 packsswb m3, m3 shufps m3, m4, q3210 pxor m4, m4 pcmpgtb m4, m3 punpcklbw m2, m3, m4 punpckhbw m3, m4 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 pmaxsw m1, m13 pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; lumaq has already been incremented above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 dec hw jg %%loop_y_h_overlap %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 == 0 xor dword r8m, 4 ; adjust top_offxyd %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add r11d, 16 %endif add offxyd, 16 %endif ; r8m = sbym test dword r8m, 2 %if %2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap %else jne %%loop_x_odd_v_overlap jmp %%loop_x_odd %endif %%end: RET %%vertical_overlap: %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, tmp, sby, see, overlap %endif or overlapd, 2 ; top_overlap: overlap & 2 mov r8m, overlapd movzx sbyd, sbyb %if ARCH_X86_32 imul r4, [fg_dataq+FGData.seed], 0x00010001 DEFINE_ARGS tmp, src, scaling, sby, see, picptr, unused %else imul seed, [fg_dataq+FGData.seed], 0x00010001 %endif imul tmpd, sbyd, 173 * 0x00010001 imul sbyd, 37 * 0x01000100 add tmpd, (105 << 16) | 188 add sbyd, (178 << 24) | (141 << 8) and tmpd, 0x00ff00ff and sbyd, 0xff00ff00 xor seed, tmpd %if ARCH_X86_32 xor sbyd, seed ; (cur_seed << 16) | top_seed DEFINE_ARGS luma, src, scaling, see, w, picptr, src_bak mov r3m, seed mov wq, r4m %if %3 shl r10mp, 1 %endif %else xor seed, sbyd ; (cur_seed << 16) | top_seed DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ tmp, unused2, see, overlap, unused3, src_bak, lstride, luma_bak mov lstrideq, r10mp %endif mov lumaq, r9mp lea src_bakq, [srcq+wq] lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 mov r1m, src_bakq mov r11m, luma_bakq mov r4m, wq DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq mov r12mp, strideq %endif %%loop_x_v_overlap: %if ARCH_X86_32 mov seed, r3m xor tmpd, tmpd %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS dst, src, scaling, offy, h, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, overlap, top_offxy, unused, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride, luma_bak %endif movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 mov [rsp+7*mmsize+1*gprsize], top_offxyd DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %endif %%loop_x_odd_v_overlap: mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m %endif %if %3 mova m1, [PIC_ptr(pb_23_22)] %else mova m1, [PIC_ptr(pb_27_17)] %endif %%loop_y_v_overlap: %if ARCH_X86_32 mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[luma_src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else vpgatherdw m7, m4, scalingq-1, r12, r2 vpgatherdw m5, m6, scalingq-1, r12, r2 %endif REPX {psrlw x, 8}, m7, m5 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] %if ARCH_X86_32 mov r0, [rsp+7*mmsize+1*gprsize] movu m4, [grain_lutq+r0] %else movu m4, [grain_lutq+top_offxyq] %endif punpckhbw m6, m4, m3 punpcklbw m4, m3 pmaddubsw m2, m1, m6 pmaddubsw m3, m1, m4 pmulhrsw m2, m8 pmulhrsw m3, m8 packsswb m3, m2 pxor m6, m6 pcmpgtb m6, m3 punpcklbw m2, m3, m6 punpckhbw m3, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 pmullw m3, m5 pmulhrsw m2, m11 pmulhrsw m3, m11 ; unpack chroma_source pxor m4, m4 punpckhbw m6, m0, m4 punpcklbw m0, m4 ; m0-1: src as word %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m6, m3 pmaxsw m0, m13 pmaxsw m6, m13 pminsw m0, m12 pminsw m6, m12 packuswb m0, m6 movifnidn dstq, dstmp mova [dstq+srcq], m0 dec hw je %%end_y_v_overlap %if ARCH_X86_32 add srcq, r2mp ; lumaq has already been incremented above %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*2] %else add lumaq, lstrideq %endif %endif add grain_lutq, 82 %if %3 == 0 btc hd, 16 %if ARCH_X86_32 mov r5, r5m %endif mova m1, [PIC_ptr(pb_17_27)] jnc %%loop_y_v_overlap %endif jmp %%loop_y %%end_y_v_overlap: %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap %else %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 btc dword r8m, 2 jnc %%loop_x_odd_v_overlap %endif %%loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused mov r6, [rsp+7*mmsize+1*gprsize] %if %2 lea r0, [r3d+16] add r6, 16 mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy %else mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy %endif mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused mov seed, r3m xor tmpd, tmpd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride %if %2 lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offxyq+16] %else mov topleft_offxyq, top_offxyq mov left_offxyq, offxyq %endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed %endif mov r6d, seed or seed, 0xeff4eff4 test seeb, seeh setp tmpb ; parity of top_seed shr seed, 16 shl tmpd, 16 test seeb, seeh setp tmpb ; parity of cur_seed or r6d, 0x00010001 xor tmpd, r6d mov seed, tmpd ror seed, 1 ; updated (cur_seed << 16) | top_seed %if ARCH_X86_32 mov r3m, seed DEFINE_ARGS tmp, src, scaling, offy, w, picptr, offx mov offxd, offyd %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride mov offxd, seed mov offyd, seed %endif ror offyd, 8 ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride, luma_bak %endif movzx top_offxyd, offxyw shr offxyd, 16 %if ARCH_X86_32 mov [rsp+7*mmsize+1*gprsize], top_offxyd %endif mov hd, r7m mov grain_lutq, grain_lutmp %if ARCH_X86_32 mov r5, r5m %endif %if %3 mova m3, [PIC_ptr(pb_23_22)] %else mova m3, [PIC_ptr(pb_27_17)] %endif %%loop_y_hv_overlap: ; grain = grain_lut[offy+y][offx+x] %if ARCH_X86_32 mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy movd m1, [grain_lutq+r0] mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy %else movd m1, [grain_lutq+topleft_offxyq] %endif movu m2, [grain_lutq+offxyq] %if ARCH_X86_32 movu m6, [grain_lutq+r5] movd m4, [grain_lutq+r0] %else movu m6, [grain_lutq+top_offxyq] movd m4, [grain_lutq+left_offxyq] %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m1, m6 punpcklbw m4, m2 pmaddubsw m0, m9, m1 pmaddubsw m1, m9, m4 REPX {pmulhrsw x, m8}, m0, m1 packsswb m0, m1 shufps m4, m0, m2, q3232 shufps m0, m6, q3210 ; followed by v interpolation (top | cur -> cur) punpcklbw m2, m0, m4 punpckhbw m0, m4 pmaddubsw m4, m3, m0 pmaddubsw m1, m3, m2 pmulhrsw m4, m8 pmulhrsw m1, m8 packsswb m1, m4 ; src %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov lumaq, r9mp %endif %if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq mov r5, r5m movd m7, [base+pb_1] %else movd m7, [pb_1] %endif pshufd m7, m7, q0000 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 %else mova m4, [lumaq] mova m0, [srcq] %if ARCH_X86_32 add lumaq, r10mp mov r9mp, lumaq %endif pxor m2, m2 %endif %if %1 %if %2 packuswb m4, m6 ; luma %endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 pmaddubsw m4, m14 psraw m6, 6 psraw m4, 6 paddw m6, m15 paddw m4, m15 packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 %elif %2 == 0 punpckhbw m6, m4, m2 punpcklbw m4, m2 %endif ; scaling[src] %if ARCH_X86_32 vpgatherdw m7, m4, scalingq-1, r0, r5 vpgatherdw m5, m6, scalingq-1, r0, r5 %else %if %3 vpgatherdw m7, m4, scalingq-1, r2, r12 vpgatherdw m5, m6, scalingq-1, r2, r12 %else vpgatherdw m7, m4, scalingq-1, r2, r13 vpgatherdw m5, m6, scalingq-1, r2, r13 %endif %endif REPX {psrlw x, 8}, m7, m5 ; unpack grain pxor m4, m4 pcmpgtb m4, m1 punpcklbw m2, m1, m4 punpckhbw m1, m4 ; noise = round2(scaling[src] * grain, scaling_shift) pmullw m2, m7 pmullw m1, m5 pmulhrsw m2, m11 pmulhrsw m1, m11 %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut %endif ; unpack chroma source pxor m4, m4 punpckhbw m5, m0, m4 punpcklbw m0, m4 ; m0-1: src as word ; dst = clip_pixel(src, noise) paddw m0, m2 paddw m5, m1 pmaxsw m0, m13 pmaxsw m5, m13 pminsw m0, m12 pminsw m5, m12 packuswb m0, m5 movifnidn dstq, dstmp mova [dstq+srcq], m0 %if ARCH_X86_32 add srcq, r2mp ; lumaq has been adjusted above already %else add srcq, r12mp %if %3 lea lumaq, [lumaq+lstrideq*(1+%2)] %else add lumaq, r10mp %endif %endif add grain_lutq, 82 dec hw %if %3 jg %%loop_y_h_overlap %else jle %%end_y_hv_overlap %if ARCH_X86_32 mov r5, r5m %endif mova m3, [PIC_ptr(pb_17_27)] btc hd, 16 jnc %%loop_y_hv_overlap %if ARCH_X86_64 mov lstrideq, r10mp %endif jmp %%loop_y_h_overlap %%end_y_hv_overlap: %if ARCH_X86_64 mov lstrideq, r10mp %endif %endif %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut mov wq, r4m %endif add wq, 16 jge %%end_hv %if ARCH_X86_32 mov srcq, r1mp mov lumaq, r11mp %else mov srcq, r11mp %endif lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif %if %2 jmp %%loop_x_hv_overlap %else %if ARCH_X86_32 add dword [rsp+7*mmsize+1*gprsize], 16 %else add top_offxyd, 16 %endif add offxyd, 16 xor dword r8m, 4 jmp %%loop_x_odd_v_overlap %endif %%end_hv: RET %endmacro %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: %%FGUV_32x32xN_LOOP 0, %2, %3 %endmacro FGUV_FN 420, 1, 1 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif FGUV_FN 422, 1, 0 %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 %endif FGUV_FN 444, 0, 0 rav1e-0.7.1/src/x86/ipred16_avx2.asm000064400000000000000000005406721046102023000150150ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 %macro SMOOTH_WEIGHTS 1-* const smooth_weights_1d_16bpc ; sm_weights[] << 7 %rep %0 dw %1*128 %rotate 1 %endrep const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[] %rep %0 dw %1, 256-%1 %rotate 1 %endrep %endmacro SMOOTH_WEIGHTS 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 %if ARCH_X86_64 ipred_hv_shuf: db 6, 7, 6, 7, 0, 1, 2, 3, 2, 3, 2, 3, 8, 9, 10, 11 db 4, 5, 4, 5, 4, 5, 6, 7, 0, 1, 0, 1, 12, 13, 14, 15 filter_shuf1: db 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 14, 15, 12, 13, -1, -1 filter_shuf2: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 filter_shuf3: db 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 8, 9, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 dw 8*64, 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 pw_m1024: times 2 dw -1024 pw_1to16: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 pw_16to1: dw 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 z2_ymul: dw 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4 z2_ymul8: dw 1, 2, 5, 6, 3, 4, 7, 8, 5, 6, 16, 16, 7, 8 pb_90: times 4 db 90 z2_y_shuf_h4: dd 3, 7, 2, 6, 1, 5, 0, 4 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_x_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 z2_y_shuf: db 6, 7, 14, 15, 4, 5, 12, 13, 4, 5, 12, 13, 2, 3, 10, 11 z2_y_shuf_us: db 6, 7, 14, 15, 2, 3, 10, 11, 4, 5, 12, 13, 0, 1, 8, 9 z_filter_k: dw 4, 4, 5, 5, 4, 4 dw 8, 8, 6, 6, 4, 4 dw 0, 0, 0, 0, 2, 2 %define pw_2 (z_filter_k+32) %define pw_4 (z_filter_k+ 0) %define pw_16 (z2_ymul8 +20) pw_1: times 2 dw 1 pw_3: times 2 dw 3 pw_62: times 2 dw 62 pw_512: times 2 dw 512 pw_2048: times 2 dw 2048 pd_8: dd 8 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4) %define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4) JMP_TABLE ipred_dc_16bpc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_filter_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE ipred_cfl_16bpc, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, avx2, w4, w8, w16, w32, w64 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_YMM avx2 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn hd, hm add tlq, 2 movd xm4, wd pxor xm3, xm3 pavgw xm4, xm3 tzcnt wd, wd movd xm5, wd movu m0, [tlq] lea r5, [ipred_dc_left_16bpc_avx2_table] movsxd r6, [r5+wq*4] add r6, r5 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 mov hd, hm sub tlq, hq movd xm4, hd sub tlq, hq pxor xm3, xm3 pavgw xm4, xm3 tzcnt r6d, hd movd xm5, r6d movu m0, [tlq] lea r5, [ipred_dc_left_16bpc_avx2_table] movsxd r6, [r5+r6*4] add r6, r5 add r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: paddw m0, [tlq+96] paddw m0, [tlq+64] .h32: paddw m0, [tlq+32] .h16: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h8: psrldq xm1, xm0, 8 paddw xm0, xm1 .h4: punpcklwd xm0, xm3 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 paddd xm0, xm4 psrld xm0, xm5 lea stride3q, [strideq*3] vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 jmp wq cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm tzcnt r6d, hd lea r5d, [wq+hq] movd xm4, r5d tzcnt r5d, r5d movd xm5, r5d lea r5, [ipred_dc_16bpc_avx2_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pxor m3, m3 psrlw xm4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movq xm0, [tlq-8] jmp wq .w4: movq xm1, [tlq+2] paddw m0, m4 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrld m1, m0, 16 paddw m0, m1 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: vextracti128 xm1, m0, 1 paddw xm0, xm1 lea r2d, [hq*2] mov r6d, 0xAAAB6667 shrx r6d, r6d, r2d punpckhwd xm1, xm0, xm3 punpcklwd xm0, xm3 paddd xm0, xm1 movd xm1, r6d psrld xm0, 2 pmulhuw xm0, xm1 psrlw xm0, 1 .w4_end: vpbroadcastw xm0, xm0 .s4: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: mova xm0, [tlq-16] jmp wq .w8: vextracti128 xm1, m0, 1 paddw xm0, [tlq+2] paddw xm0, xm4 paddw xm0, xm1 psrld xm1, xm0, 16 paddw xm0, xm1 pblendw xm0, xm3, 0xAA psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w8_end: vpbroadcastw xm0, xm0 .s8: mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm0 mova [dstq+strideq*2], xm0 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova m0, [tlq-32] jmp wq .w16: paddw m0, [tlq+2] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpckhwd xm1, xm0, xm3 punpcklwd xm0, xm3 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w16_end: vpbroadcastw m0, xm0 .s16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-64] paddw m0, [tlq-32] jmp wq .w32: paddw m0, [tlq+ 2] paddw m0, [tlq+34] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpcklwd xm1, xm0, xm3 punpckhwd xm0, xm3 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x6667AAAB shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w32_end: vpbroadcastw m0, xm0 mova m1, m0 .s32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m0 mova [dstq+stride3q +32*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-128] mova m1, [tlq- 96] paddw m0, [tlq- 64] paddw m1, [tlq- 32] paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 2] paddw m0, [tlq+34] paddw m1, [tlq+66] paddw m0, [tlq+98] paddw m0, m1 vextracti128 xm1, m0, 1 paddw xm0, xm1 punpcklwd xm1, xm0, xm3 punpckhwd xm0, xm3 paddd xm1, xm4 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 64 je .w64_end mov r6d, 0x6667AAAB shrx r6d, r6d, hd movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w64_end: vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 .s64: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s64 RET cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 mov r6d, r8m shr r6d, 11 lea r5, [ipred_dc_splat_16bpc_avx2_table] tzcnt wd, wd movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4] mova m1, m0 mova m2, m0 mova m3, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movu m0, [tlq+ 2] movu m1, [tlq+34] movu m2, [tlq+66] movu m3, [tlq+98] lea r5, [ipred_dc_splat_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq %macro IPRED_H 2 ; w, store_type vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] vpbroadcastw m2, [tlq-6] vpbroadcastw m3, [tlq-8] sub tlq, 8 mov%2 [dstq+strideq*0], m0 mov%2 [dstq+strideq*1], m1 mov%2 [dstq+strideq*2], m2 mov%2 [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET ALIGN function_align %endmacro cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 movifnidn hd, hm lea r5, [ipred_h_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq INIT_XMM avx2 .w4: IPRED_H 4, q .w8: IPRED_H 8, a INIT_YMM avx2 .w16: IPRED_H 16, a .w32: vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] vpbroadcastw m2, [tlq-6] vpbroadcastw m3, [tlq-8] sub tlq, 8 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m2 mova [dstq+strideq*2+32*1], m2 mova [dstq+stride3q +32*0], m3 mova [dstq+stride3q +32*1], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET .w64: vpbroadcastw m0, [tlq-2] vpbroadcastw m1, [tlq-4] sub tlq, 4 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*0+32*2], m0 mova [dstq+strideq*0+32*3], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m1 mova [dstq+strideq*1+32*3], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET %macro PAETH 3 ; top, signed_ldiff, ldiff paddw m0, m%2, m1 psubw m7, m3, m0 ; tldiff psubw m0, m%1 ; tdiff pabsw m7, m7 pabsw m0, m0 pminsw m7, m0 pcmpeqw m0, m7 pcmpgtw m7, m%3, m7 vpblendvb m0, m3, m%1, m0 vpblendvb m0, m1, m0, m7 %endmacro cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h %define base r5-ipred_paeth_16bpc_avx2_table movifnidn hd, hm lea r5, [ipred_paeth_16bpc_avx2_table] tzcnt wd, wd movsxd wq, [r5+wq*4] vpbroadcastw m3, [tlq] ; topleft add wq, r5 jmp wq .w4: vpbroadcastq m2, [tlq+2] ; top movsldup m6, [base+ipred_hv_shuf] lea r3, [strideq*3] psubw m4, m2, m3 pabsw m5, m4 .w4_loop: sub tlq, 8 vpbroadcastq m1, [tlq] pshufb m1, m6 ; left PAETH 2, 4, 5 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: vbroadcasti128 m2, [tlq+2] movsldup m6, [base+ipred_hv_shuf] psubw m4, m2, m3 pabsw m5, m4 .w8_loop: sub tlq, 4 vpbroadcastd m1, [tlq] pshufb m1, m6 PAETH 2, 4, 5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m2, [tlq+2] psubw m4, m2, m3 pabsw m5, m4 .w16_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 mova [dstq], m0 add dstq, strideq dec hd jg .w16_loop RET ALIGN function_align .w32: movu m2, [tlq+2] movu m6, [tlq+34] %if WIN64 movaps r4m, xmm8 movaps r6m, xmm9 %endif psubw m4, m2, m3 psubw m8, m6, m3 pabsw m5, m4 pabsw m9, m8 .w32_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 mova [dstq+32*0], m0 PAETH 6, 8, 9 mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w32_loop %if WIN64 movaps xmm8, r4m movaps xmm9, r6m %endif RET ALIGN function_align .w64: WIN64_SPILL_XMM 16 movu m2, [tlq+ 2] movu m6, [tlq+34] movu m10, [tlq+66] movu m13, [tlq+98] psubw m4, m2, m3 psubw m8, m6, m3 psubw m11, m10, m3 psubw m14, m13, m3 pabsw m5, m4 pabsw m9, m8 pabsw m12, m11 pabsw m15, m14 .w64_loop: sub tlq, 2 vpbroadcastw m1, [tlq] PAETH 2, 4, 5 mova [dstq+32*0], m0 PAETH 6, 8, 9 mova [dstq+32*1], m0 PAETH 10, 11, 12 mova [dstq+32*2], m0 PAETH 13, 14, 15 mova [dstq+32*3], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_16bpc_avx2_table lea r6, [ipred_smooth_v_16bpc_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] neg hq vpbroadcastw m5, [tlq+hq*2] ; bottom add wq, r6 jmp wq .w4: vpbroadcastq m4, [tlq+2] ; top movsldup m3, [base+ipred_hv_shuf] lea r6, [strideq*3] psubw m4, m5 ; top - bottom .w4_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m3 pmulhrsw m0, m4 paddw m0, m5 vextracti128 xm1, m0, 1 movhps [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movq [dstq+r6 ], xm0 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop .ret: RET .w8: vbroadcasti128 m4, [tlq+2] movsldup m3, [base+ipred_hv_shuf] lea r6, [strideq*3] psubw m4, m5 .w8_loop: vpbroadcastd m0, [weightsq+hq*2+0] vpbroadcastd m1, [weightsq+hq*2+4] pshufb m0, m3 pshufb m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 vextracti128 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], xm0 vextracti128 [dstq+strideq*2], m1, 1 mova [dstq+r6 ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: movu m4, [tlq+2] lea r6, [strideq*3] psubw m4, m5 .w16_loop: vpbroadcastw m0, [weightsq+hq*2+0] vpbroadcastw m1, [weightsq+hq*2+2] vpbroadcastw m2, [weightsq+hq*2+4] vpbroadcastw m3, [weightsq+hq*2+6] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r6 ], m3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: WIN64_SPILL_XMM 7 movu m4, [tlq+ 2] movu m6, [tlq+34] psubw m4, m5 psubw m6, m5 .w32_loop: vpbroadcastw m1, [weightsq+hq*2+0] vpbroadcastw m3, [weightsq+hq*2+2] pmulhrsw m0, m4, m1 pmulhrsw m1, m6 pmulhrsw m2, m4, m3 pmulhrsw m3, m6 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] add hq, 2 jl .w32_loop RET .w64: WIN64_SPILL_XMM 8 movu m3, [tlq+ 2] movu m4, [tlq+34] movu m6, [tlq+66] movu m7, [tlq+98] REPX {psubw x, m5}, m3, m4, m6, m7 .w64_loop: vpbroadcastw m2, [weightsq+hq*2] pmulhrsw m0, m3, m2 pmulhrsw m1, m4, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*0], m0 pmulhrsw m0, m6, m2 mova [dstq+32*1], m1 pmulhrsw m1, m7, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, strideq inc hq jl .w64_loop RET cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 %define base r6-ipred_smooth_h_16bpc_avx2_table lea r6, [ipred_smooth_h_16bpc_avx2_table] mov wd, wm movifnidn hd, hm vpbroadcastw m5, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [r6+wq*4] sub tlq, hq lea stride3q, [strideq*3] add wq, r6 jmp wq .w4: vpbroadcastq m4, [base+smooth_weights_1d_16bpc+4*2] movsldup m3, [base+ipred_hv_shuf] .w4_loop: vpbroadcastq m0, [tlq+hq-8] ; left pshufb m0, m3 psubw m0, m5 ; left - right pmulhrsw m0, m4 paddw m0, m5 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w4_loop RET .w8: vbroadcasti128 m4, [base+smooth_weights_1d_16bpc+8*2] movsldup m3, [base+ipred_hv_shuf] .w8_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastd m1, [tlq+hq-8] pshufb m0, m3 pshufb m1, m3 psubw m0, m5 psubw m1, m5 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w8_loop RET .w16: movu m4, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: vpbroadcastq m3, [tlq+hq-8] punpcklwd m3, m3 psubw m3, m5 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w16_loop RET .w32: WIN64_SPILL_XMM 7 movu m4, [base+smooth_weights_1d_16bpc+32*2] movu m6, [base+smooth_weights_1d_16bpc+32*3] .w32_loop: vpbroadcastw m1, [tlq+hq-2] vpbroadcastw m3, [tlq+hq-4] psubw m1, m5 psubw m3, m5 pmulhrsw m0, m4, m1 pmulhrsw m1, m6 pmulhrsw m2, m4, m3 pmulhrsw m3, m6 REPX {paddw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w32_loop RET .w64: WIN64_SPILL_XMM 8 movu m3, [base+smooth_weights_1d_16bpc+32*4] movu m4, [base+smooth_weights_1d_16bpc+32*5] movu m6, [base+smooth_weights_1d_16bpc+32*6] movu m7, [base+smooth_weights_1d_16bpc+32*7] .w64_loop: vpbroadcastw m2, [tlq+hq-2] psubw m2, m5 pmulhrsw m0, m3, m2 pmulhrsw m1, m4, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*0], m0 pmulhrsw m0, m6, m2 mova [dstq+32*1], m1 pmulhrsw m1, m7, m2 paddw m0, m5 paddw m1, m5 mova [dstq+32*2], m0 mova [dstq+32*3], m1 add dstq, strideq sub hq, 1*2 jg .w64_loop RET %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] pmaddwd m0, m%1, m%3 pmaddwd m1, m%2, m%4 paddd m0, m%5 paddd m1, m%6 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m5 %endmacro cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_16bpc_avx2_table lea r6, [ipred_smooth_16bpc_avx2_table] mov wd, wm vpbroadcastw m4, [tlq+wq*2] ; right tzcnt wd, wd mov hd, hm sub tlq, hq sub tlq, hq movsxd wq, [r6+wq*4] pxor m5, m5 add wq, r6 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*4] jmp wq .w4: WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom vpbroadcastq m6, [tlq+hq*2+2] movsldup m7, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+4*4] punpcklwd m6, m0 ; top, bottom punpcklqdq m8, m9, m9 punpckhqdq m9, m9 lea r3, [strideq*3] .w4_loop: vpbroadcastq m3, [tlq+hq*2-8] vbroadcasti128 m1, [v_weightsq] pshufb m3, m7 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m10 pshufb m0, m1, m8 pshufb m1, m9 SMOOTH_2D_END 0, 1, 6, 6, 2, 3 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 16 sub hd, 4 jg .w4_loop RET .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vpbroadcastw m0, [tlq] ; bottom vbroadcasti128 m7, [tlq+hq*2+2] movsldup m8, [base+ipred_hv_shuf] movshdup m9, [base+ipred_hv_shuf] vbroadcasti128 m10, [base+smooth_weights_2d_16bpc+8*4+16*0] vbroadcasti128 m11, [base+smooth_weights_2d_16bpc+8*4+16*1] punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 .w8_loop: vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastq m1, [v_weightsq] pshufb m3, m8 punpcklwd m2, m3, m4 ; left, right punpckhwd m3, m4 pmaddwd m2, m10 pmaddwd m3, m11 pshufb m1, m9 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hd, 2 jg .w8_loop RET .w16: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 11 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+2] mova xm8, [base+smooth_weights_2d_16bpc+16*4+16*0] mova xm9, [base+smooth_weights_2d_16bpc+16*4+16*1] vinserti128 m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1 vinserti128 m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1 punpcklwd m6, m7, m0 ; top, bottom punpckhwd m7, m0 .w16_loop: vpbroadcastd m3, [tlq+hq*2-4] vpbroadcastd m1, [v_weightsq+0] punpcklwd m3, m4 ; left, right pshufd m2, m3, q1111 pmaddwd m10, m8, m2 pmaddwd m2, m9 pshufd m3, m3, q0000 SMOOTH_2D_END 1, 1, 6, 7, 10, 2 vpbroadcastd m1, [v_weightsq+4] pmaddwd m2, m8, m3 pmaddwd m3, m9 mova [dstq+strideq*0], m0 SMOOTH_2D_END 1, 1, 6, 7, 2, 3 mova [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add v_weightsq, 8 sub hq, 2 jg .w16_loop RET .w32: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastw m0, [tlq] ; bottom movu m7, [tlq+hq*2+ 2] movu m9, [tlq+hq*2+34] mova xm10, [base+smooth_weights_2d_16bpc+32*4+16*0] mova xm11, [base+smooth_weights_2d_16bpc+32*4+16*1] vinserti128 m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1 vinserti128 m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1 mova xm12, [base+smooth_weights_2d_16bpc+32*4+16*4] mova xm13, [base+smooth_weights_2d_16bpc+32*4+16*5] vinserti128 m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1 vinserti128 m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 .w32_loop: vpbroadcastw m3, [tlq+hq*2-2] vpbroadcastd m14, [v_weightsq] punpcklwd m3, m4 pmaddwd m1, m10, m3 pmaddwd m2, m11, m3 pmaddwd m0, m6, m14 paddd m0, m1 pmaddwd m1, m7, m14 paddd m1, m2 pmaddwd m2, m12, m3 pmaddwd m3, m13 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 14, 14, 8, 9, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec hd jg .w32_loop RET .w64: %assign stack_offset stack_offset - stack_size_padded PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base mov dst_baseq, dstq mov tl_baseq, tlq mov v_weights_baseq, v_weightsq xor xq, xq .w64_loop_x: mov yq, hq lea tlq, [tl_baseq+hq*2] vpbroadcastw m0, [tl_baseq] ; bottom movu m7, [tlq+xq*2+ 2] movu m9, [tlq+xq*2+34] mova xm10, [base+smooth_weights_2d_16bpc+64*4+16*0] mova xm11, [base+smooth_weights_2d_16bpc+64*4+16*1] vinserti128 m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1 vinserti128 m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1 mova xm12, [base+smooth_weights_2d_16bpc+64*4+16*4] mova xm13, [base+smooth_weights_2d_16bpc+64*4+16*5] vinserti128 m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1 vinserti128 m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1 punpcklwd m6, m7, m0 punpckhwd m7, m0 punpcklwd m8, m9, m0 punpckhwd m9, m0 lea tlq, [tl_baseq-2] .w64_loop_y: vpbroadcastw m3, [tlq+yq*2] vpbroadcastd m1, [v_weightsq] punpcklwd m3, m4 pmaddwd m14, m10, m3 pmaddwd m15, m11, m3 pmaddwd m2, m12, m3 pmaddwd m3, m13 pmaddwd m0, m6, m1 paddd m0, m14 pmaddwd m14, m7, m1 paddd m14, m15 psrld m0, 8 psrld m14, 8 packssdw m0, m14 pavgw m0, m5 mova [dstq+32*0], m0 SMOOTH_2D_END 8, 9, 1, 1, 2, 3 mova [dstq+32*1], m0 add dstq, strideq add v_weightsq, 4 dec yq jg .w64_loop_y lea dstq, [dst_baseq+32*2] add r6, 16*8 mov v_weightsq, v_weights_baseq add xq, 32 test xb, 64 jz .w64_loop_x RET cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z1_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea r7, [dr_intra_derivative] movsxd wq, [r6+wq*4] add tlq, 2 add wq, r6 mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [r7+dxq] xor angled, 0x4ff ; d = 90 - angle vpbroadcastd m5, [pw_62] jmp wq .w4: ALLOC_STACK -64, 7 cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) vpbroadcastw xm3, [tlq+14] movu xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 palignr xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8 paddw xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 add dxd, dxd palignr xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8 paddw xm2, xm1 ; -1 * a + 9 * b + 9 * c + -1 * d psubw xm0, xm2, xm0 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 psraw xm0, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 pxor xm4, xm4 paddw xm2, xm0 vpbroadcastw xm0, r8m ; pixel_max mova [rsp+32], xm3 movd xm3, dxd pmaxsw xm2, xm4 mov r3d, dxd pavgw xm2, xm4 vpbroadcastw m3, xm3 pminsw xm2, xm0 punpcklwd xm0, xm1, xm2 punpckhwd xm1, xm2 lea r5, [strideq*3] pslldq m2, m3, 8 mova [rsp+ 0], xm0 mova [rsp+16], xm1 paddw m6, m3, m3 paddw m3, m2 vpblendd m4, m6, 0xf0 paddw m6, m6 paddw m3, m4 ; xpos0 xpos1 xpos2 xpos3 vbroadcasti128 m4, [z_upsample] .w4_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base1 movu xm2, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base2 vinserti128 m1, [rsp+r3*2], 1 ; 0 2 lea r3d, [r2+dxq] shr r2d, 6 ; base3 vinserti128 m2, [rsp+r2*2], 1 ; 1 3 pshufb m1, m4 pshufb m2, m4 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m5, m3 ; frac psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) paddw m3, m6 ; xpos += dx paddw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r5 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_upsample_loop RET ALIGN function_align .filter_strength: ; w4/w8/w16 %define base r3-z_filter_t0 movd xm0, maxbased lea r3, [z_filter_t0] movd xm1, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m1, xm1 pcmpeqb m0, [base+z_filter_wh] mova xm2, [r3+angleq*8] pand m0, m1 pcmpgtb m0, m2 pmovmskb r5d, m0 ret .w4_no_upsample: mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastw xm3, [tlq+14] mova xm0, [tlq- 2] ; 0 1 2 3 4 5 6 7 vpbroadcastd xm1, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] palignr xm2, xm3, xm0, 4 ; 2 3 4 5 6 7 8 8 pmullw xm1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 paddw xm2, xm0 pmullw xm2, xm4 movd [rsp+16], xm3 cmp r5d, 3 jne .w4_3tap paddw xm1, xm2 palignr xm2, xm3, xm0, 6 ; 3 4 5 6 7 8 8 8 pblendw xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 movzx r3d, word [tlq+14] movzx r2d, word [tlq+12] inc maxbased paddw xm2, xm0 sub r2d, r3d paddw xm2, xm2 lea r2d, [r2+r3*8+4] shr r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3 mov [rsp+16], r2w .w4_3tap: pxor xm0, xm0 paddw xm1, xm2 mov tlq, rsp psrlw xm1, 3 cmp hd, 8 sbb maxbased, -1 pavgw xm0, xm1 mova [tlq], xm0 .w4_main: movd xm3, dxd vpbroadcastq m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] ; top[max_base_x] shl maxbased, 6 vpbroadcastw m3, xm3 movd xm0, maxbased mov r3d, dxd ; xpos vpbroadcastw m0, xm0 paddw m4, m3, m3 psubw m1, m0 ; -max_base_x vpblendd m3, m4, 0xcc paddw m0, m4, m3 vpblendd m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3 paddw m4, m4 paddw m3, m1 .w4_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [tlq+r3*2] lea r3d, [r5+dxq] shr r5d, 6 ; base1 movu xm2, [tlq+r5*2] lea r5d, [r3+dxq] shr r3d, 6 ; base2 vinserti128 m1, [tlq+r3*2], 1 ; 0 2 lea r3d, [r5+dxq] shr r5d, 6 ; base3 vinserti128 m2, [tlq+r5*2], 1 ; 1 3 punpcklqdq m0, m1, m2 psrldq m1, 2 pslldq m2, 6 vpblendd m1, m2, 0xcc pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 ; xpos < max_base_x paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w4_loop lea r6, [strideq*3] .w4_end_loop: movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 movq [dstq+strideq*2], xm6 movq [dstq+r6 ], xm6 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_end_loop .w4_end: RET .w8: %assign stack_offset org_stack_offset ALLOC_STACK -64, 7 lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 movu m2, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g _ movu m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g _ _ movu m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g cmp hd, 4 jne .w8_upsample_h8 ; awkward single-pixel edge case vpblendd m0, m2, 0x20 ; 3 4 5 6 7 8 9 a b c c _ _ _ _ _ .w8_upsample_h8: paddw m2, m1 paddw m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f add dxd, dxd psubw m0, m2, m0 psraw m0, 3 pxor m4, m4 paddw m2, m0 vpbroadcastw m0, r8m movd xm3, dxd pmaxsw m2, m4 mov r3d, dxd pavgw m2, m4 vpbroadcastw m3, xm3 pminsw m2, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 vbroadcasti128 m4, [z_upsample] mova [rsp+ 0], xm0 mova [rsp+16], xm1 paddw m6, m3, m3 vextracti128 [rsp+32], m0, 1 vextracti128 [rsp+48], m1, 1 vpblendd m3, m6, 0xf0 ; xpos0 xpos1 .w8_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu xm1, [rsp+r3*2] movu xm2, [rsp+r3*2+16] lea r3d, [r2+dxq] shr r2d, 6 ; base1 vinserti128 m1, [rsp+r2*2], 1 vinserti128 m2, [rsp+r2*2+16], 1 pshufb m1, m4 pshufb m2, m4 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m3, m6 paddw m0, m1 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_upsample_loop RET .w8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: lea maxbased, [hq+7] test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w8_main popcnt r5d, r5d vpbroadcastd m1, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movu m2, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m2 cmp hd, 8 jl .w8_filter_h4 punpckhwd m2, m2 vpblendd m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g je .w8_filter_end ; 8x4 and 8x8 are always 3-tap movzx r3d, word [tlq+30] mov maxbased, 16 mov [rsp+32], r3d cmp r5d, 3 jne .w8_filter_end punpcklwd xm6, xm0, xm0 vpblendd m2, [tlq+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g vpblendd m6, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movzx r5d, word [tlq+28] mov [rsp+34], r3w paddw m2, m6 sub r5d, r3d inc maxbased paddw m2, m2 lea r3d, [r5+r3*8+4] paddw m1, m2 shr r3d, 3 mov [rsp+32], r3w jmp .w8_filter_end .w8_filter_h4: pshuflw m3, m2, q3321 vinserti128 m3, [tlq+2], 0 ; 2 3 4 5 6 7 8 9 a b c c _ _ _ _ .w8_filter_end: paddw m0, m3 pmullw m0, m4 mov tlq, rsp pxor m2, m2 paddw m0, m1 psrlw m0, 3 pavgw m0, m2 mova [tlq], m0 .w8_main: movd xm3, dxd vbroadcasti128 m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m3, xm3 movd xm0, maxbased mov r3d, dxd vpbroadcastw m0, xm0 paddw m4, m3, m3 psubw m1, m0 vpblendd m3, m4, 0xf0 ; xpos0 xpos1 paddw m3, m1 .w8_loop: lea r5d, [r3+dxq] shr r3d, 6 movu xm0, [tlq+r3*2] movu xm1, [tlq+r3*2+2] lea r3d, [r5+dxq] shr r5d, 6 vinserti128 m0, [tlq+r5*2], 1 vinserti128 m1, [tlq+r5*2+2], 1 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w8_loop .w8_end_loop: mova [dstq+strideq*0], xm6 mova [dstq+strideq*1], xm6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_end_loop .w8_end: RET .w16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(h+15, 31) jmp .w16_main .w16: %assign stack_offset org_stack_offset ALLOC_STACK -96, 7 lea maxbased, [hq+15] test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w16_main popcnt r5d, r5d mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h cmp r5d, 3 jne .w16_filter_3tap vpbroadcastd m2, [base+pw_3] punpcklwd xm0, xm0 vpblendd m0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m1, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m0, m2 pavgw m0, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m0, m1 psrlw m0, 2 movu m3, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m3, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g cmp hd, 8 jl .w16_filter_5tap_h4 punpckhwd m3, m3 je .w16_filter_5tap_h8 vpblendd m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m3, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h movzx r3d, word [tlq+62] movzx r2d, word [tlq+60] pavgw m2, m4 sub r2d, r3d paddw m1, m3 lea r2d, [r2+r3*8+4] paddw m1, m2 shr r2d, 3 psrlw m1, 2 mov [rsp+66], r3w mov [rsp+64], r2w mov tlq, rsp mov r3d, 33 cmp hd, 16 cmovg maxbased, r3d jmp .w16_filter_end2 .w16_filter_5tap_h8: vpblendd xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 vpblendd xm3, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 pavgw xm2, xm4 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 jmp .w16_filter_end2 .w16_filter_5tap_h4: pshuflw xm4, xm3, q3332 ; 4 5 5 5 pshuflw xm3, xm3, q3321 ; 3 4 5 5 pavgw xm2, xm4 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 jmp .w16_filter_end2 .w16_filter_3tap: vpbroadcastd m3, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] pmullw m0, m3, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g movu m2, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m4 pmullw m3, m2 paddw m0, m1 cmp hd, 8 je .w16_filter_3tap_h8 jl .w16_filter_3tap_h4 punpckhwd m2, m2 vpblendd m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g jmp .w16_filter_end .w16_filter_3tap_h4: pshuflw xm2, xm2, q3321 ; 2 3 4 4 _ _ _ _ jmp .w16_filter_end .w16_filter_3tap_h8: psrldq xm2, 2 pshufhw xm2, xm2, q2210 ; 2 3 4 5 6 7 8 8 .w16_filter_end: paddw m2, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f pmullw m2, m4 psrlw m0, 3 pxor m1, m1 paddw m2, m3 psrlw m2, 3 pavgw m0, m1 pavgw m1, m2 .w16_filter_end2: mov tlq, rsp mova [tlq+ 0], m0 mova [tlq+32], m1 .w16_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r3d, dxd vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] psubw m3, m0 .w16_loop: lea r5d, [r3+dxq] shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] lea r3d, [r5+dxq] shr r5d, 6 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m1, m0 movu m0, [tlq+r5*2] vpblendvb m2, m6, m1, m2 movu m1, [tlq+r5*2+2] mova [dstq+strideq*0], m2 pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+strideq*1], m0 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w16_loop .w16_end_loop: mova [dstq+strideq*0], m6 mova [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_end_loop .w16_end: RET .w32: %assign stack_offset org_stack_offset ALLOC_STACK -160, 8 lea maxbased, [hq+31] mov r3d, 63 cmp hd, 32 cmova maxbased, r3d test angled, 0x400 jnz .w32_main vpbroadcastd m2, [pw_3] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m1, m2 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i mov r3, rsp paddw m0, m1 lea r5d, [maxbaseq-31] psrlw m0, 2 mova [r3], m0 .w32_filter_loop: mova m0, [tlq+30] paddw m1, m2, [tlq+28] add tlq, 32 paddw m0, [tlq+0] pavgw m1, [tlq+4] paddw m0, [tlq+2] add r3, 32 paddw m0, m1 psrlw m0, 2 mova [r3], m0 sub r5d, 16 jg .w32_filter_loop movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h punpckhwd m1, m0, m0 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g jl .w32_filter_h8 vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h movzx r5d, word [tlq+62] movzx r2d, word [tlq+60] pavgw m2, m3 sub r2d, r5d paddw m0, m1 lea r2d, [r2+r5*8+4] paddw m0, m2 shr r2d, 3 psrlw m0, 2 mova [r3+32], m0 mov [r3+66], r5w mov [r3+64], r2w mov tlq, rsp mov r3d, 65 cmp hd, 64 cmove maxbased, r3d jmp .w32_main .w32_filter_h8: vpblendd xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9 vpblendd xm1, [tlq+34], 0x07 ; 3 4 5 6 7 8 9 9 pavgw xm2, xm3 paddw xm0, xm1 mov tlq, rsp paddw xm0, xm2 psrlw xm0, 2 mova [r3+32], xm0 .w32_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r5d, dxd vpbroadcastd m7, [pw_m1024] ; -16 * 64 vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] psubw m3, m0 .w32_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 psraw m1, m3, 15 vpblendvb m0, m6, m0, m1 mova [dstq+32*0], m0 movu m0, [tlq+r3*2+32] movu m1, [tlq+r3*2+34] add r5d, dxd psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m7, m3 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+32*1], m0 dec hd jz .w32_end add dstq, strideq cmp r5d, maxbased jb .w32_loop .w32_end_loop: mova [dstq+32*0], m6 mova [dstq+32*1], m6 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [hq+63] test angled, 0x400 jnz .w64_main vpbroadcastd m2, [pw_3] mova m0, [tlq-2] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 vpblendd m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m0, [tlq+0] ; 1 2 3 4 5 6 7 8 9 a b c d e f g paddw m1, m2 paddw m0, [tlq+2] ; 2 3 4 5 6 7 8 9 a b c d e f g h pavgw m1, [tlq+4] ; 3 4 5 6 7 8 9 a b c d e f g h i mov r3, rsp paddw m0, m1 lea r5d, [hq+32] psrlw m0, 2 mova [r3], m0 .w64_filter_loop: mova m0, [tlq+30] paddw m1, m2, [tlq+28] add tlq, 32 paddw m0, [tlq+0] pavgw m1, [tlq+4] paddw m0, [tlq+2] add r3, 32 paddw m0, m1 psrlw m0, 2 mova [r3], m0 sub r5d, 16 jg .w64_filter_loop movu m0, [tlq+32] ; 2 3 4 5 6 7 8 9 a b c d e f g h punpckhwd m1, m0, m0 paddw m2, [tlq+28] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m0, [tlq+30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m1, [tlq+34], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h pavgw m2, m3 paddw m0, m1 paddw m0, m2 mov tlq, rsp psrlw m0, 2 mova [r3+32], m0 .w64_main: movd xm4, dxd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 movd xm0, maxbased mov r5d, dxd vpbroadcastd m7, [pw_m1024] ; -16 * 64 vpbroadcastw m0, xm0 paddw m3, m4, [z_base_inc] paddw m8, m7, m7 ; -32 * 64 psubw m3, m0 paddw m9, m8, m7 ; -48 * 64 .w64_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3*2] movu m1, [tlq+r3*2+2] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 psraw m1, m3, 15 vpblendvb m0, m6, m0, m1 mova [dstq+32*0], m0 movu m0, [tlq+r3*2+32] movu m1, [tlq+r3*2+34] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m7, m3 vpblendvb m0, m6, m0, m1 mova [dstq+32*1], m0 movu m0, [tlq+r3*2+64] movu m1, [tlq+r3*2+66] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m8, m3 vpblendvb m0, m6, m0, m1 mova [dstq+32*2], m0 movu m0, [tlq+r3*2+96] movu m1, [tlq+r3*2+98] add r5d, dxd psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m9, m3 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [dstq+32*3], m0 dec hd jz .w64_end add dstq, strideq cmp r5d, maxbased jb .w64_loop .w64_end_loop: mova [dstq+32*0], m6 mova [dstq+32*1], m6 mova [dstq+32*2], m6 mova [dstq+32*3], m6 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_16bpc_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea dxq, [dr_intra_derivative-90] movsxd wq, [r9+wq*4] mova m1, [tlq- 0] movzx dyd, angleb xor angled, 0x400 mova m2, [tlq- 32] mov r8, dxq sub dxq, dyq mova m3, [tlq- 64] add wq, r9 add r9, z_filter_t0-ipred_z2_16bpc_avx2_table mova m4, [tlq- 96] and dyd, ~1 mova m5, [tlq-128] and dxq, ~1 movzx dyd, word [r8+dyq] ; angle - 90 movzx dxd, word [dxq+270] ; 180 - angle vpbroadcastd m11, [base+pw_62] mova [rsp+128], m1 mova [rsp+ 96], m2 mova [rsp+ 64], m3 neg dxd mova [rsp+ 32], m4 neg dyq mova [rsp+ 0], m5 jmp wq .w4: vbroadcasti128 m10, [base+z2_x_shuf] vpbroadcastq m6, [base+z_base_inc+2] lea r8d, [dxq+(65<<6)] ; xpos mov r10d, (63-4)<<6 test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) movq xm0, [tlq+2] ; 1 2 3 4 movq xm1, [tlq+0] ; 0 1 2 3 pshuflw xm2, xm0, q3321 ; 2 3 4 4 pshuflw xm3, xm1, q2100 ; 0 0 1 2 vpbroadcastw xm4, r8m ; pixel_max vbroadcasti128 m10, [base+z_upsample] paddw xm1, xm0 paddw xm2, xm3 lea r8d, [r8+dxq+(1<<6)] psubw xm2, xm1, xm2 add dxd, dxd psraw xm2, 3 pxor xm3, xm3 sub r10d, 3<<6 paddw xm1, xm2 paddw m6, m6 pmaxsw xm1, xm3 sub angled, 1075 ; angle - 53 pavgw xm1, xm3 lea r3d, [hq+3] pminsw xm1, xm4 xor angled, 0x7f ; 180 - angle punpcklwd xm1, xm0 movu [rsp+130], xm1 call .filter_strength jmp .w4_filter_left ALIGN function_align .filter_strength: movd xm8, r3d mov r3d, angled movd xm7, angled vpbroadcastb m8, xm8 shr r3d, 8 ; is_sm << 1 vpbroadcastb m7, xm7 pcmpeqb m8, [base+z_filter_wh] mova xm9, [r9+r3*8] pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 ret ALIGN function_align .upsample_left: ; h4/h8 mova xm0, [tlq-16] ; 8 7 6 5 4 3 2 1 movu xm1, [tlq-14] ; 7 6 5 4 3 2 1 0 vpbroadcastw xm4, r8m ; pixel_max cmp hd, 8 je .upsample_left_h8 pshufhw xm2, xm0, q2100 ; _ _ _ _ 4 4 3 2 pshufhw xm3, xm1, q3321 ; _ _ _ _ 2 1 0 0 jmp .upsample_left_end .upsample_left_h8: pblendw xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2 pblendw xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0 .upsample_left_end: paddw xm1, xm0 paddw xm2, xm3 psubw xm2, xm1, xm2 add dyq, dyq psraw xm2, 3 pxor xm3, xm3 paddw xm1, xm2 pmaxsw xm1, xm3 pavgw xm1, xm3 pminsw xm1, xm4 punpcklwd xm2, xm0, xm1 punpckhwd xm0, xm1 mova [rsp+ 96+gprsize], xm2 mova [rsp+112+gprsize], xm0 ret .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above popcnt r3d, r3d vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] psrldq xm0, xm1, 2 ; 1 2 3 4 pshuflw xm2, xm1, q2100 ; 0 0 1 2 pmullw xm4, xm0 pshuflw xm3, xm0, q3321 ; 2 3 4 4 paddw xm1, xm3 pshuflw xm3, xm0, q3332 ; 3 4 4 4 pmullw xm1, xm5 vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*2] paddw xm2, xm3 vpbroadcastd xm3, r6m ; max_width pmullw xm2, xm5 packssdw xm3, xm3 paddw xm1, xm4 paddw xm1, xm2 psubw xm3, [base+pw_1to16] pxor xm4, xm4 psrlw xm1, 3 pminsw xm3, xm11 ; clip to byte range since there's no variable word blend pavgw xm1, xm4 vpblendvb xm1, xm0, xm3 movq [rsp+130], xm1 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd xm0, [base+pb_90] psubb xm0, xm7 ; 180 - angle pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f vpbroadcastd m5, r7m ; max_height cmp r3d, 3 je .w4_filter_left_s3 vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pmullw m2, m0 cmp hd, 8 jl .w4_filter_left_h4 movu m4, [tlq-34] punpcklwd m1, m0, m0 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e je .w4_filter_left_end vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e jmp .w4_filter_left_end .w4_upsample_left: call .upsample_left mov r11, -16 vbroadcasti128 m9, [base+z_upsample] jmp .w4_main_upsample_left .w4_filter_left_s3: ; can only be h16 movu m2, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m4, [base+pw_3] paddw m1, m0, m2 punpckhwd m2, m2 vpblendd m2, [tlq-28], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g punpcklwd xm3, xm0, xm0 paddw m2, m4 vpblendd m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e vpblendd m3, [tlq-36], 0xfe ; 0 0 0 1 2 3 4 5 6 8 8 9 a b c d paddw m1, m4 pavgw m2, m3 paddw m1, m2 psrlw m1, 2 jmp .w4_filter_left_end2 .w4_filter_left_h4: pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e .w4_filter_left_end: paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m3 paddw m1, m2 pxor m2, m2 psrlw m1, 3 pavgw m1, m2 .w4_filter_left_end2: packssdw m5, m5 psubw m5, [base+pw_16to1] pminsw m5, m11 vpblendvb m1, m0, m5 mova [rsp+96], m1 .w4_main: vbroadcasti128 m9, [base+z2_x_shuf] mov r11, -8 .w4_main_upsample_left: movd xm5, dyd mova m4, [base+z2_y_shuf_h4] mov r2d, r8d movd xm0, dxd vpbroadcastw m5, xm5 rorx r5, dyq, 5 lea r8d, [dyq*3] pmullw m5, [base+z2_ymul] rorx r9, dyq, 4 sar dyd, 6 vpbroadcastw m0, xm0 sar r8d, 6 pand m5, m11 ; frac_y neg dyd psllw m5, 9 add r5d, dyd add r8d, dyd add r9d, dyd paddw m7, m0, m0 lea dyq, [rsp+dyq*2+126] vpblendd m0, m7, 0xcc add dyq, r11 neg r5d paddw m1, m0, m7 neg r8d vpblendd m0, m1, 0xf0 ; xpos0 xpos1 xpos2 xpos3 neg r9d paddw m7, m7 paddw m6, m0 .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm1, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu xm3, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 vinserti128 m1, [rsp+r2*2], 1 lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 vinserti128 m3, [rsp+r3*2], 1 pshufb m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3 pshufb m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3 pand m2, m11, m6 punpcklqdq m0, m1, m3 punpckhqdq m1, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w4_toponly movu xm2, [dyq] vinserti128 m2, [dyq+r8*2], 1 movu xm3, [dyq+r5*2] vinserti128 m3, [dyq+r9*2], 1 pshufb m2, m9 pshufb m3, m9 punpckhwd m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m2, m3 psubw m2, m1 pmulhrsw m2, m5 psraw m3, m6, 15 ; base_x < topleft paddw m1, m2 vpermd m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 a2 b2 c2 d2 a3 b3 c3 d3 vpblendvb m0, m1, m3 .w4_toponly: paddw m6, m7 ; xpos += dx lea r3, [strideq*3] add dyq, r11 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r2d, r10d jge .w4_loop .w4_leftonly_loop: movu xm1, [dyq] vinserti128 m1, [dyq+r8*2], 1 movu xm2, [dyq+r5*2] vinserti128 m2, [dyq+r9*2], 1 add dyq, r11 pshufb m1, m9 pshufb m2, m9 punpckhwd m0, m1, m2 punpcklwd m1, m2 psubw m1, m0 pmulhrsw m1, m5 paddw m0, m1 vpermd m0, m4, m0 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_leftonly_loop .w4_end: RET .w8: mov r10d, hd test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] xor r8d, r8d mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 mova xm1, [tlq+0] ; 0 1 2 3 4 5 6 7 pblendw xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 pblendw xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 vpbroadcastw xm4, r8m ; pixel_max paddw xm1, xm0 paddw xm2, xm3 not r8d psubw xm2, xm1, xm2 add dxd, dxd psraw xm2, 3 sub angled, 53 ; angle - 53 pxor xm3, xm3 paddw xm2, xm1 lea r3d, [hq+7] pmaxsw xm2, xm3 xor angled, 0x7f ; 180 - angle pavgw xm2, xm3 pminsw xm2, xm4 punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 movu [rsp+130], xm1 movu [rsp+146], xm2 call .filter_strength jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above popcnt r3d, r3d vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd xm5, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm6, [base+z_filter_k-4+r3*4+12*2] movu xm0, [tlq+2] ; 1 2 3 4 5 6 7 8 x pblendw xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x pmullw xm4, xm0 pblendw xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x paddw xm1, xm3 vpblendd xm3, [tlq+6], 0x07 ; 3 4 5 6 7 8 8 8 x paddw xm2, xm3 vpbroadcastd xm3, r6m ; max_width pmullw xm1, xm5 pmullw xm2, xm6 packssdw xm3, xm3 paddw xm1, xm4 paddw xm1, xm2 psubw xm3, [base+pw_1to16] pxor xm4, xm4 psrlw xm1, 3 pminsw xm3, xm11 pavgw xm1, xm4 vpblendvb xm1, xm0, xm3 movu [rsp+130], xm1 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 .w8_filter_left: test r3d, r3d jz .w8_main popcnt r3d, r3d cmp r3d, 3 jne .w8_filter_left_s12 vpbroadcastd m6, [base+pw_3] vpbroadcastd m7, [base+pw_16] cmp hd, 16 ; flags needed for later jmp .filter_left_s3b .w8_upsample_left: call .upsample_left vbroadcasti128 m7, [base+z2_y_shuf_us] lea r11, [rsp+118] mov r8, -8 jmp .w8_main_upsample_left .w16_filter_left_s12: xor r8d, r8d .w8_filter_left_s12: mova m0, [tlq-32] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f vpbroadcastd m5, r7m ; max_height vpbroadcastd m2, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pmullw m2, m0 cmp hd, 8 jl .w8_filter_left_h4 movu m4, [tlq-34] punpcklwd m1, m0, m0 vpblendd m1, m4, 0xee ; 0 0 1 2 3 4 5 6 8 8 9 a b c d e je .w8_filter_left_end vpblendd m1, m4, 0x10 ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e jmp .w8_filter_left_end .w8_filter_left_h4: pshufhw m1, m0, q2100 ; _ _ _ _ _ _ _ _ _ _ _ _ c c d e .w8_filter_left_end: paddw m1, [tlq-30] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pmullw m1, m3 paddw m1, m2 pxor m2, m2 psrlw m1, 3 pavgw m1, m2 packssdw m5, m5 psubw m5, [base+pw_16to1] pminsw m5, m11 vpblendvb m1, m0, m5 mova [rsp+96], m1 test r8d, r8d jz .w8_main ; upsample_main vbroadcasti128 m10, [base+z_upsample] vbroadcasti128 m7, [base+z2_y_shuf] lea r5, [rsp+120] movd xm1, dyd vbroadcasti128 m4, [base+z_base_inc+2] movd xm2, dxd vpbroadcastw m1, xm1 vpbroadcastw m2, xm2 mov r7, dstq paddw m4, m4 pmullw m0, m1, [base+z2_ymul8] paddw m5, m2, m2 psllw xm1, 3 vpblendd m2, m5, 0xf0 lea r2d, [dxq+(66<<6)] ; xpos paddw m4, m2 pshufd m6, m0, q2020 psraw xm0, 6 pxor xm1, xm1 psubw xm8, xm1, xm0 pand m6, m11 punpckhwd xm9, xm8, xm1 psllw m6, 9 punpcklwd xm8, xm1 .w8_upsample_above_loop: lea r3d, [r2+dxq] shr r2d, 6 movu xm1, [rsp+r2*2] movu xm2, [rsp+r2*2+16] lea r2d, [r3+dxq] shr r3d, 6 vinserti128 m1, [rsp+r3*2], 1 vinserti128 m2, [rsp+r3*2+16], 1 pshufb m1, m10 pshufb m2, m10 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 punpckhqdq m1, m2 pand m2, m11, m4 psubw m1, m0 psllw m2, 9 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w8_upsample_above_toponly mova m1, m5 vpgatherdq m3, [r5+xm9*2], m5 mova m5, m1 vpgatherdq m2, [r5+xm8*2], m1 pshufb m3, m7 pshufb m2, m7 punpckldq m1, m2, m3 punpckhdq m2, m3 psubw m2, m1 pmulhrsw m2, m6 paddw m1, m2 vpermq m1, m1, q3120 psraw m2, m4, 15 vpblendvb m0, m1, m2 .w8_upsample_above_toponly: paddw m4, m5 sub r5, 4 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_ret lea dstq, [dstq+strideq*2] jmp .w8_upsample_above_loop .w8_main: vbroadcasti128 m7, [base+z2_y_shuf] lea r11, [rsp+120] mov r8, -4 .w8_main_upsample_left: movd xm1, dyd vbroadcasti128 m4, [base+z_base_inc+2] movd xm2, dxd vpbroadcastw m1, xm1 vpbroadcastw m2, xm2 mov r7, dstq pmullw m0, m1, [base+z2_ymul8] paddw m5, m2, m2 psllw xm1, 3 vpblendd m2, m5, 0xf0 ; xpos0 xpos1 lea r9d, [dxq+(65<<6)] ; xpos paddw m4, m2 movd [rsp+284], xm1 .w8_loop0: mov r2d, r9d mova [rsp+288], m0 mov r5, r11 mova [rsp+320], m4 pshufd m6, m0, q2020 psraw xm0, 6 pxor xm1, xm1 psubw xm8, xm1, xm0 ; base_y pand m6, m11 ; frac_y punpckhwd xm9, xm8, xm1 ; base_y 2 3 6 7 psllw m6, 9 punpcklwd xm8, xm1 ; base_y 0 1 4 5 .w8_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2*2] movu xm1, [rsp+r2*2+2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vinserti128 m0, [rsp+r3*2], 1 vinserti128 m1, [rsp+r3*2+2], 1 pand m2, m11, m4 psubw m1, m0 psllw m2, 9 pmulhrsw m1, m2 paddw m0, m1 cmp r3d, 64 jge .w8_toponly mova m1, m5 vpgatherdq m3, [r5+xm9*2], m5 mova m5, m1 vpgatherdq m2, [r5+xm8*2], m1 pshufb m3, m7 ; c0 d0 c1 d1 g0 h0 g1 h1 pshufb m2, m7 ; a0 b0 a1 b1 e0 f0 e1 f1 punpckldq m1, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m2, m3 psubw m2, m1 pmulhrsw m2, m6 paddw m1, m2 vpermq m1, m1, q3120 psraw m2, m4, 15 ; base_x < topleft vpblendvb m0, m1, m2 .w8_toponly: paddw m4, m5 ; xpos += dx add r5, r8 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r2d, (63-8)<<6 jge .w8_loop .w8_leftonly_loop: mova m0, m5 vpgatherdq m4, [r5+xm9*2], m5 mova m5, m0 vpgatherdq m3, [r5+xm8*2], m0 add r5, r8 pshufb m2, m4, m7 pshufb m1, m3, m7 punpckldq m0, m1, m2 punpckhdq m1, m2 psubw m1, m0 pmulhrsw m1, m6 paddw m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_leftonly_loop .w8_end: sub r10d, 1<<8 jl .w8_ret vpbroadcastd m0, [rsp+284] add r7, 16 paddw m0, [rsp+288] ; base_y += 8*dy add r9d, 8<<6 vpbroadcastd m4, [pw_512] movzx hd, r10b paddw m4, [rsp+320] ; base_x += 8*64 mov dstq, r7 jmp .w8_loop0 .w8_ret: RET .w16: movd xm0, [tlq+32] lea r10d, [hq+(1<<8)] movd [rsp+160], xm0 test angled, 0x400 jnz .w8_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above popcnt r3d, r3d vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m6, [base+z_filter_k-4+r3*4+12*2] movu m0, [tlq+2] ; 1 2 3 4 5 6 7 8 9 a b c d e f g punpcklwd xm2, xm1, xm1 vpblendd m2, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e punpckhwd m3, m0, m0 pmullw m4, m0 vpblendd m3, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g paddw m1, m3 vpblendd m3, [tlq+6], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g g g paddw m2, m3 vpbroadcastd m3, r6m ; max_width pmullw m1, m5 pmullw m2, m6 packssdw m3, m3 paddw m1, m4 paddw m1, m2 psubw m3, [base+pw_1to16] pxor m4, m4 psrlw m1, 3 pminsw m3, m11 pavgw m1, m4 vpblendvb m1, m0, m3 movu [rsp+130], m1 .w16_no_filter_above: vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 test r3d, r3d jz .w8_main popcnt r3d, r3d cmp r3d, 3 jne .w16_filter_left_s12 vpbroadcastd m6, [base+pw_3] vpbroadcastd m7, [base+pw_16] cmp hd, 4 jne .filter_left_s3 movq xm0, [tlq-8] ; 0 1 2 3 movq xm1, [tlq-6] ; 1 2 3 4 vpbroadcastd xm5, r7m ; max_height movq xm4, [base+pw_16to1+24] ; 4to1 pshuflw xm2, xm0, q2100 ; 0 0 1 2 pshuflw xm3, xm1, q3321 ; 2 3 4 4 paddw xm1, xm0 paddw xm1, xm2 pshuflw xm2, xm0, q1000 ; 0 0 0 1 paddw xm3, xm6 packssdw xm5, xm5 pavgw xm2, xm3 psubw xm5, xm4 paddw xm1, xm2 pminsw xm5, xm11 psrlw xm1, 2 vpblendvb xm1, xm0, xm5 movq [rsp+120], xm1 jmp .w8_main .w32: mova m2, [tlq+32] movd xm0, [tlq+64] lea r10d, [hq+(3<<8)] mova [rsp+160], m2 movd [rsp+192], xm0 test angled, 0x400 jnz .w8_main vpbroadcastd m6, [base+pw_3] vpbroadcastd m0, r6m ; max_width vpbroadcastd m7, [base+pw_16] mov r3d, 32 packssdw m0, m0 psubw m0, [base+pw_1to16] pminsw m8, m0, m11 psubw m9, m8, m7 .w32_filter_above: movu m0, [tlq+2] punpcklwd xm4, xm1, xm1 paddw m2, m6, [tlq+6] paddw m1, m0 vpblendd m4, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m1, [tlq+4] movu m3, [tlq+r3+2] paddw m5, m6, [tlq+r3-2] pavgw m2, m4 punpckhwd m4, m3, m3 paddw m1, m2 vpblendd m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b c d e f g h h h vpblendd m4, [tlq+r3+4], 0x7f ; 3 4 5 6 7 8 9 a b c d e f g h h pavgw m2, m5 paddw m5, m3, [tlq+r3] paddw m4, m5 psrlw m1, 2 paddw m2, m4 vpblendvb m1, m0, m8 psrlw m2, 2 vpblendvb m2, m3, m9 movu [rsp+130], m1 movu [rsp+r3+130], m2 .filter_left_s3: cmp hd, 16 jl .filter_left_s3_h8 ; h8 .filter_left_s3b: mova m0, [tlq-32] ; 2 3 4 5 6 7 8 9 a b c d e f g h movu m2, [tlq-30] ; 3 4 5 6 7 8 9 a b c d e f g h i vpbroadcastd m5, r7m ; max_height paddw m1, m0, m2 punpckhwd m2, m2 mov r3d, hd vpblendd m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i packssdw m5, m5 not r3 psubw m5, [base+pw_16to1] paddw m2, m6 pminsw m8, m11, m5 je .filter_left_s3_end ; h16 paddw m1, [tlq-34] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m2, [tlq-36] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m2 psrlw m1, 2 vpblendvb m3, m1, m0, m8 mova m0, [tlq-64] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m0, [tlq-62] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m2, m6, [tlq-60] ; 4 5 6 7 8 9 a b c d e f g h i j psubw m8, m7 mova [rsp+96], m3 jnp .filter_left_s3_end ; h32 mova m5, [tlq-96] paddw m1, [tlq-66] pavgw m2, [tlq-68] paddw m1, m2 paddw m4, m5, [tlq-94] paddw m2, m6, [tlq-92] psrlw m1, 2 paddw m4, [tlq- 98] pavgw m2, [tlq-100] vpblendvb m3, m1, m0, m8 mova m0, [tlq-128] psubw m8, m7 paddw m4, m2 paddw m1, m0, [tlq-126] paddw m2, m6, [tlq-124] psrlw m4, 2 mova [rsp+64], m3 vpblendvb m4, m5, m8 psubw m8, m7 mova [rsp+32], m4 .filter_left_s3_end: punpcklwd xm3, xm0, xm0 vpblendd m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, [tlq+r3*2-2], 0xfe ; 2 2 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m4 pavgw m2, m3 paddw m1, m2 psrlw m1, 2 vpblendvb m1, m0, m8 mova [rsp+r3*2+130], m1 jmp .w8_main .filter_left_s3_h8: mova xm0, [tlq-16] ; 0 1 2 3 4 5 6 7 movu xm3, [tlq-14] ; 1 2 3 4 5 6 7 8 pblendw xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6 vpbroadcastd xm5, r7m ; max_height paddw xm1, xm0, xm3 pblendw xm3, [tlq-12], 0x7f ; 2 3 4 5 6 7 8 8 paddw xm1, xm2 vpblendd xm2, [tlq-20], 0x0e ; 0 0 0 1 2 3 4 5 paddw xm3, xm6 packssdw xm5, xm5 pavgw xm2, xm3 psubw xm5, [base+pw_16to1+16] ; 8to1 paddw xm1, xm2 pminsw xm5, xm11 psrlw xm1, 2 vpblendvb xm1, xm0, xm5 mova [rsp+112], xm1 jmp .w8_main .w64: mova m2, [tlq+ 32] mova m3, [tlq+ 64] mova m4, [tlq+ 96] movd xm0, [tlq+128] lea r10d, [hq+(7<<8)] mova [rsp+160], m2 mova [rsp+192], m3 mova [rsp+224], m4 movd [rsp+256], xm0 test angled, 0x400 jnz .w8_main vpbroadcastd m6, [base+pw_3] movu m0, [tlq+34] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m2, [tlq+38] ; 4 5 6 7 8 9 a b c d e f g h h h paddw m5, [tlq+36] ; 3 4 5 6 7 8 9 a b c d e f g h h movu m4, [tlq+66] paddw m3, m6, [tlq+62] paddw m7, m4, [tlq+64] pavgw m3, [tlq+70] paddw m7, [tlq+68] paddw m2, m5 vpbroadcastd m5, r6m ; max_width mov r3d, 96 packssdw m5, m5 paddw m3, m7 psubw m5, [base+pw_1to16] psrlw m2, 2 vpbroadcastd m7, [base+pw_16] psrlw m3, 2 pminsw m8, m11, m5 psubw m9, m8, m7 vpblendvb m2, m0, m9 psubw m9, m7 vpblendvb m3, m4, m9 psubw m9, m7 movu [rsp+162], m2 movu [rsp+194], m3 jmp .w32_filter_above cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z3_16bpc_avx2_table] tzcnt hd, hm movifnidn angled, anglem lea r7, [dr_intra_derivative+45*2-1] sub tlq, 2 movsxd hq, [r6+hq*4] sub angled, 180 add hq, r6 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e movzx dyd, word [r7+dyq] vpbroadcastd m5, [pw_62] mov org_wd, wd jmp hq .h4: ALLOC_STACK -64, 7 lea r7, [strideq*3] cmp angleb, 40 jae .h4_no_upsample lea r4d, [angleq-1024] sar r4d, 7 add r4d, wd jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) mova xm2, [tlq-14] ; 0 1 2 3 4 5 6 7 pblendw xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 vpblendd xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 pshufd xm3, xm1, q0000 paddw xm1, xm2 paddw xm0, [tlq-12] ; 1 2 3 4 5 6 7 8 vpbroadcastw xm4, r8m ; pixel_max add dyd, dyd psubw xm0, xm1, xm0 mova [rsp+ 0], xm3 movd xm3, dyd psraw xm0, 3 neg dyd paddw xm1, xm0 pxor xm0, xm0 lea r2d, [dyq+(16<<6)+63] ; ypos pmaxsw xm1, xm0 pavgw xm1, xm0 vpbroadcastw m3, xm3 pminsw xm1, xm4 punpckhwd xm0, xm1, xm2 punpcklwd xm1, xm2 paddw m2, m3, m3 mova [rsp+32], xm0 punpcklwd m3, m2 mova [rsp+16], xm1 paddw m4, m2, m2 paddw m2, m3 vpblendd m3, m2, 0xf0 ; ypos0 ypos1 ypos2 ypos3 .h4_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 movu xm1, [rsp+r2*2] lea r2d, [r4+dyq] shr r4d, 6 movu xm2, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 vinserti128 m1, [rsp+r2*2], 1 lea r2d, [r4+dyq] shr r4d, 6 vinserti128 m2, [rsp+r4*2], 1 psrld m0, m1, 16 pblendw m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 pslld m2, 16 pblendw m1, m2, 0xaa pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m3, m4 paddw m1, m0 vextracti128 xm2, m1, 1 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 movhps [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r7 ], xm1 add dstq, 8 sub wd, 4 jg .h4_upsample_loop RET ALIGN function_align .filter_strength: ; h4/h8/h16 %define base r4-z_filter_t0 lea r4, [z_filter_t0] movd xm0, maxbased movd xm1, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m1, xm1 pcmpeqb m0, [base+z_filter_wh] pand m0, m1 mova xm1, [r4+angleq*8] pcmpgtb m0, m1 pmovmskb r5d, m0 ret .h4_no_upsample: mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 popcnt r5d, r5d mova xm0, [tlq-14] ; 0 1 2 3 4 5 6 7 movu xm3, [tlq-12] ; 1 2 3 4 5 6 7 8 vpbroadcastd xm2, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd xm4, [base+z_filter_k-4+r5*4+12*0] pmullw xm2, xm0 pblendw xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6 paddw xm1, xm0, xm3 movd [rsp+12], xm0 pmullw xm1, xm4 cmp r5d, 3 jne .h4_filter_3tap pblendw xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8 vpblendd xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5 movzx r4d, word [tlq-14] movzx r2d, word [tlq-12] inc maxbased paddw xm1, xm2 paddw xm0, xm3 sub r2d, r4d paddw xm2, xm0, xm0 lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+14], r2w .h4_filter_3tap: pxor xm0, xm0 paddw xm1, xm2 lea tlq, [rsp+30] psrlw xm1, 3 cmp wd, 8 sbb maxbased, -1 pavgw xm0, xm1 mova [rsp+16], xm0 .h4_main: movd xm3, dyd neg maxbaseq vbroadcasti128 m1, [z_base_inc] vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m3, xm3 lea r4d, [maxbaseq+3*64] neg dyq movd xm2, r4d sub tlq, 8 lea r4, [dyq+63] ; ypos punpcklwd m1, m1 paddw m0, m3, m3 vpbroadcastw m2, xm2 punpcklwd m3, m0 paddw m4, m0, m0 paddw m0, m3 psubw m2, m1 vpblendd m3, m0, 0xf0 ; ypos0 ypos1 ypos2 ypos3 or maxbased, 63 paddw m3, m2 .h4_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu xm1, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 movu xm2, [tlq+r5*2] lea r5, [r4+dyq] sar r4, 6 ; base2 vinserti128 m1, [tlq+r4*2], 1 lea r4, [r5+dyq] sar r5, 6 ; base3 vinserti128 m2, [tlq+r5*2], 1 punpckhwd m0, m1, m2 punpcklwd m1, m2 pand m2, m5, m3 palignr m0, m1, 4 ; a3 b3 a2 b2 a1 b1 a0 b0 c3 d3 c2 d2 c1 d1 c0 d0 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m3, 15 ; ypos < max_base_y paddw m3, m4 paddw m1, m0 vpblendvb m1, m6, m1, m2 vextracti128 xm2, m1, 1 punpckhdq xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq xm1, xm2 ; a3 b3 c3 d3 a2 b2 c2 d2 movhps [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r7 ], xm1 sub wd, 4 jz .h4_end add dstq, 8 cmp r4d, maxbased jg .h4_loop .h4_end_loop: movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 movq [dstq+strideq*2], xm6 movq [dstq+r7 ], xm6 add dstq, 8 sub wd, 4 jg .h4_end_loop .h4_end: RET .h8: lea r4d, [angleq+216] %assign stack_offset org_stack_offset ALLOC_STACK -64, 8 mov r4b, wb lea r7, [strideq*3] cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m2, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6 7 8 9 a b c d e movu m0, [tlq-34] ; _ _ 0 1 2 3 4 5 6 7 8 9 a b c d cmp wd, 8 je .h8_upsample_w8 pshufhw xm3, xm2, q1000 vpblendd m0, m3, 0x0f ; _ _ _ _ 4 4 4 5 6 7 8 9 a b c d .h8_upsample_w8: paddw m0, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastw m4, r8m ; pixel_max add dyd, dyd psubw m0, m1, m0 movd xm6, dyd psraw m0, 3 neg dyd paddw m1, m0 pxor m0, m0 pmaxsw m1, m0 lea r4d, [dyq+(16<<6)+63] ; ypos pavgw m1, m0 vpbroadcastw m6, xm6 pminsw m1, m4 punpckhwd m0, m1, m2 punpcklwd m1, m2 vextracti128 [rsp+48], m0, 1 vextracti128 [rsp+32], m1, 1 paddw m7, m6, m6 mova [rsp+16], xm0 mova [rsp+ 0], xm1 punpcklwd m6, m7 ; ypos0 ypos1 .h8_upsample_loop: lea r2d, [r4+dyq] shr r4d, 6 ; base0 movu m1, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 ; base1 movu m2, [rsp+r2*2] lea r2d, [r4+dyq] shr r4d, 6 ; base2 movu m3, [rsp+r4*2] lea r4d, [r2+dyq] shr r2d, 6 ; base3 movu m4, [rsp+r2*2] psrld m0, m1, 16 pblendw m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4 a3 b3 a2 b2 a1 b1 a0 b0 pslld m2, 16 pblendw m1, m2, 0xaa psrld m2, m3, 16 pblendw m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4 c3 d3 c2 d2 c1 d1 c0 d0 pslld m4, 16 pblendw m3, m4, 0xaa pand m4, m5, m6 paddw m6, m7 psllw m4, 9 psubw m1, m0 pmulhrsw m1, m4 pand m4, m5, m6 psllw m4, 9 psubw m3, m2 pmulhrsw m3, m4 paddw m6, m7 lea r2, [dstq+strideq*4] paddw m1, m0 paddw m3, m2 punpckhdq m0, m1, m3 ; a5 b5 c5 d5 a4 b4 c4 d4 a1 b1 c1 d1 a0 b0 c0 d0 punpckldq m1, m3 ; a7 b7 c7 d7 a6 b6 c6 d6 a3 b3 c3 d3 a2 b2 c2 d2 vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 movhps [r2 +strideq*0], xm0 movq [r2 +strideq*1], xm0 movhps [r2 +strideq*2], xm1 movq [r2 +r7 ], xm1 movhps [dstq+strideq*0], xm2 movq [dstq+strideq*1], xm2 movhps [dstq+strideq*2], xm3 movq [dstq+r7 ], xm3 add dstq, 8 sub wd, 4 jg .h8_upsample_loop RET .h8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: lea maxbased, [wq+7] test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h8_main popcnt r5d, r5d mova m0, [tlq-30] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movu m3, [tlq-28] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m2, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m4, [base+z_filter_k-4+r5*4+12*0] pmullw m2, m0 cmp wd, 8 jl .h8_filter_w4 punpcklwd xm0, xm0 vpblendd m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movd [rsp+28], xm0 paddw m1, m3 mov r4d, 16 pmullw m1, m4 cmovg maxbased, r4d cmp r5d, 3 jne .h8_filter_3tap punpckhwd m3, m3 vpblendd m0, [tlq-34], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m3, [tlq-26], 0x7f ; 2 3 4 5 6 7 8 9 a b c d e f g g movzx r4d, word [tlq-30] movzx r2d, word [tlq-28] inc maxbased paddw m1, m2 paddw m0, m3 sub r2d, r4d paddw m2, m0, m0 lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+30], r2w jmp .h8_filter_3tap .h8_filter_w4: pshufhw xm1, xm0, q2100 vinserti128 m1, [tlq-16], 1 ; _ _ _ _ 4 4 5 6 7 8 9 a b c d e paddw m1, m3 pmullw m1, m4 .h8_filter_3tap: pxor m0, m0 paddw m1, m2 lea tlq, [rsp+62] psrlw m1, 3 pavgw m0, m1 mova [rsp+32], m0 .h8_main: movd xm4, dyd neg maxbaseq vbroadcasti128 m1, [z_base_inc] vpbroadcastw m7, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 lea r4d, [maxbaseq+7*64] neg dyq movd xm2, r4d sub tlq, 16 lea r4, [dyq+63] paddw m6, m4, m4 vpbroadcastw m2, xm2 vpblendd m4, m6, 0xf0 ; ypos0 ypos1 psubw m2, m1 or maxbased, 63 paddw m4, m2 .h8_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu xm0, [tlq+r4*2+2] movu xm1, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 vinserti128 m0, [tlq+r5*2+2], 1 vinserti128 m1, [tlq+r5*2], 1 lea r5, [r4+dyq] sar r4, 6 ; base2 pand m3, m5, m4 psllw m3, 9 psubw m1, m0 pmulhrsw m1, m3 psraw m3, m4, 15 paddw m4, m6 paddw m0, m1 movu xm1, [tlq+r4*2+2] movu xm2, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base3 vpblendvb m0, m7, m0, m3 vinserti128 m1, [tlq+r5*2+2], 1 vinserti128 m2, [tlq+r5*2], 1 pand m3, m5, m4 psllw m3, 9 psubw m2, m1 pmulhrsw m2, m3 psraw m3, m4, 15 paddw m4, m6 lea r5, [dstq+strideq*4] paddw m1, m2 vpblendvb m1, m7, m1, m3 punpckhwd m2, m0, m1 ; a3 c3 a2 c2 a1 c1 a0 c0 b3 d3 b2 d2 b1 d1 b0 d0 vextracti128 xm3, m2, 1 punpcklwd m0, m1 ; a7 c7 a6 c6 a5 c5 a4 c5 b7 d7 b6 d6 b5 d5 b4 d4 punpckhwd xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0 punpcklwd xm2, xm3 ; a3 b3 c3 d3 a2 b2 c2 d2 vextracti128 xm3, m0, 1 movhps [dstq+strideq*0], xm1 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movq [dstq+r7 ], xm2 punpckhwd xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4 punpcklwd xm0, xm3 ; a7 b7 c7 d7 a6 b6 c6 d6 movhps [r5 +strideq*0], xm1 movq [r5 +strideq*1], xm1 movhps [r5 +strideq*2], xm0 movq [r5 +r7 ], xm0 sub wd, 4 jz .h8_end add dstq, 8 cmp r4d, maxbased jg .h8_loop lea r6, [strideq*5] lea r2, [strideq+r7*2] ; stride*7 test wd, 4 jz .h8_end_loop movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 movq [dstq+strideq*2], xm7 movq [dstq+r7 ], xm7 movq [dstq+strideq*4], xm7 movq [dstq+r6 ], xm7 movq [dstq+r7*2 ], xm7 movq [dstq+r2 ], xm7 add dstq, 8 sub wd, 4 jz .h8_end .h8_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 mova [dstq+strideq*2], xm7 mova [dstq+r7 ], xm7 mova [dstq+strideq*4], xm7 mova [dstq+r6 ], xm7 mova [dstq+r7*2 ], xm7 mova [dstq+r2 ], xm7 add dstq, 16 sub wd, 8 jg .h8_end_loop .h8_end: RET .h16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(w+15, 31) jmp .h16_main ALIGN function_align .h16: %assign stack_offset org_stack_offset ALLOC_STACK -96, 10 lea maxbased, [wq+15] lea r7, [strideq*3] test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h16_main ; filter_strength == 0 popcnt r5d, r5d movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i paddw m1, m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpbroadcastd m6, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pmullw m2, m6, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h pmullw m1, m7 paddw m1, m2 cmp wd, 8 jg .h16_filter_w16 mova xm3, [tlq-46] ; 0 1 2 3 4 5 6 7 pmullw xm6, xm3 jl .h16_filter_w4 pblendw xm3, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 cmp r5d, 3 jne .h16_filter_w8_3tap vpblendd xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 .h16_filter_w8_5tap: punpckhwd m0, m0 vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw xm4, [tlq-42] ; 2 3 4 5 6 7 8 9 paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f paddw xm4, xm4 paddw m0, m0 paddw xm6, xm4 paddw m1, m0 .h16_filter_w8_3tap: paddw xm3, [tlq-44] ; 1 2 3 4 5 6 7 8 pmullw xm3, xm7 pxor m0, m0 paddw xm3, xm6 psrlw xm3, 3 pavgw xm3, xm0 mova [rsp+48], xm3 jmp .h16_filter_end .h16_filter_w4: pshufhw xm3, xm3, q2100 ; _ _ _ _ 4 4 5 6 cmp r5d, 3 jne .h16_filter_w8_3tap pshufhw xm4, xm3, q2100 ; _ _ _ _ 4 4 4 5 jmp .h16_filter_w8_5tap .h16_filter_w16: mova m3, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f pmullw m6, m3 punpcklwd xm3, xm3 vpblendd m4, m3, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e paddw m4, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g mov r4d, 32 cmp wd, 16 cmovg maxbased, r4d movd [rsp+28], xm3 pmullw m4, m7 cmp r5d, 3 jne .h16_filter_w16_3tap punpckhwd m0, m0 vpblendd m3, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m0, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m3, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f movzx r4d, word [tlq-62] movzx r2d, word [tlq-60] or maxbased, 1 paddw m3, m3 sub r2d, r4d paddw m0, m0 lea r2d, [r2+r4*8+4] paddw m4, m3 shr r2d, 3 paddw m1, m0 mov [rsp+30], r2w .h16_filter_w16_3tap: pxor m0, m0 paddw m4, m6 psrlw m4, 3 pavgw m4, m0 mova [rsp+32], m4 .h16_filter_end: psrlw m1, 3 lea tlq, [rsp+94] pavgw m1, m0 mova [rsp+64], m1 .h16_main: movd xm8, dyd neg maxbaseq vpbroadcastw m9, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m8, xm8 lea r4d, [maxbaseq+dyq+15*64] neg dyq movd xm7, r4d sub tlq, 32 lea r4, [dyq+63] vpbroadcastw m7, xm7 or maxbased, 63 psubw m7, [z_base_inc] .h16_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 movu m0, [tlq+r4*2+2] movu m2, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base1 movu m1, [tlq+r5*2+2] movu m3, [tlq+r5*2] lea r5, [r4+dyq] sar r4, 6 ; base3 pand m6, m5, m7 psllw m6, 9 psubw m2, m0 pmulhrsw m2, m6 psraw m6, m7, 15 paddw m7, m8 paddw m0, m2 movu m2, [tlq+r4*2+2] movu m4, [tlq+r4*2] lea r4, [r5+dyq] sar r5, 6 ; base3 vpblendvb m0, m9, m0, m6 pand m6, m5, m7 psllw m6, 9 psubw m3, m1 pmulhrsw m3, m6 psraw m6, m7, 15 paddw m7, m8 paddw m1, m3 vpblendvb m1, m9, m1, m6 pand m6, m5, m7 psllw m6, 9 psubw m4, m2 pmulhrsw m4, m6 psraw m6, m7, 15 paddw m7, m8 paddw m2, m4 movu m3, [tlq+r5*2+2] movu m4, [tlq+r5*2] vpblendvb m2, m9, m2, m6 pand m6, m5, m7 psllw m6, 9 psubw m4, m3 pmulhrsw m4, m6 psraw m6, m7, 15 paddw m7, m8 lea r5, [dstq+strideq*4] paddw m3, m4 vpblendvb m3, m9, m3, m6 punpckhwd m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8 a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m0, m1 ; af bf ae be ad bd ac bc a7 b7 a6 b6 a5 b5 a4 b4 punpckhwd m1, m2, m3 ; cb db ca da c9 d9 c8 d8 c3 d3 c2 d2 c1 d1 c0 d0 punpcklwd m2, m3 ; cf df ce de cd dd cc dc c7 d7 c6 d6 c5 d5 c4 d4 punpckhdq m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8 a1 b1 c1 d1 a0 b0 c0 d0 vextracti128 xm6, m3, 1 punpckldq m4, m1 ; ab bb cb db aa ba ca da a3 b3 c3 d3 a2 b2 c2 d2 punpckhdq m1, m0, m2 ; ad bd cd dd ac bc cc dc a5 b5 c5 d5 a4 b4 c4 d4 punpckldq m0, m2 ; af bf cf df ae be ce de a7 b7 c7 d7 a6 b6 c6 d6 vextracti128 xm2, m4, 1 movhps [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm6 vextracti128 xm6, m1, 1 movhps [dstq+strideq*2], xm2 movq [dstq+r7 ], xm2 vextracti128 xm2, m0, 1 movhps [r5 +strideq*0], xm6 movq [r5 +strideq*1], xm6 movhps [r5 +strideq*2], xm2 movq [r5 +r7 ], xm2 lea r5, [dstq+strideq*8] movhps [r5 +strideq*0], xm3 movq [r5 +strideq*1], xm3 movhps [r5 +strideq*2], xm4 movq [r5 +r7 ], xm4 lea r5, [r5+strideq*4] movhps [r5 +strideq*0], xm1 movq [r5 +strideq*1], xm1 movhps [r5 +strideq*2], xm0 movq [r5 +r7 ], xm0 sub wd, 4 jz .h16_end add dstq, 8 cmp r4d, maxbased jg .h16_loop mov hd, 4 .h16_end_loop0: mov r6d, wd mov r2, dstq test wb, 4 jz .h16_end_loop movq [dstq+strideq*0], xm9 movq [dstq+strideq*1], xm9 movq [dstq+strideq*2], xm9 movq [dstq+r7 ], xm9 and r6d, 120 jz .h16_end_w4 add dstq, 8 .h16_end_loop: mova [dstq+strideq*0], xm9 mova [dstq+strideq*1], xm9 mova [dstq+strideq*2], xm9 mova [dstq+r7 ], xm9 add dstq, 16 sub r6d, 8 jg .h16_end_loop .h16_end_w4: lea dstq, [r2+strideq*4] dec hd jg .h16_end_loop0 .h16_end: RET .h32: %assign stack_offset org_stack_offset ALLOC_STACK -160, 9 lea maxbased, [wq+31] and maxbased, 31 or maxbased, 32 ; imin(w+31, 63) test angled, 0x400 jnz .h32_main vpbroadcastd m2, [pw_3] movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i punpckhwd m1, m0, m0 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m2 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f lea r4, [rsp+128] paddw m0, m1 lea r5d, [maxbaseq-31] psrlw m0, 2 mova [r4], m0 .h32_filter_loop: mova m0, [tlq-62] paddw m1, m2, [tlq-66] paddw m0, [tlq-64] pavgw m1, [tlq-58] paddw m0, [tlq-60] sub tlq, 32 sub r4, 32 paddw m0, m1 psrlw m0, 2 mova [r4], m0 sub r5d, 16 jg .h32_filter_loop jl .h32_filter_h8 mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e movzx r5d, word [tlq-62] movzx r2d, word [tlq-60] pavgw m2, m3 sub r2d, r5d paddw m0, m1 lea r2d, [r2+r5*8+4] paddw m0, m2 shr r2d, 3 psrlw m0, 2 mova [r4-32], m0 mov [r4-36], r5w mov [r4-34], r2w lea tlq, [rsp+158] mov r4d, 65 cmp wd, 64 cmove maxbased, r4d jmp .h32_main .h32_filter_h8: mova xm0, [tlq-46] ; 0 1 2 3 4 5 6 7 pblendw xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6 paddw xm2, [tlq-42] ; 2 3 4 5 6 7 8 9 paddw xm0, [tlq-44] ; 1 2 3 4 5 6 7 8 vpblendd xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5 lea tlq, [rsp+158] pavgw xm2, xm3 paddw xm0, xm1 paddw xm0, xm2 psrlw xm0, 2 mova [r4-16], xm0 .h32_main: movd xm6, dyd neg maxbaseq vpbroadcastw m7, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m6, xm6 lea r4d, [maxbaseq+dyq+15*64] neg dyq movd xm4, r4d vpbroadcastd m8, [pw_m1024] lea r4, [dyq+63] vpbroadcastw m4, xm4 or maxbased, 63 psubw m4, [z_base_inc] .h32_loop: mov r5, r4 sar r5, 6 movu m1, [tlq+r5*2-64] movu m0, [tlq+r5*2-62] pand m3, m5, m4 psllw m3, 9 psubw m1, m0 pmulhrsw m1, m3 pcmpgtw m2, m8, m4 paddw m0, m1 vpblendvb m0, m7, m0, m2 movu m2, [tlq+r5*2-32] movu m1, [tlq+r5*2-30] add r4, dyq sub rsp, 64 psubw m2, m1 pmulhrsw m2, m3 psraw m3, m4, 15 paddw m4, m6 mova [rsp+32*0], m0 paddw m1, m2 vpblendvb m1, m7, m1, m3 mova [rsp+32*1], m1 dec wd jz .h32_transpose cmp r4d, maxbased jg .h32_loop .h32_end_loop: sub rsp, 64 mova [rsp+32*0], m7 mova [rsp+32*1], m7 dec wd jg .h32_end_loop .h32_transpose: lea r3, [strideq*3] lea r4, [strideq*5] mov r8, dstq lea r5, [strideq+r3*2] .h32_transpose_loop0: lea r6, [rsp+32] lea r2, [r8+org_wq*2-16] .h32_transpose_loop: mova m0, [r6+64*7] mova m1, [r6+64*6] mova m2, [r6+64*5] mova m3, [r6+64*4] mova m4, [r6+64*3] mova m5, [r6+64*2] mova m6, [r6+64*1] mova m7, [r6+64*0] punpckhwd m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0 punpcklwd m0, m1 ; a7 b7 a6 b6 a5 b5 a4 b4 punpckhwd m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0 punpcklwd m2, m3 ; c7 d7 c6 d6 c5 d5 c4 d4 punpckhwd m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0 punpcklwd m4, m5 ; e7 f7 e6 f6 e5 f5 e4 f4 punpckhwd m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0 punpcklwd m6, m7 ; g7 h7 g6 h6 g5 h5 g4 h4 lea dstq, [r2+strideq*8] sub r6, 32 punpckhdq m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0 punpckldq m8, m1 ; a3 b3 c3 d3 a2 b2 c2 d2 punpckhdq m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0 punpckldq m3, m5 ; e3 f3 g3 h3 e2 f2 g2 h2 punpckhqdq m5, m7, m1 ; 8 0 vextracti128 [r2 +strideq*0], m5, 1 punpcklqdq m7, m1 ; 9 1 mova [dstq+strideq*0], xm5 punpckhqdq m1, m8, m3 ; 10 2 vextracti128 [r2 +strideq*1], m7, 1 punpcklqdq m8, m3 ; 11 3 mova [dstq+strideq*1], xm7 punpckhdq m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4 vextracti128 [r2 +strideq*2], m1, 1 punpckldq m0, m2 ; a7 b7 c7 d7 a6 b6 c6 d6 mova [dstq+strideq*2], xm1 punpckhdq m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4 vextracti128 [r2 +r3 ], m8, 1 punpckldq m4, m6 ; e7 f7 g7 h7 e6 f6 g6 h6 mova [dstq+r3 ], xm8 punpckhqdq m6, m3, m2 ; 12 4 vextracti128 [r2 +strideq*4], m6, 1 punpcklqdq m3, m2 ; 13 5 mova [dstq+strideq*4], xm6 punpckhqdq m2, m0, m4 ; 14 6 vextracti128 [r2 +r4 ], m3, 1 punpcklqdq m0, m4 ; 15 7 mova [dstq+r4 ], xm3 vextracti128 [r2 +r3*2 ], m2, 1 mova [dstq+r3*2 ], xm2 vextracti128 [r2 +r5 ], m0, 1 mova [dstq+r5 ], xm0 lea r2, [dstq+strideq*8] cmp r6, rsp jae .h32_transpose_loop add rsp, 64*8 sub org_wd, 8 jg .h32_transpose_loop0 .h32_end: RET .h64: %assign stack_offset org_stack_offset ALLOC_STACK -256, 10 lea maxbased, [wq+63] test angled, 0x400 jnz .h64_main vpbroadcastd m2, [pw_3] movu m0, [tlq-28] ; 3 4 5 6 7 8 9 a b c d e f g h i punpckhwd m1, m0, m0 vpblendd m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b c d e f g h i i paddw m0, [tlq-30] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m1, m2 paddw m0, [tlq-32] ; 1 2 3 4 5 6 7 8 9 a b c d e f g pavgw m1, [tlq-34] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f lea r4, [rsp+224] paddw m0, m1 lea r5d, [wq+32] psrlw m0, 2 mova [r4], m0 .h64_filter_loop: mova m0, [tlq-62] paddw m1, m2, [tlq-66] paddw m0, [tlq-64] pavgw m1, [tlq-58] paddw m0, [tlq-60] sub tlq, 32 sub r4, 32 paddw m0, m1 psrlw m0, 2 mova [r4], m0 sub r5d, 16 jg .h64_filter_loop mova m0, [tlq-62] ; 0 1 2 3 4 5 6 7 8 9 a b c d e f punpcklwd xm1, xm0, xm0 paddw m2, [tlq-58] ; 2 3 4 5 6 7 8 9 a b c d e f g h paddw m0, [tlq-60] ; 1 2 3 4 5 6 7 8 9 a b c d e f g vpblendd m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5 6 7 8 9 a b c d vpblendd m1, [tlq-64], 0xfe ; 0 0 1 2 3 4 5 6 7 8 9 a b c d e lea tlq, [rsp+254] pavgw m2, m3 paddw m0, m1 paddw m0, m2 psrlw m0, 2 mova [r4-32], m0 .h64_main: neg maxbaseq movd xm4, dyd vpbroadcastw m6, [tlq+maxbaseq*2] shl maxbased, 6 vpbroadcastw m4, xm4 lea r4d, [maxbaseq+dyq+15*64] neg dyq vpbroadcastd m7, [pw_m1024] movd xm3, r4d lea r4, [dyq+63] paddw m8, m7, m7 vpbroadcastw m3, xm3 or maxbased, 63 paddw m9, m8, m7 psubw m3, [z_base_inc] .h64_loop: mov r5, r4 sar r5, 6 movu m1, [tlq+r5*2-128] movu m0, [tlq+r5*2-126] pand m2, m5, m3 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 sub rsp, 128 paddw m0, m1 pcmpgtw m1, m9, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*0], m0 movu m1, [tlq+r5*2-96] movu m0, [tlq+r5*2-94] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m8, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*1], m0 movu m1, [tlq+r5*2-64] movu m0, [tlq+r5*2-62] psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 pcmpgtw m1, m7, m3 vpblendvb m0, m6, m0, m1 mova [rsp+32*2], m0 movu m1, [tlq+r5*2-32] movu m0, [tlq+r5*2-30] psubw m1, m0 pmulhrsw m1, m2 add r4, dyq psraw m2, m3, 15 paddw m3, m4 paddw m0, m1 vpblendvb m0, m6, m0, m2 mova [rsp+32*3], m0 dec wd jz .h64_transpose cmp r4d, maxbased jg .h64_loop .h64_end_loop: sub rsp, 128 mova [rsp+32*0], m6 mova [rsp+32*1], m6 mova [rsp+32*2], m6 mova [rsp+32*3], m6 dec wd jg .h64_end_loop .h64_transpose: lea r2, [strideq*3] lea r3, [strideq*5] mov r5, dstq lea r4, [strideq+r2*2] .h64_transpose_loop0: lea r6, [rsp+112] lea dstq, [r5+org_wq*2-32] .h64_transpose_loop: mova xm0, [r6+128*15] vinserti128 m0, [r6+128* 7], 1 mova xm1, [r6+128*14] vinserti128 m1, [r6+128* 6], 1 mova xm2, [r6+128*13] vinserti128 m2, [r6+128* 5], 1 mova xm3, [r6+128*12] vinserti128 m3, [r6+128* 4], 1 mova xm4, [r6+128*11] vinserti128 m4, [r6+128* 3], 1 mova xm5, [r6+128*10] vinserti128 m5, [r6+128* 2], 1 mova xm6, [r6+128* 9] vinserti128 m6, [r6+128* 1], 1 mova xm7, [r6+128* 8] vinserti128 m7, [r6+128* 0], 1 punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m4, m5 punpcklwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 sub r6, 16 punpckhdq m7, m8, m1 punpckldq m8, m1 punpckhdq m1, m3, m5 punpckldq m3, m5 punpckhqdq m5, m7, m1 punpcklqdq m7, m1 punpckhqdq m1, m8, m3 punpcklqdq m8, m3 punpckhdq m3, m0, m2 mova [dstq+strideq*0], m5 punpckldq m0, m2 mova [dstq+strideq*1], m7 punpckhdq m2, m4, m6 mova [dstq+strideq*2], m1 punpckldq m4, m6 mova [dstq+r2 ], m8 punpckhqdq m6, m3, m2 mova [dstq+strideq*4], m6 punpcklqdq m3, m2 mova [dstq+r3 ], m3 punpckhqdq m2, m0, m4 mova [dstq+r2*2 ], m2 punpcklqdq m0, m4 mova [dstq+r4 ], m0 lea dstq, [dstq+strideq*8] cmp r6, rsp jae .h64_transpose_loop add rsp, 128*16 sub org_wd, 16 jg .h64_transpose_loop0 .h64_end: RET %macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax %ifnum %4 pshufb xm%2, xm%4 %else pshufb xm%2, %4 %endif vinserti128 m%2, xm%2, 1 pshufd m%1, m%2, q0000 pmaddwd m%1, m2 pshufd m%3, m%2, q1111 pmaddwd m%3, m3 paddd m%1, m1 paddd m%1, m%3 pshufd m%3, m%2, q2222 pmaddwd m%3, m4 paddd m%1, m%3 pshufd m%3, m%2, q3333 pmaddwd m%3, m5 paddd m%1, m%3 psrad m%1, 4 packusdw m%1, m%1 pminsw m%1, m%5 %endmacro %macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax pshufb m%2, m%6 vpermq m%4, m%2, q3232 vinserti128 m%2, xm%2, 1 pshufd m%1, m%2, q0000 pshufd m%3, m%4, q0000 pmaddwd m%1, m2 pmaddwd m%3, m2 paddd m%1, m1 paddd m%3, m1 pshufd m%5, m%2, q1111 pmaddwd m%5, m3 paddd m%1, m%5 pshufd m%5, m%4, q1111 pmaddwd m%5, m3 paddd m%3, m%5 pshufd m%5, m%2, q2222 pmaddwd m%5, m4 paddd m%1, m%5 pshufd m%5, m%4, q2222 pmaddwd m%5, m4 paddd m%3, m%5 pshufd m%5, m%2, q3333 pmaddwd m%5, m5 paddd m%1, m%5 pshufd m%5, m%4, q3333 pmaddwd m%5, m5 paddd m%3, m%5 psrad m%1, 4 psrad m%3, 4 packusdw m%1, m%3 pminsw m%1, m%7 %endmacro ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. One redundant ; block is calculated for w8 and w16, two for w32. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 5 1 2 3 5 b c d f ; 2 2 3 2 4 5 7 2 4 5 7 c e f h ; 3 3 4 4 6 7 9 4 6 7 9 e g h j ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter %assign org_stack_offset stack_offset %define base r6-ipred_filter_16bpc_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 add filterq, r6 lea r6, [ipred_filter_16bpc_avx2_table] vbroadcasti128 m0, [tlq-6] movsxd wq, [r6+wq*4] vpbroadcastd m1, [base+pd_8] pmovsxbw m2, [filterq+16*0] pmovsxbw m3, [filterq+16*1] pmovsxbw m4, [filterq+16*2] pmovsxbw m5, [filterq+16*3] add wq, r6 mov hd, hm jmp wq .w4: WIN64_SPILL_XMM 10 mova xm8, [base+filter_shuf2] vpbroadcastw m9, r8m ; bitdepth_max lea r7, [6+hq*2] sub tlq, r7 jmp .w4_loop_start .w4_loop: pinsrq xm0, [tlq+hq*2], 0 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER_1BLK 6, 0, 7, 8, 9 vextracti128 xm0, m6, 1 movq [dstq+strideq*0], xm6 movq [dstq+strideq*1], xm0 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vbroadcasti128 m14, [base+filter_shuf3] vpbroadcastw m15, r8m ; bitdepth_max FILTER_1BLK 10, 0, 7, [base+filter_shuf2], 15 vpermq m6, m10, q1302 ; ____ ____ | ____ 4321 pslldq m8, m0, 4 psrldq m7, m6, 2 psrldq m0, m6, 10 punpcklwd m7, m0 vpblendd m8, m6, 0x33 ; _0__ 4321 | ____ 4321 vpblendd m8, m7, 0x40 ; _056 4321 | ____ 4321 vpblendd m8, [tlq-6], 0x30 ; _056 4321 | ____ 4321 lea r7, [16+hq*2] sub tlq, r7 jmp .w8_loop_start .w8_loop: vpermq m8, m9, q1302 ; ____ 4321 | ____ 4321 vpermq m6, m9, q2031 psrldq m0, m6, 2 psrldq m6, 10 punpcklwd m6, m0 vpblendd m8, m7, 0x80 ; _0__ 4321 | ____ 4321 vpblendd m8, m6, 0x40 ; _056 4321 | ____ 4321 mova m10, m9 .w8_loop_start: vpblendd m8, [tlq+hq*2], 0x0C ; _056 4321 | _056 4321 call .main vpblendd m10, m9, 0xCC mova [dstq+strideq*0], xm10 vextracti128 [dstq+strideq*1], m10, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: %assign stack_offset stack_offset - stack_size_padded ALLOC_STACK 32, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 TAIL_CALL .w16_main, 0 .w16_main: mova xm10, [base+filter_shuf2] FILTER_1BLK 13, 0, 6, 10, 15 vpermq m12, m13, q3120 mova xm14, [base+filter_shuf3] vinserti128 m14, [base+filter_shuf1], 1 vpbroadcastq m0, [tlq+10] vpblendd m0, [tlq-16], 0x4C ; ___0 4321 | _056 ____ psrldq m6, m12, 8 vpblendd m0, m6, 0x03 ; ___0 4321 | _056 4321 punpcklwd m6, m12 vpblendd m0, m6, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 vpblendd m13, m12, 0xCC vpermq m12, m12, q2031 ; 6___ 5___ psrldq xm6, xm12, 2 psrldq xm8, xm12, 12 vpblendd xm6, xm8, 0x01 pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ FILTER_1BLK 11, 6, 8, 10, 15 vpermq m11, m11, q3120 pshufd m9, m11, q1032 movu m8, [tlq+6] ; __43 210_ | ____ ____ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 lea r7, [20+hq*2] sub tlq, r7 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 jmp .w16_loop_start .w16_loop: vpermq m13, m13, q3322 vpermq m11, m9, q2020 vpermq m9, m9, q1302 vpermq m6, m12, q0123 psrldq m7, 4 vpblendd m13, m10, 0xCC vpblendd m9, m7, 0x40 mova m0, [rsp+8] mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 .w16_loop_start: mova m13, m12 vpblendd m0, [tlq+hq*2], 0x0C psrldq m7, m12, 8 punpcklwd m7, m12 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 vpermq m12, m10, q2031 mova [rsp+8], m0 psrldq m8, m11, 8 psrldq xm6, xm12, 2 psrldq xm7, xm12, 10 psrldq xm0, xm13, 2 punpcklwd m8, m11 punpcklwd xm7, xm6 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 call .main vpermq m8, m11, q3120 vpblendd m6, m8, m9, 0xCC mova [dstq+strideq*0+16], xm6 vextracti128 [dstq+strideq*1+16], m6, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop vpermq m8, m9, q3120 vextracti128 xm0, m8, 1 ; 4321 ____ pshufd xm11, xm11, q1032 vpblendd xm0, xm11, 0x02 ; 4321 0___ psrldq xm6, xm8, 2 psrldq xm7, xm8, 12 pblendw xm0, xm6, 0x4 ; 4321 05__ pblendw xm0, xm7, 0x2 ; 4321 056_ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 vpermq m12, m13, q1302 vpblendd m12, m10, 0xCC vpblendd m9, m6, 0xCC mova [dstq+strideq*0+ 0], xm12 mova [dstq+strideq*0+16], xm9 vextracti128 [dstq+strideq*1+ 0], m12, 1 vextracti128 [dstq+strideq*1+16], m9, 1 ret ALIGN function_align .w32: %assign stack_offset org_stack_offset ALLOC_STACK 64, 16 vpbroadcastw m15, r8m ; bitdepth_max sub hd, 2 lea r3, [dstq+32] lea r5d, [hd*2+20] call .w16_main mov dstq, r3 lea tlq, [tlq+r5+32] sub r5d, 20 shr r5d, 1 sub r5d, 2 lea r4, [dstq+strideq*2-2] DEFINE_ARGS dst, stride, tl, stride3, left, h lea stride3q, [strideq*3] movu m8, [tlq-6] ; 4321 0___ mova xm10, [base+filter_shuf2] pinsrw xm0, xm8, [dstq+strideq*0-2], 2 pinsrw xm0, xm0, [dstq+strideq*1-2], 1 ; 4321 056_ pinsrw xm9, [leftq+strideq*0], 5 pinsrw xm9, [leftq+strideq*1], 4 FILTER_1BLK 13, 0, 6, 10, 15 vpermq m12, m13, q3120 mova xm14, [base+filter_shuf3] vinserti128 m14, [base+filter_shuf1], 1 psrldq m6, m12, 8 punpcklwd m7, m6, m12 vpblendd m0, m6, 0x03 ; ___0 ____ | _0__ 4321 vpblendd m0, m7, 0x80 ; 56_0 ____ | _0__ 4321 vpblendd m0, m8, 0x30 ; 56_0 4321 | _0__ 4321 vpblendd m0, m9, 0x04 ; 56_0 4321 | _056 4321 FILTER_2BLK 12, 0, 6, 7, 8, 14, 15 vpblendd m13, m12, 0xCC pinsrw xm9, [leftq+strideq*2], 3 pinsrw xm9, [leftq+stride3q ], 2 lea leftq, [leftq+strideq*4] pinsrw xm9, [leftq+strideq*0], 1 pinsrw xm9, [leftq+strideq*1], 0 movq [rsp+32], xm9 mov r7d, 1 pslldq m8, m9, 4 vpblendd m0, m8, 0x0C ; ___0 ____ | _056 ____ vpermq m12, m12, q2031 ; 6___ 5___ psrldq xm6, xm12, 2 psrldq xm7, xm12, 12 vpblendd xm6, xm7, 0x01 ; ____ _56_ pblendw xm6, [tlq+10], 0xF8 ; 4321 056_ FILTER_1BLK 11, 6, 7, 10, 15 vpermq m11, m11, q3120 pshufd m9, m11, q1032 vbroadcasti128 m8, [tlq+22] ; __43 210_ | ____ ____ pshufd m8, m8, q3021 ; __0_ 4321 | ____ ____ pshufhw m8, m8, q3201 ; ___0 4321 | ____ ____ vpblendd m9, m8, 0x70 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 vpermq m6, m12, q0123 ; ____ 4321 | ____ 4321 jmp .w32_loop_start .w32_loop_last: mova m0, [rsp+0] jmp .w32_loop .w32_loop_left: mova m0, [rsp+0] vpblendd m0, [rsp+32+r7*4-12], 0x0C dec r7d jg .w32_loop cmp hd, 2 je .w32_loop pinsrw xm6, [rsp+32], 6 pinsrw xm6, [leftq+strideq*2], 5 pinsrw xm6, [leftq+stride3q ], 4 lea leftq, [leftq+strideq*4] pinsrw xm6, [leftq+strideq*0], 3 pinsrw xm6, [leftq+strideq*1], 2 pinsrw xm6, [leftq+strideq*2], 1 pinsrw xm6, [leftq+stride3q ], 0 lea leftq, [leftq+strideq*4] movu [rsp+36], xm6 pinsrw xm6, [leftq+strideq*0], 1 pinsrw xm6, [leftq+strideq*1], 0 movd [rsp+32], xm6 mov r7d, 4 .w32_loop: vpermq m13, m13, q3322 vpermq m11, m9, q2020 vpermq m9, m9, q1302 vpermq m6, m12, q0123 psrldq m7, 4 vpblendd m13, m10, 0xCC vpblendd m9, m7, 0x40 ; ___0 4321 | ____ 4321 mova [dstq+strideq*0], xm13 vextracti128 [dstq+strideq*1], m13, 1 .w32_loop_start: mova m13, m12 psrldq m7, m12, 8 punpcklwd m7, m12 vpblendd m0, m6, 0x33 ; ___0 4321 | _056 4321 vpblendd m0, m7, 0x80 ; 56_0 4321 | _056 4321 FILTER_2BLK 10, 0, 6, 7, 8, 14, 15 vpermq m12, m10, q2031 mova [rsp+0], m0 psrldq m8, m11, 8 psrldq xm6, xm12, 2 psrldq xm7, xm12, 10 psrldq xm0, xm13, 2 punpcklwd m8, m11 punpcklwd xm7, xm6 vpblendd m8, m9, 0x73 ; 56_0 4321 | ____ 4321 vpblendd m8, m7, 0x04 ; 56_0 4321 | __56 4321 vpblendd m8, m0, 0x08 ; 56_0 4321 | _056 4321 call .main vpermq m8, m11, q3120 vpblendd m6, m8, m9, 0xCC mova [dstq+strideq*0+16], xm6 vextracti128 [dstq+strideq*1+16], m6, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop_left jz .w32_loop_last vpermq m8, m9, q3120 vextracti128 xm0, m8, 1 ; 4321 ____ pshufd xm11, xm11, q1032 vpblendd xm0, xm11, 0x02 ; 4321 0___ psrldq xm6, xm8, 2 psrldq xm7, xm8, 12 pblendw xm0, xm6, 0x4 ; 4321 05__ pblendw xm0, xm7, 0x2 ; 4321 056_ FILTER_1BLK 6, 0, 7, [base+filter_shuf2], 15 vpermq m12, m13, q1302 vpblendd m12, m10, 0xCC vpblendd m9, m6, 0xCC mova [dstq+strideq*0+ 0], xm12 mova [dstq+strideq*0+16], xm9 vextracti128 [dstq+strideq*1+ 0], m12, 1 vextracti128 [dstq+strideq*1+16], m9, 1 RET .main: FILTER_2BLK 9, 8, 6, 7, 0, 14, 15 ret %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm add tlq, 2 movd xm4, wd pxor m6, m6 vpbroadcastw m7, r7m pavgw xm4, xm6 tzcnt wd, wd movd xm5, wd movu m0, [tlq] lea t0, [ipred_cfl_left_16bpc_avx2_table] movsxd r6, [t0+wq*4] add r6, t0 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half sub tlq, hq movd xm4, hd sub tlq, hq pxor m6, m6 vpbroadcastw m7, r7m pavgw xm4, xm6 tzcnt r6d, hd movd xm5, r6d movu m0, [tlq] lea t0, [ipred_cfl_left_16bpc_avx2_table] movsxd r6, [t0+r6*4] add r6, t0 add t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table tzcnt wd, wd movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: paddw m0, [tlq+32] .h16: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h8: psrldq xm1, xm0, 8 paddw xm0, xm1 .h4: punpcklwd xm0, xm6 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 paddd xm0, xm4 psrld xm0, xm5 vpbroadcastw m0, xm0 jmp wq cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea t0d, [wq+hq] movd xm4, t0d tzcnt t0d, t0d movd xm5, t0d lea t0, [ipred_cfl_16bpc_avx2_table] tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] psrlw xm4, 1 pxor m6, m6 vpbroadcastw m7, r7m add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movq xm0, [tlq-8] jmp wq .w4: movq xm1, [tlq+2] paddw m0, m4 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrld m1, m0, 16 paddw m0, m1 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: vextracti128 xm1, m0, 1 paddw xm0, xm1 lea r2d, [hq*2] mov r6d, 0xAAAB6667 shrx r6d, r6d, r2d punpckhwd xm1, xm0, xm6 punpcklwd xm0, xm6 paddd xm0, xm1 movd xm1, r6d psrld xm0, 2 pmulhuw xm0, xm1 psrlw xm0, 1 .w4_end: vpbroadcastw m0, xm0 .s4: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] IPRED_CFL 4 pmaxsw m4, m6 pminsw m4, m7 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm5 movhps [dstq+strideq*1], xm4 movhps [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: mova xm0, [tlq-16] jmp wq .w8: vextracti128 xm1, m0, 1 paddw xm0, [tlq+2] paddw xm0, xm4 paddw xm0, xm1 psrld xm1, xm0, 16 paddw xm0, xm1 pblendw xm0, xm6, 0xAA psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w8_end: vpbroadcastw m0, xm0 .s8: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+strideq*0], xm4 mova [dstq+strideq*2], xm5 vextracti128 [dstq+strideq*1], m4, 1 vextracti128 [dstq+r6 ], m5, 1 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova m0, [tlq-32] jmp wq .w16: paddw m0, [tlq+2] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpckhwd xm1, xm0, xm6 punpcklwd xm0, xm6 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w16_end: vpbroadcastw m0, xm0 .s16: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+strideq*0], m4 mova [dstq+strideq*1], m5 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-64] paddw m0, [tlq-32] jmp wq .w32: paddw m0, [tlq+ 2] paddw m0, [tlq+34] vextracti128 xm1, m0, 1 paddw xm0, xm4 paddw xm0, xm1 punpcklwd xm1, xm0, xm6 punpckhwd xm0, xm6 paddd xm0, xm1 psrlq xm1, xm0, 32 paddd xm0, xm1 psrldq xm1, xm0, 8 paddd xm0, xm1 psrld xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x6667AAAB shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 psrlw xm0, 1 .w32_end: vpbroadcastw m0, xm0 .s32: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 pmaxsw m4, m6 pmaxsw m5, m6 pminsw m4, m7 pminsw m5, m7 mova [dstq+32*0], m4 mova [dstq+32*1], m5 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha mov r6d, r7m shr r6d, 11 lea t0, [ipred_cfl_splat_16bpc_avx2_table] tzcnt wd, wd movifnidn hd, hm movsxd wq, [t0+wq*4] vpbroadcastd m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4] pxor m6, m6 vpbroadcastw m7, r7m add wq, t0 movifnidn acq, acmp jmp wq cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm vpbroadcastd m5, [pw_2] mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 jg .w16 je .w8 .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: mova xm0, [ypxq+strideq*2] mova xm1, [ypxq+r3 ] vinserti128 m0, [ypxq+strideq*0], 1 vinserti128 m1, [ypxq+strideq*1], 1 lea ypxq, [ypxq+strideq*4] pmaddwd m0, m5 pmaddwd m1, m5 paddd m0, m1 vextracti128 xm1, m0, 1 paddd m4, m0 packssdw xm1, xm0 mova [acq], xm1 add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .dc vpermq m1, m1, q1111 pslld xm0, 2 .w4_hpad_loop: mova [acq], m1 paddd m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .dc .w8: mov r5, acq test wpadd, wpadd jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m0, m1 vextracti128 xm1, m0, 1 paddd m4, m0 packssdw xm1, xm0, xm1 mova [acq], xm1 add acq, 16 dec hd jg .w8_loop .w8_hpad: test hpadd, hpadd jz .dc vinserti128 m1, xm1, 1 pslld m0, 2 jmp .hpad .w8_wpad1: pmaddwd xm0, xm5, [ypxq+strideq*0] pmaddwd xm3, xm5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd xm0, xm3 pshufd xm3, xm0, q3333 packssdw xm1, xm0, xm3 paddd xm0, xm3 paddd xm4, xm0 mova [acq], xm1 add acq, 16 dec hd jg .w8_wpad1 jmp .w8_hpad .w16_wpad: mova m0, [ypxq+strideq*0+ 0] mova m1, [ypxq+strideq*1+ 0] cmp wpadd, 2 jl .w16_wpad1 je .w16_wpad2 vpbroadcastd m2, [ypxq+strideq*0+12] vpbroadcastd m3, [ypxq+strideq*1+12] vpblendd m0, m2, 0xf0 vpblendd m1, m3, 0xf0 jmp .w16_wpad_end .w16_wpad2: vpbroadcastd m2, [ypxq+strideq*0+28] vpbroadcastd m3, [ypxq+strideq*1+28] jmp .w16_wpad_end .w16_wpad1: vpbroadcastd m2, [ypxq+strideq*0+44] vpbroadcastd m3, [ypxq+strideq*1+44] vinserti128 m2, [ypxq+strideq*0+32], 0 vinserti128 m3, [ypxq+strideq*1+32], 0 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] REPX {pmaddwd x, m5}, m0, m1, m2, m3 paddd m0, m1 paddd m2, m3 packssdw m1, m0, m2 paddd m0, m2 vpermq m1, m1, q3120 paddd m4, m0 mova [acq], m1 add acq, 32 dec hd jg .w16_wpad jmp .w16_hpad .w16: mov r5, acq test wpadd, wpadd jnz .w16_wpad .w16_loop: pmaddwd m0, m5, [ypxq+strideq*0+ 0] pmaddwd m2, m5, [ypxq+strideq*0+32] pmaddwd m1, m5, [ypxq+strideq*1+ 0] pmaddwd m3, m5, [ypxq+strideq*1+32] lea ypxq, [ypxq+strideq*2] paddd m0, m1 paddd m2, m3 packssdw m1, m0, m2 paddd m0, m2 vpermq m1, m1, q3120 paddd m4, m0 mova [acq], m1 add acq, 32 dec hd jg .w16_loop .w16_hpad: add hpadd, hpadd jz .dc paddd m0, m0 .hpad: mova [acq+32*0], m1 paddd m4, m0 mova [acq+32*1], m1 add acq, 32*2 sub hpadd, 4 jg .hpad .dc: vextracti128 xm1, m4, 1 sub r5, acq ; -w*h*2 tzcnt r1d, r5d paddd xm4, xm1 sub r1d, 2 punpckhqdq xm1, xm4, xm4 movd xm0, r1d paddd xm1, xm4 pshuflw xm4, xm1, q1032 paddd xm1, xm4 psrld xm1, xm0 pxor xm0, xm0 pavgw xm1, xm0 vpbroadcastw m1, xm1 .dc_loop: mova m0, [acq+r5] psubw m0, m1 mova [acq+r5], m0 add r5, 32 jl .dc_loop RET cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm vpbroadcastd m5, [pw_4] mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 jg .w16 je .w8 .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: mova xm0, [ypxq+strideq*0] mova xm1, [ypxq+strideq*1] vinserti128 m0, [ypxq+strideq*2], 1 vinserti128 m1, [ypxq+r3 ], 1 lea ypxq, [ypxq+strideq*4] pmaddwd m0, m5 pmaddwd m1, m5 paddd m4, m0 packssdw m0, m1 paddd m4, m1 mova [acq], m0 add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vextracti128 xm1, m1, 1 vpermq m0, m0, q3333 pslld xm1, 2 .w4_hpad_loop: mova [acq], m0 paddd m4, m1 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w8: mov r5, acq test wpadd, wpadd jnz .w8_wpad1 .w8_loop: pmaddwd m1, m5, [ypxq+strideq*0] pmaddwd m0, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m4, m1 packssdw m1, m0 paddd m4, m0 vpermq m2, m1, q3120 mova [acq], m2 add acq, 32 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vpermq m1, m1, q3131 pslld m0, 2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w8_wpad1: vpbroadcastd m1, [ypxq+strideq*0+12] vpbroadcastd m0, [ypxq+strideq*1+12] vinserti128 m1, [ypxq+strideq*0+ 0], 0 vinserti128 m0, [ypxq+strideq*1+ 0], 0 lea ypxq, [ypxq+strideq*2] pmaddwd m1, m5 pmaddwd m0, m5 paddd m4, m1 packssdw m1, m0 paddd m4, m0 vpermq m2, m1, q3120 mova [acq], m2 add acq, 32 sub hd, 2 jg .w8_wpad1 jmp .w8_hpad .w16: mov r5, acq test wpadd, wpadd jnz .w16_wpad .w16_loop: pmaddwd m2, m5, [ypxq+strideq*0+ 0] pmaddwd m1, m5, [ypxq+strideq*0+32] pmaddwd m0, m5, [ypxq+strideq*1+ 0] pmaddwd m3, m5, [ypxq+strideq*1+32] lea ypxq, [ypxq+strideq*2] paddd m4, m2 packssdw m2, m1 paddd m4, m1 packssdw m1, m0, m3 paddd m0, m3 vpermq m2, m2, q3120 paddd m4, m0 vpermq m1, m1, q3120 mova [acq+32*0], m2 mova [acq+32*1], m1 add acq, 32*2 sub hd, 2 jg .w16_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad .w16_wpad: mova m2, [ypxq+strideq*0+ 0] mova m0, [ypxq+strideq*1+ 0] cmp wpadd, 2 jl .w16_wpad1 je .w16_wpad2 vpbroadcastd m1, [ypxq+strideq*0+12] vpbroadcastd m3, [ypxq+strideq*1+12] vpblendd m2, m1, 0xf0 vpblendd m0, m3, 0xf0 jmp .w16_wpad_end .w16_wpad2: vpbroadcastd m1, [ypxq+strideq*0+28] vpbroadcastd m3, [ypxq+strideq*1+28] jmp .w16_wpad_end .w16_wpad1: vpbroadcastd m1, [ypxq+strideq*0+44] vpbroadcastd m3, [ypxq+strideq*1+44] vinserti128 m1, [ypxq+strideq*0+32], 0 vinserti128 m3, [ypxq+strideq*1+32], 0 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] REPX {pmaddwd x, m5}, m2, m0, m1, m3 paddd m4, m2 packssdw m2, m1 paddd m4, m1 packssdw m1, m0, m3 paddd m0, m3 vpermq m2, m2, q3120 paddd m4, m0 vpermq m1, m1, q3120 mova [acq+32*0], m2 mova [acq+32*1], m1 add acq, 32*2 sub hd, 2 jg .w16_wpad jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h lea r6, [ipred_cfl_ac_444_16bpc_avx2_table] tzcnt wd, wm movifnidn hpadd, hpadm vpbroadcastd m5, [pw_1] movsxd wq, [r6+wq*4] shl hpadd, 2 add wq, r6 mov hd, hm pxor m4, m4 sub hd, hpadd jmp wq .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: movq xm0, [ypxq+strideq*0] movhps xm0, [ypxq+strideq*1] vpbroadcastq m1, [ypxq+strideq*2] vpbroadcastq m2, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 psllw m0, 3 pmaddwd m1, m0, m5 mova [acq], m0 add acq, 32 paddd m4, m1 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vpermq m0, m0, q3333 paddd m1, m1 mova [acq+32*0], m0 vpermq m1, m1, q3333 mova [acq+32*1], m0 add acq, 32*2 paddd m4, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w8: lea r3, [strideq*3] mov r5, acq .w8_loop: mova xm2, [ypxq+strideq*0] vinserti128 m2, [ypxq+strideq*1], 1 mova xm1, [ypxq+strideq*2] vinserti128 m1, [ypxq+r3 ], 1 lea ypxq, [ypxq+strideq*4] psllw m2, 3 psllw m1, 3 mova [acq+32*0], m2 pmaddwd m2, m5 mova [acq+32*1], m1 pmaddwd m0, m1, m5 add acq, 32*2 paddd m4, m2 paddd m4, m0 sub hd, 4 jg .w8_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc vperm2i128 m1, m1, 0x11 pslld m0, 2 pxor m2, m2 vpblendd m0, m2, 0x0f jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w16_wpad2: vpbroadcastw m3, [ypxq+strideq*0+14] vpbroadcastw m0, [ypxq+strideq*1+14] vpblendd m2, m3, 0xf0 vpblendd m1, m0, 0xf0 jmp .w16_wpad_end .w16: mov r5, acq .w16_loop: mova m2, [ypxq+strideq*0] mova m1, [ypxq+strideq*1] test wpadd, wpadd jnz .w16_wpad2 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] psllw m2, 3 psllw m1, 3 mova [acq+32*0], m2 pmaddwd m2, m5 mova [acq+32*1], m1 pmaddwd m0, m1, m5 add acq, 32*2 paddd m4, m2 paddd m4, m0 sub hd, 2 jg .w16_loop add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc paddd m0, m0 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad .w32: mov r5, acq test wpadd, wpadd jnz .w32_wpad .w32_loop: mova m0, [ypxq+ 0] mova m1, [ypxq+32] add ypxq, strideq psllw m0, 3 psllw m1, 3 pmaddwd m2, m0, m5 mova [acq+32*0], m0 pmaddwd m3, m1, m5 mova [acq+32*1], m1 add acq, 32*2 paddd m2, m3 paddd m4, m2 dec hd jg .w32_loop .w32_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc paddd m2, m2 .w32_hpad_loop: mova [acq+32*0], m0 mova [acq+32*1], m1 paddd m4, m2 mova [acq+32*2], m0 mova [acq+32*3], m1 add acq, 32*4 sub hpadd, 2 jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc .w32_wpad: mova m0, [ypxq+ 0] cmp wpadd, 4 jl .w32_wpad2 je .w32_wpad4 vpbroadcastw m1, [ypxq+14] vpblendd m0, m1, 0xf0 jmp .w32_wpad_end .w32_wpad4: vpbroadcastw m1, [ypxq+30] jmp .w32_wpad_end .w32_wpad2: vpbroadcastw m1, [ypxq+46] vinserti128 m1, [ypxq+32], 0 .w32_wpad_end: add ypxq, strideq psllw m0, 3 psllw m1, 3 pmaddwd m2, m0, m5 mova [acq+32*0], m0 pmaddwd m3, m1, m5 mova [acq+32*1], m1 add acq, 32*2 paddd m2, m3 paddd m4, m2 dec hd jg .w32_wpad jmp .w32_hpad cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h vbroadcasti128 m3, [palq] lea r2, [pal_pred_16bpc_avx2_table] tzcnt wd, wm vbroadcasti128 m4, [pal_pred_shuf] movifnidn hd, hm movsxd wq, [r2+wq*4] pshufb m3, m4 punpckhqdq m4, m3, m3 add wq, r2 DEFINE_ARGS dst, stride, stride3, idx, w, h lea stride3q, [strideq*3] jmp wq .w4: mova xm2, [idxq] add idxq, 16 pshufb xm1, xm3, xm2 pshufb xm2, xm4, xm2 punpcklbw xm0, xm1, xm2 punpckhbw xm1, xm2 movq [dstq+strideq*0], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+strideq*1], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: movu m2, [idxq] ; only 16-byte alignment add idxq, 32 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 vextracti128 [dstq+strideq*2], m0, 1 vextracti128 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: vpermq m2, [idxq+ 0], q3120 vpermq m5, [idxq+32], q3120 add idxq, 64 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 pshufb m1, m3, m5 pshufb m2, m4, m5 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpermq m2, [idxq+ 0], q3120 vpermq m5, [idxq+32], q3120 add idxq, 64 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0+ 0], m0 mova [dstq+strideq*0+32], m1 pshufb m1, m3, m5 pshufb m2, m4, m5 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*1+ 0], m0 mova [dstq+strideq*1+32], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: vpermq m2, [idxq+ 0], q3120 vpermq m5, [idxq+32], q3120 add idxq, 64 pshufb m1, m3, m2 pshufb m2, m4, m2 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+ 0], m0 mova [dstq+32], m1 pshufb m1, m3, m5 pshufb m2, m4, m5 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+64], m0 mova [dstq+96], m1 add dstq, strideq dec hd jg .w64 RET %endif rav1e-0.7.1/src/x86/ipred16_avx512.asm000064400000000000000000000700711046102023000151520ustar 00000000000000; Copyright © 2022, VideoLAN and dav1d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 ipred_shuf: db 14, 15, 14, 15, 0, 1, 2, 3, 6, 7, 6, 7, 0, 1, 2, 3 db 10, 11, 10, 11, 8, 9, 10, 11, 2, 3, 2, 3, 8, 9, 10, 11 db 12, 13, 12, 13, 4, 5, 6, 7, 4, 5, 4, 5, 4, 5, 6, 7 db 8, 9, 8, 9, 12, 13, 14, 15, 0, 1, 0, 1, 12, 13, 14, 15 smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5 times 4 db 10, 11, 12, 13, 2, 3, -1, -1 filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7 times 4 db 26, 27, 28, 29, 14, 15, -1, -1 filter_permC: dd 8 ; dq 8, 10, 1, 11, 0, 9 pw_1: times 2 dw 1 dd 10 filter_rnd: dd 32 dd 1 dd 8 dd 11 filter_shift: times 2 dw 6 dd 0 times 2 dw 4 dd 9 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro JMP_TABLE ipred_paeth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc cextern filter_intra_taps SECTION .text %macro PAETH 3 ; top, signed_ldiff, ldiff paddw m0, m%2, m2 psubw m1, m0, m3 ; tldiff psubw m0, m%1 ; tdiff pabsw m1, m1 pabsw m0, m0 pcmpgtw k1, m0, m1 pminsw m0, m1 pcmpgtw k2, m%3, m0 vpblendmw m0{k1}, m%1, m3 vpblendmw m0{k2}, m2, m0 %endmacro INIT_ZMM avx512icl cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl, w, h %define base r6-ipred_paeth_16bpc_avx512icl_table lea r6, [ipred_paeth_16bpc_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] vpbroadcastw m3, [tlq] ; topleft add wq, r6 jmp wq .w4: vpbroadcastq m4, [tlq+2] ; top movsldup m7, [base+ipred_shuf] lea r6, [strideq*3] psubw m5, m4, m3 pabsw m6, m5 .w4_loop: sub tlq, 16 vbroadcasti32x4 m2, [tlq] pshufb m2, m7 ; left PAETH 4, 5, 6 vextracti32x4 xm1, m0, 2 vextracti32x4 xm8, ym0, 1 vextracti32x4 xm9, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm8 movq [dstq+r6 ], xm9 sub hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm8 movhps [dstq+r6 ], xm9 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_end: RET .w8: vbroadcasti32x4 m4, [tlq+2] movsldup m7, [base+ipred_shuf] lea r6, [strideq*3] psubw m5, m4, m3 pabsw m6, m5 .w8_loop: sub tlq, 8 vpbroadcastq m2, [tlq] pshufb m2, m7 PAETH 4, 5, 6 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: vbroadcasti32x8 m4, [tlq+2] movsldup m7, [base+ipred_shuf] psubw m5, m4, m3 pabsw m6, m5 .w16_loop: sub tlq, 4 vpbroadcastd m2, [tlq] pshufb m2, m7 PAETH 4, 5, 6 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET .w32: movu m4, [tlq+2] psubw m5, m4, m3 pabsw m6, m5 .w32_loop: sub tlq, 2 vpbroadcastw m2, [tlq] PAETH 4, 5, 6 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET .w64: movu m4, [tlq+ 2] movu m7, [tlq+66] psubw m5, m4, m3 psubw m8, m7, m3 pabsw m6, m5 pabsw m9, m8 .w64_loop: sub tlq, 2 vpbroadcastw m2, [tlq] PAETH 4, 5, 6 mova [dstq+64*0], m0 PAETH 7, 8, 9 mova [dstq+64*1], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 %define base r6-$$ lea r6, [$$] tzcnt wd, wm mov hd, hm movsxd wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq*4] lea weightsq, [base+smooth_weights_1d_16bpc+hq*4] neg hq vpbroadcastw m6, [tlq+hq*2] ; bottom lea wq, [base+ipred_smooth_v_16bpc_avx512icl_table+wq] lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastq m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w4_loop: vbroadcasti32x4 m3, [weightsq+hq*2] pshufb m3, m4 pmulhrsw m3, m5 paddw m3, m6 vextracti32x4 xm0, m3, 3 vextracti32x4 xm1, ym3, 1 vextracti32x4 xm2, m3, 2 movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 add hq, 8 jg .end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jl .w4_loop .end: RET .w8: vbroadcasti32x4 m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w8_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m4 pmulhrsw m0, m5 paddw m0, m6 vextracti32x4 [dstq+strideq*0], m0, 3 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: vbroadcasti32x8 m5, [tlq+2] ; top movsldup m4, [ipred_shuf] psubw m5, m6 ; top - bottom .w16_loop: vpbroadcastd m0, [weightsq+hq*2+0] vpbroadcastd m1, [weightsq+hq*2+4] pshufb m0, m4 pshufb m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 paddw m0, m6 paddw m1, m6 vextracti32x8 [dstq+strideq*0], m0, 1 mova [dstq+strideq*1], ym0 vextracti32x8 [dstq+strideq*2], m1, 1 mova [dstq+stride3q ], ym1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: movu m5, [tlq+2] psubw m5, m6 .w32_loop: vpbroadcastw m0, [weightsq+hq*2+0] vpbroadcastw m1, [weightsq+hq*2+2] vpbroadcastw m2, [weightsq+hq*2+4] vpbroadcastw m3, [weightsq+hq*2+6] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w32_loop RET .w64: movu m4, [tlq+ 2] movu m5, [tlq+66] psubw m4, m6 psubw m5, m6 .w64_loop: vpbroadcastw m1, [weightsq+hq*2+0] vpbroadcastw m3, [weightsq+hq*2+2] pmulhrsw m0, m4, m1 pmulhrsw m1, m5 pmulhrsw m2, m4, m3 pmulhrsw m3, m5 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 mova [dstq+strideq*1+64*0], m2 mova [dstq+strideq*1+64*1], m3 lea dstq, [dstq+strideq*2] add hq, 2 jl .w64_loop RET cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl, w, h, stride3 lea r6, [$$] mov wd, wm movifnidn hd, hm vpbroadcastw m6, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq*4] sub tlq, hq lea stride3q, [strideq*3] lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq] jmp wq .w4: movsldup m4, [base+ipred_shuf] vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2] .w4_loop: vbroadcasti32x4 m0, [tlq+hq-16] ; left pshufb m0, m4 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 vextracti32x4 xm1, m0, 2 vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 sub hd, 8*2 jl .end lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w4_loop .end: RET .w8: movsldup m4, [base+ipred_shuf] vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2] .w8_loop: vpbroadcastq m0, [tlq+hq-8] ; left pshufb m0, m4 psubw m0, m6 ; left - right pmulhrsw m0, m5 paddw m0, m6 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w8_loop RET .w16: movsldup m4, [base+ipred_shuf] vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2] .w16_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastd m1, [tlq+hq-8] pshufb m0, m4 pshufb m1, m4 psubw m0, m6 psubw m1, m6 pmulhrsw m0, m5 pmulhrsw m1, m5 paddw m0, m6 paddw m1, m6 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w16_loop RET .w32: movu m5, [base+smooth_weights_1d_16bpc+32*2] .w32_loop: vpbroadcastq m3, [tlq+hq-8] punpcklwd m3, m3 psubw m3, m6 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hq, 4*2 jg .w32_loop RET .w64: movu m4, [base+smooth_weights_1d_16bpc+64*2] movu m5, [base+smooth_weights_1d_16bpc+64*3] .w64_loop: vpbroadcastw m1, [tlq+hq-2] vpbroadcastw m3, [tlq+hq-4] psubw m1, m6 psubw m3, m6 pmulhrsw m0, m4, m1 pmulhrsw m1, m5 pmulhrsw m2, m4, m3 pmulhrsw m3, m5 REPX {paddw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 mova [dstq+strideq*1+64*0], m2 mova [dstq+strideq*1+64*1], m3 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w64_loop RET cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3 lea r6, [$$] mov wd, wm movifnidn hd, hm vpbroadcastw m13, [tlq+wq*2] ; right tzcnt wd, wd add hd, hd movsxd wq, [base+ipred_smooth_16bpc_avx512icl_table+wq*4] mov r5d, 0x55555555 sub tlq, hq mova m14, [base+smooth_perm] kmovd k1, r5d vpbroadcastw m0, [tlq] ; bottom mov r5, 0x3333333333333333 pxor m15, m15 lea wq, [base+ipred_smooth_16bpc_avx512icl_table+wq] kmovq k2, r5 lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2] jmp wq .w4: vpbroadcastq m5, [tlq+hq+2] movshdup m3, [base+ipred_shuf] movsldup m4, [base+ipred_shuf] vbroadcasti32x4 m6, [base+smooth_weights_2d_16bpc+4*4] lea stride3q, [strideq*3] punpcklwd m5, m0 ; top, bottom .w4_loop: vbroadcasti32x4 m0, [v_weightsq] vpbroadcastq m2, [tlq+hq-8] mova m1, m13 pshufb m0, m3 pmaddwd m0, m5 pshufb m1{k2}, m2, m4 ; left, right vpdpwssd m0, m1, m6 vpermb m0, m14, m0 pavgw ym0, ym15 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add v_weightsq, 4*4 sub hd, 4*2 jg .w4_loop RET .w8: vbroadcasti32x4 ym5, [tlq+hq+2] movshdup m6, [base+ipred_shuf] movsldup m7, [base+ipred_shuf] pmovzxwd m5, ym5 vbroadcasti32x8 m8, [base+smooth_weights_2d_16bpc+8*4] lea stride3q, [strideq*3] vpblendmw m5{k1}, m0, m5 ; top, bottom .w8_loop: vpbroadcastq m0, [v_weightsq+0] vpbroadcastq m1, [v_weightsq+8] vpbroadcastd m3, [tlq+hq-4] vpbroadcastd m4, [tlq+hq-8] pshufb m0, m6 pmaddwd m0, m5 pshufb m1, m6 pmaddwd m1, m5 mova m2, m13 pshufb m2{k2}, m3, m7 ; left, right mova m3, m13 pshufb m3{k2}, m4, m7 vpdpwssd m0, m2, m8 vpdpwssd m1, m3, m8 add v_weightsq, 4*4 vpermt2b m0, m14, m1 pavgw m0, m15 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w8_loop RET .w16: pmovzxwd m5, [tlq+hq+2] mova m6, [base+smooth_weights_2d_16bpc+16*4] vpblendmw m5{k1}, m0, m5 ; top, bottom .w16_loop: vpbroadcastd m0, [v_weightsq+0] vpbroadcastd m1, [v_weightsq+4] pmaddwd m0, m5 pmaddwd m1, m5 mova m2, m13 vpbroadcastw m2{k1}, [tlq+hq-2] ; left, right mova m3, m13 vpbroadcastw m3{k1}, [tlq+hq-4] vpdpwssd m0, m2, m6 vpdpwssd m1, m3, m6 add v_weightsq, 2*4 vpermt2b m0, m14, m1 pavgw m0, m15 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w16_loop RET .w32: pmovzxwd m5, [tlq+hq+ 2] pmovzxwd m6, [tlq+hq+34] mova m7, [base+smooth_weights_2d_16bpc+32*4] mova m8, [base+smooth_weights_2d_16bpc+32*6] vpblendmw m5{k1}, m0, m5 ; top, bottom vpblendmw m6{k1}, m0, m6 .w32_loop: vpbroadcastd m2, [v_weightsq+0] vpbroadcastd m3, [v_weightsq+4] pmaddwd m0, m5, m2 pmaddwd m2, m6 pmaddwd m1, m5, m3 pmaddwd m3, m6 mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right vpdpwssd m0, m4, m7 vpdpwssd m2, m4, m8 mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-4] vpdpwssd m1, m4, m7 vpdpwssd m3, m4, m8 add v_weightsq, 2*4 vpermt2b m0, m14, m2 vpermt2b m1, m14, m3 pavgw m0, m15 pavgw m1, m15 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hq, 2*2 jg .w32_loop RET .w64: pmovzxwd m5, [tlq+hq+ 2] pmovzxwd m6, [tlq+hq+34] pmovzxwd m7, [tlq+hq+66] pmovzxwd m8, [tlq+hq+98] mova m9, [base+smooth_weights_2d_16bpc+64*4] vpblendmw m5{k1}, m0, m5 ; top, bottom mova m10, [base+smooth_weights_2d_16bpc+64*5] vpblendmw m6{k1}, m0, m6 mova m11, [base+smooth_weights_2d_16bpc+64*6] vpblendmw m7{k1}, m0, m7 mova m12, [base+smooth_weights_2d_16bpc+64*7] vpblendmw m8{k1}, m0, m8 .w64_loop: vpbroadcastd m3, [v_weightsq] mova m4, m13 vpbroadcastw m4{k1}, [tlq+hq-2] ; left, right pmaddwd m0, m5, m3 pmaddwd m2, m6, m3 pmaddwd m1, m7, m3 pmaddwd m3, m8 vpdpwssd m0, m4, m9 vpdpwssd m2, m4, m10 vpdpwssd m1, m4, m11 vpdpwssd m3, m4, m12 add v_weightsq, 1*4 vpermt2b m0, m14, m2 vpermt2b m1, m14, m3 pavgw m0, m15 pavgw m1, m15 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq sub hd, 1*2 jg .w64_loop RET cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_16bpc_avx512icl_table] tzcnt wd, wm mova m2, [pal_pred_perm] movsxd wq, [r6+wq*4] mova xm3, [palq] movifnidn hd, hm add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: pmovzxbw ym0, [idxq] add idxq, 16 vpermw ym0, ym0, ym3 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbw m0, [idxq] add idxq, 32 vpermw m0, m0, m3 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: vpermb m1, m2, [idxq] add idxq, 64 vpermw m0, m1, m3 psrlw m1, 8 vpermw m1, m1, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpermb m1, m2, [idxq] add idxq, 64 vpermw m0, m1, m3 psrlw m1, 8 vpermw m1, m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: vpermb m1, m2, [idxq] add idxq, 64 vpermw m0, m1, m3 psrlw m1, 8 vpermw m1, m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq dec hd jg .w64 RET ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. ; w4 w8 w16 w32 ; 1 1 2 1 2 5 6 1 2 5 6 9 a d e ; 2 2 3 2 3 6 7 2 3 6 7 a b e f ; 3 3 4 3 4 7 8 3 4 7 8 b c f g ; 4 4 5 4 5 8 9 4 5 8 9 c d g h cglobal ipred_filter_16bpc, 4, 7, 14, dst, stride, tl, w, h, filter, top %define base r6-$$ lea r6, [$$] %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 movifnidn hd, hm movu xm0, [tlq-6] pmovsxbw m7, [base+filter_intra_taps+filterq+32*0] pmovsxbw m8, [base+filter_intra_taps+filterq+32*1] mov r5d, r8m ; bitdepth_max movsldup m9, [base+filter_permA] movshdup m10, [base+filter_permA] shr r5d, 11 ; is_12bpc jnz .12bpc psllw m7, 2 ; upshift multipliers so that packusdw psllw m8, 2 ; will perform clipping for free .12bpc: vpbroadcastd m5, [base+filter_rnd+r5*8] vpbroadcastd m6, [base+filter_shift+r5*8] sub wd, 8 jl .w4 .w8: call .main4 movsldup m11, [filter_permB] lea r5d, [hq*2+2] movshdup m12, [filter_permB] lea topq, [tlq+2] mova m13, [filter_permC] sub hd, 4 vinserti32x4 ym0, [topq], 1 ; a0 b0 t0 t1 sub tlq, r5 %if WIN64 push r7 push r8 %endif mov r7, dstq mov r8d, hd .w8_loop: movlps xm4, xm0, [tlq+hq*2] call .main8 lea dstq, [dstq+strideq*2] sub hd, 2 jge .w8_loop test wd, wd jz .end mov r2d, 0x0d kmovb k1, r2d lea r2, [strideq*3] .w16: movd xmm0, [r7+strideq*1+12] vpblendd xmm0, [topq+8], 0x0e ; t1 t2 pinsrw xm4, xmm0, [r7+strideq*0+14], 2 call .main8 add r7, 16 vinserti32x4 ym0, [topq+16], 1 ; a2 b2 t2 t3 mov hd, r8d mov dstq, r7 add topq, 16 .w16_loop: movd xmm1, [dstq+strideq*2-4] punpcklwd xm4, xmm1, xmm0 movd xmm0, [dstq+r2-4] shufps xm4{k1}, xmm0, xm0, q3210 call .main8 lea dstq, [dstq+strideq*2] sub hd, 2 jge .w16_loop sub wd, 8 jg .w16 .end: vpermb m2, m11, m0 mova ym1, ym5 vpdpwssd m1, m2, m7 vpermb m2, m12, m0 vpdpwssd m1, m2, m8 %if WIN64 pop r8 pop r7 %endif vextracti32x8 ym2, m1, 1 paddd ym1, ym2 packusdw ym1, ym1 vpsrlvw ym1, ym6 vpermt2q m0, m13, m1 vextracti32x4 [dstq+strideq*0], m0, 2 vextracti32x4 [dstq+strideq*1], ym0, 1 RET .w4_loop: movlps xm0, [tlq-10] lea dstq, [dstq+strideq*2] sub tlq, 4 .w4: call .main4 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 sub hd, 2 jg .w4_loop RET ALIGN function_align .main4: vpermb m2, m9, m0 mova ym1, ym5 vpdpwssd m1, m2, m7 vpermb m0, m10, m0 vpdpwssd m1, m0, m8 vextracti32x8 ym0, m1, 1 paddd ym0, ym1 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 ; clip vpsrlvw xm0, xm6 ret ALIGN function_align .main8: vpermb m3, m11, m0 mova ym2, ym5 vpdpwssd m2, m3, m7 vpermb m3, m9, m4 mova ym1, ym5 vpdpwssd m1, m3, m7 vpermb m3, m12, m0 vpdpwssd m2, m3, m8 vpermb m3, m10, m4 vpdpwssd m1, m3, m8 vextracti32x8 ym4, m2, 1 vextracti32x8 ym3, m1, 1 paddd ym2, ym4 paddd ym1, ym3 packusdw ym1, ym2 ; clip vpsrlvw ym1, ym6 vpermt2q m0, m13, m1 ; c0 d0 b0 b1 a0 a1 vextracti32x4 [dstq+strideq*0], m0, 2 vextracti32x4 [dstq+strideq*1], ym0, 1 ret %endif rav1e-0.7.1/src/x86/ipred16_sse.asm000064400000000000000000004076041046102023000147240ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA filter_shuf: db 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 2, 3, -1, -1 pal_pred_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 z_base_inc_z2: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 z_upsample: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_upsample_l: db -1, -1, -2, -1, -3, -1, -4, -1, 8, 9, 8, 9, 10, 11, 12, 13 db 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 z2_top_shufA: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 z2_top_shufB: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 z2_left_shufA: db 14, 15, 12, 13, 10, 11, 8, 9, 12, 13, 10, 11, 8, 9, 6, 7 z2_left_shufB: db 14, 15, 10, 11, 6, 7, 2, 3, 12, 13, 8, 9, 4, 5, 0, 1 z_filt_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 z_filt_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 z_filt_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 z_filt_wh4: db 7, 7, 19, 7, z_filt_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 ALIGN 8 pb_2_3: times 4 db 2, 3 z2_dy_offset: dw 96*64, 96*64, 95*64, 95*64 z_filt_k: times 4 dw 8 times 4 dw 6 times 4 dw 4 times 4 dw 5 pw_m3584: times 4 dw -3584 pw_m3072: times 4 dw -3072 pw_m2560: times 4 dw -2560 pw_m2048: times 4 dw -2048 pw_m1536: times 4 dw -1536 pw_m1024: times 4 dw -1024 pw_m512: times 4 dw -512 pw_1: times 4 dw 1 pw_2: times 4 dw 2 pw_3: times 4 dw 3 pw_62: times 4 dw 62 pw_256: times 4 dw 256 pw_512: times 4 dw 512 pw_2048: times 4 dw 2048 %define pw_4 (z_filt_k+8*2) %define pw_8 (z_filt_k+8*0) %define pw_m1to4 z2_upsample_l %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 10*4) %define ipred_dc_128_16bpc_ssse3_table (ipred_dc_16bpc_ssse3_table + 15*4) %define ipred_cfl_splat_16bpc_ssse3_table (ipred_cfl_16bpc_ssse3_table + 8*4) JMP_TABLE ipred_dc_left_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_dc_16bpc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4, \ s4-15*4, s8-15*4, s16c-15*4, s32c-15*4, s64-15*4 JMP_TABLE ipred_h_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2_16bpc, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3_16bpc, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl_16bpc, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left_16bpc, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_444_16bpc, ssse3, w4, w8, w16, w32 JMP_TABLE pal_pred_16bpc, ssse3, w4, w8, w16, w32, w64 cextern smooth_weights_1d_16bpc cextern smooth_weights_2d_16bpc cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_XMM ssse3 cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_16bpc_ssse3_table movd m4, wm tzcnt wd, wm add tlq, 2 movifnidn hd, hm pxor m3, m3 pavgw m4, m3 movd m5, wd movu m0, [tlq] movsxd r6, [r5+wq*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_16bpc_ssse3_table mov hd, hm movd m4, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm pxor m3, m3 sub tlq, hq pavgw m4, m3 movd m5, r6d movu m0, [tlq] movsxd r6, [r5+r6*4] add r6, r5 add r5, ipred_dc_128_16bpc_ssse3_table-ipred_dc_left_16bpc_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m2, [tlq+112] movu m1, [tlq+ 96] paddw m0, m2 movu m2, [tlq+ 80] paddw m1, m2 movu m2, [tlq+ 64] paddw m0, m2 paddw m0, m1 .h32: movu m1, [tlq+ 48] movu m2, [tlq+ 32] paddw m1, m2 paddw m0, m1 .h16: movu m1, [tlq+ 16] paddw m0, m1 .h8: movhlps m1, m0 paddw m0, m1 .h4: punpcklwd m0, m3 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 lea stride3q, [strideq*3] pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm tzcnt r6d, hd lea r5d, [wq+hq] movd m4, r5d tzcnt r5d, r5d movd m5, r5d LEA r5, ipred_dc_16bpc_ssse3_table tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pxor m3, m3 psrlw m4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m1, m0 punpckhwd m0, m3 punpcklwd m1, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 cmp hd, 4 jg .w4_mul psrlw m0, 3 jmp .w4_end .w4_mul: mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d psrld m0, 2 movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 .s4: movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 8 je .w8_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 32 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 16 je .w16_end mov r2d, 0xAAAB mov r3d, 0x6667 test hd, 8|32 cmovz r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16c: mova m1, m0 .s16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*2+16*0], m0 mova [dstq+strideq*2+16*1], m1 mova [dstq+stride3q +16*0], m0 mova [dstq+stride3q +16*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m0, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 32 je .w32_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 8 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32c: mova m1, m0 mova m2, m0 mova m3, m0 .s32: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 mova [dstq+strideq*0+16*2], m2 mova [dstq+strideq*0+16*3], m3 mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*3], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s32 RET .h64: mova m0, [tlq-128] mova m1, [tlq-112] paddw m0, [tlq- 96] paddw m1, [tlq- 80] paddw m0, [tlq- 64] paddw m1, [tlq- 48] paddw m0, [tlq- 32] paddw m1, [tlq- 16] paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 2] movu m2, [tlq+ 18] paddw m1, m2 movu m2, [tlq+ 34] paddw m0, m2 movu m2, [tlq+ 50] paddw m1, m2 movu m2, [tlq+ 66] paddw m0, m2 movu m2, [tlq+ 82] paddw m1, m2 movu m2, [tlq+ 98] paddw m0, m2 movu m2, [tlq+114] paddw m1, m2 paddw m0, m1 punpcklwd m1, m0, m3 punpckhwd m0, m3 paddd m0, m1 paddd m4, m0 punpckhqdq m0, m0 paddd m0, m4 pshuflw m1, m0, q1032 paddd m0, m1 psrld m0, m5 cmp hd, 64 je .w64_end mov r2d, 0xAAAB mov r3d, 0x6667 cmp hd, 16 cmove r2d, r3d movd m1, r2d pmulhuw m0, m1 psrlw m0, 1 .w64_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s64: mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .s64 RET cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 mov r6d, r8m LEA r5, ipred_dc_128_16bpc_ssse3_table tzcnt wd, wm shr r6d, 11 movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m0, [r5-ipred_dc_128_16bpc_ssse3_table+pw_512+r6*8] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_16bpc_ssse3_table movifnidn hd, hm movu m0, [tlq+ 2] movu m1, [tlq+ 18] movu m2, [tlq+ 34] movu m3, [tlq+ 50] cmp wd, 64 je .w64 tzcnt wd, wd movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w64: WIN64_SPILL_XMM 8 movu m4, [tlq+ 66] movu m5, [tlq+ 82] movu m6, [tlq+ 98] movu m7, [tlq+114] .w64_loop: mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 mova [dstq+16*4], m4 mova [dstq+16*5], m5 mova [dstq+16*6], m6 mova [dstq+16*7], m7 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 %define base r5-ipred_h_16bpc_ssse3_table tzcnt wd, wm LEA r5, ipred_h_16bpc_ssse3_table movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m2, [base+pw_256] movddup m3, [base+pb_2_3] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: sub tlq, 8 movq m3, [tlq] pshuflw m0, m3, q3333 pshuflw m1, m3, q2222 pshuflw m2, m3, q1111 pshuflw m3, m3, q0000 movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m1 movq [dstq+strideq*2], m2 movq [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: sub tlq, 8 movq m3, [tlq] punpcklwd m3, m3 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16 RET .w32: sub tlq, 4 movd m1, [tlq] pshufb m0, m1, m3 pshufb m1, m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m0 mova [dstq+strideq*0+16*2], m0 mova [dstq+strideq*0+16*3], m0 mova [dstq+strideq*1+16*0], m1 mova [dstq+strideq*1+16*1], m1 mova [dstq+strideq*1+16*2], m1 mova [dstq+strideq*1+16*3], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET .w64: sub tlq, 2 movd m0, [tlq] pshufb m0, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m0 mova [dstq+16*2], m0 mova [dstq+16*3], m0 mova [dstq+16*4], m0 mova [dstq+16*5], m0 mova [dstq+16*6], m0 mova [dstq+16*7], m0 add dstq, strideq dec hd jg .w64 RET cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w, h, left %define base r5-ipred_paeth_16bpc_ssse3_table movifnidn hd, hm pshuflw m4, [tlq], q0000 mov leftq, tlq add hd, hd punpcklqdq m4, m4 ; topleft sub leftq, hq and wd, ~7 jnz .w8 movddup m5, [tlq+2] ; top psubw m6, m5, m4 pabsw m7, m6 .w4_loop: movd m1, [leftq+hq-4] punpcklwd m1, m1 punpckldq m1, m1 ; left %macro PAETH 0 paddw m0, m6, m1 psubw m2, m4, m0 ; tldiff psubw m0, m5 ; tdiff pabsw m2, m2 pabsw m0, m0 pminsw m2, m0 pcmpeqw m0, m2 pand m3, m5, m0 pandn m0, m4 por m0, m3 pcmpgtw m3, m7, m2 pand m0, m3 pandn m3, m1 por m0, m3 %endmacro PAETH movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %define r7d hm %assign regs_used 7 %elif WIN64 movaps r4m, m8 PUSH r7 %assign regs_used 8 %endif %if ARCH_X86_64 movddup m8, [pw_256] %endif lea tlq, [tlq+wq*2+2] neg wq mov r7d, hd .w8_loop0: movu m5, [tlq+wq*2] mov r6, dstq add dstq, 16 psubw m6, m5, m4 pabsw m7, m6 .w8_loop: movd m1, [leftq+hq-2] %if ARCH_X86_64 pshufb m1, m8 %else pshuflw m1, m1, q0000 punpcklqdq m1, m1 %endif PAETH mova [r6], m0 add r6, strideq sub hd, 1*2 jg .w8_loop mov hd, r7d add wq, 8 jl .w8_loop0 %if WIN64 movaps m8, r4m %endif RET %if ARCH_X86_64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 4 %endif cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov hd, hm lea weightsq, [weightsq+hq*4] neg hq movd m5, [tlq+hq*2] ; bottom pshuflw m5, m5, q0000 punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [tlq+2] ; top lea r3, [strideq*3] psubw m4, m5 ; top - bottom .w4_loop: movq m1, [weightsq+hq*2] punpcklwd m1, m1 pshufd m0, m1, q1100 punpckhdq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop RET .w8: %if ARCH_X86_32 PUSH r6 %assign regs_used 7 mov hm, hq %define hq hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0, hq movu m4, [tlq+2] add tlq, 16 mov r6, dstq add dstq, 16 psubw m4, m5 .w8_loop: movq m3, [weightsq+t0*2] punpcklwd m3, m3 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] add t0, 4 jl .w8_loop sub wd, 8 jg .w8_loop0 RET cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl, w, h, weights LEA weightsq, smooth_weights_1d_16bpc mov wd, wm movifnidn hd, hm movd m5, [tlq+wq*2] ; right sub tlq, 8 add hd, hd pshuflw m5, m5, q0000 sub tlq, hq punpcklqdq m5, m5 cmp wd, 4 jne .w8 movddup m4, [weightsq+4*2] lea r3, [strideq*3] .w4_loop: movq m1, [tlq+hq] ; left punpcklwd m1, m1 psubw m1, m5 ; left - right pshufd m0, m1, q3322 punpckldq m1, m1 pmulhrsw m0, m4 pmulhrsw m1, m4 paddw m0, m5 paddw m1, m5 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movhps [dstq+strideq*2], m1 movq [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4*2 jg .w4_loop RET .w8: lea weightsq, [weightsq+wq*4] neg wq %if ARCH_X86_32 PUSH r6 %assign regs_used 7 %define hd hm %elif WIN64 PUSH r7 %assign regs_used 8 %endif .w8_loop0: mov t0d, hd mova m4, [weightsq+wq*2] mov r6, dstq add dstq, 16 .w8_loop: movq m3, [tlq+t0*(1+ARCH_X86_32)] punpcklwd m3, m3 psubw m3, m5 pshufd m0, m3, q3333 pshufd m1, m3, q2222 pshufd m2, m3, q1111 pshufd m3, m3, q0000 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 REPX {paddw x, m5}, m0, m1, m2, m3 mova [r6+strideq*0], m0 mova [r6+strideq*1], m1 lea r6, [r6+strideq*2] mova [r6+strideq*0], m2 mova [r6+strideq*1], m3 lea r6, [r6+strideq*2] sub t0d, 4*(1+ARCH_X86_64) jg .w8_loop add wq, 8 jl .w8_loop0 RET %if ARCH_X86_64 DECLARE_REG_TMP 10 %else DECLARE_REG_TMP 3 %endif cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl, w, h, \ h_weights, v_weights, top LEA h_weightsq, smooth_weights_2d_16bpc mov wd, wm mov hd, hm movd m7, [tlq+wq*2] ; right lea v_weightsq, [h_weightsq+hq*8] neg hq movd m6, [tlq+hq*2] ; bottom pshuflw m7, m7, q0000 pshuflw m6, m6, q0000 cmp wd, 4 jne .w8 movq m4, [tlq+2] ; top mova m5, [h_weightsq+4*4] punpcklwd m4, m6 ; top, bottom pxor m6, m6 .w4_loop: movq m1, [v_weightsq+hq*4] sub tlq, 4 movd m3, [tlq] ; left pshufd m0, m1, q0000 pshufd m1, m1, q1111 pmaddwd m0, m4 punpcklwd m3, m7 ; left, right pmaddwd m1, m4 pshufd m2, m3, q1111 pshufd m3, m3, q0000 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pavgw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] add hq, 2 jl .w4_loop RET .w8: %if ARCH_X86_32 lea h_weightsq, [h_weightsq+wq*4] mov t0, tlq mov r1m, tlq mov r2m, hq %define m8 [h_weightsq+16*0] %define m9 [h_weightsq+16*1] %else %if WIN64 movaps r4m, m8 movaps r6m, m9 PUSH r7 PUSH r8 %endif PUSH r9 PUSH r10 %assign regs_used 11 lea h_weightsq, [h_weightsq+wq*8] lea topq, [tlq+wq*2] neg wq mov r8, tlq mov r9, hq %endif punpcklqdq m6, m6 .w8_loop0: %if ARCH_X86_32 movu m5, [t0+2] add t0, 16 mov r0m, t0 %else movu m5, [topq+wq*2+2] mova m8, [h_weightsq+wq*4+16*0] mova m9, [h_weightsq+wq*4+16*1] %endif mov t0, dstq add dstq, 16 punpcklwd m4, m5, m6 punpckhwd m5, m6 .w8_loop: movd m1, [v_weightsq+hq*4] sub tlq, 2 movd m3, [tlq] ; left pshufd m1, m1, q0000 pmaddwd m0, m4, m1 pshuflw m3, m3, q0000 pmaddwd m1, m5 punpcklwd m3, m7 ; left, right pmaddwd m2, m8, m3 pmaddwd m3, m9 paddd m0, m2 paddd m1, m3 psrld m0, 8 psrld m1, 8 packssdw m0, m1 pxor m1, m1 pavgw m0, m1 mova [t0], m0 add t0, strideq inc hq jl .w8_loop %if ARCH_X86_32 mov t0, r0m mov tlq, r1m add h_weightsq, 16*2 mov hq, r2m sub dword wm, 8 jg .w8_loop0 %else mov tlq, r8 mov hq, r9 add wq, 8 jl .w8_loop0 %endif %if WIN64 movaps m8, r4m movaps m9, r6m %endif RET %if ARCH_X86_64 cglobal ipred_z1_16bpc, 3, 8, 8, 16*18, dst, stride, tl, w, h, angle, dx %define base r7-$$ %define bdmaxm r8m lea r7, [$$] %else cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride, tl, w, h, angle, dx %define base r1-$$ %define stridemp [rsp+4*0] %define bdmaxm [rsp+4*1] mov r3, r8m mov stridemp, r1 mov bdmaxm, r3 LEA r1, $$ %endif tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm add tlq, 2 movsxd wq, [base+ipred_z1_16bpc_ssse3_table+wq*4] mov dxd, angled movddup m0, [base+pw_256] and dxd, 0x7e movddup m7, [base+pw_62] add angled, 165 ; ~90 lea wq, [base+wq+ipred_z1_16bpc_ssse3_table] movzx dxd, word [base+dr_intra_derivative+dxq] xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r3d, 9 add r3d, hd cmp r3d, 8 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) movd m3, [tlq+14] movu m2, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movd m1, bdmaxm pshufb m3, m0 palignr m4, m3, m2, 4 ; 3 4 5 6 7 8 8 8 paddw m4, [tlq- 2] ; 0 1 2 3 4 5 6 7 add dxd, dxd mova [rsp+32], m3 palignr m3, m2, 2 ; 2 3 4 5 6 7 8 8 pshufb m1, m0 paddw m3, m2 ; -1 * a + 9 * b + 9 * c + -1 * d psubw m5, m3, m4 ; = (b + c - a - d + (b + c) << 3 + 8) >> 4 movd m4, dxd psraw m5, 3 ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1 paddw m3, m5 pxor m5, m5 pmaxsw m3, m5 mov r3d, dxd pavgw m3, m5 pshufb m4, m0 pminsw m3, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 mova m3, [base+z_upsample] movifnidn strideq, stridemp mova [rsp+ 0], m1 paddw m5, m4, m4 mova [rsp+16], m2 punpcklqdq m4, m5 ; xpos0 xpos1 .w4_upsample_loop: lea r2d, [r3+dxq] shr r3d, 6 ; base0 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base1 movu m2, [rsp+r2*2] pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 ; frac psllw m2, 9 ; (a * (64 - frac) + b * frac + 32) >> 6 psubw m1, m0 ; = a + (((b - a) * frac + 32) >> 6) pmulhrsw m1, m2 ; = a + (((b - a) * (frac << 9) + 16384) >> 15) paddw m4, m5 ; xpos += dx paddw m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_upsample_loop RET .w4_no_upsample: mov r3d, 7 ; max_base test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea r3d, [hq+3] movd m1, r3d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r3d, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movd m3, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 pshuflw m3, m3, q0000 mova [rsp+16*1], m2 lea r2d, [r3+2] movq [rsp+r3*2+18], m3 cmp hd, 8 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w4_main: lea tlq, [tlq+r3*2] movd m4, dxd movddup m1, [base+z_base_inc] ; base_inc << 6 movd m6, [tlq] ; top[max_base_x] shl r3d, 6 movd m3, r3d pshufb m4, m0 mov r5d, dxd ; xpos pshufb m6, m0 sub r5, r3 pshufb m3, m0 paddw m5, m4, m4 psubw m3, m1 ; max_base_x punpcklqdq m4, m5 ; xpos0 xpos1 movifnidn strideq, stridemp .w4_loop: lea r3, [r5+dxq] sar r5, 6 ; base0 movq m0, [tlq+r5*2+0] movq m1, [tlq+r5*2+2] lea r5, [r3+dxq] sar r3, 6 ; base1 movhps m0, [tlq+r3*2+0] movhps m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jz .w4_end lea dstq, [dstq+strideq*2] test r5d, r5d jl .w4_loop .w4_end_loop: movq [dstq+strideq*0], m6 movq [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_end_loop .w4_end: RET .w8: lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 movu m1, [tlq+ 0] ; 1 2 3 4 5 6 7 8 movu m5, [tlq+ 2] ; 2 3 4 5 6 7 8 9 movu m3, [tlq+ 4] ; 3 4 5 6 7 8 9 a paddw m5, m1 paddw m3, [tlq- 2] ; 0 1 2 3 4 5 6 7 psubw m2, m5, m3 movu m6, [tlq+18] ; a b c d e f g _ psraw m2, 3 movu m3, [tlq+20] ; b c d e f g _ _ paddw m5, m2 movu m2, [tlq+16] ; 9 a b c d e f g paddw m6, m2 add dxd, dxd cmp hd, 4 jne .w8_upsample_h8 ; awkward single-pixel edge case pshuflw m3, m3, q1110 ; b c c _ _ _ _ _ .w8_upsample_h8: paddw m3, [tlq+14] ; 8 9 a b c d e f psubw m4, m6, m3 movd m3, bdmaxm psraw m4, 3 mov r3d, dxd paddw m6, m4 pxor m4, m4 pmaxsw m5, m4 pmaxsw m6, m4 pshufb m3, m0 pavgw m5, m4 pavgw m6, m4 movd m4, dxd pminsw m5, m3 pminsw m6, m3 mova m3, [base+z_upsample] pshufb m4, m0 movifnidn strideq, stridemp punpcklwd m0, m1, m5 mova [rsp+ 0], m0 punpckhwd m1, m5 mova [rsp+16], m1 punpcklwd m0, m2, m6 mova [rsp+32], m0 punpckhwd m2, m6 mova [rsp+48], m2 mova m5, m4 .w8_upsample_loop: mov r2d, r3d shr r2d, 6 movu m1, [rsp+r2*2+ 0] movu m2, [rsp+r2*2+16] add r3d, dxd pshufb m1, m3 pshufb m2, m3 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w8_upsample_loop RET .w8_no_upsample: lea r3d, [hq+7] movd m1, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) test angled, 0x400 jnz .w8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .w8_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x55555555 movu m3, [tlq+16*1] movd m4, [tlq+r3*2] shr r5d, 30 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 pshuflw m4, m4, q0000 mova [rsp+16*2], m3 lea r2d, [r3+2] movq [rsp+r3*2+18], m4 cmp hd, 16 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w8_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 movifnidn strideq, stridemp .w8_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+0] movu m1, [tlq+r3*2+2] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 psraw m2, m4, 15 ; xpos < max_base_x paddw m4, m5 ; xpos += dx paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [dstq], m0 dec hd jz .w8_end add dstq, strideq add r5, dxq jl .w8_loop .w8_end_loop: mova [dstq], m6 add dstq, strideq dec hd jg .w8_end_loop .w8_end: RET .w16: %if ARCH_X86_32 %define strideq r3 %endif lea r3d, [hq+15] movd m1, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) test angled, 0x400 jnz .w16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_main ; filter_strength == 0 pshuflw m1, [tlq-2], q0000 movu m2, [tlq+16*0] imul r5d, 0x24924924 movu m3, [tlq+16*1] movu m4, [tlq+16*2] shr r5d, 30 movu m5, [tlq+16*3] movd m6, [tlq+r3*2] adc r5d, -1 ; filter_strength movd [rsp+12], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 pshuflw m6, m6, q0000 mova [rsp+16*3], m4 mova [rsp+16*4], m5 lea r2d, [r3+2] movq [rsp+r3*2+18], m6 cmp hd, 32 cmovae r3d, r2d lea tlq, [rsp+16*1] call .filter_edge .w16_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w16_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*0], m0 por m1, m3 mova [dstq+16*1], m1 dec hd jz .w16_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w16_loop .w16_end_loop: mova [dstq+16*0], m6 mova [dstq+16*1], m6 add dstq, strideq dec hd jg .w16_end_loop .w16_end: RET .w32: lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main call .filter_copy lea r5d, [r3+2] cmp hd, 64 cmove r3d, r5d call .filter_edge_s3 .w32_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w32_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*2], m0 por m1, m3 mova [dstq+16*3], m1 dec hd jz .w32_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w32_loop .w32_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main call .filter_copy call .filter_edge_s3 .w64_main: lea tlq, [tlq+r3*2] movd m5, dxd mova m4, [base+z_base_inc] shl r3d, 6 movd m6, [tlq] ; top[max_base_x] movd m1, r3d pshufb m5, m0 mov r5d, dxd ; xpos pshufb m1, m0 sub r5, r3 psubw m4, m1 ; max_base_x pshufb m6, m0 paddw m4, m5 .w64_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3*2+ 0] movu m2, [tlq+r3*2+ 2] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+16] paddw m0, m2 movu m2, [tlq+r3*2+18] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 psraw m2, m4, 15 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m512] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*0], m0 por m1, m2 mova [dstq+16*1], m1 movu m0, [tlq+r3*2+32] movu m2, [tlq+r3*2+34] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+48] paddw m0, m2 movu m2, [tlq+r3*2+50] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m1024] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m1536] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*2], m0 por m1, m2 mova [dstq+16*3], m1 movu m0, [tlq+r3*2+64] movu m2, [tlq+r3*2+66] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+80] paddw m0, m2 movu m2, [tlq+r3*2+82] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m2048] pcmpgtw m2, m4 pand m0, m2 pandn m2, m6 por m0, m2 movddup m2, [base+pw_m2560] pcmpgtw m2, m4 pand m1, m2 pandn m2, m6 mova [dstq+16*4], m0 por m1, m2 mova [dstq+16*5], m1 movu m0, [tlq+r3*2+96] movu m2, [tlq+r3*2+98] psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r3*2+112] paddw m0, m2 movu m2, [tlq+r3*2+114] psubw m2, m1 pmulhrsw m2, m3 paddw m1, m2 movddup m2, [base+pw_m3072] movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [dstq+16*6], m0 por m1, m3 mova [dstq+16*7], m1 dec hd jz .w64_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w64_loop .w64_end_loop: REPX {mova [dstq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET ALIGN function_align .filter_copy: pshuflw m2, [tlq-2], q0000 pshuflw m3, [tlq+r3*2], q0000 xor r5d, r5d movd [rsp+gprsize+12], m2 .filter_copy_loop: movu m1, [tlq+r5*2+16*0] movu m2, [tlq+r5*2+16*1] add r5d, 16 mova [rsp+r5*2+gprsize-16*1], m1 mova [rsp+r5*2+gprsize-16*0], m2 cmp r5d, r3d jle .filter_copy_loop lea tlq, [rsp+gprsize+16*1] movq [tlq+r3*2+2], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-2] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-2] mova [tlq+r5*2-16], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2] movu m3, [tlq+r5*2+2] paddw m2, m3 pmullw m2, m5 add r5d, 8 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r3d jl .filter_edge_loop mova [tlq+r5*2-16], m1 ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-2] movu m3, [tlq-4] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-2] movu m3, [tlq+r5*2-4] mova [tlq+r5*2-16], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2+0] paddw m3, m5 movu m1, [tlq+r5*2+2] movu m4, [tlq+r5*2+4] add r5d, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r3d jl .filter_edge_s3_loop mova [tlq+r5*2-16], m1 ret %if ARCH_X86_64 cglobal ipred_z2_16bpc, 4, 12, 11, 16*24, dst, stride, tl, w, h, angle, dx, _, dy %define base r7-$$ %define maxwm r6m %define maxhm r7m %define bdmaxm r8m lea r7, [$$] mov hd, hm movddup m8, [base+pw_62] lea r9d, [wq-4] shl r9d, 6 mova m9, [base+z2_top_shufA] or r9d, hd mova m10, [base+z2_left_shufA] %else cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w, h, angle, dx %define base r1-$$ %define r9b byte [rsp+16*26+4*0] %define r9d dword [rsp+16*26+4*0] %define r10d dword [rsp+16*26+4*1] %define r11d dword [rsp+16*26+4*2] %define maxwm [rsp+16*2+4*0] %define maxhm [rsp+16*2+4*1] %define bdmaxm [rsp+16*2+4*2] %define stridemp [rsp+16*26+4*3] %define strideq r3 %define dyd r4 %define dyq r4 mov stridemp, r1 mov r1d, r6m mov r4d, r7m mov r5d, r8m mov maxwm, r1d mov maxhm, r4d mov bdmaxm, r5d LEA r1, $$ lea hd, [wq-4] mova m0, [base+z2_top_shufA] shl hd, 6 mova m1, [base+z2_left_shufA] or hd, hm mova [rsp+16*24], m0 mov r9d, hd mova [rsp+16*25], m1 %endif tzcnt wd, wd movifnidn angled, anglem mova m0, [tlq-16*8] mova m1, [tlq-16*7] mova m2, [tlq-16*6] mova m3, [tlq-16*5] movsxd wq, [base+ipred_z2_16bpc_ssse3_table+wq*4] %if ARCH_X86_64 movzx dxd, angleb %else movzx dxd, byte anglem %endif mova m4, [tlq-16*4] mova m5, [tlq-16*3] mova m6, [tlq-16*2] mova m7, [tlq-16*1] mova [rsp+16* 5], m0 xor angled, 0x400 mova [rsp+16* 6], m1 mov dyd, dxd mova [rsp+16* 7], m2 neg dxq mova [rsp+16* 8], m3 and dyd, ~1 mova [rsp+16* 9], m4 and dxq, ~1 mova [rsp+16*10], m5 lea wq, [base+ipred_z2_16bpc_ssse3_table+wq] mova [rsp+16*11], m6 pxor m3, m3 mova [rsp+16*12], m7 movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle movddup m0, [base+pw_256] ; 4<<6 movd m4, [tlq] movu m5, [tlq+16*0+2] movu m6, [tlq+16*1+2] movsldup m1, [base+z2_dy_offset] pshufb m4, m0 movq m7, [base+z_base_inc+2] mov r11d, (112-4)<<6 mova [rsp+16*13], m4 neg dxd mova [rsp+16*14], m5 or dyd, 4<<16 mova [rsp+16*15], m6 %if ARCH_X86_64 lea r10d, [dxq+(112<<6)] ; xpos %else mov [rsp+8*3], dyd lea r4d, [dxq+(112<<6)] mov r10d, r4d movzx hd, r9b %endif movq [rsp+8*0], m1 movq [rsp+8*1], m0 movq [rsp+8*2], m7 jmp wq .w4: test angled, 0x400 jnz .w4_main lea r3d, [hq+2] add angled, 1022 pshuflw m1, m5, q3333 shl r3d, 6 movq [rsp+16*14+8], m1 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, [base+z_filt_wh4] pand m7, m2 pcmpgtb m7, [base+z_filt_t_w48+angleq*8] jmp .w8_filter_left .upsample_above: ; w4/w8 paddw m2, m5, [tlq] movu m1, [rsp+gprsize+16*14+2] movu m4, [rsp+gprsize+16*14-4] %if ARCH_X86_64 movd m6, r9m ; bdmax, offset due to call %else movd m6, [rsp+gprsize+16*2+4*2] %endif paddw m4, m1 psubw m1, m2, m4 pshufb m6, m0 psraw m1, 3 paddw m2, m1 add dxd, dxd pmaxsw m2, m3 paddw m7, m7 pavgw m2, m3 pminsw m2, m6 %if ARCH_X86_64 mova m9, [base+z2_top_shufB] lea r10d, [dxq+(113<<6)] mov r11d, (112-7)<<6 %else mova m1, [base+z2_top_shufB] lea r3d, [dxq+(113<<6)] mov dword [rsp+gprsize+16*26+4*2], (112-7)<<6 mov [rsp+gprsize+16*26+4*1], r3d mova [rsp+gprsize+16*24], m1 %endif punpcklwd m1, m2, m5 punpckhwd m2, m5 movq [rsp+gprsize+8*2], m7 mova [rsp+gprsize+16*14], m1 mova [rsp+gprsize+16*15], m2 ret .w4_no_upsample_above: lea r3d, [hq+3] mov [rsp+16*4], angled sub angled, 1112 ; angle - 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 mova m4, [base+z_filt_wh4] movd m7, r3d mova m5, [base+z_filt_t_w48+angleq*8] mov r3d, 4 call .w8_filter_top mov angled, [rsp+16*4] lea r3d, [hq+2] sub angled, 139 shl r3d, 6 test r3d, angled jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) .upsample_left: ; w4/w8 mova m2, [tlq-16] lea r3d, [hq-4] movu m3, [tlq-14] movu m4, [rsp+16*12+4] pshufb m1, m2, [base+z2_upsample_l+r3*4] movd m6, bdmaxm pxor m5, m5 paddw m3, m2 paddw m4, m1 psubw m1, m3, m4 movshdup m4, [base+z2_dy_offset] psraw m1, 3 pshufb m6, m0 paddw m3, m1 pmaxsw m3, m5 pavgw m3, m5 pminsw m3, m6 %if ARCH_X86_64 mova m10, [base+z2_left_shufB] add dyd, dyd %else mova m1, [base+z2_left_shufB] shl dword [rsp+8*3], 1 mova [rsp+16*25], m1 %endif punpckhwd m1, m2, m3 punpcklwd m2, m3 movq [rsp+8*0], m4 mova [rsp+16*12], m1 mova [rsp+16*11], m2 .w4_main: movd m6, dxd %if ARCH_X86_64 movd m3, dyd %else movd m3, [rsp+8*3] %endif pshufb m6, m0 movddup m0, [rsp+8*2] paddw m7, m6, m6 movq m5, [base+pw_m1to4] pshuflw m4, m3, q0000 punpcklqdq m6, m7 pmullw m4, m5 pshuflw m3, m3, q1111 paddw m6, m0 mov r2d, r10d pshuflw m0, m4, q3333 psubw m4, [rsp+8*0] movq [rsp+8*3], m3 movq [rsp+8*5], m0 ; dy*4 mov r5, dstq .w4_loop0: mova [rsp+16*4], m6 movq [rsp+8*4], m4 %if ARCH_X86_64 pand m0, m8, m4 %else movq m0, [base+pw_62] pand m0, m4 %endif psraw m4, 6 psllw m0, 9 ; frac_y << 9 movq [rsp+8*7], m0 pabsw m4, m4 movq [rsp+8*6], m4 movzx hd, r9b .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu m2, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu m1, [rsp+r3*2] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movu m3, [rsp+r2*2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movu m4, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m9}, m2, m1, m3, m4 %else mova m0, [rsp+16*24] REPX {pshufb x, m0}, m2, m1, m3, m4 %endif punpcklqdq m0, m2, m1 punpckhqdq m2, m1 punpcklqdq m1, m3, m4 punpckhqdq m3, m4 %if ARCH_X86_64 pand m5, m8, m6 %else movddup m5, [base+pw_62] pand m5, m6 %endif psllw m5, 9 psubw m2, m0 pmulhrsw m2, m5 paddw m5, m6, m7 psubw m3, m1 paddw m0, m2 %if ARCH_X86_64 pand m2, m8, m5 %else movddup m2, [base+pw_62] pand m2, m5 %endif psllw m2, 9 pmulhrsw m3, m2 paddw m1, m3 cmp r3d, 111 ; topleft jge .w4_toponly mova [rsp+16*22], m0 mova [rsp+16*23], m1 movzx r3d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r3*2] movzx r3d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r3*2] movzx r3d, byte [rsp+8*6+4] ; base_y2 movu m4, [rsp+r3*2] movzx r3d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r3*2] %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m4, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m4, m0 %endif punpcklwd m1, m3, m2 punpckhwd m3, m2 ; 01 punpcklwd m2, m4, m0 punpckhwd m4, m0 ; 23 punpckldq m0, m1, m2 ; y0 d1 punpckhdq m1, m2 ; y2 y3 punpckldq m2, m3, m4 punpckhdq m3, m4 movddup m4, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 psraw m6, 15 ; base_x < topleft psraw m4, m5, 15 paddw m0, m2 paddw m1, m3 pand m0, m6 pandn m6, [rsp+16*22] pand m1, m4 pandn m4, [rsp+16*23] por m0, m6 por m1, m4 .w4_toponly: movifnidn strideq, stridemp movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jz .w4_end movq m4, [rsp+8*6] paddsw m6, m5, m7 ; xpos += dx movq m5, [rsp+8*3] psubw m4, m5 lea dstq, [dstq+strideq*2] movq [rsp+8*6], m4 cmp r2d, r11d jge .w4_loop .w4_leftonly_loop: movzx r2d, byte [rsp+8*6+0] ; base_y0 movu m3, [rsp+r2*2] movzx r2d, byte [rsp+8*6+2] ; base_y1 movu m2, [rsp+r2*2] movzx r2d, byte [rsp+8*6+4] ; base_y2 movu m6, [rsp+r2*2] movzx r2d, byte [rsp+8*6+6] ; base_y3 movu m0, [rsp+r2*2] psubw m4, m5 %if ARCH_X86_64 REPX {pshufb x, m10}, m3, m2, m6, m0 %else mova m1, [rsp+16*25] REPX {pshufb x, m1}, m3, m2, m6, m0 %endif movq [rsp+8*6], m4 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m6, m0 punpckhwd m6, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m6 punpckhdq m3, m6 movddup m6, [rsp+8*7] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m6 pmulhrsw m3, m6 paddw m0, m2 paddw m1, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4_leftonly_loop .w4_end: sub r9d, 1<<8 jl .w4_ret movq m4, [rsp+8*5] add r5, 8 mov dstq, r5 paddw m4, [rsp+8*4] ; base_y += 4*dy movzx r2d, word [rsp+8*1] movddup m6, [rsp+8*1] paddw m6, [rsp+16*4] ; base_x += (4 << upsample_above) add r2d, r10d mov r10d, r2d jmp .w4_loop0 .w4_ret: RET .w8: test angled, 0x400 jnz .w4_main lea r3d, [angleq+126] pshufhw m1, m5, q3333 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif movhps [rsp+16*15], m1 cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm call .upsample_above sub angled, 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle movu m1, [base+z_filt_wh8] movd m2, r3d movd m7, angled shr angled, 8 ; is_sm << 1 psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pshufb m2, m3 pshufb m7, m3 pcmpeqb m2, m1 movq m1, [base+pw_512] pand m7, m2 pcmpgtb m7, m4 movq [rsp+8*1], m1 ; 8<<6 jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] mov [rsp+16*4], angled sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m4, [base+z_filt_wh8] movd m7, r3d psrldq m5, [base+z_filt_t_w48+angleq*8], 4 mov r3d, 8 call .w8_filter_top mov r3d, [rsp+16*4] sub r3d, 141 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif cmp r3d, 8 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm .w8_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x55555555 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 ; filter_strength lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w8_filter_top: REPX {pshufb x, m3}, m2, m1, m7 pcmpeqb m2, m4 pand m1, m2 pand m7, m2 pcmpgtb m1, m5 pcmpgtb m7, m5 pmovmskb r5d, m1 test r5d, r5d jz .w8_filter_top_end ; filter_strength == 0 imul r5d, 0x55555555 mov [dstq], tlq lea tlq, [rsp+16*14+gprsize] shr r5d, 30 ; filter_strength call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge %if ARCH_X86_64 mov r3d, r7m ; maxw, offset due to call %else mov r3d, [rsp+16*2+4*1] %endif mov tlq, [dstq] cmp r3d, 8 jge .w8_filter_top_end movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14+gprsize], m1 movu [rsp+r3*2+16*15+gprsize], m2 .w8_filter_top_end: ret .w16: test angled, 0x400 jnz .w4_main lea r3d, [hq+15] sub angled, 90 movd m2, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movd m7, r3d REPX {pshufb x, m3}, m2, m1, m7 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m2, [base+z_filt_wh16] pand m1, m2 pand m7, m2 pcmpgtb m1, m4 pcmpgtb m7, m4 pmovmskb r5d, m1 test r5d, r5d jz .w16_filter_left ; filter_strength == 0 imul r5d, 0x24924924 pshufhw m6, m6, q3333 mov [dstq], tlq lea tlq, [rsp+16*14] shr r5d, 30 movhps [tlq+16*2], m6 adc r5d, -1 ; filter_strength mov r3d, 16 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge mov r3d, maxwm mov tlq, [dstq] cmp r3d, 16 jge .w16_filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 .w16_filter_left: pmovmskb r5d, m7 test r5d, r5d jz .w4_main imul r5d, 0x24924924 neg hq mov r3, tlq movd m1, [tlq+hq*2] shr r5d, 30 lea tlq, [rsp+16*13-2] pshuflw m1, m1, q0000 adc r5d, -1 ; filter_strength movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge jmp .filter_left_end .w32: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] mova [rsp+16*16], m1 mova [rsp+16*17], m2 test angled, 0x400 jnz .w4_main mov [dstq], tlq lea tlq, [rsp+16*14] pshufhw m2, m2, q3333 mov r3d, 32 movhps [tlq+16*4], m2 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 .filter_left: neg hq mov r3, tlq pshuflw m1, [tlq+hq*2], q0000 lea tlq, [rsp+16*13-2] movq [tlq+hq*2-6], m1 call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge_s3 .filter_left_end: mov r2d, maxhm cmp r2d, hd jge .w4_main neg r2 movu m1, [r3+r2*2-16*1] movu m2, [r3+r2*2-16*2] movu [rsp+r2*2+16*12], m1 movu [rsp+r2*2+16*11], m2 cmp r2d, -48 jle .w4_main movu m1, [r3+r2*2-16*3] movu m2, [r3+r2*2-16*4] movu [rsp+r2*2+16*10], m1 movu [rsp+r2*2+16* 9], m2 cmp r2d, -32 jle .w4_main movu m1, [r3+r2*2-16*5] movu m2, [r3+r2*2-16*6] movu [rsp+r2*2+16* 8], m1 movu [rsp+r2*2+16* 7], m2 cmp r2d, -16 jle .w4_main movu m1, [r3+r2*2-16*7] movu m2, [r3+r2*2-16*8] movu [rsp+r2*2+16* 6], m1 movu [rsp+r2*2+16* 5], m2 jmp .w4_main .w64: movu m1, [tlq+16*2+2] movu m2, [tlq+16*3+2] movu m3, [tlq+16*4+2] movu m4, [tlq+16*5+2] movu m5, [tlq+16*6+2] movu m6, [tlq+16*7+2] mov [dstq], tlq lea tlq, [rsp+16*14] mova [tlq+16*2], m1 mova [tlq+16*3], m2 mova [tlq+16*4], m3 mova [tlq+16*5], m4 mova [tlq+16*6], m5 mova [tlq+16*7], m6 test angled, 0x400 jnz .w4_main pshufhw m6, m6, q3333 mov r3d, 64 movhps [tlq+16*8], m6 call mangle(private_prefix %+ _ipred_z1_16bpc_ssse3).filter_edge_s3 mov r3d, maxwm mov tlq, [dstq] cmp r3d, 64 jge .filter_left movu m1, [tlq+r3*2+16*0+2] movu m2, [tlq+r3*2+16*1+2] movu [rsp+r3*2+16*14], m1 movu [rsp+r3*2+16*15], m2 cmp r3d, 48 jge .filter_left movu m1, [tlq+r3*2+16*2+2] movu m2, [tlq+r3*2+16*3+2] movu [rsp+r3*2+16*16], m1 movu [rsp+r3*2+16*17], m2 cmp r3d, 32 jge .filter_left movu m1, [tlq+r3*2+16*4+2] movu m2, [tlq+r3*2+16*5+2] movu [rsp+r3*2+16*18], m1 movu [rsp+r3*2+16*19], m2 cmp r3d, 16 jge .filter_left movu m1, [tlq+r3*2+16*6+2] movu m2, [tlq+r3*2+16*7+2] movu [rsp+r3*2+16*20], m1 movu [rsp+r3*2+16*21], m2 jmp .filter_left %if ARCH_X86_64 cglobal ipred_z3_16bpc, 4, 9, 8, 16*18, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ lea r7, [$$] mov org_wd, wd %else cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride, tl, w, h, angle, dy %define base r1-$$ %define org_wd r5 %define org_wq r5 movd m6, r8m ; pixel_max mov [dstq+4*0], strideq LEA r1, $$ mov [dstq+4*1], wd %endif tzcnt hd, hm movifnidn angled, anglem sub tlq, 2 movsxd hq, [base+ipred_z3_16bpc_ssse3_table+hq*4] sub angled, 180 movddup m0, [base+pw_256] mov dyd, angled neg dyd xor angled, 0x400 movddup m7, [base+pw_62] or dyq, ~0x7e lea hq, [base+ipred_z3_16bpc_ssse3_table+hq] movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r4d, 9 add r4d, wd cmp r4d, 8 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) mova m2, [tlq-14] ; 7 6 5 4 3 2 1 0 movu m3, [tlq-12] ; 8 7 6 5 4 3 2 1 %if ARCH_X86_64 movd m6, r8m %endif pshufb m4, m2, m0 mov tlq, rsp palignr m1, m2, m4, 14 ; 8 8 7 6 5 4 3 2 add dyd, dyd palignr m5, m2, m4, 12 ; 8 8 8 7 6 5 4 3 paddw m1, m2 paddw m3, m5 psubw m5, m1, m3 mova m3, [base+z_upsample] mova [tlq+ 0], m4 movd m4, dyd psraw m5, 3 neg dyd paddw m1, m5 pxor m5, m5 lea r5d, [dyq+(16<<6)+63] ; ypos pmaxsw m1, m5 pshufb m6, m0 shl wd, 3 pavgw m1, m5 pshufb m4, m0 pminsw m1, m6 sub rsp, wq punpckhwd m0, m1, m2 paddw m5, m4, m4 punpcklwd m1, m2 mova [tlq+32], m0 movsd m4, m5 mova [tlq+16], m1 .h4_upsample_loop: lea r4d, [r5+dyq] sar r5d, 6 movu m2, [tlq+r5*2] lea r5d, [r4+dyq] sar r4d, 6 movu m1, [tlq+r4*2] pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h4_upsample_loop or r3d, 4*2 jmp .end_transpose .h4_no_upsample: mov r4d, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea r4d, [wq+3] movd m1, r4d movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 pcmpeqb m1, [base+z_filt_wh4] pand m1, m3 pcmpgtb m1, [base+z_filt_t_w48+angleq*8] pmovmskb r5d, m1 mov r4d, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-14] neg r4 movd m3, [tlq+r4*2] shr r5d, 30 movd [rsp+16*17], m1 pshuflw m3, m3, q0000 mova [rsp+16*16], m2 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m3 cmp wd, 8 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h4_main: movd m4, dyd sub tlq, r4 movddup m1, [base+z_base_inc_z2+8] ; base_inc << 6 sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] ; ypos pshufb m3, m0 shl wd, 3 paddw m5, m4, m4 sub rsp, wq psubw m3, m1 ; max_base_y movsd m4, m5 ; ypos1 ypos0 .h4_loop: lea r4, [r5+dyq] sar r5, 6 movddup m0, [tlq+r5*2-6] movddup m1, [tlq+r5*2-8] lea r5, [r4+dyq] sar r4, 6 movlps m0, [tlq+r4*2-6] movlps m1, [tlq+r4*2-8] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 16 jz .h4_transpose test r5d, r5d jg .h4_loop .h4_end_loop: mova [rsp+wq-16], m6 sub wd, 16 jg .h4_end_loop .h4_transpose: or r3d, 4*2 jmp .end_transpose .h8: lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m2, [tlq-30] ; g f e d c b a 9 movu m1, [tlq-32] ; _ g f e d c b a movu m3, [tlq-16] ; 9 8 7 6 5 4 3 2 paddw m3, [tlq-14] ; 8 7 6 5 4 3 2 1 pshufd m4, m2, q2100 ; _ _ g f e d c b paddw m1, m2 movu m5, [tlq-28] ; f e d c b a 9 8 add dyd, dyd cmp wd, 8 je .h8_upsample_w8 pshufhw m4, m2, q1000 ; _ _ _ _ c c c b .h8_upsample_w8: paddw m4, m5 psubw m5, m1, m4 movu m4, [tlq-18] ; a 9 8 7 6 5 4 3 psraw m5, 3 paddw m1, m5 movu m5, [tlq-12] ; 7 6 5 4 3 2 1 0 %if ARCH_X86_64 movd m6, r8m ; pixel_max %endif paddw m4, m5 shl wd, 4 psubw m5, m3, m4 movd m4, dyd psraw m5, 3 neg dyd paddw m3, m5 pshufb m6, m0 mova m5, [tlq-14] pshufb m4, m0 pxor m0, m0 pmaxsw m1, m0 pmaxsw m3, m0 mov tlq, rsp pavgw m1, m0 pavgw m3, m0 sub rsp, wq pminsw m1, m6 pminsw m6, m3 mova m3, [base+z_upsample] lea r5d, [dyq+(16<<6)+63] ; ypos punpcklwd m0, m1, m2 mova [tlq+16*0], m0 punpckhwd m1, m2 mova [tlq+16*1], m1 punpcklwd m0, m6, m5 mova [tlq+16*2], m0 punpckhwd m6, m5 mova [tlq+16*3], m6 mova m5, m4 .h8_upsample_loop: mov r4d, r5d sar r4d, 6 movu m1, [tlq+r4*2+16*0] movu m2, [tlq+r4*2+16*1] add r5d, dyd pshufb m2, m3 pshufb m1, m3 punpckhqdq m0, m1, m2 punpcklqdq m1, m2 pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 paddw m4, m5 paddw m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jg .h8_upsample_loop or r3d, 8*2 jmp .end_transpose .h8_no_upsample: lea r4d, [wq+7] movd m1, r4d and r4d, 7 or r4d, 8 ; imin(w+7, 15) test angled, 0x400 jnz .h8_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movu m2, [base+z_filt_wh8] psrldq m4, [base+z_filt_t_w48+angleq*8], 4 pcmpeqb m2, m1 pand m2, m3 pcmpgtb m2, m4 pmovmskb r5d, m2 test r5d, r5d jz .h8_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 imul r5d, 0x55555555 mova m2, [tlq-16*1+2] neg r4 mova m3, [tlq-16*2+2] shr r5d, 30 movd m4, [tlq+r4*2] movd [rsp+16*17], m1 mova [rsp+16*16], m2 pshuflw m4, m4, q0000 mova [rsp+16*15], m3 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m4 cmp wd, 16 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h8_main: sub tlq, r4 movd m4, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m4, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 4 mova m5, m4 sub rsp, wq psubw m3, [base+z_base_inc_z2] .h8_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m1, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m1, m0 pmulhrsw m1, m2 pcmpgtw m2, m3, m4 paddw m4, m5 paddw m0, m1 pand m0, m2 pandn m2, m6 por m0, m2 mova [rsp+wq-16], m0 sub wd, 8*2 jz .h8_transpose add r5, dyq jg .h8_loop .h8_end_loop: mova [rsp+wq-16], m6 sub wd, 8*2 jg .h8_end_loop .h8_transpose: or r3d, 8*2 jmp .end_transpose .h16: lea r4d, [wq+15] movd m1, r4d and r4d, 15 or r4d, 16 ; imin(w+15, 31) test angled, 0x400 jnz .h16_main movd m3, angled shr angled, 8 ; is_sm << 1 pxor m2, m2 pshufb m1, m2 pshufb m3, m2 movq m4, [base+z_filt_t_w16+angleq*4] pcmpeqb m1, [base+z_filt_wh16] pand m1, m3 pcmpgtb m1, m4 pmovmskb r5d, m1 test r5d, r5d jz .h16_main ; filter_strength == 0 pshuflw m1, [tlq+2], q0000 mova m2, [tlq-16*1+2] imul r5d, 0x24924924 mova m3, [tlq-16*2+2] neg r4 mova m4, [tlq-16*3+2] shr r5d, 30 mova m5, [tlq-16*4+2] movd m6, [tlq+r4*2] adc r5d, -1 ; filter_strength movd [rsp+16*17], m1 mova [rsp+16*16], m2 mova [rsp+16*15], m3 pshuflw m6, m6, q0000 mova [rsp+16*14], m4 mova [rsp+16*13], m5 lea r2, [r4-2] movq [rsp+16*17+r4*2-10], m6 cmp wd, 32 cmovae r4, r2 lea tlq, [rsp+16*17-2] call .filter_edge .h16_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 shl wd, 5 paddw m4, m5, [base+z_base_inc_z2] sub rsp, wq psubw m4, m3 .h16_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m2, [tlq+r4*2-16] pand m3, m7, m4 psllw m3, 9 psubw m2, m0 pmulhrsw m2, m3 movu m1, [tlq+r4*2-30] paddw m0, m2 movu m2, [tlq+r4*2-32] psubw m2, m1 pmulhrsw m2, m3 movddup m3, [base+pw_m512] paddw m1, m2 psraw m2, m4, 15 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+wq-16*1], m0 por m1, m3 mova [rsp+wq-16*2], m1 sub wd, 16*2 jz .h16_transpose add r5, dyq jg .h16_loop .h16_end_loop: mova [rsp+wq-16*1], m6 mova [rsp+wq-16*2], m6 sub wd, 16*2 jg .h16_end_loop .h16_transpose: or r3d, 16*2 jmp .end_transpose .h32: lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main call .filter_copy lea r5, [r4-2] cmp wd, 64 cmove r4, r5 call .filter_edge_s3 .h32_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h32_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2-14] movu m3, [tlq+r4*2-16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-30] paddw m0, m3 movu m3, [tlq+r4*2-32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*4 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-46] movu m3, [tlq+r4*2-48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-62] paddw m0, m3 movu m3, [tlq+r4*2-64] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m1024] paddw m1, m3 movddup m3, [base+pw_m1536] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h32_transpose add r5, dyq jg .h32_loop .h32_end_loop: sub rsp, 16*4 REPX {mova [rsp+16*x], m6}, 3, 2, 1, 0 dec wd jg .h32_end_loop .h32_transpose: or r3d, 32*2 jmp .end_transpose .h64: lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main call .filter_copy call .filter_edge_s3 .h64_main: sub tlq, r4 movd m5, dyd sub tlq, r4 shl r4d, 6 movd m6, [tlq] movd m3, r4d pshufb m5, m0 neg dyq pshufb m6, m0 lea r5, [dyq+r4+63] pshufb m3, m0 paddw m4, m5, [base+z_base_inc_z2] psubw m4, m3 .h64_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4*2- 14] movu m3, [tlq+r4*2- 16] pand m2, m7, m4 psllw m2, 9 psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 30] paddw m0, m3 movu m3, [tlq+r4*2- 32] psubw m3, m1 pmulhrsw m3, m2 sub rsp, 16*8 paddw m1, m3 psraw m3, m4, 15 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m512] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*7], m0 por m1, m3 mova [rsp+16*6], m1 movu m0, [tlq+r4*2- 46] movu m3, [tlq+r4*2- 48] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 62] paddw m0, m3 movu m3, [tlq+r4*2- 64] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m1024] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m1536] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*5], m0 por m1, m3 mova [rsp+16*4], m1 movu m0, [tlq+r4*2- 78] movu m3, [tlq+r4*2- 80] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2- 94] paddw m0, m3 movu m3, [tlq+r4*2- 96] psubw m3, m1 pmulhrsw m3, m2 paddw m1, m3 movddup m3, [base+pw_m2048] pcmpgtw m3, m4 pand m0, m3 pandn m3, m6 por m0, m3 movddup m3, [base+pw_m2560] pcmpgtw m3, m4 pand m1, m3 pandn m3, m6 mova [rsp+16*3], m0 por m1, m3 mova [rsp+16*2], m1 movu m0, [tlq+r4*2-110] movu m3, [tlq+r4*2-112] psubw m3, m0 pmulhrsw m3, m2 movu m1, [tlq+r4*2-126] paddw m0, m3 movu m3, [tlq+r4*2-128] psubw m3, m1 pmulhrsw m3, m2 movddup m2, [base+pw_m3072] paddw m1, m3 movddup m3, [base+pw_m3584] pcmpgtw m2, m4 pcmpgtw m3, m4 paddw m4, m5 pand m0, m2 pandn m2, m6 pand m1, m3 pandn m3, m6 por m0, m2 mova [rsp+16*1], m0 por m1, m3 mova [rsp+16*0], m1 dec wd jz .h64_transpose add r5, dyq jg .h64_loop .h64_end_loop: sub rsp, 16*8 REPX {mova [rsp+16*x], m6}, 7, 6, 5, 4, 3, 2, 1, 0 dec wd jg .h64_end_loop .h64_transpose: add r3d, 64*2 .end_transpose: %if ARCH_X86_64 lea r7, [strideq*3] %else mov strideq, [dstq+4*0] mov org_wd, [dstq+4*1] %endif lea r4d, [r3*3] .end_transpose_loop: lea r2, [rsp+r3-8] lea r6, [dstq+org_wq*2-8] .end_transpose_loop_y: movq m0, [r2+r4 ] movq m1, [r2+r3*2] movq m2, [r2+r3*1] movq m3, [r2+r3*0] sub r2, 8 punpcklwd m0, m1 punpcklwd m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 movhps [r6+strideq*0], m1 movq [r6+strideq*1], m1 %if ARCH_X86_64 movhps [r6+strideq*2], m0 movq [r6+r7 ], m0 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m0 movq [r6+strideq*1], m0 lea r6, [r6+strideq*2] %endif cmp r2, rsp jae .end_transpose_loop_y lea rsp, [rsp+r3*4] sub org_wd, 4 jg .end_transpose_loop RET .filter_copy: neg r4 pshuflw m2, [tlq+2], q0000 xor r5d, r5d pshuflw m3, [tlq+r4*2], q0000 movq [rsp+gprsize+16*17], m2 .filter_copy_loop: mova m1, [tlq+r5*2-16*1+2] mova m2, [tlq+r5*2-16*2+2] sub r5, 16 mova [rsp+r5*2+gprsize+16*18], m1 mova [rsp+r5*2+gprsize+16*17], m2 cmp r5d, r4d jg .filter_copy_loop lea tlq, [rsp+gprsize+16*17-2] movq [tlq+r4*2-8], m3 ret .filter_edge: cmp r5d, 3 je .filter_edge_s3 movddup m4, [base+z_filt_k+r5*8-8] movddup m5, [base+z_filt_k+r5*8+8] xor r5d, r5d movddup m6, [base+pw_8] movu m2, [tlq-12] jmp .filter_edge_start .filter_edge_loop: movu m2, [tlq+r5*2-12] mova [tlq+r5*2+2], m1 .filter_edge_start: pmullw m1, m4, [tlq+r5*2-14] movu m3, [tlq+r5*2-16] sub r5, 8 paddw m2, m3 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 4 cmp r5d, r4d jg .filter_edge_loop mova [tlq+r5*2+2], m1 neg r4d ret .filter_edge_s3: movddup m5, [base+pw_3] xor r5d, r5d movu m2, [tlq-12] movu m3, [tlq-10] jmp .filter_edge_s3_start .filter_edge_s3_loop: movu m2, [tlq+r5*2-12] movu m3, [tlq+r5*2-10] mova [tlq+r5*2+2], m1 .filter_edge_s3_start: paddw m2, [tlq+r5*2-14] paddw m3, m5 movu m1, [tlq+r5*2-16] movu m4, [tlq+r5*2-18] sub r5, 8 paddw m1, m2 pavgw m3, m4 paddw m1, m3 psrlw m1, 2 cmp r5d, r4d jg .filter_edge_s3_loop mova [tlq+r5*2+2], m1 neg r4d ret %if ARCH_X86_64 cglobal ipred_filter_16bpc, 4, 7, 16, dst, stride, tl, w, h, filter %else cglobal ipred_filter_16bpc, 4, 7, 8, -16*8, dst, stride, tl, w, h, filter %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %endif %define base r6-$$ movifnidn hd, hm movd m6, r8m ; bitdepth_max %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif LEA r6, $$ shl filterd, 6 movu m0, [tlq-6] ; __ l1 l0 tl t0 t1 t2 t3 mova m1, [base+filter_intra_taps+filterq+16*0] mova m2, [base+filter_intra_taps+filterq+16*1] mova m3, [base+filter_intra_taps+filterq+16*2] mova m4, [base+filter_intra_taps+filterq+16*3] pxor m5, m5 %if ARCH_X86_64 punpcklbw m8, m5, m1 ; place 8-bit coefficients in the upper punpckhbw m9, m5, m1 ; half of each 16-bit word to avoid punpcklbw m10, m5, m2 ; having to perform sign-extension. punpckhbw m11, m5, m2 punpcklbw m12, m5, m3 punpckhbw m13, m5, m3 punpcklbw m14, m5, m4 punpckhbw m15, m5, m4 %else punpcklbw m7, m5, m1 mova m8, m7 punpckhbw m7, m5, m1 mova m9, m7 punpcklbw m7, m5, m2 mova m10, m7 punpckhbw m7, m5, m2 mova m11, m7 punpcklbw m7, m5, m3 mova m12, m7 punpckhbw m7, m5, m3 mova m13, m7 punpcklbw m7, m5, m4 mova m14, m7 punpckhbw m7, m5, m4 mova m15, m7 %endif mova m7, [base+filter_shuf] add hd, hd mov r5, dstq pshuflw m6, m6, q0000 mov r6, tlq punpcklqdq m6, m6 sub tlq, hq .left_loop: pshufb m0, m7 ; tl t0 t1 t2 t3 l0 l1 __ pshufd m1, m0, q0000 pmaddwd m2, m8, m1 pmaddwd m1, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 paddd m2, m3 paddd m1, m4 pshufd m4, m0, q2222 pmaddwd m3, m12, m4 pmaddwd m4, m13 paddd m2, m3 paddd m1, m4 pshufd m3, m0, q3333 pmaddwd m0, m14, m3 pmaddwd m3, m15 paddd m0, m2 paddd m1, m3 psrad m0, 11 ; x >> 3 psrad m1, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 ; (x + 8) >> 4 pminsw m0, m6 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movlps m0, [tlq+hq-10] lea dstq, [dstq+strideq*2] sub hd, 2*2 jg .left_loop sub wd, 4 jz .end sub tld, r6d ; -h*2 sub r6, r5 ; tl-dst .right_loop0: add r5, 8 mov hd, tld movu m0, [r5+r6] ; tl t0 t1 t2 t3 __ __ __ mov dstq, r5 .right_loop: pshufd m2, m0, q0000 pmaddwd m1, m8, m2 pmaddwd m2, m9 pshufd m4, m0, q1111 pmaddwd m3, m10, m4 pmaddwd m4, m11 pinsrw m0, [dstq+strideq*0-2], 5 paddd m1, m3 paddd m2, m4 pshufd m0, m0, q2222 movddup m4, [dstq+strideq*1-8] pmaddwd m3, m12, m0 pmaddwd m0, m13 paddd m1, m3 paddd m0, m2 pshuflw m2, m4, q3333 punpcklwd m2, m5 pmaddwd m3, m14, m2 pmaddwd m2, m15 paddd m1, m3 paddd m0, m2 psrad m1, 11 psrad m0, 11 packssdw m0, m1 pmaxsw m0, m5 pavgw m0, m5 pminsw m0, m6 movhps [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 palignr m0, m4, 14 lea dstq, [dstq+strideq*2] add hd, 2*2 jl .right_loop sub wd, 4 jg .right_loop0 .end: RET %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal ipred_cfl_top_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac LEA t0, ipred_cfl_left_16bpc_ssse3_table movd m4, wd tzcnt wd, wd movifnidn hd, hm add tlq, 2 movsxd r6, [t0+wq*4] movd m5, wd jmp mangle(private_prefix %+ _ipred_cfl_left_16bpc_ssse3.start) cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm LEA t0, ipred_cfl_left_16bpc_ssse3_table tzcnt wd, wm lea r6d, [hq*2] movd m4, hd sub tlq, r6 tzcnt r6d, hd movd m5, r6d movsxd r6, [t0+r6*4] .start: movd m7, r7m movu m0, [tlq] add r6, t0 add t0, ipred_cfl_splat_16bpc_ssse3_table-ipred_cfl_left_16bpc_ssse3_table movsxd wq, [t0+wq*4] pxor m6, m6 pshuflw m7, m7, q0000 pcmpeqw m3, m3 add wq, t0 movifnidn acq, acmp pavgw m4, m6 punpcklqdq m7, m7 jmp r6 .h32: movu m1, [tlq+48] movu m2, [tlq+32] paddw m0, m1 paddw m0, m2 .h16: movu m1, [tlq+16] paddw m0, m1 .h8: pshufd m1, m0, q1032 paddw m0, m1 .h4: pmaddwd m0, m3 psubd m4, m0 pshuflw m0, m4, q1032 paddd m0, m4 psrld m0, m5 pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq %macro IPRED_CFL 2 ; dst, src pabsw m%1, m%2 pmulhrsw m%1, m2 psignw m%2, m1 psignw m%1, m%2 paddw m%1, m0 pmaxsw m%1, m6 pminsw m%1, m7 %endmacro cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm tzcnt r6d, hd lea t0d, [wq+hq] movd m4, t0d tzcnt t0d, t0d movd m5, t0d LEA t0, ipred_cfl_16bpc_ssse3_table tzcnt wd, wd movd m7, r7m movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] psrlw m4, 1 pxor m6, m6 pshuflw m7, m7, q0000 add r6, t0 add wq, t0 movifnidn acq, acmp pcmpeqw m3, m3 punpcklqdq m7, m7 jmp r6 .h4: movq m0, [tlq-8] jmp wq .w4: movq m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 cmp hd, 4 jg .w4_mul psrld m0, 3 jmp .w4_end .w4_mul: mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 16 cmove r6d, r2d movd m1, r6d psrld m0, 2 pmulhuw m0, m1 psrlw m0, 1 .w4_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: movd m1, alpham lea r6, [strideq*3] pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 movq [dstq+strideq*0], m3 movhps [dstq+strideq*1], m3 movq [dstq+strideq*2], m4 movhps [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4_loop RET .h8: mova m0, [tlq-16] jmp wq .w8: movu m1, [tlq+2] paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 8 je .w8_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 32 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+strideq*0], m3 mova [dstq+strideq*1], m4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s8_loop RET .h16: mova m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w16: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 16 je .w16_end mov r6d, 0xAAAB mov r2d, 0x6667 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] add acq, 16*2 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 add dstq, strideq dec hd jg .s16_loop RET .h32: mova m0, [tlq-64] paddw m0, [tlq-48] paddw m0, [tlq-32] paddw m0, [tlq-16] jmp wq .w32: movu m1, [tlq+ 2] movu m2, [tlq+18] paddw m1, m2 movu m2, [tlq+34] paddw m1, m2 movu m2, [tlq+50] paddw m1, m2 paddw m0, m1 pmaddwd m0, m3 psubd m4, m0 pshufd m0, m4, q1032 paddd m0, m4 pshuflw m4, m0, q1032 paddd m0, m4 psrld m0, m5 cmp hd, 32 je .w32_end mov r6d, 0xAAAB mov r2d, 0x6667 cmp hd, 8 cmove r6d, r2d movd m1, r6d pmulhuw m0, m1 psrlw m0, 1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq+16*0] mova m5, [acq+16*1] IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*0], m3 mova [dstq+16*1], m4 mova m4, [acq+16*2] mova m5, [acq+16*3] add acq, 16*4 IPRED_CFL 3, 4 IPRED_CFL 4, 5 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, strideq dec hd jg .s32_loop RET cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac tzcnt wd, wm LEA t0, ipred_cfl_splat_16bpc_ssse3_table mov r6d, r7m movifnidn hd, hm shr r6d, 11 movd m7, r7m movsxd wq, [t0+wq*4] movddup m0, [t0-ipred_cfl_splat_16bpc_ssse3_table+pw_512+r6*8] pshuflw m7, m7, q0000 pxor m6, m6 add wq, t0 movifnidn acq, acmp punpcklqdq m7, m7 jmp wq cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 paddw m5, m5 %else movddup m5, [pw_2] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] pmaddwd m2, m5, [ypxq+strideq*2] pmaddwd m3, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m0, m1 paddd m2, m3 paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq], m0 add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .dc punpckhqdq m0, m0 pslld m2, 2 .w4_hpad: mova [acq+16*0], m0 paddd m4, m2 mova [acq+16*1], m0 add acq, 16*2 sub hpadd, 4 jg .w4_hpad jmp .dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*1+16*0] pmaddwd m1, m5, [ypxq+strideq*0+16*1] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m0, m2 paddd m1, m3 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_loop .w8_hpad: test hpadd, hpadd jz .dc pslld m2, 2 mova m1, m0 jmp .hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] paddd m0, m1 pshufd m1, m0, q3333 paddd m2, m0, m1 packssdw m0, m1 paddd m4, m2 mova [acq], m0 add acq, 16 dec hd jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m6, m5, [ypxq+strideq*1+16*0] paddd m0, m6 cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+strideq*0+16*1] pmaddwd m6, m5, [ypxq+strideq*1+16*1] paddd m3, m6 je .w16_wpad2 pmaddwd m1, m5, [ypxq+strideq*0+16*2] pmaddwd m6, m5, [ypxq+strideq*1+16*2] paddd m1, m6 jp .w16_wpad1 pmaddwd m2, m5, [ypxq+strideq*0+16*3] pmaddwd m6, m5, [ypxq+strideq*1+16*3] paddd m2, m6 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] paddd m6, m0, m3 packssdw m0, m3 paddd m6, m1 mova [acq+16*0], m0 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz .dc paddd m2, m2 .hpad: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m0 mova [acq+16*3], m1 add acq, 16*4 sub hpadd, 4 jg .hpad .dc: sub r5, acq ; -w*h*2 pshufd m2, m4, q1032 tzcnt r1d, r5d paddd m2, m4 sub r1d, 2 pshufd m4, m2, q2301 movd m0, r1d paddd m2, m4 psrld m2, m0 pxor m0, m0 pavgw m2, m0 packssdw m2, m2 .dc_loop: mova m0, [acq+r5+16*0] mova m1, [acq+r5+16*1] psubw m0, m2 psubw m1, m2 mova [acq+r5+16*0], m0 mova [acq+r5+16*1], m1 add r5, 16*2 jl .dc_loop RET cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h movifnidn hpadd, hpadm %if ARCH_X86_32 && PIC pcmpeqw m5, m5 pabsw m5, m5 psllw m5, 2 %else movddup m5, [pw_4] %endif mov hd, hm shl hpadd, 2 pxor m4, m4 sub hd, hpadd cmp dword wm, 8 mov r5, acq jg .w16 je .w8 lea r3, [strideq*3] .w4_loop: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m3, m5, [ypxq+strideq*1] pmaddwd m1, m5, [ypxq+strideq*2] pmaddwd m2, m5, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] paddd m4, m0 packssdw m0, m3 paddd m3, m1 packssdw m1, m2 paddd m4, m2 paddd m4, m3 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 pslld m2, 3 mova [acq+16*0], m1 mova [acq+16*1], m1 paddd m4, m2 mova [acq+16*2], m1 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: %if ARCH_X86_32 cmp dword wpadm, 0 %else test wpadd, wpadd %endif jnz .w8_wpad1 .w8_loop: pmaddwd m0, m5, [ypxq+strideq*0+16*0] pmaddwd m2, m5, [ypxq+strideq*0+16*1] pmaddwd m1, m5, [ypxq+strideq*1+16*0] pmaddwd m3, m5, [ypxq+strideq*1+16*1] lea ypxq, [ypxq+strideq*2] paddd m4, m0 packssdw m0, m2 paddd m4, m2 mova [acq+16*0], m0 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w8_wpad1: pmaddwd m0, m5, [ypxq+strideq*0] pmaddwd m1, m5, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] pshufd m2, m0, q3333 pshufd m3, m1, q3333 paddd m4, m0 packssdw m0, m2 paddd m4, m2 paddd m2, m1, m3 packssdw m1, m3 paddd m4, m2 mova [acq+16*0], m0 mova [acq+16*1], m1 add acq, 16*2 sub hd, 2 jg .w8_wpad1 jmp .w8_hpad .w16_wpad3: pshufd m3, m0, q3333 mova m1, m3 mova m2, m3 jmp .w16_wpad_end .w16_wpad2: pshufd m1, m3, q3333 mova m2, m1 jmp .w16_wpad_end .w16_wpad1: pshufd m2, m1, q3333 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm WIN64_SPILL_XMM 7 .w16_loop: pmaddwd m0, m5, [ypxq+16*0] cmp wpadd, 2 jg .w16_wpad3 pmaddwd m3, m5, [ypxq+16*1] je .w16_wpad2 pmaddwd m1, m5, [ypxq+16*2] jp .w16_wpad1 pmaddwd m2, m5, [ypxq+16*3] .w16_wpad_end: add ypxq, strideq paddd m6, m0, m3 packssdw m0, m3 mova [acq+16*0], m0 paddd m6, m1 packssdw m1, m2 paddd m2, m6 mova [acq+16*1], m1 add acq, 16*2 paddd m4, m2 dec hd jg .w16_loop WIN64_RESTORE_XMM add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h %define base r6-ipred_cfl_ac_444_16bpc_ssse3_table LEA r6, ipred_cfl_ac_444_16bpc_ssse3_table tzcnt wd, wm movifnidn hpadd, hpadm pxor m4, m4 movsxd wq, [r6+wq*4] movddup m5, [base+pw_1] add wq, r6 mov hd, hm shl hpadd, 2 sub hd, hpadd jmp wq .w4: lea r3, [strideq*3] mov r5, acq .w4_loop: movq m0, [ypxq+strideq*0] movhps m0, [ypxq+strideq*1] movq m1, [ypxq+strideq*2] movhps m1, [ypxq+r3 ] lea ypxq, [ypxq+strideq*4] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 4 jg .w4_loop test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc punpckhqdq m1, m1 mova [acq+16*0], m1 pslld m2, 2 mova [acq+16*1], m1 punpckhqdq m2, m2 mova [acq+16*2], m1 paddd m4, m2 mova [acq+16*3], m1 add acq, 16*4 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w8: mov r5, acq .w8_loop: mova m0, [ypxq+strideq*0] mova m1, [ypxq+strideq*1] lea ypxq, [ypxq+strideq*2] psllw m0, 3 psllw m1, 3 mova [acq+16*0], m0 pmaddwd m0, m5 mova [acq+16*1], m1 pmaddwd m2, m5, m1 add acq, 16*2 paddd m4, m0 paddd m4, m2 sub hd, 2 jg .w8_loop .w8_hpad: test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc pslld m2, 2 mova m0, m1 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w16_wpad2: pshufhw m3, m2, q3333 pshufhw m1, m0, q3333 punpckhqdq m3, m3 punpckhqdq m1, m1 jmp .w16_wpad_end .w16: movifnidn wpadd, wpadm mov r5, acq .w16_loop: mova m2, [ypxq+strideq*0+16*0] mova m0, [ypxq+strideq*1+16*0] psllw m2, 3 psllw m0, 3 test wpadd, wpadd jnz .w16_wpad2 mova m3, [ypxq+strideq*0+16*1] mova m1, [ypxq+strideq*1+16*1] psllw m3, 3 psllw m1, 3 .w16_wpad_end: lea ypxq, [ypxq+strideq*2] mova [acq+16*0], m2 pmaddwd m2, m5 mova [acq+16*1], m3 pmaddwd m3, m5 paddd m4, m2 pmaddwd m2, m5, m0 mova [acq+16*2], m0 paddd m4, m3 pmaddwd m3, m5, m1 mova [acq+16*3], m1 add acq, 16*4 paddd m2, m3 paddd m4, m2 sub hd, 2 jg .w16_loop add hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc paddd m2, m2 jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).hpad .w32_wpad6: pshufhw m1, m0, q3333 punpckhqdq m1, m1 mova m2, m1 mova m3, m1 jmp .w32_wpad_end .w32_wpad4: pshufhw m2, m1, q3333 punpckhqdq m2, m2 mova m3, m2 jmp .w32_wpad_end .w32_wpad2: pshufhw m3, m2, q3333 punpckhqdq m3, m3 jmp .w32_wpad_end .w32: movifnidn wpadd, wpadm mov r5, acq WIN64_SPILL_XMM 8 .w32_loop: mova m0, [ypxq+16*0] psllw m0, 3 cmp wpadd, 4 jg .w32_wpad6 mova m1, [ypxq+16*1] psllw m1, 3 je .w32_wpad4 mova m2, [ypxq+16*2] psllw m2, 3 jnp .w32_wpad2 mova m3, [ypxq+16*3] psllw m3, 3 .w32_wpad_end: add ypxq, strideq pmaddwd m6, m5, m0 mova [acq+16*0], m0 pmaddwd m7, m5, m1 mova [acq+16*1], m1 paddd m6, m7 pmaddwd m7, m5, m2 mova [acq+16*2], m2 paddd m6, m7 pmaddwd m7, m5, m3 mova [acq+16*3], m3 add acq, 16*4 paddd m6, m7 paddd m4, m6 dec hd jg .w32_loop %if WIN64 mova m5, m6 WIN64_RESTORE_XMM SWAP 5, 6 %endif test hpadd, hpadd jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc .w32_hpad_loop: mova [acq+16*0], m0 mova [acq+16*1], m1 paddd m4, m6 mova [acq+16*2], m2 mova [acq+16*3], m3 add acq, 16*4 dec hpadd jg .w32_hpad_loop jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h %define base r2-pal_pred_16bpc_ssse3_table %if ARCH_X86_32 %define hd r2d %endif mova m3, [palq] LEA r2, pal_pred_16bpc_ssse3_table tzcnt wd, wm pshufb m3, [base+pal_pred_shuf] movsxd wq, [r2+wq*4] pshufd m4, m3, q1032 add wq, r2 movifnidn hd, hm jmp wq .w4: mova m0, [idxq] add idxq, 16 pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4 RET .w8: mova m0, [idxq] add idxq, 16 pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8 RET .w16: mova m0, [idxq] add idxq, 16 pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, strideq dec hd jg .w16 RET .w32: mova m0, [idxq+16*0] pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova m2, [idxq+16*1] add idxq, 16*2 mova [dstq+16*0], m0 pshufb m0, m3, m2 mova [dstq+16*1], m1 pshufb m1, m4, m2 punpcklbw m2, m0, m1 punpckhbw m0, m1 mova [dstq+16*2], m2 mova [dstq+16*3], m0 add dstq, strideq dec hd jg .w32 RET .w64: mova m0, [idxq+16*0] pshufb m1, m3, m0 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova m2, [idxq+16*1] mova [dstq+16*0], m0 pshufb m0, m3, m2 mova [dstq+16*1], m1 pshufb m1, m4, m2 punpcklbw m2, m0, m1 punpckhbw m0, m1 mova m1, [idxq+16*2] mova [dstq+16*2], m2 pshufb m2, m3, m1 mova [dstq+16*3], m0 pshufb m0, m4, m1 punpcklbw m1, m2, m0 punpckhbw m2, m0 mova m0, [idxq+16*3] add idxq, 16*4 mova [dstq+16*4], m1 pshufb m1, m3, m0 mova [dstq+16*5], m2 pshufb m2, m4, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 mova [dstq+16*6], m0 mova [dstq+16*7], m1 add dstq, strideq dec hd jg .w64 RET rav1e-0.7.1/src/x86/ipred_avx2.asm000064400000000000000000005565671046102023000146600ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro ; sm_weights[], but modified to precalculate x and 256-x with offsets to ; enable efficient use of pmaddubsw (which requires signed values) smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 pb_1to32: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 db 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 pb_32to1: db 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 pb_16to1: db 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1 z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16 db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0 z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15 db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line pb_128: times 4 db 128 ; those are just placed here for alignment. pb_36_m4: times 2 db 36, -4 z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 z2_shuf_h2: db 3, 2, 7, 6, 11, 10, 15, 14, 2, 1, 6, 5, 10, 9, 14, 13 z2_shuf_h4: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64 z2_base_inc: dw 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64, 8*64 dw 9*64, 10*64, 11*64, 12*64, 13*64, 14*64, 15*64, 16*64 z2_ymul: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 z2_y_shuf_h4: db 90, 90, 90, 90, 14, 14, 14, 14, 27, 27, 27, 27, 31, 31, 31, 31 ; 2, 6, 3, 7 db 32, 32, 32, 32, 12, 12, 12, 12, 1, 0, 1, 0, 5, -1, -1, -1 ; 0, 4, 1, 5 ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5 filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1 db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1 filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1 pb_127_m127: times 2 db 127, -127 ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15 ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 pw_64: times 2 dw 64 cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 times 9 db 7, -1 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ; w=8, w_pad=1 as well as second half of previous one cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5 times 5 db 6, 7 ; w=16,w_pad=2 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 times 8 db 14, 15 ; w=16,w_pad=3 db 0, 1, 2, 3, 4, 5 times 13 db 6, 7 pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 %define pb_0to15 cfl_ac_w16_pad_shuffle %define pb_1 (ipred_h_shuf+12) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+ 4) %define pb_4 (ipred_h_shuf+24) %define pb_5 (ipred_h_shuf+ 8) %define pb_7 (ipred_h_shuf+ 0) %define pb_8 (z_upsample2 +12) %define pb_12 (z2_y_shuf_h4+20) %define pb_14 (z2_y_shuf_h4+ 4) %define pb_15 (z_filter_s +32) %define pb_27 (z2_y_shuf_h4+ 8) %define pb_31 (z2_y_shuf_h4+12) %define pb_32 (z2_y_shuf_h4+16) %define pb_90 (z2_y_shuf_h4+ 0) %define pw_1 (z2_y_shuf_h4+24) %define pw_8 (z_filter_k +32) pw_62: times 2 dw 62 pw_128: times 2 dw 128 pw_255: times 2 dw 255 pw_512: times 2 dw 512 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4) %define ipred_cfl_splat_avx2_table (ipred_cfl_avx2_table + 8*4) JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_filter, avx2, w4, w8, w16, w32 JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2, avx2, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64 JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text INIT_YMM avx2 cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h lea r5, [ipred_dc_left_avx2_table] tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 shrx r6d, r6d, wd movd xm3, r6d movsxd r6, [r5+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov r5d, 0x8000 shrx r5d, r5d, r6d movd xm3, r5d lea r5, [ipred_dc_left_avx2_table] movsxd r6, [r5+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_avx2_table-ipred_dc_left_avx2_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m1, [tlq+32] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h32: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 .h8: psrlq xm1, xm0, 32 paddw xm0, xm1 .h4: pmaddwd xm0, xm2 pmulhrsw xm0, xm3 lea stride3q, [strideq*3] vpbroadcastb m0, xm0 mova m1, m0 jmp wq cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd xm4, r5d tzcnt r5d, r5d movd xm5, r5d lea r5, [ipred_dc_avx2_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] pcmpeqd m3, m3 psrlw xm4, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd xm0, [tlq-4] pmaddubsw xm0, xm3 jmp wq .w4: movd xm1, [tlq+1] pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm1 pmaddwd xm0, xm3 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddw xm0, xm1 shrx r6d, r6d, r2d psrlq xm1, xm0, 32 paddw xm0, xm1 movd xm1, r6d psrlw xm0, 2 pmulhuw xm0, xm1 .w4_end: vpbroadcastb xm0, xm0 .s4: movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm0 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: movq xm0, [tlq-8] pmaddubsw xm0, xm3 jmp wq .w8: movq xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 punpckhqdq xm2, xm0, xm0 paddw xm0, xm2 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w8_end: vpbroadcastb xm0, xm0 .s8: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova xm0, [tlq-16] pmaddubsw xm0, xm3 jmp wq .w16: movu xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w16_end: vpbroadcastb xm0, xm0 .s16: mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm0 mova [dstq+strideq*2], xm0 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 psubw xm0, xm4 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w32_end: vpbroadcastb m0, xm0 .s32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-64] mova m1, [tlq-32] pmaddubsw m0, m3 pmaddubsw m1, m3 paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 1] movu m2, [tlq+33] pmaddubsw m1, m3 pmaddubsw m2, m3 paddw m0, m1 paddw m0, m2 vextracti128 xm1, m0, 1 psubw xm0, xm4 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 64 je .w64_end mov r6d, 0x33345556 shrx r6d, r6d, hd movd xm1, r6d pmulhuw xm0, xm1 .w64_end: vpbroadcastb m0, xm0 mova m1, m0 .s64: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m0 mova [dstq+stride3q +32*1], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s64 RET cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_avx2_table+pb_128] mova m1, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_avx2_table] tzcnt wd, wm movu m0, [tlq+ 1] movu m1, [tlq+33] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq %macro IPRED_H 2 ; w, store_type vpbroadcastb m0, [tlq-1] vpbroadcastb m1, [tlq-2] vpbroadcastb m2, [tlq-3] sub tlq, 4 vpbroadcastb m3, [tlq+0] mov%2 [dstq+strideq*0], m0 mov%2 [dstq+strideq*1], m1 mov%2 [dstq+strideq*2], m2 mov%2 [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET ALIGN function_align %endmacro INIT_XMM avx2 cglobal ipred_h_8bpc, 3, 6, 4, dst, stride, tl, w, h, stride3 lea r5, [ipred_h_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: IPRED_H 4, d .w8: IPRED_H 8, q .w16: IPRED_H 16, a INIT_YMM avx2 .w32: IPRED_H 32, a .w64: vpbroadcastb m0, [tlq-1] vpbroadcastb m1, [tlq-2] vpbroadcastb m2, [tlq-3] sub tlq, 4 vpbroadcastb m3, [tlq+0] mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m0 mova [dstq+strideq*1+32*0], m1 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*2+32*0], m2 mova [dstq+strideq*2+32*1], m2 mova [dstq+stride3q +32*0], m3 mova [dstq+stride3q +32*1], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64 RET %macro PAETH 2 ; top, ldiff pavgb m1, m%1, m3 ; Calculating tldiff normally requires pxor m0, m%1, m3 ; 10-bit intermediates, but we can do it pand m0, m4 ; in 8-bit with some tricks which avoids psubusb m2, m5, m1 ; having to unpack everything to 16-bit. psubb m1, m0 psubusb m1, m5 por m1, m2 paddusb m1, m1 por m1, m0 ; min(tldiff, 255) psubusb m2, m5, m3 psubusb m0, m3, m5 por m2, m0 ; tdiff pminub m2, m%2 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff vpblendvb m0, m%1, m3, m0 pminub m1, m2 pcmpeqb m1, m2 ; ldiff <= tldiff || tdiff <= tldiff vpblendvb m0, m5, m0, m1 %endmacro cglobal ipred_paeth_8bpc, 3, 6, 9, dst, stride, tl, w, h %define base r5-ipred_paeth_avx2_table lea r5, [ipred_paeth_avx2_table] tzcnt wd, wm vpbroadcastb m5, [tlq] ; topleft movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m4, [base+pb_1] add wq, r5 jmp wq .w4: vpbroadcastd m6, [tlq+1] ; top mova m8, [base+ipred_h_shuf] lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: sub tlq, 8 vpbroadcastq m3, [tlq] pshufb m3, m8 ; left PAETH 6, 7 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r3 ], xm1, 2 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .ret: RET ALIGN function_align .w8: vpbroadcastq m6, [tlq+1] mova m8, [base+ipred_h_shuf] lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: sub tlq, 4 vpbroadcastd m3, [tlq] pshufb m3, m8 PAETH 6, 7 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: vbroadcasti128 m6, [tlq+1] mova xm8, xm4 ; lower half = 1, upper half = 0 psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: sub tlq, 2 vpbroadcastd m3, [tlq] pshufb m3, m8 PAETH 6, 7 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w32_loop: dec tlq vpbroadcastb m3, [tlq] PAETH 6, 7 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: movu m6, [tlq+ 1] movu m7, [tlq+33] %if WIN64 movaps r4m, xmm9 %endif psubusb m8, m5, m6 psubusb m0, m6, m5 psubusb m9, m5, m7 psubusb m1, m7, m5 por m8, m0 por m9, m1 .w64_loop: dec tlq vpbroadcastb m3, [tlq] PAETH 6, 8 mova [dstq+32*0], m0 PAETH 7, 9 mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w64_loop %if WIN64 movaps xmm9, r4m %endif RET %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] ; w * a = (w - 128) * a + 128 * a ; (256 - w) * b = (127 - w) * b + 129 * b pmaddubsw m0, m%3, m%1 pmaddubsw m1, m%4, m%2 paddw m0, m%5 paddw m1, m%6 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 %endmacro cglobal ipred_smooth_v_8bpc, 3, 7, 0, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_avx2_table lea r6, [ipred_smooth_v_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m0, [base+pb_127_m127] vpbroadcastd m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq vpbroadcastb m5, [tlq+hq] ; bottom add wq, r6 jmp wq .w4: vpbroadcastd m2, [tlq+1] punpcklbw m2, m5 ; top, bottom mova m5, [base+ipred_v_shuf] lea r3, [strideq*3] punpckldq m4, m5, m5 punpckhdq m5, m5 pmaddubsw m3, m2, m0 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; 128 * top + 129 * bottom + 128 .w4_loop: vbroadcasti128 m1, [weightsq+hq*2] pshufb m0, m1, m4 pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 1 pextrd [dstq+r3 ], xm1, 1 cmp hd, -4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm1, 2 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] add hq, 8 jl .w4_loop .ret: RET ALIGN function_align .w8: vpbroadcastq m2, [tlq+1] punpcklbw m2, m5 mova m5, [base+ipred_v_shuf] lea r3, [strideq*3] pshufd m4, m5, q0000 pshufd m5, m5, q1111 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 .w8_loop: vpbroadcastq m1, [weightsq+hq*2] pshufb m0, m1, m4 pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET ALIGN function_align .w16: WIN64_SPILL_XMM 7 vbroadcasti128 m3, [tlq+1] mova m6, [base+ipred_v_shuf] punpcklbw m2, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w16_loop: vpbroadcastd m1, [weightsq+hq*2] pshufb m1, m6 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add hq, 2 jl .w16_loop RET ALIGN function_align .w32: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 6 movu m3, [tlq+1] punpcklbw m2, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w32_loop: vpbroadcastw m1, [weightsq+hq*2] SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m0 add dstq, strideq inc hq jl .w32_loop RET ALIGN function_align .w64: WIN64_SPILL_XMM 11 movu m4, [tlq+ 1] movu m8, [tlq+33] punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m7, m8, m5 punpckhbw m8, m5 pmaddubsw m5, m3, m0 pmaddubsw m6, m4, m0 pmaddubsw m9, m7, m0 pmaddubsw m10, m8, m0 paddw m2, m1, m3 paddw m5, m2 paddw m2, m1, m4 paddw m6, m2 paddw m0, m1, m7 paddw m9, m0 paddw m1, m8 paddw m10, m1 .w64_loop: vpbroadcastw m2, [weightsq+hq*2] SMOOTH 2, 2, 3, 4, 5, 6 mova [dstq+32*0], m0 SMOOTH 2, 2, 7, 8, 9, 10 mova [dstq+32*1], m0 add dstq, strideq inc hq jl .w64_loop RET %macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used %assign stack_offset 0 %assign stack_size_padded 0 %assign regs_used %2 %xdefine rstk rsp SETUP_STACK_POINTER %1 %if regs_used != %2 && WIN64 PUSH r%2 %endif ALLOC_STACK %1, %3 %endmacro cglobal ipred_smooth_h_8bpc, 3, 7, 0, dst, stride, tl, w, h %define base r6-ipred_smooth_h_avx2_table lea r6, [ipred_smooth_h_avx2_table] mov wd, wm vpbroadcastb m3, [tlq+wq] ; right tzcnt wd, wd mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m4, [base+pb_127_m127] vpbroadcastd m5, [base+pw_128] add wq, r6 jmp wq .w4: WIN64_SPILL_XMM 8 vpbroadcastq m6, [base+smooth_weights+4*2] mova m7, [base+ipred_h_shuf] sub tlq, 8 sub tlq, hq lea r3, [strideq*3] .w4_loop: vpbroadcastq m2, [tlq+hq] pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r3 ], xm1, 2 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .ret: RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 vbroadcasti128 m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 lea r3, [strideq*3] sub tlq, hq .w8_loop: vpbroadcastd m2, [tlq+hq] pshufb m2, m7 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m0, m1, m4 paddw m0, m1 pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: SETUP_STACK_FRAME 32*4, 7, 8 lea r3, [rsp+64*2-4] call .prep ; only worthwhile for for w16 and above sub tlq, 2 vpbroadcastd xm6, [base+pb_1] mova xm7, [base+ipred_v_shuf+16] vinserti128 m7, [base+ipred_v_shuf+ 0], 1 vbroadcasti128 m4, [base+smooth_weights+16*2] vbroadcasti128 m5, [base+smooth_weights+16*3] .w16_loop: vpbroadcastd m1, [tlq+hq] vpbroadcastd m2, [r3+hq*2] pshufb m1, m6 punpcklbw m1, m3 pshufb m2, m7 SMOOTH 4, 5, 1, 1, 2, 2 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: SETUP_STACK_FRAME 32*4, 7, 6 lea r3, [rsp+64*2-2] call .prep dec tlq mova xm4, [base+smooth_weights+16*4] vinserti128 m4, [base+smooth_weights+16*6], 1 mova xm5, [base+smooth_weights+16*5] vinserti128 m5, [base+smooth_weights+16*7], 1 .w32_loop: vpbroadcastb m1, [tlq+hq] punpcklbw m1, m3 vpbroadcastw m2, [r3+hq*2] SMOOTH 4, 5, 1, 1, 2, 2 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: SETUP_STACK_FRAME 32*4, 7, 9 lea r3, [rsp+64*2-2] call .prep add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table dec tlq mova xm5, [r6-16*7] vinserti128 m5, [r6-16*5], 1 mova xm6, [r6-16*6] vinserti128 m6, [r6-16*4], 1 mova xm7, [r6-16*3] vinserti128 m7, [r6-16*1], 1 mova xm8, [r6-16*2] vinserti128 m8, [r6-16*0], 1 .w64_loop: vpbroadcastb m2, [tlq+hq] punpcklbw m2, m3 vpbroadcastw m4, [r3+hq*2] SMOOTH 5, 6, 2, 2, 4, 4 mova [dstq+32*0], m0 SMOOTH 7, 8, 2, 2, 4, 4 mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w64_loop RET ALIGN function_align .prep: vpermq m2, [tlq-32*1], q3120 punpckhbw m1, m2, m3 punpcklbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m1, m5 ; 1 * left + 256 * right + 128 paddw m0, m1 ; 128 * left + 129 * right + 128 pmaddubsw m1, m2, m4 paddw m2, m5 paddw m1, m2 vpermq m2, [tlq-32*2], q3120 mova [rsp+gprsize+32*3], m0 mova [rsp+gprsize+32*2], m1 punpckhbw m1, m2, m3 punpcklbw m2, m3 pmaddubsw m0, m1, m4 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m2, m5 paddw m1, m2 mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*0], m1 sub r3, hq sub tlq, hq sub r3, hq ret %macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2] pmaddubsw m0, m%3, m%1 pmaddubsw m1, m%4, m%2 %ifnum %5 paddw m0, m%5 %else paddw m0, %5 %endif %ifnum %6 paddw m1, m%6 %else paddw m1, %6 %endif pavgw m0, m2 pavgw m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 %endmacro cglobal ipred_smooth_8bpc, 3, 7, 0, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_avx2_table lea r6, [ipred_smooth_avx2_table] mov wd, wm vpbroadcastb m4, [tlq+wq] ; right tzcnt wd, wd mov hd, hm mov r5, tlq sub r5, hq movsxd wq, [r6+wq*4] vpbroadcastd m5, [base+pb_127_m127] vpbroadcastb m0, [r5] ; bottom vpbroadcastd m3, [base+pw_255] add wq, r6 lea v_weightsq, [base+smooth_weights+hq*2] jmp wq .w4: WIN64_SPILL_XMM 12 mova m10, [base+ipred_h_shuf] vpbroadcastq m11, [base+smooth_weights+4*2] mova m7, [base+ipred_v_shuf] vpbroadcastd m8, [tlq+1] sub tlq, 8 lea r3, [strideq*3] sub tlq, hq punpcklbw m8, m0 ; top, bottom pshufd m6, m7, q2200 pshufd m7, m7, q3311 pmaddubsw m9, m8, m5 paddw m3, m8 ; 1 * top + 255 * bottom + 255 paddw m9, m3 ; 128 * top + 129 * bottom + 255 .w4_loop: vpbroadcastq m1, [tlq+hq] pshufb m1, m10 punpcklbw m0, m1, m4 ; left, right punpckhbw m1, m4 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right pmaddubsw m3, m1, m5 paddw m2, m0 ; 128 * left + 129 * right paddw m3, m1 pmaddubsw m0, m11 pmaddubsw m1, m11 paddw m2, m0 paddw m3, m1 vbroadcasti128 m1, [v_weightsq] add v_weightsq, 16 pshufb m0, m1, m6 pshufb m1, m7 SMOOTH_2D_END 0, 1, 8, 8, 9, 9 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r3 ], xm1, 2 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+r3 ], xm1, 3 lea dstq, [dstq+strideq*4] sub hd, 8 jg .w4_loop .ret: RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 mova m10, [base+ipred_h_shuf] vbroadcasti128 m11, [base+smooth_weights+8*2] mova m7, [base+ipred_v_shuf] vpbroadcastq m8, [tlq+1] sub tlq, 4 lea r3, [strideq*3] sub tlq, hq punpcklbw m8, m0 pshufd m6, m7, q0000 pshufd m7, m7, q1111 pmaddubsw m9, m8, m5 paddw m3, m8 paddw m9, m3 .w8_loop: vpbroadcastd m1, [tlq+hq] pshufb m1, m10 punpcklbw m0, m1, m4 punpckhbw m1, m4 pmaddubsw m2, m0, m5 pmaddubsw m3, m1, m5 paddw m2, m0 paddw m3, m1 pmaddubsw m0, m11 pmaddubsw m1, m11 paddw m2, m0 paddw m3, m1 vpbroadcastq m1, [v_weightsq] add v_weightsq, 8 pshufb m0, m1, m6 pshufb m1, m7 SMOOTH_2D_END 0, 1, 8, 8, 9, 9 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+r3 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET ALIGN function_align .w16: SETUP_STACK_FRAME 32*4, 7, 14 vbroadcasti128 m11, [tlq+1] lea r3, [rsp+64*2-4] punpcklbw m10, m11, m0 ; top, bottom punpckhbw m11, m0 call .prep_v sub tlq, 2 pmaddubsw m12, m10, m5 pmaddubsw m13, m11, m5 vpbroadcastd xm5, [base+pb_1] mova m9, [base+ipred_v_shuf] vbroadcasti128 m6, [base+smooth_weights+16*2] vbroadcasti128 m7, [base+smooth_weights+16*3] vperm2i128 m8, m9, m9, 0x01 paddw m0, m10, m3 paddw m3, m11 paddw m12, m0 paddw m13, m3 .w16_loop: vpbroadcastd m3, [tlq+hq] vpbroadcastd m0, [r3+hq*2] vpbroadcastd m1, [v_weightsq] add v_weightsq, 4 pshufb m3, m5 punpcklbw m3, m4 ; left, right pmaddubsw m2, m3, m6 pmaddubsw m3, m7 pshufb m0, m8 pshufb m1, m9 paddw m2, m0 paddw m3, m0 SMOOTH_2D_END 1, 1, 10, 11, 12, 13 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: SETUP_STACK_FRAME 32*4, 7, 11 movu m8, [tlq+1] lea r3, [rsp+64*2-2] punpcklbw m7, m8, m0 punpckhbw m8, m0 call .prep_v dec tlq pmaddubsw m9, m7, m5 pmaddubsw m10, m8, m5 mova xm5, [base+smooth_weights+16*4] vinserti128 m5, [base+smooth_weights+16*6], 1 mova xm6, [base+smooth_weights+16*5] vinserti128 m6, [base+smooth_weights+16*7], 1 paddw m0, m7, m3 paddw m3, m8 paddw m9, m0 paddw m10, m3 .w32_loop: vpbroadcastb m3, [tlq+hq] punpcklbw m3, m4 vpbroadcastw m0, [r3+hq*2] vpbroadcastw m1, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m3, m5 pmaddubsw m3, m6 paddw m2, m0 paddw m3, m0 SMOOTH_2D_END 1, 1, 7, 8, 9, 10 mova [dstq], m0 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: SETUP_STACK_FRAME 32*8, 7, 16 movu m13, [tlq+1 ] movu m15, [tlq+33] add r6, smooth_weights+16*15-ipred_smooth_avx2_table lea r3, [rsp+64*2-2] punpcklbw m12, m13, m0 punpckhbw m13, m0 punpcklbw m14, m15, m0 punpckhbw m15, m0 call .prep_v dec tlq pmaddubsw m0, m12, m5 pmaddubsw m1, m13, m5 pmaddubsw m2, m14, m5 pmaddubsw m5, m15, m5 mova xm8, [r6-16*7] vinserti128 m8, [r6-16*5], 1 mova xm9, [r6-16*6] vinserti128 m9, [r6-16*4], 1 mova xm10, [r6-16*3] vinserti128 m10, [r6-16*1], 1 mova xm11, [r6-16*2] vinserti128 m11, [r6-16*0], 1 lea r6, [rsp+32*4] paddw m0, m3 paddw m1, m3 paddw m2, m3 paddw m3, m5 paddw m0, m12 paddw m1, m13 paddw m2, m14 paddw m3, m15 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 .w64_loop: vpbroadcastb m5, [tlq+hq] punpcklbw m5, m4 vpbroadcastw m6, [r3+hq*2] vpbroadcastw m7, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m5, m8 pmaddubsw m3, m5, m9 paddw m2, m6 paddw m3, m6 SMOOTH_2D_END 7, 7, 12, 13, [r6+32*0], [r6+32*1] mova [dstq+32*0], m0 pmaddubsw m2, m5, m10 pmaddubsw m3, m5, m11 paddw m2, m6 paddw m3, m6 SMOOTH_2D_END 7, 7, 14, 15, [r6+32*2], [r6+32*3] mova [dstq+32*1], m0 add dstq, strideq dec hd jg .w64_loop RET ALIGN function_align .prep_v: vpermq m2, [tlq-32*1], q3120 punpckhbw m1, m2, m4 punpcklbw m2, m4 pmaddubsw m0, m1, m5 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m2, m5 paddw m1, m2 vpermq m2, [tlq-32*2], q3120 mova [rsp+gprsize+32*3], m0 mova [rsp+gprsize+32*2], m1 punpckhbw m1, m2, m4 punpcklbw m2, m4 pmaddubsw m0, m1, m5 paddw m0, m1 pmaddubsw m1, m2, m5 paddw m1, m2 mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*0], m1 sub r3, hq sub tlq, hq sub r3, hq ret cglobal ipred_z1_8bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z1_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea r7, [dr_intra_derivative] inc tlq movsxd wq, [r6+wq*4] add wq, r6 mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 movzx dxd, word [r7+dxq] xor angled, 0x4ff ; d = 90 - angle vpbroadcastd m3, [pw_512] vpbroadcastd m4, [pw_62] vpbroadcastd m5, [pw_64] jmp wq .w4: cmp angleb, 40 jae .w4_no_upsample lea r3d, [angleq-1024] sar r3d, 7 add r3d, hd jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) ALLOC_STACK -32, 8 mova xm1, [tlq-1] pshufb xm0, xm1, [z_upsample1] pshufb xm1, [z_upsample2] vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse add dxd, dxd ; pw_512 (which is already in m3) pmaddubsw xm0, xm2 ; for rounding instead of pw_2048 pextrd [rsp+16], xm1, 3 ; top[max_base_x] pmaddubsw xm1, xm2 movd xm7, dxd mov r3d, dxd ; xpos vpbroadcastw m7, xm7 paddw xm1, xm0 movq xm0, [tlq] pmulhrsw xm1, xm3 pslldq m6, m7, 8 paddw xm2, xm7, xm7 lea r2, [strideq*3] paddw m6, m7 packuswb xm1, xm1 paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1 punpcklbw xm0, xm1 psllw m7, 2 mova [rsp], xm0 .w4_upsample_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 vpbroadcastq m1, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vpbroadcastq m2, [rsp+r5] lea r5d, [r3+dxq] shr r3d, 6 ; base2 movq xm0, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 movhps xm0, [rsp+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 paddw m6, m7 ; xpos += dx pmulhrsw m0, m3 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r2 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_upsample_loop RET ALIGN function_align .filter_strength: ; w4/w8/w16 ; The C version uses a lot of branches, but we can do all the comparisons ; in parallel and use popcnt to get the final filter strength value. %define base r3-z_filter_t0 lea r3, [z_filter_t0] movd xm0, maxbased movd xm2, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m2, xm2 pcmpeqb m1, m0, [base+z_filter_wh] pand m1, m2 mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases pcmpgtb m1, m2 pmovmskb r5d, m1 ret .w4_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -16, 11 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m7, [base+pb_8] vbroadcasti128 m2, [tlq-1] pminub m1, m7, [base+z_filter_s] vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] pminub m7, [base+z_filter_s+8] vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] pshufb m0, m2, m1 shufps m1, m7, q2121 pmaddubsw m0, m8 pshufb m1, m2, m1 pmaddubsw m1, m9 pshufb m2, m7 pmaddubsw m2, m10 paddw m0, m1 paddw m0, m2 pmulhrsw m0, m3 mov r3d, 9 mov tlq, rsp cmp hd, 4 cmovne maxbased, r3d vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [tlq], xm0 .w4_main: movd xm6, dxd vpbroadcastq m0, [z_base_inc] ; base_inc << 6 vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 mov r3d, dxd ; xpos movd xm9, maxbased vpbroadcastw m9, xm9 vbroadcasti128 m8, [z1_shuf_w4] psrlw m7, 8 ; top[max_base_x] paddw m10, m6, m6 psubw m9, m0 ; max_base_x vpblendd m6, m10, 0xcc mova xm0, xm10 paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1 paddw m10, m10 .w4_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 vpbroadcastq m1, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vpbroadcastq m2, [tlq+r5] lea r5d, [r3+dxq] shr r3d, 6 ; base2 movq xm0, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 movhps xm0, [tlq+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 pshufb m0, m8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 pcmpgtw m1, m9, m6 ; base < max_base_x pmulhrsw m0, m3 paddw m6, m10 ; xpos += dx lea r5, [dstq+strideq*2] vpblendvb m0, m7, m0, m1 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [r5 +strideq*0], xm0 pextrd [r5 +strideq*1], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r3d, maxbased jb .w4_loop packuswb xm7, xm7 lea r6, [strideq*3] .w4_end_loop: movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm7 movd [dstq+strideq*2], xm7 movd [dstq+r6 ], xm7 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_end_loop .w4_end: RET ALIGN function_align .w8: lea r3d, [angleq+216] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 movu xm2, [z_filter_s+6] mova xm0, [tlq-1] movd xm6, hd vinserti128 m0, [tlq+7], 1 vpbroadcastb xm6, xm6 vbroadcasti128 m1, [z_upsample1] pminub xm6, xm2 vpbroadcastd m7, [pb_36_m4] vinserti128 m2, xm6, 1 add dxd, dxd pshufb m1, m0, m1 pshufb m2, m0, m2 movd xm6, dxd pmaddubsw m1, m7 pmaddubsw m2, m7 vpbroadcastw m6, xm6 mov r3d, dxd psrldq m0, 1 lea r2, [strideq*3] paddw m7, m6, m6 paddw m1, m2 vpblendd m6, m7, 0xf0 pmulhrsw m1, m3 pslldq m2, m7, 8 paddw m7, m7 paddw m6, m2 packuswb m1, m1 punpcklbw m0, m1 mova [rsp], m0 .w8_upsample_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 movu xm0, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [rsp+r5], 1 lea r5d, [r3+dxq] shr r3d, 6 ; base2 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 punpcklqdq m1, m2, m2 ; frac0 frac1 pmaddubsw m0, m1 movu xm1, [rsp+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base3 vinserti128 m1, [rsp+r5], 1 punpckhqdq m2, m2 ; frac2 frac3 pmaddubsw m1, m2 pmulhrsw m0, m3 paddw m6, m7 pmulhrsw m1, m3 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_upsample_loop RET .w8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(h+7, 15) jmp .w8_main .w8_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [hq+7] test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w8_main ; filter_strength == 0 popcnt r5d, r5d movu xm2, [tlq] pminub xm1, xm0, [base+z_filter_s+14] vinserti128 m2, [tlq-1], 1 vinserti128 m1, [base+z_filter_s+ 0], 1 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pminub xm0, [base+z_filter_s+22] vinserti128 m0, [base+z_filter_s+ 8], 1 pshufb m6, m2, m1 pmaddubsw m6, m7 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] movzx r3d, byte [tlq+15] shufps m1, m0, q2121 pshufb m1, m2, m1 pmaddubsw m1, m7 paddw m1, m6 sub r5d, 3 jnz .w8_3tap ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one, ; which also results in an awkward edge case where out[w*2] is ; slightly different from out[max_base_x] when h > w. vpbroadcastd m7, [z_filter_k+4*8] movzx r2d, byte [tlq+14] pshufb m2, m0 pmaddubsw m2, m7 sub r2d, r3d lea r2d, [r2+r3*8+4] shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3 mov [rsp+16], r2b paddw m1, m2 .w8_3tap: pmulhrsw m1, m3 sar r5d, 1 mov tlq, rsp add r5d, 17 ; w*2 + (filter_strength == 3) cmp hd, 16 cmovns maxbased, r5d mov [tlq+r5], r3b vextracti128 xm0, m1, 1 packuswb xm0, xm1 mova [tlq], xm0 .w8_main: movd xm2, dxd vbroadcasti128 m0, [z_base_inc] vpbroadcastw m2, xm2 vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 psrlw m7, 8 psubw m9, m0 mov r3d, dxd paddw m6, m2, m2 vpblendd m2, m6, 0xf0 .w8_loop: lea r5d, [r3+dxq] shr r3d, 6 pand m0, m4, m2 psubw m1, m5, m0 psllw m0, 8 por m1, m0 movu xm0, [tlq+r3] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [tlq+r5], 1 pshufb m0, m8 pmaddubsw m0, m1 pcmpgtw m1, m9, m2 paddw m2, m6 pmulhrsw m0, m3 vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 sub hd, 2 jz .w8_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w8_loop packuswb xm7, xm7 .w8_end_loop: movq [dstq+strideq*0], xm7 movq [dstq+strideq*1], xm7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_end_loop .w8_end: RET .w16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(h+15, 31) jmp .w16_main ALIGN function_align .w16: %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [hq+15] test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .w16_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m1, [base+pb_12] vbroadcasti128 m6, [base+z_filter_s+8] vinserti128 m2, m6, [base+z_filter_s], 0 vinserti128 m6, [base+z_filter_s+16], 1 mova xm10, [tlq-1] vinserti128 m10, [tlq+3], 1 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] vbroadcasti128 m7, [base+z_filter_s+14] vinserti128 m8, m7, [base+z_filter_s+6], 0 vinserti128 m7, [base+z_filter_s+22], 1 psubw m0, m1 movu xm11, [tlq+12] vinserti128 m11, [tlq+16], 1 pminub m8, m0 pminub m7, m0 pshufb m0, m10, m2 shufps m2, m6, q2121 pmaddubsw m0, m9 pshufb m1, m11, m8 shufps m8, m7, q2121 pmaddubsw m1, m9 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] movzx r3d, byte [tlq+31] pshufb m2, m10, m2 pmaddubsw m2, m9 pshufb m8, m11, m8 pmaddubsw m8, m9 paddw m0, m2 paddw m1, m8 sub r5d, 3 jnz .w16_3tap vpbroadcastd m9, [z_filter_k+4*8] movzx r2d, byte [tlq+30] pshufb m10, m6 pmaddubsw m10, m9 pshufb m11, m7 pmaddubsw m11, m9 sub r2d, r3d lea r2d, [r2+r3*8+4] shr r2d, 3 mov [rsp+32], r2b paddw m0, m10 paddw m1, m11 .w16_3tap: pmulhrsw m0, m3 pmulhrsw m1, m3 sar r5d, 1 mov tlq, rsp add r5d, 33 cmp hd, 32 cmovns maxbased, r5d mov [tlq+r5], r3b packuswb m0, m1 vpermq m0, m0, q3120 mova [tlq], m0 .w16_main: movd xm6, dxd vbroadcasti128 m0, [z_base_inc] vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 mov r3d, dxd psubw m9, m0 paddw m11, m6, m6 psubw m10, m9, m3 ; 64*8 vpblendd m6, m11, 0xf0 .w16_loop: lea r5d, [r3+dxq] shr r3d, 6 ; base0 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r3+0] movu xm1, [tlq+r3+8] lea r3d, [r5+dxq] shr r5d, 6 ; base1 vinserti128 m0, [tlq+r5+0], 1 vinserti128 m1, [tlq+r5+8], 1 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r3d, maxbased jb .w16_loop .w16_end_loop: mova [dstq+strideq*0], xm7 mova [dstq+strideq*1], xm7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_end_loop .w16_end: RET ALIGN function_align .w32: %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea r3d, [hq+31] mov maxbased, 63 cmp hd, 32 cmovs maxbased, r3d test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main vbroadcasti128 m0, [pb_0to15] sub r3d, 29 ; h+2 movu xm13, [tlq+29] ; 32-39 movd xm1, r3d movu xm14, [tlq+37] ; 40-47 sub r3d, 8 ; h-6 vinserti128 m14, [tlq+51], 1 ; 56-63 vpbroadcastb xm1, xm1 mova xm11, [tlq- 1] ; 0- 7 vinserti128 m11, [tlq+13], 1 ; 16-23 movd xm2, r3d movu xm12, [tlq+ 5] ; 8-15 vinserti128 m12, [tlq+19], 1 ; 24-31 pminub xm1, xm0 ; clip 32x8 mova m7, [z_filter_s+0] pshufb xm13, xm1 vpbroadcastd m1, [pb_12] vpbroadcastb xm2, xm2 vinserti128 m13, [tlq+43], 1 ; 48-55 vinserti128 m8, m7, [z_filter_s+4], 1 vpblendd m2, m1, 0xf0 vinserti128 m7, [z_filter_s+12], 0 pminub m2, m0 ; clip 32x16 and 32x(32|64) vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m14, m2 pshufb m0, m11, m8 shufps m8, m7, q1021 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 shufps m8, m7, q2121 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m8 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m8 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m8 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m7 pmaddubsw m12, m9 movzx r3d, byte [tlq+63] movzx r2d, byte [tlq+62] paddw m0, m11 paddw m2, m12 pshufb m13, m7 pmaddubsw m13, m9 pshufb m14, m7 pmaddubsw m14, m9 paddw m1, m13 paddw m6, m14 sub r2d, r3d lea r2d, [r2+r3*8+4] ; edge case for 32x64 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 shr r2d, 3 mov [rsp+64], r2b mov tlq, rsp mov [tlq+65], r3b mov r3d, 65 cmp hd, 64 cmove maxbased, r3d packuswb m0, m2 packuswb m1, m6 mova [tlq+ 0], m0 mova [tlq+32], m1 .w32_main: movd xm6, dxd vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m6, xm6 movd xm9, maxbased vbroadcasti128 m8, [z_filter_s+2] vpbroadcastw m9, xm9 mov r5d, dxd psubw m9, [z_base_inc] mova m11, m6 psubw m10, m9, m3 ; 64*8 .w32_loop: mov r3d, r5d shr r3d, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu m0, [tlq+r3+0] movu m1, [tlq+r3+8] add r5d, dxd pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [dstq], m0 dec hd jz .w32_end add dstq, strideq cmp r5d, maxbased jb .w32_loop test hb, 1 jz .w32_end_loop mova [dstq], m7 add dstq, strideq dec hd jz .w32_end .w32_end_loop: mova [dstq+strideq*0], m7 mova [dstq+strideq*1], m7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_end_loop .w32_end: RET ALIGN function_align .w64: %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main mova xm11, [tlq- 1] ; 0- 7 vinserti128 m11, [tlq+13], 1 ; 16-23 movu xm12, [tlq+ 5] ; 8-15 vinserti128 m12, [tlq+19], 1 ; 24-31 mova m7, [z_filter_s+0] vinserti128 m8, m7, [z_filter_s+4], 1 vinserti128 m7, [z_filter_s+12], 0 vpbroadcastd m9, [z_filter_k+4*2+12*0] movu xm13, [tlq+29] ; 32-39 vinserti128 m13, [tlq+43], 1 ; 48-55 movu xm14, [tlq+37] ; 40-47 vinserti128 m14, [tlq+51], 1 ; 56-63 pshufb m0, m11, m8 shufps m8, m7, q1021 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 shufps m15, m8, m7, q2121 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m15 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m10, [z_filter_k+4*2+12*2] pshufb m11, m15 pmaddubsw m11, m10 pshufb m12, m7 pmaddubsw m12, m10 pshufb m13, m7 pmaddubsw m13, m10 pshufb m14, m7 pmaddubsw m14, m10 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 movu xm11, [tlq+ 61] ; 64- 71 vinserti128 m11, [tlq+ 75], 1 ; 80- 87 movu xm12, [tlq+ 69] ; 72- 79 vinserti128 m12, [tlq+ 83], 1 ; 88- 95 movu xm13, [tlq+ 93] ; 96-103 vinserti128 m13, [tlq+107], 1 ; 112-119 movu xm14, [tlq+101] ; 104-111 vinserti128 m14, [tlq+115], 1 ; 120-127 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 lea r3d, [hq-20] mov tlq, rsp packuswb m0, m2 packuswb m1, m6 vpbroadcastd xm2, [pb_14] vbroadcasti128 m6, [pb_0to15] mova [tlq+32*0], m0 mova [tlq+32*1], m1 movd xm0, r3d vpbroadcastd m1, [pb_12] vpbroadcastb m0, xm0 paddb m0, m2 pminub m0, m6 ; clip 64x16 and 64x32 pshufb m12, m0 pminub m1, m6 ; clip 64x64 pshufb m14, m1 pshufb m0, m11, m7 pmaddubsw m0, m10 pshufb m2, m12, m7 pmaddubsw m2, m10 pshufb m1, m13, m7 pmaddubsw m1, m10 pshufb m6, m14, m7 pmaddubsw m6, m10 pshufb m7, m11, m15 pmaddubsw m7, m9 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m0, m7 pshufb m7, m13, m15 pmaddubsw m7, m9 paddw m2, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m1, m7 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m8 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 packuswb m0, m2 packuswb m1, m6 mova [tlq+32*2], m0 mova [tlq+32*3], m1 .w64_main: movd xm12, dxd vpbroadcastb m7, [tlq+maxbaseq] lea r3d, [dxq-64] shl maxbased, 6 vpbroadcastw m12, xm12 sub r3d, maxbased vbroadcasti128 m8, [z_filter_s+2] movd xm6, r3d mov r5d, dxd mova m10, [pb_1to32] vpbroadcastd m11, [pb_32] vpbroadcastw m6, xm6 .w64_loop: mov r3d, r5d shr r3d, 6 movu m0, [tlq+r3+ 0] movu m1, [tlq+r3+ 8] pand m2, m4, m6 psubw m9, m5, m2 psllw m2, 8 por m9, m2 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 psraw m2, m6, 6 pmulhrsw m0, m3 pmulhrsw m1, m3 packsswb m2, m2 paddb m2, m10 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [dstq+ 0], m0 movu m0, [tlq+r3+32] movu m1, [tlq+r3+40] add r5d, dxd pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 paddb m2, m11 pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m6, m12 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [dstq+32], m0 dec hd jz .w64_end add dstq, strideq cmp r5d, maxbased jb .w64_loop .w64_end_loop: mova [dstq+ 0], m7 mova [dstq+32], m7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET cglobal ipred_z2_8bpc, 3, 10, 16, 224, dst, stride, tl, w, h, angle, dx, dy %define base r9-z_filter_t0 lea r9, [ipred_z2_avx2_table] tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm lea dxq, [dr_intra_derivative-90] movsxd wq, [r9+wq*4] movzx dyd, angleb xor angled, 0x400 mov r8, dxq sub dxq, dyq add wq, r9 add r9, z_filter_t0-ipred_z2_avx2_table mova m2, [tlq-64] mova m0, [tlq-32] mova m1, [tlq] and dyd, ~1 and dxq, ~1 movzx dyd, word [r8+dyq] ; angle - 90 movzx dxd, word [dxq+270] ; 180 - angle vpbroadcastd m13, [base+pw_512] vpbroadcastd m14, [base+pw_62] vpbroadcastd m15, [base+pw_64] mova [rsp+ 0], m2 mova [rsp+32], m0 mova [rsp+64], m1 neg dxd neg dyd jmp wq .w4: vpbroadcastq m6, [base+z2_base_inc] ; base_inc << 6 vbroadcasti128 m10, [base+z1_shuf_w4] vbroadcasti128 m11, [base+z2_shuf_h4] lea r2d, [dxq+(65<<6)] ; xpos movd xm5, dyd mov r8d, (63-4)<<6 mov dyq, -4 pshuflw xm5, xm5, q0000 pmullw xm5, [base+z2_ymul] test angled, 0x400 jnz .w4_main ; !enable_intra_edge_filter lea r3d, [hq+2] add angled, 1022 shl r3d, 6 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) vpbroadcastd xm3, [base+pb_4] call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle call .filter_strength jmp .w4_filter_left ALIGN function_align .filter_strength: movd xm8, r3d mov r3d, angled movd xm7, angled vpbroadcastb m8, xm8 shr r3d, 8 ; is_sm << 1 vpbroadcastb m7, xm7 pcmpeqb m8, [base+z_filter_wh] mova xm9, [r9+r3*8] pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 ret ALIGN function_align .upsample_above: ; w4/w8 pshufb xm2, xm1, [base+z_upsample1-2] pminub xm3, [base+z_filter_s+4] vpbroadcastd xm4, [base+pb_36_m4] vbroadcasti128 m10, [base+pb_0to15] pshufb xm3, xm1, xm3 pmaddubsw xm2, xm4 pmaddubsw xm3, xm4 lea r2d, [r2+dxq+(1<<6)] add dxd, dxd paddw xm2, xm3 pmulhrsw xm2, xm13 sub r8d, 3<<6 paddw m6, m6 packuswb xm2, xm2 punpcklbw xm1, xm2 mova [rsp+gprsize+64], xm1 ret ALIGN function_align .upsample_left: ; h4/h8 mov r3d, hd and r3d, 4 movd xm2, [rsp+gprsize+64] movddup xm0, [rsp+gprsize+56] movd xm1, r3d palignr xm2, xm0, 1 vpbroadcastb xm1, xm1 pshufb xm2, [base+z_filter_s+18] vpbroadcastd xm3, [base+pb_36_m4] pmaxub xm1, [base+z_upsample1-2] pshufb xm1, xm0, xm1 pmaddubsw xm2, xm3 pmaddubsw xm1, xm3 paddw xm5, xm5 add dyq, dyq paddw xm1, xm2 pmulhrsw xm1, xm13 vbroadcasti128 m11, [base+z2_upsample] paddw xm5, xm15 packuswb xm1, xm1 punpcklbw xm0, xm1 mova [rsp+gprsize+48], xm0 ret .w4_no_upsample_above: lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength test r3d, r3d jz .w4_no_filter_above popcnt r3d, r3d vpbroadcastd xm2, [base+pb_4] pminub xm2, [base+z_filter_s] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] pshufb xm3, xm1, xm2 ; 00 01 12 23 pshufd xm2, xm2, q0321 pmaddubsw xm0, xm3, xm0 pshufb xm2, xm1, xm2 ; 12 23 34 44 pmaddubsw xm2, xm4 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] punpckhqdq xm3, xm3 ; 34 44 44 44 pmaddubsw xm3, xm4 movd xm4, r6m ; max_width pminsw xm4, xm15 vpbroadcastb xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 psubb xm4, [base+pb_1to32] psrlq xm1, 8 packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movd [rsp+65], xm0 .w4_no_filter_above: lea r3d, [hq+2] add angled, 973 ; angle + 883 shl r3d, 6 test r3d, angled jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8) vpbroadcastd xm0, [base+pb_90] psubb xm0, xm7 ; 180 - angle pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 .w4_filter_left: test r3d, r3d jz .w4_main popcnt r3d, r3d mov r5d, 10 cmp hd, 16 movu xm2, [rsp+49] vinserti128 m2, [rsp+43], 1 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] vinserti128 m3, m1, [z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef vpbroadcastb m0, xm0 pmaxub m0, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*0] pshufb m0, m2, m0 pmaddubsw m0, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*1] pshufb m1, m2, m1 pmaddubsw m1, m3 vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2] pshufb m2, m4 pmaddubsw m2, m3 movd xm4, r7m ; max_height pminsw xm4, xm15 vpbroadcastb xm4, xm4 psubb xm4, [base+pb_16to1] paddw m1, m0 paddw m1, m2 pmulhrsw m1, m13 vextracti128 xm0, m1, 1 packuswb xm0, xm1 vpblendvb xm0, [rsp+48], xm4 mova [rsp+48], xm0 jmp .w4_main .w4_upsample_left: call .upsample_left .w4_main: movd xm0, dxd mova m12, [base+z2_y_shuf_h4] lea r5, [rsp+56] ; left-7 vpbroadcastw m0, xm0 lea r9, [strideq*3] psraw xm1, xm5, 6 pand xm5, xm14 ; frac_y pxor xm2, xm2 paddw m7, m0, m0 psubw xm4, xm2, xm1 ; base_y vpblendd m0, m7, 0xcc mova xm1, xm7 punpcklwd xm4, xm2 paddw m0, m1 ; xpos2 xpos3 xpos0 xpos1 psubw xm1, xm15, xm5 ; 64-frac_y psllw xm5, 8 paddw m7, m7 paddw m6, m0 por xm5, xm1 ; 64-frac_y, frac_y vpbroadcastq m5, xm5 .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 vpbroadcastq m1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vpbroadcastq m2, [rsp+r3] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movq xm0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movhps xm0, [rsp+r3] vpblendd m1, m2, 0xc0 pand m2, m14, m6 ; frac_x vpblendd m0, m1, 0xf0 psubw m1, m15, m2 ; 64-frac_x psllw m2, 8 pshufb m0, m10 por m1, m2 ; 64-frac_x, frac_x pmaddubsw m0, m1 cmp r3d, 64 jge .w4_toponly mova m1, m7 ; arbitrary negative value vpgatherdq m3, [r5+xm4], m1 pshufb m1, m3, m11 vpermd m1, m12, m1 pmaddubsw m1, m5 psraw m2, m6, 15 ; base_x < topleft vpblendvb m0, m1, m2 .w4_toponly: pmulhrsw m0, m13 paddw m6, m7 ; xpos += dx add r5, dyq packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r9 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 sub hd, 4 jz .w4_end lea dstq, [dstq+strideq*4] cmp r2d, r8d jge .w4_loop .w4_leftonly_loop: mova m1, m7 vpgatherdq m2, [r5+xm4], m1 add r5, dyq pshufb m0, m2, m11 vpermd m0, m12, m0 pmaddubsw m0, m5 pmulhrsw m0, m13 packuswb m0, m0 vextracti128 xm1, m0, 1 movd [dstq+strideq*2], xm0 pextrd [dstq+r9 ], xm0, 1 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_leftonly_loop .w4_end: RET .w8: vbroadcasti128 m6, [base+z2_base_inc] ; base_inc << 6 movd xm5, dyd vbroadcasti128 m10, [base+z_filter_s+2] vbroadcasti128 m11, [base+z2_shuf_h4] lea r2d, [dxq+(65<<6)] ; xpos vpbroadcastw xm5, xm5 mov r8d, (63-8)<<6 mov dyq, -4 pmullw xm5, [base+z2_ymul] test angled, 0x400 jnz .w8_main lea r3d, [angleq+126] mov r3b, hb cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm vpbroadcastd xm3, [base+pb_8] movhps [rsp+80], xm1 call .upsample_above sub angled, 53 ; angle - 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle call .filter_strength jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength test r3d, r3d jz .w8_no_filter_above popcnt r3d, r3d vpbroadcastd xm3, [base+pb_8] pminub xm3, [base+z_filter_s+8] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*1] pshufb xm2, xm1, [base+z_filter_s] ; 00 01 12 23 34 45 56 67 pmaddubsw xm0, xm2, xm0 pshufb xm3, xm1, xm3 ; 34 45 56 67 78 88 88 88 shufps xm2, xm3, q2121 ; 12 23 34 45 56 67 78 88 pmaddubsw xm2, xm4 vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2] pmaddubsw xm3, xm4 movd xm4, r6m ; max_width pminuw xm4, xm15 vpbroadcastb xm4, xm4 paddw xm0, xm2 paddw xm0, xm3 pmulhrsw xm0, xm13 psubb xm4, [base+pb_1to32] psrldq xm1, 1 packuswb xm0, xm0 vpblendvb xm0, xm1, xm4 movq [rsp+65], xm0 .w8_no_filter_above: lea r3d, [angleq-51] mov r3b, hb cmp r3d, 8 jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 .w8_filter_left: test r3d, r3d jz .w8_main popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] cmp hd, 32 jne .w8_filter_left_h16 movu xm2, [rsp+27] vinserti128 m2, [rsp+35], 1 vpbroadcastd xm0, [base+pb_5] vbroadcasti128 m3, [base+z_filter_s+ 8] vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] pmaxub m3, m0 pshufb m3, m2, m3 pmaddubsw m3, m7 pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 paddw m3, m1 paddw m3, m2 pmulhrsw m3, m13 jmp .w8_filter_left_top16 .w8_filter_left_h16: mov r5d, 10 cmp hd, 16 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vpbroadcastb m0, xm0 .w8_filter_left_top16: vbroadcasti128 m1, [base+z_filter_s+12] vinserti128 m2, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 55 55 56 67 78 89 9a ab vbroadcasti128 m4, [base+z_filter_s+16] vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef pmaxub m0, m2 movu xm2, [rsp+49] vinserti128 m2, [rsp+43], 1 pshufb m0, m2, m0 pmaddubsw m0, m7 movd xm7, r7m ; max_height pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 pminsw xm7, xm15 paddw m1, m0 vpbroadcastb m7, xm7 paddw m1, m2 pmulhrsw m1, m13 psubb m7, [base+pb_32to1] packuswb m3, m1 vpermq m3, m3, q1320 vpblendvb m3, [rsp+32], m7 mova [rsp+32], m3 jmp .w8_main .w8_upsample_left: call .upsample_left .w8_main: movd xm3, dxd lea r5, [rsp+56] ; left-7 pshufd xm1, xm5, q3120 pand xm5, xm14 vpbroadcastw m3, xm3 pxor xm0, xm0 psubw xm2, xm15, xm5 psraw xm1, 6 lea r9, [strideq*3] paddw m7, m3, m3 psubw xm9, xm0, xm1 ; base_y psllw xm5, 8 punpcklwd xm8, xm9, xm0 ; base_y 0, 1, 4, 5 vpblendd m3, m7, 0xf0 ; xpos0 xpos1 por xm5, xm2 ; 64-frac_y, frac_y punpckhwd xm9, xm0 ; base_y 2, 3, 6, 7 paddw m6, m3 vinserti128 m12, m5, xm5, 1 .w8_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 vinserti128 m0, [rsp+r3], 1 lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movu xm1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 vinserti128 m1, [rsp+r3], 1 pand m2, m14, m6 paddsw m4, m6, m7 psubw m5, m15, m2 psllw m2, 8 pshufb m0, m10 por m2, m5 pmaddubsw m0, m2 pand m2, m14, m4 psubw m5, m15, m2 psllw m2, 8 pshufb m1, m10 por m2, m5 pmaddubsw m1, m2 cmp r3d, 64 jge .w8_toponly mova m5, m7 vpgatherdq m3, [r5+xm9], m7 mova m7, m5 vpgatherdq m2, [r5+xm8], m5 pshufb m3, m11 pshufb m2, m11 punpckldq m5, m2, m3 ; a0 b0 c0 d0 a1 b1 c1 d1 e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m2, m3 ; a2 b2 c2 d2 a3 b3 c3 d3 e2 f2 g2 h2 e3 f3 g3 h3 vpermq m5, m5, q3120 ; y0 y1 vpermq m2, m2, q3120 ; y2 y3 pmaddubsw m5, m12 pmaddubsw m2, m12 psraw m6, 15 ; base_x < topleft vpblendvb m0, m5, m6 psraw m3, m4, 15 vpblendvb m1, m2, m3 .w8_toponly: pmulhrsw m0, m13 pmulhrsw m1, m13 paddw m6, m4, m7 ; xpos += dx add r5, dyq packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r9 ], xm1 sub hd, 4 jz .w8_end lea dstq, [dstq+strideq*4] cmp r2d, r8d jge .w8_loop .w8_leftonly_loop: mova m0, m7 vpgatherdq m5, [r5+xm9], m7 mova m7, m0 vpgatherdq m3, [r5+xm8], m0 add r5, dyq pshufb m2, m5, m11 pshufb m1, m3, m11 punpckldq m0, m1, m2 punpckhdq m1, m2 vpermq m0, m0, q3120 vpermq m1, m1, q3120 pmaddubsw m0, m12 pmaddubsw m1, m12 pmulhrsw m0, m13 pmulhrsw m1, m13 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*2], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+r9 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_leftonly_loop .w8_end: RET .w16: mov r8d, hd test angled, 0x400 jnz .w16_main lea r3d, [hq+15] sub angled, 90 call .filter_strength test r3d, r3d jz .w16_no_filter_above popcnt r3d, r3d vbroadcasti128 m6, [tlq+1] mova xm2, [base+z_filter_s] vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de movu xm3, [base+z_filter_s+8] vinserti128 m3, [base+z_filter_s+22], 1 ; 34 45 56 67 78 89 9a ab ab bc cd de ef ff ff ff vpblendd m1, m6, 0xf0 vpbroadcastd m0, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m4, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m5, [base+z_filter_k-4+r3*4+12*2] pshufb m2, m1, m2 pshufb m1, m3 pmaddubsw m0, m2, m0 shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff pmaddubsw m2, m4 pmaddubsw m1, m5 movd xm4, r6m ; max_width pminsw xm4, xm15 vpbroadcastb xm4, xm4 paddw m0, m2 paddw m0, m1 pmulhrsw m0, m13 psubb xm4, [base+pb_1to32] vextracti128 xm2, m0, 1 packuswb xm0, xm2 vpblendvb xm0, xm6, xm4 movu [rsp+65], xm0 .w16_no_filter_above: vpbroadcastd m0, [base+pb_90] psubb m0, m7 pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 test r3d, r3d jz .w16_main popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] .w16_filter_left: movd xm6, r7m ; max_height pminsw xm6, xm15 vpbroadcastb m6, xm6 cmp hd, 32 jl .w16_filter_left_h16 vpbroadcastd xm0, [base+pb_5] vbroadcasti128 m10, [base+z_filter_s+ 8] vbroadcasti128 m11, [base+z_filter_s+12] vbroadcasti128 m12, [base+z_filter_s+16] je .w16_filter_left_h32 movu m3, [tlq-69] movu m5, [tlq-61] pmaxub m1, m10, m0 pshufb m1, m3, m1 pmaddubsw m1, m7 pshufb m2, m3, m11 pmaddubsw m2, m8 pshufb m3, m12 pmaddubsw m3, m9 paddw m1, m2 pshufb m2, m5, m10 pmaddubsw m2, m7 pshufb m4, m5, m11 pmaddubsw m4, m8 pshufb m5, m12 pmaddubsw m5, m9 paddw m1, m3 vpbroadcastd m3, [base+pb_32] paddb m3, [base+pb_32to1] paddw m2, m4 paddw m2, m5 pmulhrsw m1, m13 pmulhrsw m2, m13 psubb m3, m6, m3 packuswb m1, m2 vpblendvb m1, [tlq-64], m3 mova [rsp], m1 jmp .w16_filter_left_top32 .w16_filter_left_h32: pmaxub m10, m0 .w16_filter_left_top32: movu xm2, [tlq-37] vinserti128 m2, [tlq-29], 1 pshufb m3, m2, m10 pshufb m1, m2, m11 pshufb m2, m12 pmaddubsw m3, m7 pmaddubsw m1, m8 pmaddubsw m2, m9 paddw m3, m1 paddw m3, m2 pmulhrsw m3, m13 jmp .w16_filter_left_top16 .w16_filter_left_h16: mov r5d, 10 cmp hd, 16 cmovs r5d, hd xor r5d, 15 ; h == 16 ? 5 : 15 - h movd xm0, r5d vpbroadcastb m0, xm0 .w16_filter_left_top16: movu xm2, [tlq-15] vinserti128 m2, [tlq-21], 1 vbroadcasti128 m1, [base+z_filter_s+12] vbroadcasti128 m4, [base+z_filter_s+16] vinserti128 m5, m1, [base+z_filter_s+8], 1 ; 56 67 78 89 9a ab bc cd 34 45 56 67 78 89 9a ab vpblendd m1, m4, 0x0f ; 78 89 9a ab bc cd de ef 56 67 78 89 9a ab bc cd vinserti128 m4, [base+z_filter_s+20], 0 ; 9a ab bc cd de ef ff ff 78 89 9a ab bc cd de ef pmaxub m0, m5 pshufb m0, m2, m0 pmaddubsw m0, m7 pshufb m1, m2, m1 pmaddubsw m1, m8 pshufb m2, m4 pmaddubsw m2, m9 psubb m6, [base+pb_32to1] paddw m1, m0 paddw m1, m2 pmulhrsw m1, m13 packuswb m3, m1 vpermq m3, m3, q1320 vpblendvb m3, [tlq-32], m6 mova [rsp+32], m3 .w16_main: movd xm1, dyd vbroadcasti128 m10, [base+z_filter_s+2] movd xm7, dxd vbroadcasti128 m11, [base+z2_shuf_h2] vpbroadcastw m1, xm1 vpbroadcastw m7, xm7 mov r7, dstq pmullw m0, m1, [base+z2_ymul] psllw xm1, 4 paddw m6, m7, [base+z2_base_inc] lea r9d, [dxq+(65<<6)] ; xpos movd [rsp+156], xm1 .w16_loop0: mov r2d, r9d mova [rsp+160], m0 lea r5, [rsp+60] ; left-3 mova [rsp+192], m6 pxor m1, m1 psraw m2, m0, 6 pand m0, m14 psubw m9, m1, m2 ; base_y psubw m12, m15, m0 punpcklwd m8, m9, m1 ; base_y 0, 1, 2, 3, 8, 9, 10, 11 psllw m0, 8 punpckhwd m9, m1 ; base_y 4, 5, 6, 7, 12, 13, 14, 15 por m12, m0 ; 64-frac_y, frac_y .w16_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movu xm0, [rsp+r2] vinserti128 m0, [rsp+r2+8], 1 lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movu xm1, [rsp+r3] vinserti128 m1, [rsp+r3+8], 1 pand m2, m14, m6 paddsw m5, m6, m7 psubw m3, m15, m2 psllw m2, 8 pshufb m0, m10 por m2, m3 pmaddubsw m0, m2 pand m2, m14, m5 psubw m3, m15, m2 psllw m2, 8 pshufb m1, m10 por m2, m3 pmaddubsw m1, m2 cmp r3d, 64 jge .w16_toponly punpckhwd m2, m5, m5 ; mask out unnecessary loads vpgatherdd m4, [r5+m9], m2 punpcklwd m2, m5, m5 vpgatherdd m3, [r5+m8], m2 pshufb m4, m11 ; e0 f0 g0 h0 e1 f1 g1 h1 m0 n0 o0 p0 m1 n1 o1 p1 pshufb m3, m11 ; a0 b0 c0 d0 a1 b1 c1 d1 i0 j0 k0 l0 i1 j1 k1 l1 punpcklqdq m2, m3, m4 ; y0 punpckhqdq m3, m4 ; y1 pmaddubsw m2, m12 pmaddubsw m3, m12 psraw m6, 15 ; base_x < topleft vpblendvb m0, m2, m6 psraw m6, m5, 15 vpblendvb m1, m3, m6 .w16_toponly: pmulhrsw m0, m13 pmulhrsw m1, m13 paddw m6, m5, m7 ; xpos += dx sub r5, 2 packuswb m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 sub hd, 2 jz .w16_end lea dstq, [dstq+strideq*2] cmp r2d, (63-16)<<6 jge .w16_loop .w16_leftonly_loop: mova m0, m7 vpgatherdd m4, [r5+m9], m7 mova m7, m0 vpgatherdd m3, [r5+m8], m0 sub r5, 2 pshufb m2, m4, m11 pshufb m1, m3, m11 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pmaddubsw m0, m12 pmaddubsw m1, m12 pmulhrsw m0, m13 pmulhrsw m1, m13 packuswb m0, m1 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_leftonly_loop .w16_end: sub r8d, 1<<8 jl .w16_ret vpbroadcastd m0, [rsp+156] paddw m0, [rsp+160] ; base_y += 16*dy paddw m6, m13, [rsp+192] add r7, 16 add r9d, 16<<6 movzx hd, r8b mov dstq, r7 paddw m6, m13 ; base_x += 16*64 jmp .w16_loop0 .w16_ret: RET .w32: mova m2, [tlq+32] lea r8d, [hq+(1<<8)] mova [rsp+96], m2 test angled, 0x400 jnz .w16_main vpbroadcastd m7, [base+z_filter_k+4*2+12*0] vpbroadcastd m8, [base+z_filter_k+4*2+12*1] vpbroadcastd m9, [base+z_filter_k+4*2+12*2] mova xm5, [base+z_filter_s] vinserti128 m5, [base+z_filter_s+10], 1 ; 00 01 12 23 34 45 56 67 45 56 67 78 89 9a ab bc vinserti128 m1, [tlq+11], 1 movu xm6, [base+z_filter_s+12] vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff movu xm3, [tlq+ 6] vinserti128 m3, [tlq+17], 1 movd xm0, r6m ; max_width pminsw xm0, xm15 vpbroadcastb m10, xm0 .w32_filter_above: pshufb m0, m1, m5 shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de pmaddubsw m0, m7 pshufb m2, m1, m4 shufps m5, m6, q2132 ; 34 45 56 67 78 89 9a ab 89 9a ab bc cd de ef ff pmaddubsw m2, m8 pshufb m1, m5 pmaddubsw m1, m9 paddw m0, m2 paddw m0, m1 pshufb m1, m3, m4 pmaddubsw m1, m7 pshufb m2, m3, m5 pmaddubsw m2, m8 pshufb m3, m6 pmaddubsw m3, m9 paddw m1, m2 paddw m1, m3 pmulhrsw m0, m13 pmulhrsw m1, m13 psubb m10, [base+pb_1to32] packuswb m0, m1 vpblendvb m0, [tlq+1], m10 movu [rsp+65], m0 jmp .w16_filter_left .w64: mova m2, [tlq+32] mov r3d, [tlq+64] lea r8d, [hq+(3<<8)] mova [rsp+ 96], m2 mov [rsp+128], r3d test angled, 0x400 jnz .w16_main vpbroadcastd m7, [base+z_filter_k+4*2+12*0] vpbroadcastd m8, [base+z_filter_k+4*2+12*1] vpbroadcastd m9, [base+z_filter_k+4*2+12*2] movu xm6, [base+z_filter_s+ 4] vinserti128 m6, [base+z_filter_s+10], 1 ; 12 23 34 45 56 67 78 89 45 56 67 78 89 9a ab bc movu xm3, [tlq+30] vinserti128 m3, [tlq+43], 1 movu xm5, [base+z_filter_s+16] vinserti128 m5, [base+z_filter_s+22], 1 ; 78 89 9a ab bc cd de ef ab bc cd de ef ff ff ff pshufb m0, m3, m6 shufps m4, m6, m5, q1021 ; 34 45 56 67 78 89 9a ab 67 78 89 9a ab bc cd de pmaddubsw m0, m7 pshufb m2, m3, m4 shufps m6, m5, q2132 ; 56 67 78 89 9a ab bc cd 89 9a ab bc cd de ef ff pmaddubsw m2, m8 pshufb m3, m6 pmaddubsw m3, m9 paddw m0, m2 paddw m0, m3 movu xm2, [tlq+36] vinserti128 m2, [tlq+49], 1 pshufb m4, m2, m4 pmaddubsw m4, m7 pshufb m3, m2, m6 pmaddubsw m3, m8 pshufb m2, m5 pmaddubsw m2, m9 movd xm5, r6m ; max_width pminsw xm5, xm15 vpbroadcastb m10, xm5 paddw m3, m4 paddw m2, m3 vpbroadcastd m3, [base+pb_32] pmulhrsw m0, m13 pmulhrsw m2, m13 mova xm5, [base+z_filter_s] vinserti128 m5, [base+z_filter_s+6], 1 psubb m3, m10, m3 psubb m3, [base+pb_1to32] vinserti128 m1, [tlq+13], 1 packuswb m0, m2 vpblendvb m0, [tlq+33], m3 movu xm3, [tlq+ 6] vinserti128 m3, [tlq+19], 1 movu [rsp+97], m0 jmp .w32_filter_above cglobal ipred_z3_8bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase %assign org_stack_offset stack_offset lea r6, [ipred_z3_avx2_table] tzcnt hd, hm movifnidn angled, anglem lea r7, [dr_intra_derivative+45*2-1] dec tlq movsxd hq, [r6+hq*4] sub angled, 180 add hq, r6 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e movzx dyd, word [r7+dyq] vpbroadcastd m3, [pw_512] vpbroadcastd m4, [pw_62] vpbroadcastd m5, [pw_64] mov org_wd, wd jmp hq .h4: lea r7, [strideq*3] cmp angleb, 40 jae .h4_no_upsample lea r4d, [angleq-1024] sar r4d, 7 add r4d, wd jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm) ALLOC_STACK -32, 9 movu xm8, [tlq-7] pshufb xm0, xm8, [z_upsample1-4] vpbroadcastb xm2, xm8 pshufb xm1, xm8, [z_filter_s+2] mova [rsp+16], xm2 ; top[max_base_y] vpbroadcastd xm2, [pb_36_m4] add dyd, dyd pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 movd xm7, dyd mov r2d, dyd vpbroadcastw m7, xm7 paddw xm1, xm0 pmulhrsw xm1, xm3 pslldq m6, m7, 8 paddw xm2, xm7, xm7 paddw m6, m7 packuswb xm1, xm1 paddw m6, m2 punpcklbw xm1, xm8 mova xm8, [z_transpose4] psllw m7, 2 pshufb xm1, [pb_15to0] mova [rsp], xm1 .h4_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 vpbroadcastq m1, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 vpbroadcastq m2, [rsp+r4] lea r4d, [r2+dyq] shr r2d, 6 movq xm0, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 movhps xm0, [rsp+r4] vpblendd m1, m2, 0xc0 pand m2, m4, m6 vpblendd m0, m1, 0xf0 psubw m1, m5, m2 psllw m2, 8 por m1, m2 pmaddubsw m0, m1 paddw m6, m7 pmulhrsw m0, m3 vextracti128 xm1, m0, 1 packuswb xm1, xm0 pshufb xm1, xm8 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r7 ], xm1, 3 add dstq, 4 sub wd, 4 jg .h4_upsample_loop RET ALIGN function_align .filter_strength: ; h4/h8/h16 %define base r4-z_filter_t0 lea r4, [z_filter_t0] movd xm0, maxbased movd xm2, angled shr angled, 8 ; is_sm << 1 vpbroadcastb m0, xm0 vpbroadcastb m2, xm2 pcmpeqb m1, m0, [base+z_filter_wh] pand m1, m2 mova xm2, [r4+angleq*8] pcmpgtb m1, m2 pmovmskb r5d, m1 ret .h4_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -16, 12 mov maxbased, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m7, [base+pb_7] vbroadcasti128 m2, [tlq-14] pmaxub m1, m7, [base+z_filter_s-4] vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0] pmaxub m7, [base+z_filter_s+4] vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2] pshufb m0, m2, m1 shufps m1, m7, q2121 pmaddubsw m0, m8 pshufb m1, m2, m1 pmaddubsw m1, m9 pshufb m2, m7 pmaddubsw m2, m10 paddw m0, m1 paddw m0, m2 pmulhrsw m0, m3 mov r4d, 9 lea tlq, [rsp+15] cmp wd, 4 cmovne maxbased, r4d vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [rsp], xm0 .h4_main: movd xm6, dyd vpbroadcastq m0, [z_base_inc] ; base_inc << 6 mov r4, tlq sub tlq, 4 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] ; ypos movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf_w4] add maxbased, 64 vpbroadcastw m9, xm9 psrlw m7, 8 ; top[max_base_y] paddw m10, m6, m6 psubw m9, m0 ; max_base_y vpblendd m6, m10, 0xcc mova xm0, xm10 paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1 paddw m10, m10 mova xm11, [z_transpose4] .h4_loop: lea r5, [r4+dyq] sar r4, 6 ; base0 vpbroadcastq m1, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 ; base1 vpbroadcastq m2, [tlq+r5] lea r5, [r4+dyq] sar r4, 6 ; base2 movq xm0, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 ; base3 movhps xm0, [tlq+r5] vpblendd m1, m2, 0xc0 pand m2, m4, m6 ; frac vpblendd m0, m1, 0xf0 psubw m1, m5, m2 ; 64-frac psllw m2, 8 pshufb m0, m8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 pcmpgtw m1, m9, m6 ; base < max_base_y pmulhrsw m0, m3 paddw m6, m10 ; ypos += dy vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 packuswb xm1, xm0 pshufb xm1, xm11 ; transpose movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r7 ], xm1, 3 sub wd, 4 jz .h4_end add dstq, 4 cmp r4d, maxbased jg .h4_loop packuswb xm7, xm7 .h4_end_loop: movd [dstq+strideq*0], xm7 movd [dstq+strideq*1], xm7 movd [dstq+strideq*2], xm7 movd [dstq+r7 ], xm7 add dstq, 4 sub wd, 4 jg .h4_end_loop .h4_end: RET ALIGN function_align .h8: lea r4d, [angleq+216] mov r4b, wb cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 %assign stack_offset org_stack_offset ALLOC_STACK -32, 8 and r4d, 4 mova xm0, [tlq-15] vinserti128 m0, [tlq- 9], 1 movd xm1, r4d movu xm2, [z_filter_s+2] vinserti128 m2, [z_filter_s+6], 1 vpbroadcastb xm1, xm1 ; w & 4 vpbroadcastd m7, [pb_36_m4] pmaxub xm1, [z_upsample1-4] ; clip 4x8 vinserti128 m1, [z_upsample1], 1 add dyd, dyd pshufb m1, m0, m1 pshufb m2, m0, m2 vinserti128 m0, [tlq-7], 1 movd xm6, dyd pmaddubsw m1, m7 pmaddubsw m2, m7 vpbroadcastw m6, xm6 mov r2d, dyd lea r5, [strideq*3] paddw m7, m6, m6 paddw m1, m2 vpblendd m6, m7, 0xf0 pmulhrsw m1, m3 pslldq m2, m7, 8 paddw m7, m7 paddw m6, m2 vbroadcasti128 m2, [pb_15to0] packuswb m1, m1 punpcklbw m1, m0 pshufb m1, m2 vextracti128 [rsp+ 0], m1, 1 mova [rsp+16], xm1 .h8_upsample_loop: lea r4d, [r2+dyq] shr r2d, 6 ; base0 movu xm0, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 ; base1 vinserti128 m0, [rsp+r4], 1 lea r4d, [r2+dyq] shr r2d, 6 ; base2 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 punpcklqdq m1, m2, m2 ; frac0 frac1 pmaddubsw m0, m1 movu xm1, [rsp+r2] lea r2d, [r4+dyq] shr r4d, 6 ; base3 vinserti128 m1, [rsp+r4], 1 punpckhqdq m2, m2 ; frac2 frac3 pmaddubsw m1, m2 pmulhrsw m0, m3 paddw m6, m7 pmulhrsw m1, m3 lea r4, [dstq+strideq*4] psllw m1, 8 por m0, m1 vextracti128 xm1, m0, 1 punpcklbw xm2, xm0, xm1 punpckhbw xm0, xm1 movd [dstq+strideq*0], xm2 pextrd [dstq+strideq*1], xm2, 1 pextrd [dstq+strideq*2], xm2, 2 pextrd [dstq+r5 ], xm2, 3 movd [r4 +strideq*0], xm0 pextrd [r4 +strideq*1], xm0, 1 pextrd [r4 +strideq*2], xm0, 2 pextrd [r4 +r5 ], xm0, 3 add dstq, 4 sub wd, 4 jg .h8_upsample_loop RET .h8_no_intra_edge_filter: and maxbased, 7 or maxbased, 8 ; imin(w+7, 15) jmp .h8_main .h8_no_upsample: %assign stack_offset org_stack_offset ALLOC_STACK -32, 10 lea maxbased, [wq+7] test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h8_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd xm6, [base+pb_15] pcmpeqb xm1, xm1 psubusb xm6, xm0 psubb xm6, xm1 ; w == 4 ? 5 : 1 movu xm2, [tlq-16] pmaxub xm1, xm6, [base+z_filter_s] vinserti128 m2, [tlq-14], 1 vinserti128 m1, [base+z_filter_s+12], 1 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0] pmaxub xm6, [base+z_filter_s+ 8] vinserti128 m6, [base+z_filter_s+20], 1 pshufb m0, m2, m1 pmaddubsw m0, m7 vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1] movzx r4d, byte [tlq-15] shufps m1, m6, q2121 pshufb m1, m2, m1 pmaddubsw m1, m7 paddw m0, m1 sub r5d, 3 jnz .h8_3tap vpbroadcastd m7, [z_filter_k+4*8] movzx r2d, byte [tlq-14] pshufb m2, m6 pmaddubsw m2, m7 sub r2d, r4d lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+15], r2b paddw m0, m2 .h8_3tap: pmulhrsw m0, m3 sar r5d, 1 lea tlq, [rsp+31] add r5d, 17 cmp wd, 16 cmovns maxbased, r5d neg r5 mov [tlq+r5], r4b vextracti128 xm1, m0, 1 packuswb xm0, xm1 mova [tlq-15], xm0 .h8_main: movd xm2, dyd vbroadcasti128 m0, [z_base_inc] mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m2, xm2 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psrlw m7, 8 psubw m9, m0 paddw m6, m2, m2 vpblendd m2, m6, 0x0f .h8_loop: lea r5, [r4+dyq] sar r4, 6 pand m0, m4, m2 psubw m1, m5, m0 psllw m0, 8 por m1, m0 vbroadcasti128 m0, [tlq+r4] lea r4, [r5+dyq] sar r5, 6 vinserti128 m0, [tlq+r5], 0 sub rsp, 8*2 pshufb m0, m8 pmaddubsw m0, m1 pcmpgtw m1, m9, m2 paddw m2, m6 pmulhrsw m0, m3 vpblendvb m0, m7, m0, m1 vextracti128 xm1, m0, 1 psllw xm0, 8 por xm0, xm1 ; interleave rows (partial transpose) mova [rsp], xm0 sub wd, 2 jz .h8_transpose cmp r4d, maxbased jg .h8_loop packuswb xm0, xm7, xm7 .h8_end_loop: sub rsp, 8*2 mova [rsp], xm0 sub wd, 2 jg .h8_end_loop .h8_transpose: mova xm2, [rsp+16*1] sub org_wd, 8 lea r2, [strideq*3] lea r6, [dstq+org_wq] cmovns dstq, r6 punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 lea r6, [dstq+strideq*4] jge .h8_w8 add rsp, 16*2 movd [dstq+strideq*0], xm1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+r2 ], xm1, 3 movd [r6 +strideq*0], xm2 pextrd [r6 +strideq*1], xm2, 1 pextrd [r6 +strideq*2], xm2, 2 pextrd [r6 +r2 ], xm2, 3 jmp .h8_end .h8_w8_loop: mova xm0, [rsp+16*0] mova xm2, [rsp+16*1] punpcklwd xm1, xm2, xm0 punpckhwd xm2, xm0 .h8_w8: ; w8/w16/w32 mova xm0, [rsp+16*2] mova xm4, [rsp+16*3] add rsp, 16*4 punpcklwd xm3, xm4, xm0 punpckhwd xm4, xm0 punpckldq xm0, xm3, xm1 punpckhdq xm3, xm1 punpckldq xm1, xm4, xm2 punpckhdq xm4, xm2 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm3 movhps [dstq+r2 ], xm3 movq [r6 +strideq*0], xm1 movhps [r6 +strideq*1], xm1 movq [r6 +strideq*2], xm4 movhps [r6 +r2 ], xm4 sub dstq, 8 sub r6, 8 sub org_wd, 8 jge .h8_w8_loop .h8_end: RET .h16_no_intra_edge_filter: and maxbased, 15 or maxbased, 16 ; imin(w+15, 31) jmp .h16_main ALIGN function_align .h16: %assign stack_offset org_stack_offset ALLOC_STACK -64, 12 lea maxbased, [wq+15] test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength test r5d, r5d jz .h16_main ; filter_strength == 0 popcnt r5d, r5d vpbroadcastd m11, [base+pb_27] vpbroadcastd m1, [base+pb_1] vbroadcasti128 m6, [base+z_filter_s+12] vinserti128 m2, m6, [base+z_filter_s+4], 0 vinserti128 m6, [base+z_filter_s+20], 1 movu xm10, [tlq-18] vinserti128 m10, [tlq-14], 1 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0] vbroadcasti128 m7, [base+z_filter_s+8] vinserti128 m8, m7, [base+z_filter_s+0], 0 vinserti128 m7, [base+z_filter_s+16], 1 psubusb m11, m0 por m1, m11 movu xm11, [tlq-32] vinserti128 m11, [tlq-28], 1 pmaxub m8, m1 pmaxub m7, m1 pshufb m0, m10, m2 shufps m2, m6, q2121 pmaddubsw m0, m9 pshufb m1, m11, m8 shufps m8, m7, q2121 pmaddubsw m1, m9 vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1] movzx r4d, byte [tlq-31] pshufb m2, m10, m2 pmaddubsw m2, m9 pshufb m8, m11, m8 pmaddubsw m8, m9 paddw m0, m2 paddw m1, m8 sub r5d, 3 jnz .h16_3tap vpbroadcastd m9, [z_filter_k+4*8] movzx r2d, byte [tlq-30] pshufb m10, m6 pmaddubsw m10, m9 pshufb m11, m7 pmaddubsw m11, m9 sub r2d, r4d lea r2d, [r2+r4*8+4] shr r2d, 3 mov [rsp+31], r2b paddw m0, m10 paddw m1, m11 .h16_3tap: pmulhrsw m0, m3 pmulhrsw m1, m3 sar r5d, 1 lea tlq, [rsp+63] add r5d, 33 cmp wd, 32 cmovns maxbased, r5d neg r5 mov [tlq+r5], r4b packuswb m0, m1 vpermq m0, m0, q2031 mova [tlq-31], m0 .h16_main: movd xm6, dyd vbroadcasti128 m0, [z_base_inc] mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psubw m9, m0 paddw m11, m6, m6 psubw m10, m9, m3 ; 64*8 vpblendd m6, m11, 0xf0 .h16_loop: lea r5, [r4+dyq] sar r4, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r4-0] movu xm1, [tlq+r4-8] lea r4, [r5+dyq] sar r5, 6 vinserti128 m0, [tlq+r5-0], 1 vinserti128 m1, [tlq+r5-8], 1 sub rsp, 32 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 vpermq m0, m0, q3120 mova [rsp], m0 sub wd, 2 jz .h16_transpose cmp r4d, maxbased jg .h16_loop mova m0, m7 .h16_end_loop: sub rsp, 32 mova [rsp], m7 sub wd, 2 jg .h16_end_loop .h16_transpose: mova m2, [rsp+32*1] sub org_wd, 8 lea r2, [strideq*3] lea r6, [dstq+org_wq] cmovns dstq, r6 punpcklbw m1, m2, m0 punpckhbw m2, m0 lea r3, [strideq*5] punpcklbw m0, m1, m2 punpckhbw m1, m2 lea r4, [strideq+r2*2] ; stride*7 jge .h16_w8 add rsp, 32*2 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 vextracti128 xm0, m0, 1 movd [dstq+strideq*4], xm1 pextrd [dstq+r3 ], xm1, 1 pextrd [dstq+r2*2 ], xm1, 2 pextrd [dstq+r4 ], xm1, 3 lea dstq, [dstq+strideq*8] vextracti128 xm1, m1, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 movd [dstq+strideq*4], xm1 pextrd [dstq+r3 ], xm1, 1 pextrd [dstq+r2*2 ], xm1, 2 pextrd [dstq+r4 ], xm1, 3 jmp .h16_end .h16_w8_loop: mova m0, [rsp+32*0] mova m2, [rsp+32*1] punpcklbw m1, m2, m0 punpckhbw m2, m0 punpcklbw m0, m1, m2 punpckhbw m1, m2 .h16_w8: mova m2, [rsp+32*2] mova m4, [rsp+32*3] lea r6, [dstq+strideq*8] add rsp, 32*4 punpcklbw m3, m4, m2 punpckhbw m4, m2 punpcklbw m2, m3, m4 punpckhbw m3, m4 punpckldq m4, m2, m0 punpckhdq m2, m0 punpckldq m0, m3, m1 punpckhdq m3, m1 movq [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm4 vextracti128 xm4, m4, 1 movq [dstq+strideq*2], xm2 movhps [dstq+r2 ], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*4], xm0 movhps [dstq+r3 ], xm0 vextracti128 xm0, m0, 1 movq [dstq+r2*2 ], xm3 movhps [dstq+r4 ], xm3 vextracti128 xm3, m3, 1 movq [r6+strideq*0], xm4 movhps [r6+strideq*1], xm4 movq [r6+strideq*2], xm2 movhps [r6+r2 ], xm2 movq [r6+strideq*4], xm0 movhps [r6+r3 ], xm0 movq [r6+r2*2 ], xm3 movhps [r6+r4 ], xm3 sub dstq, 8 sub org_wd, 8 jge .h16_w8_loop .h16_end: RET ALIGN function_align .h32: %assign stack_offset org_stack_offset ALLOC_STACK -96, 15 lea maxbased, [wq+31] and maxbased, 31 or maxbased, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main vbroadcasti128 m0, [pb_0to15] mov r4d, 21 mov r5d, 3 movu xm11, [tlq-66] ; 56-63 vinserti128 m11, [tlq-52], 1 ; 40-47 sub r4d, wd ; 21-w cmovns r5d, r4d movu xm12, [tlq-58] ; 48-55 vinserti128 m12, [tlq-44], 1 ; 32-39 sub r4d, 8 ; 13-w movd xm1, r5d movu xm13, [tlq-34] ; 24-31 vinserti128 m13, [tlq-20], 1 ; 8-15 movd xm2, r4d vpbroadcastb m1, xm1 movu xm14, [tlq-28] ; 16-23 vinserti128 m14, [tlq-14], 1 ; 0- 7 vpbroadcastb m2, xm2 pmaxsb m1, m0 ; clip 16x32 and (32|64)x32 movu m7, [z_filter_s+4] pshufb m11, m1 vinserti128 m8, m7, [z_filter_s+8], 1 vinserti128 m7, [z_filter_s+16], 0 pmaxsb m2, m0 ; clip 8x32 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m12, m2 pshufb m0, m11, m8 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pshufb m1, m13, m8 pmaddubsw m1, m9 shufps m8, m7, q1021 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m10, m11, m8 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m8 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m8 pmaddubsw m10, m9 shufps m8, m7, q2121 paddw m1, m10 pshufb m10, m14, m8 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 movzx r4d, byte [tlq-63] movzx r2d, byte [tlq-62] paddw m0, m11 paddw m2, m12 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m7 pmaddubsw m14, m9 paddw m1, m13 paddw m6, m14 sub r2d, r4d lea r2d, [r2+r4*8+4] ; edge case for 64x32 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 shr r2d, 3 mov [rsp+31], r2b lea tlq, [rsp+95] mov [tlq-65], r4b mov r4d, 65 cmp wd, 64 cmove maxbased, r4d packuswb m0, m2 packuswb m1, m6 mova [tlq-63], m0 mova [tlq-31], m1 .h32_main: movd xm6, dyd mov r4, tlq sub tlq, 8 neg dyq vpbroadcastw m6, xm6 sub r4, maxbaseq shl maxbased, 6 vpbroadcastb m7, [r4] lea r4, [dyq+63] movd xm9, maxbased not maxbased vbroadcasti128 m8, [z3_shuf] add maxbased, 64 vpbroadcastw m9, xm9 psubw m9, [z_base_inc] mova m11, m6 psubw m10, m9, m3 ; 64*8 .h32_loop: mov r5, r4 sar r5, 6 pand m1, m4, m6 psubw m2, m5, m1 psllw m1, 8 por m2, m1 movu xm0, [tlq+r5- 0] vinserti128 m0, [tlq+r5-16], 1 movu xm1, [tlq+r5- 8] vinserti128 m1, [tlq+r5-24], 1 sub rsp, 32 add r4, dyq pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 pcmpgtw m1, m9, m6 pcmpgtw m2, m10, m6 packsswb m1, m2 paddw m6, m11 vpblendvb m0, m7, m0, m1 mova [rsp], m0 dec wd jz .h32_transpose cmp r4d, maxbased jg .h32_loop .h32_end_loop: sub rsp, 32 mova [rsp], m7 dec wd jg .h32_end_loop .h32_transpose: lea dstq, [dstq+org_wq-8] lea r2, [strideq*3] lea r3, [strideq*5] lea r4, [strideq+r2*2] ; stride*7 .h32_w8_loop: mova m7, [rsp+32*0] mova m6, [rsp+32*1] mova m5, [rsp+32*2] mova m4, [rsp+32*3] mova m3, [rsp+32*4] mova m2, [rsp+32*5] mova m1, [rsp+32*6] mova m0, [rsp+32*7] lea r6, [dstq+strideq*8] add rsp, 32*8 punpcklbw m8, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklwd m7, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpckldq m6, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m3 punpckhdq m8, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 movq [dstq+strideq*0], xm6 movhps [dstq+strideq*1], xm6 vextracti128 xm6, m6, 1 movq [dstq+strideq*2], xm7 movhps [dstq+r2 ], xm7 vextracti128 xm7, m7, 1 movq [dstq+strideq*4], xm2 movhps [dstq+r3 ], xm2 vextracti128 xm2, m2, 1 movq [dstq+r2*2 ], xm8 movhps [dstq+r4 ], xm8 vextracti128 xm8, m8, 1 movq [r6+strideq*0], xm3 movhps [r6+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [r6+strideq*2], xm1 movhps [r6+r2 ], xm1 vextracti128 xm1, m1, 1 movq [r6+strideq*4], xm5 movhps [r6+r3 ], xm5 vextracti128 xm5, m5, 1 movq [r6+r2*2 ], xm0 movhps [r6+r4 ], xm0 lea r6, [r6+strideq*8] vextracti128 xm0, m0, 1 movq [r6+strideq*0], xm6 movhps [r6+strideq*1], xm6 movq [r6+strideq*2], xm7 movhps [r6+r2 ], xm7 movq [r6+strideq*4], xm2 movhps [r6+r3 ], xm2 movq [r6+r2*2 ], xm8 movhps [r6+r4 ], xm8 lea r6, [r6+strideq*8] movq [r6+strideq*0], xm3 movhps [r6+strideq*1], xm3 movq [r6+strideq*2], xm1 movhps [r6+r2 ], xm1 movq [r6+strideq*4], xm5 movhps [r6+r3 ], xm5 movq [r6+r2*2 ], xm0 movhps [r6+r4 ], xm0 sub dstq, 8 sub org_wd, 8 jg .h32_w8_loop RET ALIGN function_align .h64: %assign stack_offset org_stack_offset ALLOC_STACK -128, 16 lea maxbased, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main mov r4d, 21 vpbroadcastb xm11, [tlq-127] vpblendd xm11, [tlq-130], 0x0e ; 120-127 sub r4d, wd ; 21-w mov r5d, 3 vinserti128 m11, [tlq-116], 1 ; 104-111 movu m7, [z_filter_s+4] cmp wd, 32 cmove r4d, r5d vinserti128 m8, m7, [z_filter_s+8], 1 vbroadcasti128 m6, [pb_0to15] movd xm1, r4d vpbroadcastd m9, [z_filter_k+4*2+12*0] movu xm12, [tlq-122] ; 112-119 vinserti128 m12, [tlq-108], 1 ; 96-103 vpbroadcastb m1, xm1 movu xm13, [tlq- 98] ; 88- 95 vinserti128 m13, [tlq- 84], 1 ; 72- 79 movu xm14, [tlq- 90] ; 80- 87 vinserti128 m14, [tlq- 76], 1 ; 64- 71 vinserti128 m7, [z_filter_s+16], 0 pshufb m0, m11, m8 pmaddubsw m0, m9 pshufb m2, m12, m8 pmaddubsw m2, m9 pmaxsb m1, m6 ; clip (16|32)x64 pshufb m13, m1 pshufb m1, m13, m8 pmaddubsw m1, m9 pshufb m6, m14, m8 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] shufps m15, m8, m7, q1021 pshufb m10, m11, m15 pmaddubsw m10, m9 paddw m0, m10 pshufb m10, m12, m15 pmaddubsw m10, m9 paddw m2, m10 pshufb m10, m13, m15 pmaddubsw m10, m9 paddw m1, m10 pshufb m10, m14, m15 pmaddubsw m10, m9 paddw m6, m10 vpbroadcastd m9, [z_filter_k+4*2+12*2] shufps m10, m8, m7, q2132 pshufb m11, m10 pmaddubsw m11, m9 pshufb m12, m10 pmaddubsw m12, m9 pshufb m13, m10 pmaddubsw m13, m9 pshufb m14, m10 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 movu xm11, [tlq-66] ; 56-63 vinserti128 m11, [tlq-52], 1 ; 40-47 movu xm12, [tlq-58] ; 48-55 vinserti128 m12, [tlq-44], 1 ; 32-39 movu xm13, [tlq-34] ; 24-31 vinserti128 m13, [tlq-20], 1 ; 8-15 movu xm14, [tlq-28] ; 16-23 vinserti128 m14, [tlq-14], 1 ; 0- 7 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 lea tlq, [rsp+127] packuswb m0, m2 packuswb m1, m6 mova [tlq-127], m0 mova [tlq- 95], m1 pshufb m0, m11, m10 pmaddubsw m0, m9 pshufb m2, m12, m10 pmaddubsw m2, m9 pshufb m1, m13, m10 pmaddubsw m1, m9 pshufb m6, m14, m7 pmaddubsw m6, m9 vpbroadcastd m9, [z_filter_k+4*2+12*1] pshufb m7, m11, m15 pmaddubsw m7, m9 paddw m0, m7 pshufb m7, m12, m15 pmaddubsw m7, m9 paddw m2, m7 pshufb m7, m13, m15 pmaddubsw m7, m9 paddw m1, m7 pshufb m7, m14, m10 pmaddubsw m7, m9 paddw m6, m7 vpbroadcastd m9, [z_filter_k+4*2+12*0] pshufb m11, m8 pmaddubsw m11, m9 pshufb m12, m8 pmaddubsw m12, m9 pshufb m13, m8 pmaddubsw m13, m9 pshufb m14, m15 pmaddubsw m14, m9 paddw m0, m11 paddw m2, m12 paddw m1, m13 paddw m6, m14 pmulhrsw m0, m3 pmulhrsw m2, m3 pmulhrsw m1, m3 pmulhrsw m6, m3 packuswb m0, m2 packuswb m1, m6 mova [tlq-63], m0 mova [tlq-31], m1 .h64_main: movd xm12, dyd neg maxbaseq vbroadcasti128 m8, [z3_shuf] vpbroadcastb m7, [tlq+maxbaseq] shl maxbased, 6 vpbroadcastw m12, xm12 lea r5d, [dyq+maxbaseq-64] neg dyq or maxbased, 63 lea r4, [dyq+63] movd xm6, r5d mova xm10, [pb_1to32+16] vinserti128 m10, [pb_1to32], 1 vpbroadcastd m11, [pb_32] vpbroadcastw m6, xm6 .h64_loop: mov r5, r4 sar r5, 6 movu m0, [tlq+r5-24] movu m1, [tlq+r5-32] pand m2, m4, m6 psubw m9, m5, m2 psllw m2, 8 por m9, m2 pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 psraw m2, m6, 6 sub rsp, 64 pmulhrsw m0, m3 pmulhrsw m1, m3 packsswb m2, m2 paddb m2, m10 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [rsp+32], m0 movu m0, [tlq+r5-56] movu m1, [tlq+r5-64] add r4, dyq pshufb m0, m8 pshufb m1, m8 pmaddubsw m0, m9 pmaddubsw m1, m9 paddb m2, m11 pmulhrsw m0, m3 pmulhrsw m1, m3 paddw m6, m12 packuswb m0, m1 vpblendvb m0, m7, m0, m2 mova [rsp], m0 dec wd jz .h64_transpose cmp r4d, maxbased jg .h64_loop .h64_end_loop: sub rsp, 64 mova [rsp+32], m7 mova [rsp+ 0], m7 dec wd jg .h64_end_loop .h64_transpose: lea r2, [strideq*3] lea r3, [strideq*5] imul r5, strideq, -8 lea dstq, [dstq+org_wq-16] lea r4, [strideq+r2*2] ; stride*7 .h64_transpose_loop0: lea r6, [rsp+16*3] .h64_transpose_loop: mova xm0, [r6+64*15] vinserti128 m0, [r6+64* 7], 1 mova xm1, [r6+64*14] vinserti128 m1, [r6+64* 6], 1 mova xm2, [r6+64*13] vinserti128 m2, [r6+64* 5], 1 mova xm3, [r6+64*12] vinserti128 m3, [r6+64* 4], 1 mova xm4, [r6+64*11] vinserti128 m4, [r6+64* 3], 1 mova xm5, [r6+64*10] vinserti128 m5, [r6+64* 2], 1 mova xm6, [r6+64* 9] vinserti128 m6, [r6+64* 1], 1 mova xm7, [r6+64* 8] vinserti128 m7, [r6+64* 0], 1 sub r6, 16 punpcklbw m8, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklwd m7, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpckldq m6, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m3 punpckhdq m8, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 vpermq m6, m6, q3120 vpermq m7, m7, q3120 vpermq m2, m2, q3120 vpermq m8, m8, q3120 vpermq m3, m3, q3120 vpermq m1, m1, q3120 vpermq m5, m5, q3120 vpermq m0, m0, q3120 mova [dstq+strideq*0], xm6 vextracti128 [dstq+strideq*1], m6, 1 mova [dstq+strideq*2], xm7 vextracti128 [dstq+r2 ], m7, 1 mova [dstq+strideq*4], xm2 vextracti128 [dstq+r3 ], m2, 1 mova [dstq+r2*2 ], xm8 vextracti128 [dstq+r4 ], m8, 1 sub dstq, r5 mova [dstq+strideq*0], xm3 vextracti128 [dstq+strideq*1], m3, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+r2 ], m1, 1 mova [dstq+strideq*4], xm5 vextracti128 [dstq+r3 ], m5, 1 mova [dstq+r2*2 ], xm0 vextracti128 [dstq+r4 ], m0, 1 sub dstq, r5 cmp r6, rsp jae .h64_transpose_loop add rsp, 64*16 lea dstq, [dstq+r5*8-16] sub org_wd, 16 jg .h64_transpose_loop0 .h64_end: RET %macro FILTER_XMM 4 ; dst, src, tmp, shuf %ifnum %4 pshufb xm%2, xm%4 %else pshufb xm%2, %4 %endif pshufd xm%1, xm%2, q0000 ; p0 p1 pmaddubsw xm%1, xm2 pshufd xm%3, xm%2, q1111 ; p2 p3 pmaddubsw xm%3, xm3 paddw xm%1, xm1 paddw xm%1, xm%3 pshufd xm%3, xm%2, q2222 ; p4 p5 pmaddubsw xm%3, xm4 paddw xm%1, xm%3 pshufd xm%3, xm%2, q3333 ; p6 __ pmaddubsw xm%3, xm5 paddw xm%1, xm%3 psraw xm%1, 4 packuswb xm%1, xm%1 %endmacro %macro FILTER_YMM 4 ; dst, src, tmp, shuf pshufb m%2, m%4 pshufd m%1, m%2, q0000 pmaddubsw m%1, m2 pshufd m%3, m%2, q1111 pmaddubsw m%3, m3 paddw m%1, m1 paddw m%1, m%3 pshufd m%3, m%2, q2222 pmaddubsw m%3, m4 paddw m%1, m%3 pshufd m%3, m%2, q3333 pmaddubsw m%3, m5 paddw m%1, m%3 psraw m%1, 4 vperm2i128 m%3, m%1, m%1, 0x01 packuswb m%1, m%3 %endmacro ; The ipred_filter SIMD processes 4x2 blocks in the following order which ; increases parallelism compared to doing things row by row. One redundant ; block is calculated for w8 and w16, two for w32. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 5 1 2 3 5 b c d f ; 2 2 3 2 4 5 7 2 4 5 7 c e f h ; 3 3 4 4 6 7 9 4 6 7 9 e g h j ; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___ ; 5 8 8 i cglobal ipred_filter_8bpc, 3, 7, 0, dst, stride, tl, w, h, filter %define base r6-ipred_filter_avx2_table lea r6, [filter_intra_taps] tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 add filterq, r6 lea r6, [ipred_filter_avx2_table] movq xm0, [tlq-3] ; _ 6 5 0 1 2 3 4 movsxd wq, [r6+wq*4] vpbroadcastd m1, [base+pw_8] vbroadcasti128 m2, [filterq+16*0] vbroadcasti128 m3, [filterq+16*1] vbroadcasti128 m4, [filterq+16*2] vbroadcasti128 m5, [filterq+16*3] add wq, r6 mov hd, hm jmp wq .w4: WIN64_SPILL_XMM 9 mova xm8, [base+filter_shuf2] sub tlq, 3 sub tlq, hq jmp .w4_loop_start .w4_loop: pinsrd xm0, xm6, [tlq+hq], 0 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER_XMM 6, 0, 7, 8 movd [dstq+strideq*0], xm6 pextrd [dstq+strideq*1], xm6, 1 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 10 mova m8, [base+filter_shuf1] FILTER_XMM 7, 0, 6, [base+filter_shuf2] vpbroadcastd m0, [tlq+4] vpbroadcastd m6, [tlq+5] sub tlq, 4 sub tlq, hq vpbroadcastq m7, xm7 vpblendd m7, m6, 0x20 .w8_loop: vpbroadcastd xm6, [tlq+hq] palignr m6, m0, 12 vpblendd m0, m6, m7, 0xeb ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm6, xm7 call .main vpblendd xm6, xm7, 0x0c pshufd xm6, xm6, q3120 movq [dstq+strideq*0], xm6 movhps [dstq+strideq*1], xm6 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: %if WIN64 %assign stack_offset stack_offset - stack_size_padded %assign xmm_regs_used 15 %assign stack_size_padded 0x98 SUB rsp, stack_size_padded %endif sub hd, 2 TAIL_CALL .w16_main, 0 .w16_main: %if WIN64 movaps [rsp+0xa8], xmm6 movaps [rsp+0xb8], xmm7 movaps [rsp+0x28], xmm8 movaps [rsp+0x38], xmm9 movaps [rsp+0x48], xmm10 movaps [rsp+0x58], xmm11 movaps [rsp+0x68], xmm12 movaps [rsp+0x78], xmm13 movaps [rsp+0x88], xmm14 %endif FILTER_XMM 12, 0, 7, [base+filter_shuf2] vpbroadcastd m0, [tlq+5] vpblendd m0, [tlq-12], 0x14 mova m8, [base+filter_shuf1] vpbroadcastq m7, xm12 vpblendd m0, m7, 0xc2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 movlps xm9, xm7, [tlq+5] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 vinserti128 m14, m8, [base+filter_shuf3], 0 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 FILTER_XMM 6, 9, 10, 14 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 vpbroadcastd m9, [tlq+13] vpbroadcastd m10, [tlq+12] psrld m11, m8, 4 vpblendd m6, m9, 0x20 ; top sub tlq, 6 sub tlq, hq .w16_loop: vpbroadcastd xm9, [tlq+hq] palignr m9, m0, 12 vpblendd m0, m9, m7, 0xe2 ; _ _ _ _ 1 2 3 4 6 5 0 _ _ _ _ _ ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm13, xm7 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 vpblendd m9, m12, m10, 0xf0 vpblendd m12, m6, 0xc0 pshufd m9, m9, q3333 vpblendd m9, m6, 0xee vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 mova [dstq+strideq*0], xm9 vextracti128 [dstq+strideq*1], m9, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 ret ALIGN function_align .w32: sub rsp, stack_size_padded sub hd, 2 lea r3, [dstq+16] lea r5d, [hq-2] call .w16_main add tlq, r5 mov dstq, r3 lea r3, [strideq-4] lea r4, [r3+strideq*2] movq xm0, [tlq+21] pinsrd xm0, [dstq-4], 2 pinsrd xm0, [dstq+r3*1], 3 FILTER_XMM 12, 0, 7, 14 ; a0 b0 a0 b0 movq xm7, [dstq+r3*2] pinsrd xm7, [dstq+r4], 2 palignr xm7, xm0, 12 ; 0 _ _ _ _ _ _ _ _ _ _ 5 _ _ _ 6 vpbroadcastd m0, [tlq+28] vpbroadcastd m9, [tlq+29] vbroadcasti128 m8, [base+filter_shuf1+16] vpblendd m0, m9, 0x20 vpblendd m0, m7, 0x0f vpbroadcastq m7, xm12 vpblendd m0, m7, 0xc2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 call .main ; c0 d0 a1 b1 a1 b1 c0 d0 add r3, 2 lea r4, [r4+strideq*2] movlps xm9, xm7, [tlq+29] ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 vpblendd xm12, xm7, 0x0c ; a0 b0 a1 b1 FILTER_XMM 6, 9, 10, 14 vpbroadcastq m6, xm6 ; a2 b2 __ __ __ __ a2 b2 vpbroadcastd m9, [tlq+37] vpbroadcastd m10, [tlq+36] vpblendd m6, m9, 0x20 ; top .w32_loop: movq xm9, [dstq+r3*4] pinsrd xm9, [dstq+r4], 2 .w32_loop_last: palignr m9, m0, 12 vpblendd m0, m9, m7, 0xe2 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 mova xm13, xm7 ; c0 d0 call .main ; e0 f0 c1 d1 c1 d1 e0 f0 vpblendd m9, m12, m10, 0xf0 vpblendd m12, m6, 0xc0 pshufd m9, m9, q3333 vpblendd m9, m6, 0xee vpblendd m10, m9, m7, 0x0c ; _ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_YMM 6, 10, 9, 14 ; c2 d2 a3 b3 a3 b3 c2 d2 vpblendd m12, m6, 0x30 ; a0 b0 a1 b1 a3 b3 a2 b2 vpermd m9, m11, m12 ; a0 a1 a2 a3 b0 b1 b2 b3 vpblendd xm12, xm13, xm7, 0x0c ; c0 d0 c1 d1 mova [dstq+strideq*0], xm9 vextracti128 [dstq+strideq*1], m9, 1 lea dstq, [dstq+strideq*2] sub r5d, 2 jg .w32_loop jz .w32_loop_last vpblendd xm7, xm6, xm10, 0x04 ; _ _ _ 5 _ _ _ 6 0 _ _ _ 1 2 3 4 pshufd xm7, xm7, q1032 ; 0 _ _ _ 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER_XMM 0, 7, 9, [base+filter_shuf1+16] vpblendd xm6, xm0, 0x0c ; c2 d2 c3 d3 shufps xm0, xm12, xm6, q2020 ; c0 c1 c2 c3 shufps xm6, xm12, xm6, q3131 ; d0 d1 d2 d3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm6 RET ALIGN function_align .main: FILTER_YMM 7, 0, 9, 8 ret %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_left_avx2_table] tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 shrx r6d, r6d, wd movd xm3, r6d movsxd r6, [t0+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov t0d, 0x8000 shrx t0d, t0d, r6d movd xm3, t0d lea t0, [ipred_cfl_left_avx2_table] movsxd r6, [t0+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_avx2_table-ipred_cfl_left_avx2_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: vextracti128 xm1, m0, 1 paddw xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 .h8: psrlq xm1, xm0, 32 paddw xm0, xm1 .h4: pmaddwd xm0, xm2 pmulhrsw xm0, xm3 vpbroadcastw m0, xm0 jmp wq cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea t0d, [wq+hq] movd xm4, t0d tzcnt t0d, t0d movd xm5, t0d lea t0, [ipred_cfl_avx2_table] tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+4*4] pcmpeqd m3, m3 psrlw xm4, 1 add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movd xm0, [tlq-4] pmaddubsw xm0, xm3 jmp wq .w4: movd xm1, [tlq+1] pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm1 pmaddwd xm0, xm3 cmp hd, 4 jg .w4_mul psrlw xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddw xm0, xm1 shrx r6d, r6d, r2d psrlq xm1, xm0, 32 paddw xm0, xm1 movd xm1, r6d psrlw xm0, 2 pmulhuw xm0, xm1 .w4_end: vpbroadcastw m0, xm0 .s4: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] IPRED_CFL 4 packuswb m4, m4 vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 movd [dstq+strideq*2], xm5 pextrd [dstq+r6 ], xm5, 1 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: movq xm0, [tlq-8] pmaddubsw xm0, xm3 jmp wq .w8: movq xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 punpckhqdq xm2, xm0, xm0 paddw xm0, xm2 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w8_end: vpbroadcastw m0, xm0 .s8: vpbroadcastw m1, alpham lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 movhps [dstq+strideq*2], xm4 movhps [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova xm0, [tlq-16] pmaddubsw xm0, xm3 jmp wq .w16: movu xm1, [tlq+1] vextracti128 xm2, m0, 1 pmaddubsw xm1, xm3 psubw xm0, xm4 paddw xm0, xm2 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w16_end: vpbroadcastw m0, xm0 .s16: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vpermq m4, m4, q3120 mova [dstq+strideq*0], xm4 vextracti128 [dstq+strideq*1], m4, 1 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 psubw xm0, xm4 paddw xm0, xm1 punpckhqdq xm1, xm0, xm0 paddw xm0, xm1 psrlq xm1, xm0, 32 paddw xm0, xm1 pmaddwd xm0, xm3 psrlw xm0, xm5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xm1, r6d pmulhuw xm0, xm1 .w32_end: vpbroadcastw m0, xm0 .s32: vpbroadcastw m1, alpham pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+32] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 vpermq m4, m4, q3120 mova [dstq], m4 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha lea t0, [ipred_cfl_splat_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [t0+wq*4] vpbroadcastd m0, [t0-ipred_cfl_splat_avx2_table+pw_128] add wq, t0 movifnidn acq, acmp jmp wq cglobal ipred_cfl_ac_420_8bpc, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd mov ac_bakq, acq imul szd, hd shl hpadd, 2 sub hd, hpadd vpbroadcastd m2, [pb_2] pxor m4, m4 cmp wd, 8 jg .w16 je .w8 ; fall-through DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak .w4: lea stride3q, [strideq*3] .w4_loop: movq xm0, [yq] movq xm1, [yq+strideq] movhps xm0, [yq+strideq*2] movhps xm1, [yq+stride3q] pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 paddw xm0, xm1 mova [acq], xm0 paddw xm4, xm0 lea yq, [yq+strideq*4] add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .calc_avg vpermq m0, m0, q1111 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova xm0, [yq] mova xm1, [yq+strideq] vinserti128 m0, [yq+strideq*2], 1 vinserti128 m1, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_loop test hpadd, hpadd jz .calc_avg jmp .w8_hpad .w8_wpad: vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] .w8_wpad_loop: movq xm0, [yq] movq xm1, [yq+strideq] vinserti128 m0, [yq+strideq*2], 1 vinserti128 m1, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufb m0, m3 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_wpad_loop test hpadd, hpadd jz .calc_avg .w8_hpad: vpermq m0, m0, q3232 .w8_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad_loop jmp .calc_avg .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad_loop .w16_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_420_avx2_table] shl wpadd, 2 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ ipred_cfl_ac_420_avx2_table+wpadq*8-32] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w16_pad3: vpbroadcastq m0, [yq] vpbroadcastq m1, [yq+strideq] jmp .w16_wpad_end .w16_pad2: vbroadcasti128 m0, [yq] vbroadcasti128 m1, [yq+strideq] jmp .w16_wpad_end .w16_pad1: mova m0, [yq] mova m1, [yq+strideq] ; fall-through .w16_wpad_end: pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufb m0, m3 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 dec hd jz .w16_wpad_done jmp iptrq .w16_wpad_done: test hpadd, hpadd jz .calc_avg .w16_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 dec hpadd jg .w16_hpad_loop ; fall-through .calc_avg: vpbroadcastd m2, [pw_1] pmaddwd m0, m4, m2 vextracti128 xm1, m0, 1 tzcnt r1d, szd paddd xm0, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal ipred_cfl_ac_422_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd mov ac_bakq, acq imul szd, hd shl hpadd, 2 sub hd, hpadd vpbroadcastd m2, [pb_4] pxor m4, m4 pxor m5, m5 cmp wd, 8 jg .w16 je .w8 ; fall-through DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak .w4: lea stride3q, [strideq*3] .w4_loop: movq xm1, [yq] movhps xm1, [yq+strideq] movq xm0, [yq+strideq*2] movhps xm0, [yq+stride3q] pmaddubsw xm0, xm2 pmaddubsw xm1, xm2 mova [acq], xm1 mova [acq+16], xm0 paddw xm4, xm0 paddw xm5, xm1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg vpermq m0, m0, q1111 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova xm1, [yq] vinserti128 m1, [yq+strideq], 1 mova xm0, [yq+strideq*2] vinserti128 m0, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg jmp .w8_hpad .w8_wpad: vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle] .w8_wpad_loop: movq xm1, [yq] vinserti128 m1, [yq+strideq], 1 movq xm0, [yq+strideq*2] vinserti128 m0, [yq+stride3q], 1 pmaddubsw m0, m2 pmaddubsw m1, m2 pshufb m0, m3 pshufb m1, m3 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_wpad_loop test hpadd, hpadd jz .calc_avg .w8_hpad: vpermq m0, m0, q3232 .w8_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad_loop jmp .calc_avg .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m1, [yq] mova m0, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad_loop .w16_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_422_avx2_table] shl wpadd, 2 mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \ ipred_cfl_ac_422_avx2_table+wpadq*8-32] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w16_pad3: vpbroadcastq m1, [yq] vpbroadcastq m0, [yq+strideq] jmp .w16_wpad_end .w16_pad2: vbroadcasti128 m1, [yq] vbroadcasti128 m0, [yq+strideq] jmp .w16_wpad_end .w16_pad1: mova m1, [yq] mova m0, [yq+strideq] ; fall-through .w16_wpad_end: pmaddubsw m0, m2 pmaddubsw m1, m2 pshufb m0, m3 pshufb m1, m3 mova [acq], m1 mova [acq+32], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jz .w16_wpad_done jmp iptrq .w16_wpad_done: test hpadd, hpadd jz .calc_avg .w16_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddw m4, m0 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop ; fall-through .calc_avg: vpbroadcastd m2, [pw_1] pmaddwd m5, m5, m2 pmaddwd m0, m4, m2 paddd m0, m5 vextracti128 xm1, m0, 1 tzcnt r1d, szd paddd xm0, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak movifnidn hpadd, hpadm movifnidn wd, wm mov hd, hm mov szd, wd imul szd, hd shl hpadd, 2 sub hd, hpadd pxor m4, m4 vpbroadcastd m5, [pw_1] tzcnt r8d, wd lea r5, [ipred_cfl_ac_444_avx2_table] movsxd r8, [r5+r8*4+12] add r5, r8 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak mov ac_bakq, acq jmp r5 .w4: lea stride3q, [strideq*3] pxor xm2, xm2 .w4_loop: movd xm1, [yq] movd xm0, [yq+strideq*2] pinsrd xm1, [yq+strideq], 1 pinsrd xm0, [yq+stride3q], 1 punpcklbw xm1, xm2 punpcklbw xm0, xm2 psllw xm1, 3 psllw xm0, 3 mova [acq], xm1 mova [acq+16], xm0 paddw xm1, xm0 paddw xm4, xm1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_mul pshufd xm0, xm0, q3232 paddw xm1, xm0, xm0 .w4_hpad_loop: mova [acq], xm0 mova [acq+16], xm0 paddw xm4, xm1 add acq, 32 sub hpadd, 4 jg .w4_hpad_loop jmp .calc_avg_mul .w8: lea stride3q, [strideq*3] pxor m2, m2 .w8_loop: movq xm1, [yq] movq xm0, [yq+strideq*2] vinserti128 m1, [yq+strideq], 1 vinserti128 m0, [yq+stride3q], 1 punpcklbw m1, m2 punpcklbw m0, m2 psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 paddw m4, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_mul vpermq m0, m0, q3232 paddw m1, m0, m0 .w8_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddw m4, m1 add acq, 64 sub hpadd, 4 jg .w8_hpad_loop jmp .calc_avg_mul .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: pmovzxbw m1, [yq] pmovzxbw m0, [yq+strideq] psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 pmaddwd m1, m5 paddd m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg jmp .w16_hpad .w16_wpad: mova m3, [cfl_ac_444_w16_pad1_shuffle] .w16_wpad_loop: vpbroadcastq m1, [yq] vpbroadcastq m0, [yq+strideq] pshufb m1, m3 pshufb m0, m3 psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m1, m0 pmaddwd m1, m5 paddd m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_wpad_loop test hpadd, hpadd jz .calc_avg .w16_hpad: paddw m1, m0, m0 pmaddwd m1, m5 .w16_hpad_loop: mova [acq], m0 mova [acq+32], m0 paddd m4, m1 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop jmp .calc_avg .w32: test wpadd, wpadd jnz .w32_wpad .w32_loop: pmovzxbw m1, [yq] pmovzxbw m0, [yq+16] psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m2, m1, m0 pmaddwd m2, m5 paddd m4, m2 add yq, strideq add acq, 64 dec hd jg .w32_loop test hpadd, hpadd jz .calc_avg jmp .w32_hpad_loop .w32_wpad: DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak lea iptrq, [ipred_cfl_ac_444_avx2_table] add wpadd, wpadd mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] movsxd wpadq, [iptrq+wpadq+4] add iptrq, wpadq jmp iptrq .w32_pad3: vpbroadcastq m1, [yq] pshufb m1, m3 vpermq m0, m1, q3232 jmp .w32_wpad_end .w32_pad2: pmovzxbw m1, [yq] pshufhw m0, m1, q3333 vpermq m0, m0, q3333 jmp .w32_wpad_end .w32_pad1: pmovzxbw m1, [yq] vpbroadcastq m0, [yq+16] pshufb m0, m3 ; fall-through .w32_wpad_end: psllw m1, 3 psllw m0, 3 mova [acq], m1 mova [acq+32], m0 paddw m2, m1, m0 pmaddwd m2, m5 paddd m4, m2 add yq, strideq add acq, 64 dec hd jz .w32_wpad_done jmp iptrq .w32_wpad_done: test hpadd, hpadd jz .calc_avg .w32_hpad_loop: mova [acq], m1 mova [acq+32], m0 paddd m4, m2 add acq, 64 dec hpadd jg .w32_hpad_loop jmp .calc_avg .calc_avg_mul: pmaddwd m4, m5 .calc_avg: vextracti128 xm1, m4, 1 tzcnt r1d, szd paddd xm0, xm4, xm1 movd xm2, r1d movd xm3, szd punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 psrad xm3, 1 psrlq xm1, xm0, 32 paddd xm0, xm3 paddd xm0, xm1 psrad xm0, xm2 vpbroadcastw m0, xm0 .sub_loop: mova m1, [ac_bakq] psubw m1, m0 mova [ac_bakq], m1 add ac_bakq, 32 sub szd, 16 jg .sub_loop RET cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h vbroadcasti128 m4, [palq] lea r2, [pal_pred_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] packuswb m4, m4 add wq, r2 lea r2, [strideq*3] jmp wq .w4: pshufb xm0, xm4, [idxq] add idxq, 16 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET ALIGN function_align .w8: pshufb xm0, xm4, [idxq+16*0] pshufb xm1, xm4, [idxq+16*1] add idxq, 16*2 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r2 ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET ALIGN function_align .w16: pshufb m0, m4, [idxq+32*0] pshufb m1, m4, [idxq+32*1] add idxq, 32*2 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+r2 ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET ALIGN function_align .w32: pshufb m0, m4, [idxq+32*0] pshufb m1, m4, [idxq+32*1] pshufb m2, m4, [idxq+32*2] pshufb m3, m4, [idxq+32*3] add idxq, 32*4 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r2 ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET ALIGN function_align .w64: pshufb m0, m4, [idxq+32*0] pshufb m1, m4, [idxq+32*1] pshufb m2, m4, [idxq+32*2] pshufb m3, m4, [idxq+32*3] add idxq, 32*4 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64 RET %endif rav1e-0.7.1/src/x86/ipred_avx512.asm000064400000000000000000001467561046102023000150210ustar 00000000000000; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 ; dav1d_filter_intra_taps[], reordered for VNNI: p1 p2 p3 p4, p6 p5 p0 __ filter_taps: db 10, 0, 0, 0, 2, 10, 0, 0, 1, 1, 10, 0, 1, 1, 2, 10 db 6, 0, 0, 0, 2, 6, 0, 0, 2, 2, 6, 0, 1, 2, 2, 6 db 0, 12, -6, 0, 0, 9, -5, 0, 0, 7, -3, 0, 0, 5, -3, 0 db 12, 2, -4, 0, 9, 2, -3, 0, 7, 2, -3, 0, 5, 3, -3, 0 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16, 0, 0, 0, 0, 16 db 0, 10,-10, 0, 0, 6, -6, 0, 0, 4, -4, 0, 0, 2, -2, 0 db 10, 0,-10, 0, 6, 0, -6, 0, 4, 0, -4, 0, 2, 0, -2, 0 db 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8, 0, 0, 0, 0, 8 db 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 4 db 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0, 0, 16, -8, 0 db 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0, 16, 0, -4, 0 db 8, 0, 0, 0, 3, 8, 0, 0, 2, 3, 8, 0, 1, 2, 3, 8 db 4, 0, 0, 0, 3, 4, 0, 0, 2, 3, 4, 0, 2, 2, 3, 4 db 0, 10, -2, 0, 0, 6, -1, 0, 0, 4, -1, 0, 0, 2, 0, 0 db 10, 3, -1, 0, 6, 4, -1, 0, 4, 4, -1, 0, 3, 3, -1, 0 db 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14, 0, 0, 0, 0, 14 db 12, 0, 0, 0, 1, 12, 0, 0, 0, 0, 12, 0, 0, 0, 1, 12 db 0, 14,-12, 0, 0, 12,-10, 0, 0, 11, -9, 0, 0, 10, -8, 0 db 14, 0,-10, 0, 12, 0, -9, 0, 11, 1, -8, 0, 9, 1, -7, 0 filter_perm: db 0, 1, 2, 3, 24, 25, 26, 27, 4, 5, 6, 7, 28, 29, 30, 31 db 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7, 3, 15, 11, 7,131 db 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23, 19, 31, 27, 23,147 db 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39, 35, 47, 43, 39,163 filter_end: dd 2, 3, 16, 17, -1, -1, 20, 21, 0, 6, 24, 30, 1, 7, 25, 31 smooth_shuf: db 7, 7, 7, 7, 0, 1, 0, 1, 3, 3, 3, 3, 8, 9, 8, 9 db 5, 5, 5, 5, 4, 5, 4, 5, 1, 1, 1, 1, 12, 13, 12, 13 db 6, 6, 6, 6, 2, 3, 2, 3, 2, 2, 2, 2, 10, 11, 10, 11 db 4, 4, 4, 4, 6, 7, 6, 7, 0, 0, 0, 0, 14, 15, 14, 15 smooth_endA: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95 db 97, 99,101,103,105,107,109,111,113,115,117,119,121,123,125,127 smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79 db 17, 19, 21, 23, 25, 27, 29, 31, 81, 83, 85, 87, 89, 91, 93, 95 db 33, 35, 37, 39, 41, 43, 45, 47, 97, 99,101,103,105,107,109,111 db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127 ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pb_127_m127: times 2 db 127, -127 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 %define pb_1 (ipred_h_shuf+24) %define pb_2 (ipred_h_shuf+20) %define pb_3 (ipred_h_shuf+16) %define pd_8 (filter_taps+128) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_8bpc_avx512icl_table (ipred_dc_8bpc_avx512icl_table + 10*4) JMP_TABLE ipred_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64 SECTION .text INIT_ZMM avx512icl cglobal ipred_dc_top_8bpc, 3, 7, 5, dst, stride, tl, w, h lea r5, [ipred_dc_left_8bpc_avx512icl_table] movd xm0, wm tzcnt wd, wm inc tlq movifnidn hd, hm movu ym1, [tlq] movd xmm3, wd movsxd r6, [r5+wq*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_left_8bpc_avx512icl_table] mov hd, hm tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movd xm0, hm movu ym1, [tlq] movd xmm3, r6d movsxd r6, [r5+r6*4] vpbroadcastd ym2, [r5-ipred_dc_left_8bpc_avx512icl_table+pb_1] psrld xm0, 1 vpdpbusd ym0, ym1, ym2 add r6, r5 add r5, ipred_dc_splat_8bpc_avx512icl_table-ipred_dc_left_8bpc_avx512icl_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu ym1, [tlq+32] ; unaligned when jumping here from dc_top vpdpbusd ym0, ym1, ym2 .h32: vextracti32x4 xm1, ym0, 1 paddd xm0, xm1 .h16: punpckhqdq xm1, xm0, xm0 paddd xm0, xm1 .h8: psrlq xm1, xm0, 32 paddd xm0, xm1 .h4: vpsrlvd xm0, xmm3 lea stride3q, [strideq*3] vpbroadcastb m0, xm0 jmp wq cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd xm0, r5d tzcnt r5d, r5d movd xmm4, r5d lea r5, [ipred_dc_8bpc_avx512icl_table] tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+5*4] vpbroadcastd ym3, [r5-ipred_dc_8bpc_avx512icl_table+pb_1] psrld xm0, 1 add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd xmm1, [tlq-4] vpdpbusd xm0, xmm1, xm3 jmp wq .w4: movd xmm1, [tlq+1] vpdpbusd xm0, xmm1, xm3 cmp hd, 4 jg .w4_mul psrlw xmm0, xm0, 3 jmp .w4_end .w4_mul: punpckhqdq xmm1, xm0, xm0 lea r2d, [hq*2] mov r6d, 0x55563334 paddd xmm1, xm0 shrx r6d, r6d, r2d psrlq xmm0, xmm1, 32 paddd xmm0, xmm1 movd xmm1, r6d psrld xmm0, 2 pmulhuw xmm0, xmm1 .w4_end: vpbroadcastb xm0, xmm0 .s4: movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm0 movd [dstq+strideq*2], xm0 movd [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET .h8: movq xmm1, [tlq-8] vpdpbusd xm0, xmm1, xm3 jmp wq .w8: movq xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmove r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w8_end: vpbroadcastb xm0, xmm0 .s8: movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm0 movq [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET .h16: mova xmm1, [tlq-16] vpdpbusd xm0, xmm1, xm3 jmp wq .w16: movu xmm1, [tlq+1] vextracti32x4 xm2, ym0, 1 vpdpbusd xm0, xmm1, xm3 paddd xmm2, xm2, xm0 punpckhqdq xmm0, xmm2, xmm2 paddd xmm0, xmm2 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hb, 8|32 cmovz r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w16_end: vpbroadcastb xm0, xmm0 .s16: mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm0 mova [dstq+strideq*2], xm0 mova [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET .h32: mova ym1, [tlq-32] vpdpbusd ym0, ym1, ym3 jmp wq .w32: movu ym1, [tlq+1] vpdpbusd ym0, ym1, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x33345556 shrx r6d, r6d, r2d movd xmm1, r6d pmulhuw xmm0, xmm1 .w32_end: vpbroadcastb ym0, xmm0 .s32: mova [dstq+strideq*0], ym0 mova [dstq+strideq*1], ym0 mova [dstq+strideq*2], ym0 mova [dstq+stride3q ], ym0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET .h64: mova ym1, [tlq-64] mova ym2, [tlq-32] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 jmp wq .w64: movu ym1, [tlq+ 1] movu ym2, [tlq+33] vpdpbusd ym0, ym1, ym3 vpdpbusd ym0, ym2, ym3 vextracti32x4 xm1, ym0, 1 paddd xmm1, xm1, xm0 punpckhqdq xmm0, xmm1, xmm1 paddd xmm0, xmm1 psrlq xmm1, xmm0, 32 paddd xmm0, xmm1 vpsrlvd xmm0, xmm4 cmp hd, 64 je .w64_end mov r6d, 0x33345556 shrx r6d, r6d, hd movd xmm1, r6d pmulhuw xmm0, xmm1 .w64_end: vpbroadcastb m0, xmm0 .s64: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s64 RET cglobal ipred_dc_128_8bpc, 2, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m0, [r5-ipred_dc_splat_8bpc_avx512icl_table+pb_128] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_v_8bpc, 3, 7, 5, dst, stride, tl, w, h, stride3 lea r5, [ipred_dc_splat_8bpc_avx512icl_table] tzcnt wd, wm movu m0, [tlq+1] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h, stride3 %define base r6-ipred_h_8bpc_avx512icl_table lea r6, [ipred_h_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea stride3q, [strideq*3] sub tlq, hq add wq, r6 jmp wq .w4: mova xmm1, [base+ipred_h_shuf+16] .w4_loop: movd xmm0, [tlq+hq-4] pshufb xmm0, xmm1 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET .w8: movsldup xmm2, [base+ipred_h_shuf+16] movshdup xmm3, [base+ipred_h_shuf+16] .w8_loop: movd xmm1, [tlq+hq-4] pshufb xmm0, xmm1, xmm2 pshufb xmm1, xmm3 movq [dstq+strideq*0], xmm0 movq [dstq+strideq*1], xmm1 movhps [dstq+strideq*2], xmm0 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m1, [base+smooth_shuf] .w16_loop: vpbroadcastd m0, [tlq+hq-4] pshufb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: vpbroadcastd ym3, [base+pb_1] vpord m2, m3, [base+pb_2] {1to16} .w32_loop: vpbroadcastd m1, [tlq+hq-4] pshufb m0, m1, m2 pshufb m1, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32_loop RET .w64: vpbroadcastd m4, [base+pb_3] vpbroadcastd m5, [base+pb_2] vpbroadcastd m6, [base+pb_1] pxor m7, m7 .w64_loop: vpbroadcastd m3, [tlq+hq-4] pshufb m0, m3, m4 pshufb m1, m3, m5 pshufb m2, m3, m6 pshufb m3, m7 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64_loop RET %macro PAETH 0 psubusb m1, m5, m4 psubusb m0, m4, m5 por m1, m0 ; tdiff pavgb m2, m6, m4 vpcmpub k1, m1, m7, 1 ; tdiff < ldiff vpblendmb m0{k1}, m4, m6 vpternlogd m4, m6, m8, 0x28 ; (m4 ^ m6) & m8 psubusb m3, m5, m2 psubb m2, m4 psubusb m2, m5 por m2, m3 pminub m1, m7 paddusb m2, m2 por m2, m4 ; min(tldiff, 255) vpcmpub k1, m2, m1, 1 ; tldiff < ldiff && tldiff < tdiff vmovdqu8 m0{k1}, m5 %endmacro cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w, h, top, stride3 lea r6, [ipred_paeth_8bpc_avx512icl_table] tzcnt wd, wm vpbroadcastb m5, [tlq] ; topleft mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m8, [r6-ipred_paeth_8bpc_avx512icl_table+pb_1] lea topq, [tlq+1] sub tlq, hq add wq, r6 lea stride3q, [strideq*3] jmp wq INIT_YMM avx512icl .w4: vpbroadcastd m6, [topq] mova m9, [ipred_h_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 ; left PAETH movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 sub hd, 8 jl .w4_ret vextracti32x4 xm0, m0, 1 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm0, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .w4_ret: RET INIT_ZMM avx512icl .w8: vpbroadcastq m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: vpbroadcastq m4, [tlq+hq-8] pshufb m4, m9 PAETH vextracti32x4 xm1, m0, 2 vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 sub hd, 8 jl .w8_ret lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] jg .w8_loop .w8_ret: RET .w16: vbroadcasti32x4 m6, [topq] movsldup m9, [smooth_shuf] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: vpbroadcastd m4, [tlq+hq-4] pshufb m4, m9 PAETH mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m6, [topq] mova ym9, ym8 psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w32_loop: vpbroadcastd m4, [tlq+hq-2] pshufb m4, m9 PAETH mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m6, [topq] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w64_loop: vpbroadcastb m4, [tlq+hq-1] PAETH mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights, stride3 %define base r6-ipred_smooth_v_8bpc_avx512icl_table lea r6, [ipred_smooth_v_8bpc_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m0, [base+pb_127_m127] vpbroadcastd m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq vpbroadcastb m4, [tlq+hq] ; bottom add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 ; top, bottom pmaddubsw m3, m2, m0 paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; 128 * top + 129 * bottom + 128 .w4_loop: vbroadcasti32x4 m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 add hq, 8 jg .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jl .w4_loop .ret: RET .w8: vpbroadcastq m2, [tlq+1] movshdup m5, [smooth_shuf] mova ym6, [smooth_endA] punpcklbw m2, m4 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 .w8_loop: vpbroadcastq m0, [weightsq+hq*2] pshufb m0, m5 pmaddubsw m0, m2, m0 paddw m0, m3 vpermb m0, m6, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] add hq, 4 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w16_loop: vpbroadcastq m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] add hq, 4 jl .w16_loop RET .w32: vbroadcasti32x8 m3, [tlq+1] movshdup m6, [smooth_shuf] mova m7, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w32_loop: vpbroadcastd m1, [weightsq+hq*2] pshufb m1, m6 pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m7, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] add hq, 2 jl .w32_loop RET .w64: movu m3, [tlq+1] mova m6, [smooth_endB] punpcklbw m2, m3, m4 punpckhbw m3, m4 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 .w64_loop: vpbroadcastw m1, [weightsq+hq*2] pmaddubsw m0, m2, m1 pmaddubsw m1, m3, m1 paddw m0, m4 paddw m1, m5 vpermt2b m0, m6, m1 mova [dstq], m0 add dstq, strideq inc hq jl .w64_loop RET cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl, w, h, stride3 %define base r5-ipred_smooth_h_8bpc_avx512icl_table lea r5, [ipred_smooth_h_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd vpbroadcastb m4, [tlq+r6] ; right mov hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m5, [base+pb_127_m127] vpbroadcastd m6, [base+pw_128] sub tlq, hq add wq, r5 vpmovb2m k1, m6 lea stride3q, [strideq*3] jmp wq .w4: movsldup m3, [smooth_shuf] vpbroadcastq m7, [smooth_weights+4*2] mova ym8, [smooth_endA] .w4_loop: vpbroadcastq m0, [tlq+hq-8] mova m2, m4 vpshufb m2{k1}, m0, m3 ; left, right pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: movsldup m3, [smooth_shuf] vbroadcasti32x4 m7, [smooth_weights+8*2] mova ym8, [smooth_endA] .w8_loop: vpbroadcastd m0, [tlq+hq-4] mova m2, m4 vpshufb m2{k1}, m0, m3 pmaddubsw m0, m2, m5 pmaddubsw m1, m2, m7 paddw m2, m6 paddw m0, m2 paddw m0, m1 vpermb m0, m8, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: movsldup m7, [smooth_shuf] vbroadcasti32x4 m8, [smooth_weights+16*2] vbroadcasti32x4 m9, [smooth_weights+16*3] mova m10, [smooth_endB] .w16_loop: vpbroadcastd m0, [tlq+hq-4] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: mova m10, [smooth_endA] vpbroadcastd ym7, [pb_1] vbroadcasti32x8 m8, [smooth_weights+32*2] vbroadcasti32x8 m9, [smooth_weights+32*3] vshufi32x4 m10, m10, q3120 .w32_loop: vpbroadcastd m0, [tlq+hq-2] mova m3, m4 vpshufb m3{k1}, m0, m7 pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m8 pmaddubsw m1, m3, m9 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m10, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: mova m7, [smooth_weights+64*2] mova m8, [smooth_weights+64*3] mova m9, [smooth_endA] .w64_loop: mova m3, m4 vpbroadcastb m3{k1}, [tlq+hq-1] pmaddubsw m2, m3, m5 pmaddubsw m0, m3, m7 pmaddubsw m1, m3, m8 paddw m3, m6 paddw m2, m3 paddw m0, m2 paddw m1, m2 vpermt2b m0, m9, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3 %define base r5-ipred_smooth_8bpc_avx512icl_table lea r5, [ipred_smooth_8bpc_avx512icl_table] mov r6d, wd tzcnt wd, wd mov hd, hm vpbroadcastb m6, [tlq+r6] ; right sub tlq, hq movsxd wq, [r5+wq*4] vpbroadcastd m7, [base+pb_127_m127] vpbroadcastb m0, [tlq] ; bottom vpbroadcastd m1, [base+pw_255] add wq, r5 lea v_weightsq, [base+smooth_weights+hq*2] vpmovb2m k1, m1 lea stride3q, [strideq*3] jmp wq .w4: vpbroadcastd m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vpbroadcastq m9, [smooth_weights+4*2] mova ym11, [smooth_endA] punpcklbw m8, m0 ; top, bottom pmaddubsw m10, m8, m7 paddw m1, m8 ; 1 * top + 256 * bottom + 255 paddw m10, m1 ; 128 * top + 129 * bottom + 255 .w4_loop: vpbroadcastq m1, [tlq+hq-8] vbroadcasti32x4 m0, [v_weightsq] add v_weightsq, 16 mova m2, m6 vpshufb m2{k1}, m1, m4 ; left, right pmaddubsw m1, m2, m7 ; 127 * left - 127 * right pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 ; 128 * left + 129 * right pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movd [dstq+strideq*0], xm0 movd [dstq+strideq*1], xm1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+stride3q ], xm1, 2 sub hd, 8 jl .ret lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 1 pextrd [dstq+strideq*1], xm1, 1 pextrd [dstq+strideq*2], xm0, 3 pextrd [dstq+stride3q ], xm1, 3 lea dstq, [dstq+strideq*4] jg .w4_loop .ret: RET .w8: vpbroadcastq m8, [tlq+hq+1] movsldup m4, [smooth_shuf] movshdup m5, [smooth_shuf] vbroadcasti32x4 m9, [smooth_weights+8*2] mova ym11, [smooth_endA] punpcklbw m8, m0 pmaddubsw m10, m8, m7 paddw m1, m8 paddw m10, m1 .w8_loop: vpbroadcastd m1, [tlq+hq-4] vpbroadcastq m0, [v_weightsq] add v_weightsq, 8 mova m2, m6 vpshufb m2{k1}, m1, m4 pmaddubsw m1, m2, m7 pshufb m0, m5 pmaddubsw m0, m8, m0 paddw m1, m2 pmaddubsw m2, m9 paddw m0, m10 paddw m1, m2 pavgw m0, m1 vpermb m0, m11, m0 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8_loop RET .w16: vbroadcasti32x4 m9, [tlq+hq+1] movsldup m5, [smooth_shuf] movshdup m10, [smooth_shuf] vbroadcasti32x4 m11, [smooth_weights+16*2] vbroadcasti32x4 m12, [smooth_weights+16*3] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w16_loop: vpbroadcastd m0, [tlq+hq-4] vpbroadcastq m1, [v_weightsq] add v_weightsq, 8 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: vbroadcasti32x8 m9, [tlq+hq+1] movshdup m10, [smooth_shuf] mova m12, [smooth_weights+32*2] vpbroadcastd ym5, [pb_1] mova m15, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m13, m8, m7 pmaddubsw m14, m9, m7 vshufi32x4 m11, m12, m12, q2020 vshufi32x4 m12, m12, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m13, m0 paddw m14, m1 .w32_loop: vpbroadcastd m0, [tlq+hq-2] vpbroadcastd m1, [v_weightsq] add v_weightsq, 4 mova m4, m6 vpshufb m4{k1}, m0, m5 pmaddubsw m2, m4, m7 pshufb m1, m10 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m11 pmaddubsw m4, m12 paddw m0, m13 paddw m1, m14 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m15, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: movu m9, [tlq+hq+1] mova m11, [smooth_weights+64*2] mova m2, [smooth_weights+64*3] mova m14, [smooth_endB] punpcklbw m8, m9, m0 punpckhbw m9, m0 pmaddubsw m12, m8, m7 pmaddubsw m13, m9, m7 vshufi32x4 m10, m11, m2, q2020 vshufi32x4 m11, m2, q3131 paddw m0, m1, m8 paddw m1, m9 paddw m12, m0 paddw m13, m1 .w64_loop: mova m4, m6 vpbroadcastb m4{k1}, [tlq+hq-1] vpbroadcastw m1, [v_weightsq] add v_weightsq, 2 pmaddubsw m2, m4, m7 pmaddubsw m0, m8, m1 pmaddubsw m1, m9, m1 paddw m2, m4 pmaddubsw m3, m4, m10 pmaddubsw m4, m11 paddw m0, m12 paddw m1, m13 paddw m3, m2 paddw m4, m2 pavgw m0, m3 pavgw m1, m4 vpermt2b m0, m14, m1 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3 lea r6, [pal_pred_8bpc_avx512icl_table] tzcnt wd, wm vbroadcasti32x4 m4, [palq] movifnidn hd, hm movsxd wq, [r6+wq*4] packuswb m4, m4 add wq, r6 lea stride3q, [strideq*3] jmp wq .w4: pshufb xmm0, xm4, [idxq] add idxq, 16 movd [dstq+strideq*0], xmm0 pextrd [dstq+strideq*1], xmm0, 1 pextrd [dstq+strideq*2], xmm0, 2 pextrd [dstq+stride3q ], xmm0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: pshufb xmm0, xm4, [idxq+16*0] pshufb xmm1, xm4, [idxq+16*1] add idxq, 16*2 movq [dstq+strideq*0], xmm0 movhps [dstq+strideq*1], xmm0 movq [dstq+strideq*2], xmm1 movhps [dstq+stride3q ], xmm1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET .w16: pshufb m0, m4, [idxq] add idxq, 64 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET .w32: pshufb m0, m4, [idxq+64*0] pshufb m1, m4, [idxq+64*1] add idxq, 64*2 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w32 RET .w64: pshufb m0, m4, [idxq+64*0] pshufb m1, m4, [idxq+64*1] pshufb m2, m4, [idxq+64*2] pshufb m3, m4, [idxq+64*3] add idxq, 64*4 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w64 RET ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. ; w4 w8 w16 w32 ; 1 1 2 1 2 3 4 1 2 3 4 9 a b c ; 2 2 3 2 3 4 5 2 3 4 5 a b c d ; 3 3 4 3 4 5 6 3 4 5 6 b c d e ; 4 4 5 4 5 6 7 4 5 6 7 c d e f ; 5 5 6 5 6 7 8 5 6 7 8 d e f g ; 6 6 7 6 7 8 9 6 7 8 9 e f g h ; 7 7 8 7 8 9 a 7 8 9 a f g h i ; ___ 8 ___ 8 9 ___ 8 9 a b ___ 8 9 a b g h i j ___ ; 9 9 a b h i j ; a b i j ; b j cglobal ipred_filter_8bpc, 4, 7, 14, dst, stride, tl, w, h, flt %define base r6-filter_taps lea r6, [filter_taps] %ifidn fltd, fltm movzx fltd, fltb %else movzx fltd, byte fltm %endif vpbroadcastd xmm2, [tlq+1] ; t0 t0 t0 t0 movifnidn hd, hm shl fltd, 6 vpbroadcastd m6, [base+pd_8] vpbroadcastd xmm3, [tlq-2] ; l1 l0 tl __ vbroadcasti32x4 m7, [r6+fltq+16*0] ; p1 p2 p3 p4 vbroadcasti32x4 m8, [r6+fltq+16*1] vbroadcasti32x4 m9, [r6+fltq+16*2] ; p6 p5 p0 __ vbroadcasti32x4 m10, [r6+fltq+16*3] mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 cmp wd, 8 jb .w4 vpbroadcastd ym2, [tlq+5] mova m11, [base+filter_perm] mov r5, 0xffffffffffff000f psrldq xmm2, 1 ; __ t0 kmovq k1, r5 ; 0x000f psraw xm5, xmm0, 4 packuswb xmm2, xm5 ; __ t0 a0 b0 pshufd ym2{k1}, ymm2, q3333 ; b0 b0 b0 b0 t1 t1 t1 t1 je .w8 kxnorb k3, k3, k3 ; 0x00ff vpbroadcastd xm3, [tlq-4] kandnq k2, k3, k1 ; 0xffffffffffff0000 vpermb ym3{k2}, ym11, ymm2 ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 pshufb ym5{k2}, ym2, ym11 ; a0 b0 __ t0 vpbroadcastd m2, [tlq+9] vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 vpbroadcastd xm3, [tlq-6] ; l5 l4 l3 __ kunpckbw k4, k1, k3 ; 0x0fff packssdw ym0, ym1 psraw ym0, 4 ; a0 d0 a1 b1 packuswb ym5, ym0 ; a0 b0 c0 d0 __ t1 a1 b1 pshufd m2{k3}, m5, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 t2 t2 t2 t2 vpermb m3{k2}, m11, m5 ; l5 l4 l3 __ d3 c3 b3 __ b7 a7 t7 __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 psrldq m0, m2, 1 ; __ d0 __ b0 __ t0 vpbroadcastd m2, [tlq+13] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 mova m12, [base+filter_end] lea r5d, [hq-6] mov r6, dstq cmovp hd, r5d ; w == 16 ? h : h - 6 packssdw m4, m1 psraw m4, 4 ; e0 f0 c1 d1 a2 b2 packuswb m0, m4 ; __ d0 e0 f0 __ b1 c1 d1 __ t2 a2 b2 pshufd m2{k4}, m0, q3333 ; f0 f0 f0 f0 d1 d1 d1 d1 b2 b2 b2 b2 t3 t3 t3 t3 .w16_loop: vpbroadcastd xm3, [tlq-8] vpermb m3{k2}, m11, m0 ; l7 l6 l5 __ f3 e3 d3 __ d7 c7 b7 __ bb ab tb __ mova m1, m6 vpdpbusd m1, m2, m7 mova m0, m6 vpdpbusd m0, m2, m8 sub tlq, 2 vpdpbusd m1, m3, m9 vpdpbusd m0, m3, m10 packssdw m1, m0 mova m0, m4 psraw m4, m1, 4 ; g0 h0 e1 f1 c2 d2 a3 b3 packuswb m0, m4 ; e0 f0 g0 h0 c1 d1 e1 f1 a2 b2 c2 d2 __ __ a3 b3 pshufd m2, m0, q3333 ; h0 h0 h0 h0 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 vpermt2d m5, m12, m0 ; c0 d0 e0 f0 __ __ c1 d1 a0 a1 a2 a3 b0 b1 b2 b3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop cmp wd, 16 je .ret mova xm13, [filter_perm+16] mova xmm3, [r6+strideq*0] punpckhdq xmm3, [r6+strideq*1] vpbroadcastd m2{k1}, [tlq+r5+17] ; t4 t4 t4 t4 f1 f1 f1 f1 d2 d2 d2 d2 b3 b3 b3 b3 pinsrb xm3, xmm3, [tlq+r5+16], 7 pshufb xm3, xm13 vpermb m3{k2}, m11, m0 ; bf af tf __ h3 g3 f3 __ f7 e7 d7 __ db cb bb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckbw k5, k3, k1 ; 0xff0f lea r3, [strideq*3] vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; a4 b4 g1 h1 e2 f2 c3 d3 packuswb m4, m0 ; g0 h0 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpblendmb m1{k3}, m4, m2 ; __ t4 a4 b4 e1 f1 g1 h1 c2 d2 e2 f2 __ __ c3 d3 vpbroadcastd ym2, [tlq+r5+21] pshufd m2{k5}, m4, q3333 ; b4 b4 b4 b4 t5 t5 t5 t5 f2 f2 f2 f2 d3 d3 d3 d3 vpermt2d m5, m12, m4 ; e0 f0 g0 h0 __ __ e1 f1 c0 c1 c2 c3 d0 d1 d2 d3 vextracti32x4 [dstq+strideq*0], m5, 2 vextracti32x4 [dstq+strideq*1], m5, 3 punpckhqdq xmm3, [r6+r3] pinsrb xmm3, [r6+strideq*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; df cf bf __ bj aj tj __ h7 g7 f7 __ fb eb db __ mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kxnord k3, k3, k4 ; 0xfffff0ff lea r4, [strideq*5] vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 psraw m4, 4 ; c4 d4 a5 b5 g2 h2 e3 f3 packuswb m0, m4 ; a4 b4 c4 d4 g1 h1 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpblendmw m1{k3}, m2, m0 ; a4 b4 c4 d4 __ t5 a5 b5 e2 f2 g2 h2 __ __ e3 f3 vpbroadcastd m2, [tlq+r5+25] pshufd m2{k3}, m0, q3333 ; d4 d4 d4 d4 b5 b5 b5 b5 t6 t6 t6 t6 f3 f3 f3 f3 vpermt2d m5, m12, m0 ; g0 h0 a4 b4 __ __ g1 h1 e0 e1 e2 e3 f0 f1 f2 f3 vextracti32x4 [dstq+strideq*2], m5, 2 vextracti32x4 [dstq+r3 ], m5, 3 punpckhqdq xmm3, [r6+r4] pinsrb xmm3, [r6+strideq*4+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; ff ef df __ dj cj bj __ bn an tn __ hb hb fb __ mova m0, m6 vpdpbusd m0, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 kunpckwd k1, k1, k2 ; 0x000f0000 vpdpbusd m0, m3, m9 vpdpbusd m1, m3, m10 packssdw m0, m1 psraw m0, 4 ; e4 f4 c5 d5 a6 b6 g3 h3 packuswb m4, m0 ; c4 d4 e4 f4 a5 b5 c5 d5 g2 h2 a6 b6 __ __ g3 h3 vpblendmw m1{k1}, m4, m2 ; c4 d4 e4 f4 a5 b5 c5 d5 __ t6 a6 b6 __ __ g3 h3 vpbroadcastd m2, [tlq+r5+29] pshufd m2{k4}, m4, q3333 ; f4 f4 f4 f4 d5 d5 d5 d5 b6 b6 b6 b6 t7 t7 t7 t7 vpermt2d m5, m12, m4 ; a4 b4 c4 d4 __ __ a5 b5 g0 g1 g2 g3 h0 h1 h2 h3 vextracti32x4 [dstq+strideq*4], m5, 2 vextracti32x4 [dstq+r4 ], m5, 3 lea r0, [strideq+r3*2] .w32_loop: punpckhqdq xmm3, [r6+r0] pinsrb xmm3, [r6+r3*2+15], 11 pshufb xm3, xmm3, xm13 vpermb m3{k2}, m11, m1 ; hf gf ff __ fj ej dj __ dn cn bn __ br ar tr __ .w32_loop_tail: mova m4, m6 vpdpbusd m4, m2, m7 mova m1, m6 vpdpbusd m1, m2, m8 vpdpbusd m4, m3, m9 vpdpbusd m1, m3, m10 packssdw m4, m1 mova m1, m0 psraw m0, m4, 4 ; g4 h4 e5 f5 c6 d6 a7 b7 packuswb m1, m0 ; e4 f4 g4 h4 c5 d5 e5 f5 a6 b6 c6 d6 __ __ a7 b7 pshufd m2, m1, q3333 ; h4 h4 h4 h4 f5 f5 f5 f5 d6 d6 d6 d6 b7 b7 b7 b7 vpermt2d m5, m12, m1 ; c4 d4 e4 f4 __ __ c5 d5 a4 a5 a6 a7 b4 b5 b6 b7 vextracti32x4 [r6+strideq*0+16], m5, 2 vextracti32x4 [r6+strideq*1+16], m5, 3 lea r6, [r6+strideq*2] sub r5d, 2 jg .w32_loop vpermb m3, m11, m1 cmp r5d, -6 jg .w32_loop_tail .ret: RET .w8: vpermb ym3, ym11, ymm2 .w8_loop: vpbroadcastd ym3{k1}, [tlq-4] ; l3 l2 l1 __ b3 a3 t3 __ mova ym0, ym6 vpdpbusd ym0, ym2, ym7 mova ym1, ym6 vpdpbusd ym1, ym2, ym8 sub tlq, 2 vpdpbusd ym0, ym3, ym9 vpdpbusd ym1, ym3, ym10 mova ym3, ym5 packssdw ym0, ym1 psraw ym5, ym0, 4 ; c0 d0 a1 b1 packuswb ym3, ym5 ; a0 b0 c0 d0 __ __ a1 b1 pshufd ym2, ym3, q3333 ; d0 d0 d0 d0 b1 b1 b1 b1 vpermb ym3, ym11, ym3 ; a0 a1 b0 b1 movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET .w4_loop: vpbroadcastd xmm3, [tlq-4] ; l3 l2 l1 __ mova xmm0, xm6 vpdpbusd xmm0, xmm2, xm7 mova xmm1, xm6 vpdpbusd xmm1, xmm2, xm8 sub tlq, 2 vpdpbusd xmm0, xmm3, xm9 vpdpbusd xmm1, xmm3, xm10 packssdw xmm0, xmm1 .w4: psraw xmm0, 4 ; a0 b0 packuswb xmm0, xmm0 movd [dstq+strideq*0], xmm0 pshufd xmm2, xmm0, q1111 ; b0 b0 b0 b0 movd [dstq+strideq*1], xmm2 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_loop RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/ipred_sse.asm000064400000000000000000005567661046102023000145730ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 %macro SMOOTH_WEIGHT_TABLE 1-* %rep %0 db %1-128, 127-%1 %rotate 1 %endrep %endmacro ; sm_weights[], but modified to precalculate x and 256-x with offsets to ; enable efficient use of pmaddubsw (which requires signed values) smooth_weights: SMOOTH_WEIGHT_TABLE \ 0, 0, 255, 128, 255, 149, 85, 64, \ 255, 197, 146, 105, 73, 50, 37, 32, \ 255, 225, 196, 170, 145, 123, 102, 84, \ 68, 54, 43, 33, 26, 20, 17, 16, \ 255, 240, 225, 210, 196, 182, 169, 157, \ 145, 133, 122, 111, 101, 92, 83, 74, \ 66, 59, 52, 45, 39, 34, 29, 25, \ 21, 17, 14, 12, 10, 9, 8, 8, \ 255, 248, 240, 233, 225, 218, 210, 203, \ 196, 189, 182, 176, 169, 163, 156, 150, \ 144, 138, 133, 127, 121, 116, 111, 106, \ 101, 96, 91, 86, 82, 77, 73, 69, \ 65, 61, 57, 54, 50, 47, 44, 41, \ 38, 35, 32, 29, 27, 25, 22, 20, \ 18, 16, 15, 13, 12, 10, 9, 8, \ 7, 6, 6, 5, 5, 4, 4, 4 ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8 z_transpose4: db 8, 12, 0, 4, 9, 13, 1, 5, 10, 14, 2, 6, 11, 15, 3, 7 z3_shuf: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 z3_shuf_h4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8 filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1 filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1 z_filter_wh4: db 7, 7, 19, 7, z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39 pd_32768: dd 32768 z3_filter_k_tail: db 64, 0, 64, 0, 64, 0, 56, 8 z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64 z3_base_inc: dw 7*64, 6*64, 5*64, 4*64, 3*64, 2*64, 1*64, 0*64 z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1 z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15 db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3 z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0 z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11 z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 pw_m1to4: dw -1, -2, -3, -4 z_filter_k: times 4 db 0, 16 times 4 db 0, 20 times 4 db 8, 16 times 4 db 32, 16 times 4 db 24, 20 times 4 db 16, 16 times 4 db 0, 0 times 4 db 0, 0 pw_8: times 8 db 8, 0 pb_3: times 16 db 3 pb_16: times 16 db 16 pw_62: times 8 dw 62 pw_64: times 8 dw 64 pw_256: times 8 dw 256 pw_512: times 8 dw 512 pw_m256: times 8 dw -256 pb_2: times 8 db 2 pb_4: times 8 db 4 pb_8: times 8 db 8 pb_128: times 8 db 128 pb_m16: times 8 db -16 pw_128: times 4 dw 128 pw_255: times 4 dw 255 pb_36_m4: times 4 db 36, -4 pb_127_m127: times 4 db 127, -127 %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%base %+ .%3 - (%%table - 2*4) %rotate 1 %endrep %endmacro %define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4) %define ipred_cfl_splat_ssse3_table (ipred_cfl_ssse3_table + 8*4) JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left, ssse3, h4, h8, h16, h32, h64 JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ s4-8*4, s8-8*4, s16-8*4, s32-8*4 JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32 JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32 cextern dr_intra_derivative cextern filter_intra_taps SECTION .text ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8 pshuflw m1, m0, %3 ; extend 8 byte for 2 pos punpcklqdq m1, m1 mova [dstq + %2], m1 %if %1 > 16 mova [dstq + 16 + %2], m1 %endif %if %1 > 32 mova [dstq + 32 + %2], m1 mova [dstq + 48 + %2], m1 %endif %endmacro %macro IPRED_H 1 ; width sub tlq, 4 movd m0, [tlq] ; get 4 bytes of topleft data punpcklbw m0, m0 ; extend 2 byte %if %1 == 4 pshuflw m1, m0, q2233 movd [dstq+strideq*0], m1 psrlq m1, 32 movd [dstq+strideq*1], m1 pshuflw m0, m0, q0011 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+stride3q ], m0 %elif %1 == 8 punpcklwd m0, m0 punpckhdq m1, m0, m0 punpckldq m0, m0 movq [dstq+strideq*1], m1 movhps [dstq+strideq*0], m1 movq [dstq+stride3q ], m0 movhps [dstq+strideq*2], m0 %else IPRED_SET %1, 0, q3333 IPRED_SET %1, strideq, q2222 IPRED_SET %1, strideq*2, q1111 IPRED_SET %1, stride3q, q0000 %endif lea dstq, [dstq+strideq*4] sub hd, 4 jg .w%1 RET %endmacro INIT_XMM ssse3 cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, stride3 LEA r5, ipred_h_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq .w4: IPRED_H 4 .w8: IPRED_H 8 .w16: IPRED_H 16 .w32: IPRED_H 32 .w64: IPRED_H 64 ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_v_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movu m0, [tlq+ 1] movu m1, [tlq+17] movu m2, [tlq+33] movu m3, [tlq+49] movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea stride3q, [strideq*3] jmp wq ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 movifnidn hd, hm movifnidn wd, wm tzcnt r6d, hd lea r5d, [wq+hq] movd m4, r5d tzcnt r5d, r5d movd m5, r5d LEA r5, ipred_dc_ssse3_table tzcnt wd, wd movsxd r6, [r5+r6*4] movsxd wq, [r5+wq*4+20] pcmpeqd m3, m3 psrlw m4, 1 ; dc = (width + height) >> 1; add r6, r5 add wq, r5 lea stride3q, [strideq*3] jmp r6 .h4: movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 paddw m0, m1 pmaddwd m0, m3 cmp hd, 4 jg .w4_mul psrlw m0, 3 ; dc >>= ctz(width + height); jmp .w4_end .w4_mul: punpckhqdq m1, m0, m0 paddw m0, m1 psrlq m1, m0, 32 paddw m0, m1 psrlw m0, 2 mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8 cmovz r6d, r2d movd m5, r6d pmulhuw m0, m5 .w4_end: pxor m1, m1 pshufb m0, m1 .s4: movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 movd [dstq+strideq*2], m0 movd [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s4 RET ALIGN function_align .h8: movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w8_end: pxor m1, m1 pshufb m0, m1 .s8: movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 movq [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s8 RET ALIGN function_align .h16: mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w16_end: pxor m1, m1 pshufb m0, m1 .s16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s16 RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] pmaddubsw m2, m3 paddw m0, m2 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x5556 mov r2d, 0x3334 test hd, 64|16 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w32_end: pxor m1, m1 pshufb m0, m1 mova m1, m0 .s32: mova [dstq], m0 mova [dstq+16], m1 mova [dstq+strideq], m0 mova [dstq+strideq+16], m1 mova [dstq+strideq*2], m0 mova [dstq+strideq*2+16], m1 mova [dstq+stride3q], m0 mova [dstq+stride3q+16], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .s32 RET ALIGN function_align .h64: mova m0, [tlq-64] mova m1, [tlq-48] pmaddubsw m0, m3 pmaddubsw m1, m3 paddw m0, m1 mova m1, [tlq-32] pmaddubsw m1, m3 paddw m0, m1 mova m1, [tlq-16] pmaddubsw m1, m3 paddw m0, m1 jmp wq .w64: movu m1, [tlq+ 1] movu m2, [tlq+17] pmaddubsw m1, m3 pmaddubsw m2, m3 paddw m1, m2 movu m2, [tlq+33] pmaddubsw m2, m3 paddw m1, m2 movu m2, [tlq+49] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 64 je .w64_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w64_end: pxor m1, m1 pshufb m0, m1 mova m1, m0 mova m2, m0 mova m3, m0 .s64: mova [dstq], m0 mova [dstq+16], m1 mova [dstq+32], m2 mova [dstq+48], m3 mova [dstq+strideq], m0 mova [dstq+strideq+16], m1 mova [dstq+strideq+32], m2 mova [dstq+strideq+48], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .s64 RET ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_left_ssse3_table mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] movd m2, r6d psrld m3, m2 movsxd r6, [r5+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 .h64: movu m1, [tlq+48] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 movu m1, [tlq+32] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h32: movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: pmaddwd m0, m2 pmulhrsw m0, m3 lea stride3q, [strideq*3] pxor m1, m1 pshufb m0, m1 mova m1, m0 mova m2, m0 mova m3, m0 jmp wq ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_128_8bpc, 2, 7, 6, dst, stride, tl, w, h, stride3 LEA r5, ipred_dc_splat_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] movddup m0, [r5-ipred_dc_splat_ssse3_table+pb_128] mova m1, m0 mova m2, m0 mova m3, m0 add wq, r5 lea stride3q, [strideq*3] jmp wq ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_dc_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_dc_top_8bpc, 3, 7, 6, dst, stride, tl, w, h LEA r5, ipred_dc_left_ssse3_table tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm movd m3, [r5-ipred_dc_left_ssse3_table+pd_32768] movd m2, wd psrld m3, m2 movsxd r6, [r5+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, r5 add r5, ipred_dc_splat_ssse3_table-ipred_dc_left_ssse3_table movsxd wq, [r5+wq*4] add wq, r5 jmp r6 ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_smooth_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2] ; w * a = (w - 128) * a + 128 * a ; (256 - w) * b = (127 - w) * b + 129 * b ; => w * a + (256 - w) * b = [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b] pmaddubsw m6, m%3, m%1 pmaddubsw m0, m%4, m%2 ; (w - 128) * a + (127 - w) * b paddw m6, m%5 paddw m0, m%6 ; [(w - 128) * a + (127 - w) * b] + [128 * a + 129 * b + 128] psrlw m6, 8 psrlw m0, 8 packuswb m6, m0 %endmacro cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, w, h, weights %define base r6-ipred_smooth_v_ssse3_table LEA r6, ipred_smooth_v_ssse3_table tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] lea weightsq, [base+smooth_weights+hq*4] neg hq movd m5, [tlq+hq] pxor m2, m2 pshufb m5, m2 add wq, r6 jmp wq .w4: movd m2, [tlq+1] punpckldq m2, m2 punpcklbw m2, m5 ; top, bottom lea r3, [strideq*3] mova m4, [base+ipred_v_shuf] mova m5, m4 punpckldq m4, m4 punpckhdq m5, m5 pmaddubsw m3, m2, m0 ; m3: 127 * top - 127 * bottom paddw m1, m2 ; m1: 1 * top + 256 * bottom + 128, overflow is ok paddw m3, m1 ; m3: 128 * top + 129 * bottom + 128 .w4_loop: movu m1, [weightsq+hq*2] pshufb m0, m1, m4 ;m2, m3, m4 and m5 should be stable in loop pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 movd [dstq+strideq*0], m6 pshuflw m1, m6, q1032 movd [dstq+strideq*1], m1 punpckhqdq m6, m6 movd [dstq+strideq*2], m6 psrlq m6, 32 movd [dstq+r3 ], m6 lea dstq, [dstq+strideq*4] add hq, 4 jl .w4_loop RET ALIGN function_align .w8: movq m2, [tlq+1] punpcklbw m2, m5 mova m5, [base+ipred_v_shuf] lea r3, [strideq*3] pshufd m4, m5, q0000 pshufd m5, m5, q1111 pmaddubsw m3, m2, m0 paddw m1, m2 paddw m3, m1 ; m3 is output for loop .w8_loop: movq m1, [weightsq+hq*2] pshufb m0, m1, m4 pshufb m1, m5 SMOOTH 0, 1, 2, 2, 3, 3 movq [dstq+strideq*0], m6 movhps [dstq+strideq*1], m6 lea dstq, [dstq+strideq*2] add hq, 2 jl .w8_loop RET ALIGN function_align .w16: movu m3, [tlq+1] punpcklbw m2, m3, m5 punpckhbw m3, m5 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 ; m4 and m5 is output for loop .w16_loop: movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add dstq, strideq add hq, 1 jl .w16_loop RET ALIGN function_align .w32: %if WIN64 movaps [rsp+24], xmm7 %define xmm_regs_used 8 %endif mova m7, m5 .w32_loop_init: mov r3d, 2 .w32_loop: movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] movu m3, [tlq+1] punpcklbw m2, m3, m7 punpckhbw m3, m7 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add tlq, 16 add dstq, 16 dec r3d jg .w32_loop lea dstq, [dstq-32+strideq] sub tlq, 32 add hq, 1 jl .w32_loop_init RET ALIGN function_align .w64: %if WIN64 movaps [rsp+24], xmm7 %define xmm_regs_used 8 %endif mova m7, m5 .w64_loop_init: mov r3d, 4 .w64_loop: movddup m0, [base+pb_127_m127] movddup m1, [base+pw_128] movu m3, [tlq+1] punpcklbw m2, m3, m7 punpckhbw m3, m7 pmaddubsw m4, m2, m0 pmaddubsw m5, m3, m0 paddw m0, m1, m2 paddw m1, m3 paddw m4, m0 paddw m5, m1 movd m1, [weightsq+hq*2] pshuflw m1, m1, q0000 punpcklqdq m1, m1 SMOOTH 1, 1, 2, 3, 4, 5 mova [dstq], m6 add tlq, 16 add dstq, 16 dec r3d jg .w64_loop lea dstq, [dstq-64+strideq] sub tlq, 64 add hq, 1 jl .w64_loop_init RET ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_smooth_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, w, h %define base r6-ipred_smooth_h_ssse3_table LEA r6, ipred_smooth_h_ssse3_table mov wd, wm movd m3, [tlq+wq] pxor m1, m1 pshufb m3, m1 ; right tzcnt wd, wd mov hd, hm movsxd wq, [r6+wq*4] movddup m4, [base+pb_127_m127] movddup m5, [base+pw_128] add wq, r6 jmp wq .w4: movddup m6, [base+smooth_weights+4*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 sub tlq, hq lea r3, [strideq*3] .w4_loop: movd m2, [tlq+hq] ; left pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r3 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: mova m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 sub tlq, hq punpckldq m7, m7 .w8_loop: movd m2, [tlq+hq] ; left pshufb m2, m7 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m6 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: mova m6, [base+smooth_weights+16*2] mova m7, [base+smooth_weights+16*3] sub tlq, 1 sub tlq, hq .w16_loop: pxor m1, m1 movd m2, [tlq+hq] ; left pshufb m2, m1 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m6 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 lea dstq, [dstq+strideq] sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: sub tlq, 1 sub tlq, hq pxor m6, m6 .w32_loop_init: mov r5, 2 lea r3, [base+smooth_weights+16*4] .w32_loop: mova m7, [r3] add r3, 16 movd m2, [tlq+hq] ; left pshufb m2, m6 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m7 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 mova m7, [r3] add r3, 16 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 add dstq, 16 dec r5 jg .w32_loop lea dstq, [dstq-32+strideq] sub hd, 1 jg .w32_loop_init RET ALIGN function_align .w64: sub tlq, 1 sub tlq, hq pxor m6, m6 .w64_loop_init: mov r5, 4 lea r3, [base+smooth_weights+16*8] .w64_loop: mova m7, [r3] add r3, 16 movd m2, [tlq+hq] ; left pshufb m2, m6 punpcklbw m1, m2, m3 ; left, right punpckhbw m2, m3 pmaddubsw m0, m1, m4 ; 127 * left - 127 * right paddw m0, m1 ; 128 * left + 129 * right pmaddubsw m1, m7 paddw m1, m5 paddw m0, m1 pmaddubsw m1, m2, m4 paddw m1, m2 mova m7, [r3] add r3, 16 pmaddubsw m2, m7 paddw m2, m5 paddw m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq], m0 add dstq, 16 dec r5 jg .w64_loop lea dstq, [dstq-64+strideq] sub hd, 1 jg .w64_loop_init RET ;--------------------------------------------------------------------------------------- ;int dav1d_ipred_smooth_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int a); ;--------------------------------------------------------------------------------------- %macro SMOOTH_2D_END 7 ; src[1-2], mul[1-2], add[1-2], m3 pmaddubsw m6, m%3, m%1 mova m0, m6 pmaddubsw m6, m%4, m%2 mova m1, m6 %ifnum %5 paddw m0, m%5 %else paddw m0, %5 %endif %ifnum %6 paddw m1, m%6 %else paddw m1, %6 %endif %ifnum %7 %else mova m3, %7 %endif pavgw m0, m2 pavgw m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 %endmacro %macro SMOOTH_OUTPUT_16B 12 ; m1, [buffer1, buffer2, buffer3, buffer4,] [w1, w2,] m3, m7, [m0, m4, m5] mova m1, [rsp+16*%1] ; top punpckhbw m6, m1, m0 ; top, bottom punpcklbw m1, m0 ; top, bottom pmaddubsw m2, m1, m5 mova [rsp+16*%2], m1 paddw m1, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m1 ; 128 * top + 129 * bottom + 255 mova [rsp+16*%3], m2 pmaddubsw m2, m6, m5 mova [rsp+16*%4], m6 paddw m6, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m6 ; 128 * top + 129 * bottom + 255 mova [rsp+16*%5], m2 movd m1, [tlq+hq] ; left pshufb m1, [base+pb_3] ; topleft[-(1 + y)] punpcklbw m1, m4 ; left, right pmaddubsw m2, m1, m5 ; 127 * left - 127 * right paddw m2, m1 ; 128 * left + 129 * right mova m3, m2 pmaddubsw m0, m1, %6 ; weights_hor = &dav1d_sm_weights[width]; pmaddubsw m1, %7 paddw m2, m3, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; mova m7, [rsp+16*%9] pshufb m1, m7 mova [rsp+16*%8], m3 mova m4, [rsp+16*%2] mova m5, [rsp+16*%3] mova m3, [rsp+16*%4] mova m7, [rsp+16*%5] SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*%8] mova [dstq], m0 movddup m3, [base+pw_255] ; recovery mova m0, [rsp+16*%10] ; recovery mova m4, [rsp+16*%11] ; recovery mova m5, [rsp+16*%12] ; recovery %endmacro cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, stride, tl, w, h, v_weights %define base r6-ipred_smooth_ssse3_table mov wd, wm mov hd, hm LEA r6, ipred_smooth_ssse3_table movd m4, [tlq+wq] ; right pxor m2, m2 pshufb m4, m2 tzcnt wd, wd mov r5, tlq sub r5, hq movsxd wq, [r6+wq*4] movddup m5, [base+pb_127_m127] movd m0, [r5] pshufb m0, m2 ; bottom movddup m3, [base+pw_255] add wq, r6 lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] jmp wq .w4: mova m7, [base+ipred_v_shuf] movd m1, [tlq+1] ; left pshufd m1, m1, q0000 sub tlq, 4 lea r3, [strideq*3] sub tlq, hq punpcklbw m1, m0 ; top, bottom pshufd m6, m7, q1100 pshufd m7, m7, q3322 pmaddubsw m2, m1, m5 paddw m3, m1 ; 1 * top + 255 * bottom + 255 paddw m2, m3 ; 128 * top + 129 * bottom + 255 mova [rsp+16*0], m1 mova [rsp+16*1], m2 movq m1, [base+smooth_weights+4*2] ; weights_hor = &dav1d_sm_weights[width]; punpcklqdq m1, m1 mova [rsp+16*2], m1 mova [rsp+16*3], m4 mova [rsp+16*4], m6 mova [rsp+16*5], m5 .w4_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+ipred_h_shuf] punpcklbw m0, m1, m4 ; left, right punpckhbw m1, m4 pmaddubsw m2, m0, m5 ; 127 * left - 127 * right pmaddubsw m3, m1, m5 paddw m2, m0 ; 128 * left + 129 * right paddw m3, m1 mova m4, [rsp+16*2] pmaddubsw m0, m4 pmaddubsw m1, m4 paddw m2, m0 paddw m3, m1 movq m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; add v_weightsq, 8 pshufb m0, m1, m6 pshufb m1, m7 mova m4, [rsp+16*0] mova m5, [rsp+16*1] SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 mova m4, [rsp+16*3] mova m6, [rsp+16*4] mova m5, [rsp+16*5] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r3 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: mova m7, [base+ipred_v_shuf] movq m1, [tlq+1] ; left punpcklqdq m1, m1 sub tlq, 4 sub tlq, hq punpcklbw m1, m0 pshufd m6, m7, q0000 pshufd m7, m7, q1111 pmaddubsw m2, m1, m5 paddw m3, m1 paddw m2, m3 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova m1, [base+smooth_weights+8*2] ; weights_hor = &dav1d_sm_weights[width]; mova [rsp+16*2], m1 mova [rsp+16*3], m4 mova [rsp+16*4], m6 mova [rsp+16*5], m5 .w8_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+ipred_h_shuf] pshufd m1, m1, q1100 punpcklbw m0, m1, m4 punpckhbw m1, m4 pmaddubsw m2, m0, m5 pmaddubsw m3, m1, m5 paddw m2, m0 paddw m3, m1 mova m4, [rsp+16*2] pmaddubsw m0, m4 pmaddubsw m1, m4 paddw m2, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; add v_weightsq, 4 pshufb m0, m1, m6 pshufb m1, m7 mova m4, [rsp+16*0] mova m5, [rsp+16*1] SMOOTH_2D_END 0, 1, 4, 4, 5, 5, 3 mova m4, [rsp+16*3] mova m6, [rsp+16*4] mova m5, [rsp+16*5] movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: mova m7, [base+ipred_v_shuf] movu m1, [tlq+1] ; left sub tlq, 4 sub tlq, hq punpckhbw m6, m1, m0 ; top, bottom punpcklbw m1, m0 ; top, bottom pshufd m7, m7, q0000 mova [rsp+16*2], m7 pmaddubsw m2, m6, m5 mova [rsp+16*5], m6 paddw m6, m3 ; 1 * top + 255 * bottom + 255 paddw m2, m6 ; 128 * top + 129 * bottom + 255 mova [rsp+16*6], m2 pmaddubsw m2, m1, m5 paddw m3, m1 ; 1 * top + 255 * bottom + 255 mova [rsp+16*0], m1 paddw m2, m3 ; 128 * top + 129 * bottom + 255 mova [rsp+16*1], m2 mova [rsp+16*3], m4 mova [rsp+16*4], m5 .w16_loop: movd m1, [tlq+hq] ; left pshufb m1, [base+pb_3] ; topleft[-(1 + y)] punpcklbw m1, m4 ; left, right pmaddubsw m2, m1, m5 ; 127 * left - 127 * right paddw m2, m1 ; 128 * left + 129 * right mova m0, m1 mova m3, m2 pmaddubsw m0, [base+smooth_weights+16*2] ; weights_hor = &dav1d_sm_weights[width]; pmaddubsw m1, [base+smooth_weights+16*3] paddw m2, m0 paddw m3, m1 movd m1, [v_weightsq] ; weights_ver = &dav1d_sm_weights[height]; add v_weightsq, 2 mova m7, [rsp+16*2] pshufb m1, m7 mova [rsp+16*7], m3 mova m4, [rsp+16*0] mova m5, [rsp+16*1] mova m3, [rsp+16*5] mova m7, [rsp+16*6] SMOOTH_2D_END 1, 1, 4, 3, 5, 7, [rsp+16*7] mova m4, [rsp+16*3] mova m5, [rsp+16*4] mova [dstq], m0 lea dstq, [dstq+strideq] sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 mova [rsp+16*1], m2 sub tlq, 4 sub tlq, hq mova m7, [base+ipred_v_shuf] pshufd m7, m7, q0000 mova [rsp+16*2], m7 mova [rsp+16*3], m0 mova [rsp+16*4], m4 mova [rsp+16*5], m5 .w32_loop: SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*4], [base+smooth_weights+16*5], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*6], [base+smooth_weights+16*7], 10, 2, 3, 4, 5 lea dstq, [dstq-16+strideq] add v_weightsq, 2 sub hd, 1 jg .w32_loop RET ALIGN function_align .w64: movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 mova [rsp+16*1], m2 movu m1, [tlq+33] ; top movu m2, [tlq+49] ; top mova [rsp+16*11], m1 mova [rsp+16*12], m2 sub tlq, 4 sub tlq, hq mova m7, [base+ipred_v_shuf] pshufd m7, m7, q0000 mova [rsp+16*2], m7 mova [rsp+16*3], m0 mova [rsp+16*4], m4 mova [rsp+16*5], m5 .w64_loop: SMOOTH_OUTPUT_16B 0, 6, 7, 8, 9, [base+smooth_weights+16*8], [base+smooth_weights+16*9], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 1, 6, 7, 8, 9, [base+smooth_weights+16*10], [base+smooth_weights+16*11], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 11, 6, 7, 8, 9, [base+smooth_weights+16*12], [base+smooth_weights+16*13], 10, 2, 3, 4, 5 add dstq, 16 SMOOTH_OUTPUT_16B 12, 6, 7, 8, 9, [base+smooth_weights+16*14], [base+smooth_weights+16*15], 10, 2, 3, 4, 5 lea dstq, [dstq-48+strideq] add v_weightsq, 2 sub hd, 1 jg .w64_loop RET %if ARCH_X86_64 cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx %define base r7-$$ lea r7, [$$] mova m8, [base+pw_62] mova m9, [base+pw_64] mova m10, [base+pw_512] %else cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, h, angle, dx %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define strideq r3 %define stridemp dword [rsp+16*12] mov stridemp, r1 LEA r1, $$ %endif tzcnt wd, wm movifnidn angled, anglem movifnidn hd, hm inc tlq movsxd wq, [base+ipred_z1_ssse3_table+wq*4] mov dxd, angled and dxd, 0x7e add angled, 165 ; ~90 lea wq, [base+wq+ipred_z1_ssse3_table] movzx dxd, word [base+dr_intra_derivative+dxq] xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r3d, 9 add r3d, hd cmp r3d, 8 jg .w4_no_upsample ; h > 8 || (w == h && is_sm) mova m1, [tlq-1] pshufb m0, m1, [base+z_upsample1] pshufb m1, [base+z_upsample2] movddup m2, [base+pb_36_m4] add dxd, dxd pmaddubsw m0, m2 pshufd m7, m1, q3333 movd [rsp+16], m7 ; top[max_base_x] pmaddubsw m1, m2 movd m6, dxd mov r5d, dxd ; xpos pshufb m6, [base+pw_256] paddw m1, m0 movq m0, [tlq] pmulhrsw m1, m10 paddw m7, m6, m6 punpcklqdq m6, m7 ; xpos0 xpos1 packuswb m1, m1 punpcklbw m0, m1 movifnidn strideq, stridemp mova [rsp], m0 .w4_upsample_loop: lea r2d, [r5+dxq] shr r5d, 6 ; base0 movq m0, [rsp+r5] lea r5d, [r2+dxq] shr r2d, 6 ; base1 movhps m0, [rsp+r2] pand m2, m8, m6 ; frac psubw m1, m9, m2 ; 64-frac psllw m2, 8 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 paddw m6, m7 ; xpos += dx pmulhrsw m0, m10 packuswb m0, m0 movd [dstq+strideq*0], m0 pshuflw m0, m0, q1032 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_upsample_loop RET .w4_no_upsample: mov r3d, 7 ; max_base test angled, 0x400 ; !enable_intra_edge_filter jnz .w4_main lea r3d, [hq+3] movd m0, r3d movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 pcmpeqb m1, m0, [base+z_filter_wh4] pand m1, m2 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] pmovmskb r5d, m1 mov r3d, 7 test r5d, r5d jz .w4_main ; filter_strength == 0 mova m3, [tlq-1] imul r5d, 0x55555555 movu m7, [base+z_filter_s+8] shr r5d, 30 ; filter_strength movddup m0, [base+pb_8] pminub m7, m0 pshufb m0, m3, [base+z_filter_s] movddup m4, [base+z_filter_k-8+r5*8+24*0] pshufb m3, m7 movddup m5, [base+z_filter_k-8+r5*8+24*1] shufps m2, m0, m3, q2121 movddup m6, [base+z_filter_k-8+r5*8+24*2] pmaddubsw m0, m4 pmaddubsw m1, m2, m4 pmaddubsw m2, m5 paddd m5, m6 pmaddubsw m4, m3, m5 pmaddubsw m3, m6 paddw m0, m2 paddw m1, m4 paddw m0, m3 pshufd m1, m1, q3333 pmulhrsw m0, m10 pmulhrsw m1, m10 mov r5d, 9 mov tlq, rsp cmp hd, 4 cmovne r3d, r5d packuswb m0, m1 mova [tlq], m0 .w4_main: add tlq, r3 movd m5, dxd movddup m0, [base+z_base_inc] ; base_inc << 6 movd m7, [tlq] ; top[max_base_x] shl r3d, 6 movd m4, r3d pshufb m5, [base+pw_256] mov r5d, dxd ; xpos pshufb m7, [base+pw_m256] sub r5, r3 pshufb m4, [base+pw_256] mova m3, [base+z1_shuf_w4] paddw m6, m5, m5 psubw m4, m0 ; max_base_x punpcklqdq m5, m6 ; xpos0 xpos1 .w4_loop: lea r3, [r5+dxq] sar r5, 6 ; base0 movq m0, [tlq+r5] lea r5, [r3+dxq] sar r3, 6 ; base1 movhps m0, [tlq+r3] pand m2, m8, m5 ; frac psubw m1, m9, m2 ; 64-frac psllw m2, 8 pshufb m0, m3 por m1, m2 ; 64-frac, frac pmaddubsw m0, m1 movifnidn strideq, stridemp pcmpgtw m1, m4, m5 ; base < max_base_x pmulhrsw m0, m10 paddw m5, m6 ; xpos += dx pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movd [dstq+strideq*0], m0 pshuflw m0, m0, q1032 movd [dstq+strideq*1], m0 sub hd, 2 jz .w4_end lea dstq, [dstq+strideq*2] test r5d, r5d jl .w4_loop packuswb m7, m7 .w4_end_loop: movd [dstq+strideq*0], m7 movd [dstq+strideq*1], m7 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_end_loop .w4_end: RET .w8: lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd cmp r3d, 8 ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 mova m5, [base+z_upsample1] movu m3, [base+z_filter_s+6] movd m4, hd mova m0, [tlq-1] movu m1, [tlq+7] pxor m7, m7 pshufb m4, m7 movddup m7, [base+pb_36_m4] pminub m4, m3 add dxd, dxd pshufb m2, m0, m5 pmaddubsw m2, m7 pshufb m0, m3 pmaddubsw m0, m7 movd m6, dxd pshufb m3, m1, m5 pmaddubsw m3, m7 pshufb m1, m4 pmaddubsw m1, m7 pshufb m6, [base+pw_256] mov r5d, dxd paddw m2, m0 paddw m7, m6, m6 paddw m3, m1 punpcklqdq m6, m7 ; xpos0 xpos1 movu m1, [tlq] pmulhrsw m2, m10 pmulhrsw m3, m10 packuswb m2, m3 punpcklbw m0, m1, m2 punpckhbw m1, m2 movifnidn strideq, stridemp mova [rsp+16*0], m0 mova [rsp+16*1], m1 .w8_upsample_loop: lea r2d, [r5+dxq] shr r5d, 6 ; base0 movu m0, [rsp+r5] lea r5d, [r2+dxq] shr r2d, 6 ; base1 movu m1, [rsp+r2] pand m2, m8, m6 psubw m3, m9, m2 psllw m2, 8 por m3, m2 punpcklqdq m2, m3, m3 ; frac0 pmaddubsw m0, m2 punpckhqdq m3, m3 ; frac1 pmaddubsw m1, m3 paddw m6, m7 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_upsample_loop RET .w8_no_upsample: lea r3d, [hq+7] movd m0, r3d and r3d, 7 or r3d, 8 ; imin(h+7, 15) test angled, 0x400 jnz .w8_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movu m1, [base+z_filter_wh8] psrldq m3, [base+z_filter_t_w48+angleq*8], 4 pcmpeqb m1, m0 pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .w8_main ; filter_strength == 0 movd m3, [tlq-1] movu m0, [tlq+16*0] imul r5d, 0x55555555 movu m1, [tlq+16*1] shr r5d, 30 ; filter_strength movd m2, [tlq+r3] lea tlq, [rsp+16*4] sub r5, 3 mova [tlq-16*1], m0 pxor m7, m7 mova [tlq+16*0], m1 pshufb m3, m7 pshufb m2, m7 mova [tlq-16*2], m3 movq [tlq+r3-15], m2 call .filter_edge sar r5d, 1 add r5d, 17 cmp hd, 8 cmova r3d, r5d .w8_main: add tlq, r3 movd m5, dxd movd m7, [tlq] shl r3d, 6 movu m3, [base+z_filter_s+2] movd m4, r3d pshufb m5, [base+pw_256] mov r5d, dxd pshufb m7, [base+pw_m256] sub r5, r3 pshufb m4, [base+pw_256] psubw m4, [base+z_base_inc] mova m6, m5 .w8_loop: mov r3, r5 sar r3, 6 movu m0, [tlq+r3] pand m1, m8, m5 psubw m2, m9, m1 psllw m1, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [dstq], m0 dec hd jz .w8_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w8_loop packuswb m7, m7 .w8_end_loop: movq [dstq], m7 add dstq, strideq dec hd jg .w8_end_loop .w8_end: RET .w16: lea r3d, [hq+15] movd m0, r3d and r3d, 15 or r3d, 16 ; imin(h+15, 31) test angled, 0x400 jnz .w16_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m0, [base+z_filter_wh16] pand m0, m2 pcmpgtb m0, m3 pmovmskb r5d, m0 test r5d, r5d jz .w16_main ; filter_strength == 0 movd m4, [tlq-1] movu m0, [tlq+16*0] imul r5d, 0x24924924 movu m1, [tlq+16*1] shr r5d, 30 movd m2, [tlq+30] adc r5, -4 ; filter_strength-3 movd m3, [tlq+r3] lea tlq, [rsp+16*4] mova [tlq-16*1], m0 pxor m7, m7 mova [tlq+16*0], m1 pshufb m4, m7 movd [rsp], m2 pshufb m3, m7 mova [tlq-16*2], m4 movd [tlq+r3-16], m3 call .filter_edge cmp hd, 16 jle .w16_main pshuflw m0, [rsp], q0000 sar r5, 1 movd m1, [base+z_filter_k_tail+4+r5*4] lea r3d, [r5+33] pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq+32], m0 .w16_main: add tlq, r3 movd m5, dxd movd m7, [tlq] movd m4, r3d shl r3d, 6 pshufb m5, [base+pw_256] pxor m6, m6 pshufb m7, m6 mov r5d, dxd pshufb m4, m6 sub r5, r3 psubb m4, [base+pb_0to15] mova m6, m5 .w16_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+0] pand m0, m8, m5 movu m2, [tlq+r3+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m3, m5, 6 packsswb m3, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 paddw m5, m6 pcmpgtb m2, m4, m3 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq], m0 dec hd jz .w16_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w16_loop .w16_end_loop: mova [dstq], m7 add dstq, strideq dec hd jg .w16_end_loop .w16_end: RET .w32: lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .w32_main movd m6, [tlq-1] movu m0, [tlq+16*0] movu m1, [tlq+16*1] movu m2, [tlq+16*2] movu m3, [tlq+16*3] movd m4, [tlq+62] movd m5, [tlq+r3] lea tlq, [rsp+16*6] mova [tlq-16*3], m0 pxor m7, m7 mova [tlq-16*2], m1 pshufb m6, m7 mova [tlq-16*1], m2 xor r5d, r5d ; filter_strength = 3 mova [tlq+16*0], m3 movd [rsp], m4 pshufb m5, m7 mova [tlq-16*4], m6 movd [tlq+r3-48], m5 call .filter_edge sub tlq, 16*2 call .filter_edge cmp hd, 32 jle .w32_main pshuflw m0, [rsp], q0000 movd m1, [base+z_filter_k_tail+4] add r3d, 2 pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq+64], m0 .w32_main: add tlq, r3 movd m0, r3d movd m7, [tlq] shl r3d, 6 movd m5, dxd pxor m6, m6 mov r5d, dxd pshufb m0, m6 pshufb m5, [base+pw_256] sub r5, r3 pshufb m7, m6 psubb m0, [base+pb_0to15] movddup m1, [base+pb_m16] mova [rsp+16*0], m0 paddb m0, m1 mova [rsp+16*1], m0 mova m6, m5 .w32_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+16*0+0] pand m0, m8, m5 movu m2, [tlq+r3+16*0+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m4, m5, 6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m4, m4 pcmpgtb m2, [rsp+16*0], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*1+0] movu m2, [tlq+r3+16*1+1] mova [dstq+16*0], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*1], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq+16*1], m0 dec hd jz .w32_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w32_loop .w32_end_loop: mova [dstq+16*0], m7 mova [dstq+16*1], m7 add dstq, strideq dec hd jg .w32_end_loop .w32_end: RET .w64: lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main movd m4, [tlq-1] movu m0, [tlq+16*0] movu m1, [tlq+16*1] movu m2, [tlq+16*2] movu m3, [tlq+16*3] mova [rsp+16*3], m0 pxor m7, m7 mova [rsp+16*4], m1 pshufb m4, m7 mova [rsp+16*5], m2 mova [rsp+16*6], m3 mova [rsp+16*2], m4 movu m0, [tlq+16*4] movu m1, [tlq+16*5] movu m2, [tlq+16*6] movu m3, [tlq+16*7] movd m4, [tlq+r3] lea tlq, [rsp+16*10] mova [tlq-16*3], m0 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*2], m1 pshufb m4, m7 mova [tlq-16*1], m2 mova [tlq+16*0], m3 movd [tlq+r3-16*7], m4 cmp hd, 64 jl .w64_filter96 ; skip one call if the last 32 bytes aren't used call .filter_edge .w64_filter96: sub tlq, 16*2 call .filter_edge sub tlq, 16*2 call .filter_edge sub tlq, 16*2 call .filter_edge .w64_main: add tlq, r3 movd m0, r3d movd m7, [tlq] shl r3d, 6 movd m5, dxd pxor m6, m6 mov r5d, dxd pshufb m0, m6 sub r5, r3 pshufb m5, [base+pw_256] pshufb m7, m6 psubb m0, [base+pb_0to15] movddup m1, [base+pb_m16] mova [rsp+16*0], m0 paddb m0, m1 mova [rsp+16*1], m0 paddb m0, m1 mova [rsp+16*2], m0 paddb m0, m1 mova [rsp+16*3], m0 mova m6, m5 .w64_loop: mov r3, r5 sar r3, 6 movu m1, [tlq+r3+16*0+0] pand m0, m8, m5 movu m2, [tlq+r3+16*0+1] psubw m3, m9, m0 psllw m0, 8 por m3, m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 psrlw m4, m5, 6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m4, m4 pcmpgtb m2, [rsp+16*0], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*1+0] movu m2, [tlq+r3+16*1+1] mova [dstq+16*0], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*1], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*2+0] movu m2, [tlq+r3+16*2+1] mova [dstq+16*1], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*2], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 movu m1, [tlq+r3+16*3+0] movu m2, [tlq+r3+16*3+1] mova [dstq+16*2], m0 punpcklbw m0, m1, m2 pmaddubsw m0, m3 punpckhbw m1, m2 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 pcmpgtb m2, [rsp+16*3], m4 packuswb m0, m1 pand m0, m2 pandn m2, m7 por m0, m2 mova [dstq+16*3], m0 dec hd jz .w64_end movifnidn strideq, stridemp add dstq, strideq add r5, dxq jl .w64_loop .w64_end_loop: mova [dstq+16*0], m7 mova [dstq+16*1], m7 mova [dstq+16*2], m7 mova [dstq+16*3], m7 add dstq, strideq dec hd jg .w64_end_loop .w64_end: RET ALIGN function_align .filter_edge: ; 32 pixels/iteration movddup m7, [base+z_filter_k+8*2+r5*8+24*0] movu m2, [tlq-18] movu m1, [tlq-17] movu m3, [tlq- 2] movu m4, [tlq- 1] punpcklbw m0, m2, m1 pmaddubsw m0, m7 punpckhbw m2, m1 pmaddubsw m2, m7 punpcklbw m1, m3, m4 pmaddubsw m1, m7 punpckhbw m3, m4 pmaddubsw m3, m7 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] mova m5, [tlq-16] movu m6, [tlq-15] punpcklbw m4, m5, m6 pmaddubsw m4, m7 punpckhbw m5, m6 pmaddubsw m5, m7 paddw m0, m4 paddw m2, m5 mova m5, [tlq+ 0] movu m6, [tlq+ 1] punpcklbw m4, m5, m6 pmaddubsw m4, m7 punpckhbw m5, m6 pmaddubsw m5, m7 paddw m1, m4 paddw m3, m5 test r5d, r5d jnz .filter_end ; 3-tap movddup m7, [base+z_filter_k+8*8] movu m5, [tlq-14] movu m6, [tlq+ 2] punpcklbw m4, m5, m5 pmaddubsw m4, m7 punpckhbw m5, m5 pmaddubsw m5, m7 paddw m0, m4 paddw m2, m5 punpcklbw m5, m6, m6 pmaddubsw m5, m7 punpckhbw m6, m6 pmaddubsw m6, m7 paddw m1, m5 paddw m3, m6 .filter_end: %if ARCH_X86_64 REPX {pmulhrsw x, m10}, m0, m2, m1, m3 %else mova m4, m10 REPX {pmulhrsw x, m4 }, m0, m2, m1, m3 %endif packuswb m0, m2 packuswb m1, m3 mova [tlq+16*0], m0 mova [tlq+16*1], m1 ret %if ARCH_X86_64 cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy %define base r7-$$ %define maxwm r6m %define maxhm r7m lea r7, [$$] mov hd, hm mova m8, [base+pw_62] mova m9, [base+pw_64] lea r9d, [wq-4] mova m10, [base+pw_512] shl r9d, 6 mova m11, [base+z1_shuf_w4] or r9d, hd mova m12, [base+z2_h_shuf] %else cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define m11 [rsp+16*16] %define m12 [rsp+16*17] %define r9b byte [rsp+16*18+4*0] %define r9d dword [rsp+16*18+4*0] %define r10d dword [rsp+16*18+4*1] %define r11d dword [rsp+16*18+4*2] %define maxwm [rsp+16*18+4*3] %define maxhm [rsp+16*19+4*0] %define stridemp [rsp+16*19+4*1] %define strideq r3 %define dyd r4 %define dyq r4 mov stridemp, r1 mov r1d, r6m mov r4d, r7m mov maxwm, r1d mov maxhm, r4d LEA r1, $$ lea hd, [wq-4] mova m0, [base+z1_shuf_w4] shl hd, 6 mova m1, [base+z2_h_shuf] or hd, hm mova m11, m0 mov r9d, hd mova m12, m1 %endif tzcnt wd, wd movifnidn angled, anglem movsxd wq, [base+ipred_z2_ssse3_table+wq*4] %if ARCH_X86_64 movzx dxd, angleb %else movzx dxd, byte anglem %endif xor angled, 0x400 mova m0, [tlq-16*4] mov dyd, dxd mova m1, [tlq-16*3] neg dxq mova m2, [tlq-16*2] and dyd, ~1 mova m3, [tlq-16*1] and dxq, ~1 movd m4, [tlq] movu m5, [tlq+16*0+1] movu m6, [tlq+16*1+1] movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle mova [rsp+16*2], m0 pxor m7, m7 mova [rsp+16*3], m1 pshufb m4, m7 mova [rsp+16*4], m2 lea wq, [base+ipred_z2_ssse3_table+wq] mova [rsp+16*5], m3 neg dxd mova [rsp+16*6], m4 or dyd, 4<<16 mova [rsp+16*7], m4 mova [rsp+16*8], m5 mova [rsp+16*9], m6 movq m0, [base+z_base_inc+2] movsldup m1, [base+z2_dy_offset] movq m2, [base+pw_256] ; 4<<6 movq [rsp+16*14+8*0], m0 movq [rsp+16*15+8*0], m1 movq [rsp+16*15+8*1], m2 %if ARCH_X86_64 lea r10d, [dxq+(128<<6)] ; xpos %else mov [rsp+16*7+4*1], dyd lea r4d, [dxq+(128<<6)] mov r10d, r4d movzx hd, r9b %endif mov r11d, (128-4)<<6 jmp wq .w4: test angled, 0x400 jnz .w4_main movd m5, [tlq+4] lea r3d, [hq+2] add angled, 1022 pshufb m5, m7 shl r3d, 6 movd [rsp+16*8+4], m5 test r3d, angled jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) call .upsample_above sub angled, 1075 ; angle - 53 lea r3d, [hq+3] xor angled, 0x7f ; 180 - angle movd m0, r3d movd m6, angled shr angled, 8 ; is_sm << 1 pshufb m0, m7 pshufb m6, m7 pcmpeqb m0, [base+z_filter_wh4] pand m6, m0 pcmpgtb m6, [base+z_filter_t_w48+angleq*8] jmp .w8_filter_left .upsample_above: ; w4/w8 movq m3, [rsp+gprsize+16*8-2] movq m1, [rsp+gprsize+16*8-1] movq m0, [rsp+gprsize+16*8+0] movq m4, [rsp+gprsize+16*8+1] movddup m5, [base+pb_36_m4] punpcklbw m1, m3 punpcklbw m2, m0, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 %if ARCH_X86_64 mova m11, [base+pb_0to15] lea r10d, [r10+dxq+(1<<6)] mov r11d, (128-7)<<6 %else mova m3, [base+pb_0to15] mov r3d, [rsp+gprsize+16*18+4*1] mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 lea r3d, [r3+dxq+(1<<6)] mov [rsp+gprsize+16*18+4*1], r3d mova [rsp+gprsize+16*16], m3 %endif add dxd, dxd paddw m1, m2 pmulhrsw m1, m10 movq m2, [rsp+gprsize+16*14] paddw m2, m2 movq [rsp+gprsize+16*14], m2 packuswb m1, m1 punpcklbw m1, m0 mova [rsp+gprsize+16*8], m1 ret .w4_no_upsample_above: lea r3d, [hq+3] mov [rsp], angled sub angled, 1112 ; angle - 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m3, [base+z_filter_wh4] mova m4, [base+z_filter_t_w48+angleq*8] call .w8_filter_top mov angled, [rsp] lea r3d, [hq+2] sub angled, 139 shl r3d, 6 test r3d, angled jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) .upsample_left: ; w4/w8 neg hq movd m0, [tlq+hq] pshufb m0, m7 movd [rsp+16*6+hq-4], m0 movq m3, [rsp+16*5+7] movq m0, [rsp+16*5+8] movq m2, [rsp+16*5+9] movq m4, [rsp+16*5+10] movddup m5, [base+pb_36_m4] punpcklbw m1, m0, m3 punpcklbw m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 movshdup m3, [base+z2_dy_offset] %if ARCH_X86_64 mova m12, [base+z2_upsample] add dyd, dyd %else mova m4, [base+z2_upsample] shl dword [rsp+16*7+4*1], 1 mova m12, m4 %endif paddw m1, m2 pmulhrsw m1, m10 movq [rsp+16*15], m3 packuswb m1, m1 punpcklbw m0, m1 mova [rsp+16*5], m0 .w4_main: movd m6, dxd %if ARCH_X86_64 movd m3, dyd %else movd m3, [rsp+16*7+4*1] %endif movddup m0, [rsp+16*14+8*0] pshufb m6, [base+pw_256] paddw m7, m6, m6 movq m5, [base+pw_m1to4] pshuflw m4, m3, q0000 punpcklqdq m6, m7 pmullw m4, m5 pshuflw m3, m3, q1111 paddw m6, m0 mov r2d, r10d pshuflw m0, m4, q3333 psubw m4, [rsp+16*15] movq [rsp+16*6+8*1], m3 movq [rsp+8*1], m0 ; dy*4 mov r5, dstq .w4_loop0: mova [rsp+16*12], m6 movq [rsp+8*0], m4 pand m0, m4, m8 psraw m4, 6 psubw m1, m9, m0 psllw m0, 8 por m0, m1 ; 64-frac_y, frac_y movq [rsp+8*3], m0 pabsw m4, m4 movq [rsp+8*2], m4 movzx hd, r9b .w4_loop: lea r3d, [r2+dxq] shr r2d, 6 ; base_x0 movq m0, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x1 movhps m0, [rsp+r3] lea r3d, [r2+dxq] shr r2d, 6 ; base_x2 movq m1, [rsp+r2] lea r2d, [r3+dxq] shr r3d, 6 ; base_x3 movhps m1, [rsp+r3] pand m2, m8, m6 paddsw m5, m6, m7 psubw m3, m9, m2 psllw m2, 8 pshufb m0, m11 por m2, m3 pmaddubsw m0, m2 pand m2, m8, m5 psubw m3, m9, m2 psllw m2, 8 pshufb m1, m11 por m2, m3 pmaddubsw m1, m2 cmp r3d, 127 ; topleft jge .w4_toponly movzx r3d, byte [rsp+8*2+0] ; base_y0 movq m3, [rsp+r3] movzx r3d, byte [rsp+8*2+2] ; base_y1 movhps m3, [rsp+r3] movzx r3d, byte [rsp+8*2+4] ; base_y2 movq m4, [rsp+r3] movzx r3d, byte [rsp+8*2+6] ; base_y3 movhps m4, [rsp+r3] pshufb m3, m12 pshufb m4, m12 punpckldq m2, m3, m4 punpckhdq m3, m4 movddup m4, [rsp+8*3] pmaddubsw m2, m4 pmaddubsw m3, m4 psraw m6, 15 ; base_x < topleft pand m2, m6 pandn m6, m0 por m0, m2, m6 psraw m6, m5, 15 pand m3, m6 pandn m6, m1 por m1, m3, m6 .w4_toponly: pmulhrsw m0, m10 pmulhrsw m1, m10 movifnidn strideq, stridemp packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] punpckhqdq m0, m0 movd [dstq+strideq*0], m0 psrlq m0, 32 movd [dstq+strideq*1], m0 sub hd, 4 jz .w4_end movq m4, [rsp+8*2] movq m3, [rsp+16*6+8*1] paddw m6, m5, m7 ; xpos += dx psubw m4, m3 movq [rsp+8*2], m4 lea dstq, [dstq+strideq*2] cmp r2d, r11d jge .w4_loop movddup m5, [rsp+8*3] .w4_leftonly_loop: movzx r2d, byte [rsp+8*2+0] ; base_y0 movq m1, [rsp+r2] movzx r2d, byte [rsp+8*2+2] ; base_y1 movhps m1, [rsp+r2] movzx r2d, byte [rsp+8*2+4] ; base_y2 movq m2, [rsp+r2] movzx r2d, byte [rsp+8*2+6] ; base_y3 movhps m2, [rsp+r2] psubw m4, m3 pshufb m1, m12 pshufb m2, m12 movq [rsp+8*2], m4 punpckldq m0, m1, m2 punpckhdq m1, m2 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] punpckhqdq m0, m0 movd [dstq+strideq*0], m0 psrlq m0, 32 movd [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 4 jg .w4_leftonly_loop .w4_end: sub r9d, 1<<8 jl .w4_ret movq m4, [rsp+8*1] add r5, 4 mov dstq, r5 paddw m4, [rsp+8*0] ; base_y += 4*dy movzx r2d, word [rsp+16*15+8*1] movddup m6, [rsp+16*15+8*1] paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) add r2d, r10d mov r10d, r2d jmp .w4_loop0 .w4_ret: RET .w8: test angled, 0x400 jnz .w4_main movd m5, [tlq+8] lea r3d, [angleq+126] pshufb m5, m7 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif movd [rsp+16*8+8], m5 cmp r3d, 8 ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm call .upsample_above sub angled, 53 lea r3d, [hq+7] xor angled, 0x7f ; 180 - angle movu m1, [base+z_filter_wh8] movd m0, r3d movd m6, angled shr angled, 8 ; is_sm << 1 psrldq m2, [base+z_filter_t_w48+angleq*8], 4 pshufb m0, m7 pshufb m6, m7 pcmpeqb m0, m1 pand m6, m0 pcmpgtb m6, m2 %if ARCH_X86_64 movq [rsp+16*15+8*1], m10 ; 8<<6 %else movq m0, m10 movq [rsp+16*15+8*1], m0 %endif jmp .w8_filter_left .w8_no_upsample_above: lea r3d, [hq+7] mov [rsp], angled sub angled, 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movu m3, [base+z_filter_wh8] psrldq m4, [base+z_filter_t_w48+angleq*8], 4 call .w8_filter_top mov r3d, [rsp] sub r3d, 141 %if ARCH_X86_64 mov r3b, hb %else xor r3b, r3b or r3d, hd %endif cmp r3d, 8 jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm .w8_filter_left: pmovmskb r5d, m6 test r5d, r5d jz .w4_main imul r5d, 0x55555555 mov r3, tlq shr r5d, 30 sub r5, 3 ; filter_strength-3 jmp .filter_left .w8_filter_top: movd m6, r3d REPX {pshufb x, m7}, m0, m1, m6 pcmpeqb m0, m3 pand m1, m0 pand m6, m0 pcmpgtb m1, m4 pcmpgtb m6, m4 pmovmskb r5d, m1 test r5d, r5d jz .w8_filter_top_end ; filter_strength == 0 imul r5d, 0x55555555 movq m0, [rsp+gprsize+16*8-2] shr r5d, 30 movq m1, [rsp+gprsize+16*8-1] sub r5, 3 ; filter_strength-3 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] punpcklbw m0, m1 pmaddubsw m0, m7 movq m1, [rsp+gprsize+16*8+0] movq m2, [rsp+gprsize+16*8+1] movddup m7, [base+z_filter_k+8*2+r5*8+24*1] punpcklbw m1, m2 pmaddubsw m1, m7 movq m2, [rsp+gprsize+16*8+2] movddup m7, [base+z_filter_k+8*2+r5*8+24*2] punpcklbw m2, m2 pmaddubsw m2, m7 paddw m0, m1 paddw m0, m2 %if ARCH_X86_64 mov r3d, r7m ; maxw, offset due to call %else mov r3d, [rsp+gprsize+16*18+4*3] %endif pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 movq [rsp+gprsize+16*8], m0 cmp r3d, 8 jge .w8_filter_top_end movq m0, [tlq+r3+1] movq [rsp+gprsize+r3+16*8], m0 .w8_filter_top_end: ret .w16: test angled, 0x400 jnz .w4_main lea r3d, [hq+15] sub angled, 90 movd m0, r3d mov r3d, 90 movd m1, angled sub r3d, angled ; 180 - angle shr angled, 8 ; is_sm << 1 movd m6, r3d REPX {pshufb x, m7}, m0, m1, m6 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m0, [base+z_filter_wh16] pand m1, m0 pand m6, m0 pcmpgtb m1, m3 pcmpgtb m6, m3 pmovmskb r5d, m1 mov r3, tlq test r5d, r5d jz .w16_filter_left ; filter_strength == 0 imul r5d, 0x24924924 pshufb m5, [base+z_filter_t_w16] ; tlq[16] shr r5d, 30 adc r5, -4 ; filter_strength-3 movd [rsp+16*9], m5 movddup m7, [base+z_filter_k+8*2+r5*8+24*0] movu m1, [rsp+16*8-2] movu m2, [rsp+16*8-1] punpcklbw m0, m1, m2 pmaddubsw m0, m7 punpckhbw m1, m2 pmaddubsw m1, m7 movddup m7, [base+z_filter_k+8*2+r5*8+24*1] mova m3, [rsp+16*8+0] movu m4, [rsp+16*8+1] punpcklbw m2, m3, m4 pmaddubsw m2, m7 punpckhbw m3, m4 pmaddubsw m3, m7 paddw m0, m2 paddw m1, m3 test r5d, r5d jnz .w16_filter_end ; 3-tap movddup m7, [base+z_filter_k+8*8] movu m3, [rsp+16*8+2] punpcklbw m2, m3, m3 pmaddubsw m2, m7 punpckhbw m3, m3 pmaddubsw m3, m7 paddw m0, m2 paddw m1, m3 .w16_filter_end: mov r2d, maxwm pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*8], m0 cmp r2d, 16 jge .w16_filter_left movu m0, [r3+r2+1] movu [rsp+r2+16*8], m0 .w16_filter_left: pmovmskb r5d, m6 test r5d, r5d jz .w4_main imul r5d, 0x24924924 shr r5d, 30 adc r5, -4 ; filter_strength-3 jmp .filter_left .w32: test angled, 0x400 jnz .w4_main pshufb m6, [base+z_filter_t_w16] ; tlq[32] mov r3, tlq lea tlq, [rsp+16*9] movd [tlq+16*1], m6 xor r5d, r5d ; filter_strength = 3 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mov r2d, maxwm mova [rsp+16*8], m0 mova [rsp+16*9], m1 cmp r2d, 32 jge .filter_left movu m0, [r3+r2+16*0+1] movu m1, [r3+r2+16*1+1] movu [rsp+r2+16*8], m0 movu [rsp+r2+16*9], m1 jmp .filter_left .w64: movu m0, [tlq+16*2+1] movu m1, [tlq+16*3+1] mova [rsp+16*10], m0 mova [rsp+16*11], m1 test angled, 0x400 jnz .w4_main pshufb m1, [base+z_filter_t_w16] ; tlq[64] mov r3, tlq lea tlq, [rsp+16*11] movd [tlq+16*1], m1 xor r5d, r5d ; filter_strength = 3 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mova m2, [tlq+16*2] mova m3, [tlq+16*3] mov r2d, maxwm mova [rsp+16* 8], m0 mova [rsp+16* 9], m1 mova [rsp+16*10], m2 mova [rsp+16*11], m3 cmp r2d, 64 jge .filter_left movu m0, [r3+r2+16*0+1] movu m1, [r3+r2+16*1+1] movu [rsp+r2+16* 8], m0 movu [rsp+r2+16* 9], m1 cmp r2d, 32 jge .filter_left movu m0, [r3+r2+16*2+1] movu m1, [r3+r2+16*3+1] movu [rsp+r2+16*10], m0 movu [rsp+r2+16*11], m1 .filter_left: neg hq movd m0, [r3+hq] pxor m1, m1 pshufb m0, m1 movd [rsp+16*6+hq-4], m0 lea tlq, [rsp+16*5] call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge cmp hd, -32 jge .filter_left_end sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge mova m0, [tlq+16*0] mova m1, [tlq+16*1] mova [rsp+16*2], m0 mova [rsp+16*3], m1 .filter_left_end: mov r2d, maxhm mova m0, [rsp+16*5] mova m1, [rsp+16*6] mova m2, [rsp+16*7] neg r2 mova [rsp+16*4], m0 mova [rsp+16*5], m1 mova [rsp+16*6], m2 cmp r2d, hd jle .w4_main movu m0, [r3+r2-16*2] movu m1, [r3+r2-16*1] movu [rsp+r2+16*4], m0 movu [rsp+r2+16*5], m1 cmp r2d, -32 jle .w4_main movu m0, [r3+r2-16*4] movu m1, [r3+r2-16*3] movu [rsp+r2+16*2], m0 movu [rsp+r2+16*3], m1 jmp .w4_main %if ARCH_X86_64 cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ lea r7, [$$] mova m8, [base+pw_62] mova m9, [base+pw_64] mova m10, [base+pw_512] mov org_wd, wd %else cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy %define base r1-$$ %define m8 [base+pw_62] %define m9 [base+pw_64] %define m10 [base+pw_512] %define org_wd r5 %define org_wq r5 mov [dstq+strideq*0], strideq mov [dstq+strideq*1], wd LEA r1, $$ %endif tzcnt hd, hm movifnidn angled, anglem dec tlq movsxd hq, [base+ipred_z3_ssse3_table+hq*4] sub angled, 180 mov dyd, angled neg dyd xor angled, 0x400 or dyq, ~0x7e lea hq, [base+ipred_z3_ssse3_table+hq] movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 sar r4d, 9 add r4d, wd cmp r4d, 8 jg .h4_no_upsample ; w > 8 || (w == 8 && is_sm) movu m3, [tlq-7] movu m1, [base+z_upsample1-4] movu m4, [base+z_filter_s+2] pshufb m0, m3, m1 pxor m1, m1 pshufb m2, m3, m1 pshufb m1, m3, m4 mova [rsp+16], m2 ; top[max_base_y] movddup m2, [base+pb_36_m4] add dyd, dyd pmaddubsw m0, m2 pmaddubsw m1, m2 movd m5, dyd mov r5d, dyd pshufb m5, [base+pw_256] paddw m0, m1 pmulhrsw m0, m10 shl wd, 2 mov tlq, rsp sub rsp, wq packuswb m0, m0 punpcklbw m0, m3 paddw m6, m5, m5 punpcklqdq m5, m6 pshufb m0, [base+pb_15to0] mova [tlq], m0 .h4_upsample_loop: lea r4d, [r5+dyq] shr r5d, 6 movq m0, [tlq+r5] lea r5d, [r4+dyq] shr r4d, 6 movhps m0, [tlq+r4] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 por m1, m2 pmaddubsw m0, m1 paddw m5, m6 pmulhrsw m0, m10 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jg .h4_upsample_loop jmp .h4_transpose .h4_no_upsample: mov r4d, 7 test angled, 0x400 ; !enable_intra_edge_filter jnz .h4_main lea r4d, [wq+3] movd m0, r4d movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 pcmpeqb m1, m0, [base+z_filter_wh4] pand m1, m2 pcmpgtb m1, [base+z_filter_t_w48+angleq*8] pmovmskb r5d, m1 mov r4d, 7 test r5d, r5d jz .h4_main ; filter_strength == 0 movu m2, [tlq-7] imul r5d, 0x55555555 movu m3, [base+z_filter_s-2] shr r5d, 30 ; filter_strength mova m4, [base+z_upsample2] movddup m5, [base+z_filter_k-8+r5*8+24*0] movddup m6, [base+z_filter_k-8+r5*8+24*1] movddup m7, [base+z_filter_k-8+r5*8+24*2] pshufb m0, m2, m3 shufps m3, m4, q2121 pmaddubsw m1, m0, m5 pmaddubsw m0, m6 pshufb m5, m2, m3 pmaddubsw m3, m5, m6 pmaddubsw m5, m7 pshufb m2, m4 pmaddubsw m2, m7 paddw m0, m1 paddw m1, m3 paddw m0, m5 paddw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 lea r2d, [r4+2] cmp wd, 4 cmovne r4d, r2d pshufd m0, m0, q0000 lea tlq, [rsp+15] packuswb m0, m1 mova [rsp], m0 .h4_main: movd m5, dyd movddup m0, [base+z_base_inc] ; base_inc << 6 sub tlq, r4 shl r4d, 6 movd m7, [tlq] movd m4, r4d pshufb m5, [base+pw_256] neg dyq pshufb m7, [base+pw_m256] mova m3, [base+z3_shuf_h4] lea r5, [dyq+r4+63] ; ypos pshufb m4, [base+pw_256] psubw m4, m0 ; max_base_y shl wd, 2 paddw m6, m5, m5 sub rsp, wq punpcklqdq m5, m6 .h4_loop: lea r4, [r5+dyq] sar r5, 6 movq m0, [tlq+r5-4] lea r5, [r4+dyq] sar r4, 6 movhps m0, [tlq+r4-4] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jz .h4_transpose test r5d, r5d jg .h4_loop packuswb m7, m7 .h4_end_loop: movq [rsp+wq-8], m7 sub wd, 8 jg .h4_end_loop .h4_transpose: mova m1, [base+z_transpose4] %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif lea r2, [strideq*3] lea dstq, [dstq+org_wq-4] .h4_transpose_loop: mova m0, [rsp] add rsp, 16 pshufb m0, m1 movd [dstq+strideq*0], m0 pshuflw m2, m0, q1032 movd [dstq+strideq*1], m2 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r2 ], m0 sub dstq, 4 sub org_wd, 4 jg .h4_transpose_loop RET .h8: lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd cmp r4d, 8 ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8 mova m4, [tlq-15] and r4d, 4 movu m3, [tlq- 9] movd m1, r4d movu m2, [base+z_filter_s+2] pxor m0, m0 movu m5, [base+z_filter_s+6] movddup m7, [base+pb_36_m4] pshufb m1, m0 ; w & 4 movu m0, [base+z_upsample1-4] pmaxub m1, m0 ; clip 4x8 add dyd, dyd pshufb m0, m4, m1 pmaddubsw m0, m7 pshufb m1, m4, m2 pmaddubsw m1, m7 pshufb m2, m3, [base+z_upsample1] pmaddubsw m2, m7 pshufb m3, m5 pmaddubsw m3, m7 movd m5, dyd neg dyq paddw m1, m0 paddw m2, m3 pmulhrsw m1, m10 pmulhrsw m2, m10 shl wd, 3 lea tlq, [rsp+16] pshufb m5, [base+pw_256] sub rsp, wq packuswb m1, m2 lea r5, [dyq+63] punpcklbw m0, m1, m4 punpckhbw m1, m4 mova [tlq-16*1], m0 mova [tlq-16*0], m1 paddw m6, m5, m5 punpcklqdq m5, m6 .h8_upsample_loop: lea r4, [r5+dyq] sar r5, 6 movu m0, [tlq+r5] lea r5, [r4+dyq] sar r4, 6 movu m1, [tlq+r4] pand m3, m8, m5 psubw m2, m9, m3 psllw m2, 8 por m3, m2 pshufd m2, m3, q1010 pmaddubsw m0, m2 punpckhqdq m3, m3 pmaddubsw m1, m3 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m1, m0 mova [rsp+wq-16], m1 sub wd, 16 jg .h8_upsample_loop jmp .h8_transpose .h8_no_upsample: lea r4d, [wq+7] movd m0, r4d and r4d, 7 or r4d, 8 ; imin(w+7, 15) test angled, 0x400 jnz .h8_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movu m1, [base+z_filter_wh8] psrldq m3, [base+z_filter_t_w48+angleq*8], 4 pcmpeqb m1, m0 pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .h8_main ; filter_strength == 0 mova m0, [tlq-15] imul r5d, 0x55555555 movd m1, [tlq+1] neg r4 movd m2, [tlq+r4] shr r5d, 30 pxor m7, m7 lea tlq, [rsp+16*2] sub r5, 3 ; filter_strength-3 mova [tlq+16*0], m0 pshufb m1, m7 mova [tlq+16*1], m1 pshufb m2, m7 movq [tlq+r4+8], m2 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sar r5d, 1 add tlq, 31 add r5d, 17 cmp wd, 8 cmova r4d, r5d .h8_main: movd m5, dyd sub tlq, r4 shl r4d, 6 movd m7, [tlq] movd m4, r4d pshufb m5, [base+pw_256] neg dyq pshufb m7, [base+pw_m256] mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, [base+pw_256] psubw m4, [base+z3_base_inc] shl wd, 3 mova m6, m5 sub rsp, wq .h8_loop: mov r4, r5 sar r4, 6 movu m0, [tlq+r4-8] pand m2, m8, m5 psubw m1, m9, m2 psllw m2, 8 pshufb m0, m3 por m1, m2 pmaddubsw m0, m1 pcmpgtw m1, m4, m5 paddw m5, m6 pmulhrsw m0, m10 pand m0, m1 pandn m1, m7 por m0, m1 packuswb m0, m0 movq [rsp+wq-8], m0 sub wd, 8 jz .h8_transpose add r5, dyq jg .h8_loop packuswb m7, m7 .h8_end_loop: movq [rsp+wq-8], m7 sub wd, 8 jg .h8_end_loop .h8_transpose: %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif or r3d, 8 cmp org_wd, 4 %if ARCH_X86_64 jne .end_transpose_main %else jne .end_transpose_loop %endif mova m1, [rsp+16*1] mova m0, [rsp+16*0] lea r2, [strideq*3] add rsp, 16*2 punpcklbw m2, m1, m0 punpckhbw m1, m0 punpckhbw m0, m1, m2 punpcklbw m1, m2 .write_4x8_end: call .write_4x8 RET .write_4x8: movd [dstq+r2 ], m0 pshuflw m4, m0, q1032 movd [dstq+strideq*2], m4 punpckhqdq m0, m0 movd [dstq+strideq*1], m0 psrlq m0, 32 movd [dstq+strideq*0], m0 lea dstq, [dstq+strideq*4] movd [dstq+r2 ], m1 pshuflw m4, m1, q1032 movd [dstq+strideq*2], m4 punpckhqdq m1, m1 movd [dstq+strideq*1], m1 psrlq m1, 32 movd [dstq+strideq*0], m1 ret .h16: lea r4d, [wq+15] movd m0, r4d and r4d, 15 or r4d, 16 ; imin(w+15, 31) test angled, 0x400 jnz .h16_main movd m2, angled shr angled, 8 ; is_sm << 1 pxor m1, m1 pshufb m0, m1 pshufb m2, m1 movq m3, [base+z_filter_t_w16+angleq*4] pcmpeqb m1, m0, [base+z_filter_wh16] pand m1, m2 pcmpgtb m1, m3 pmovmskb r5d, m1 test r5d, r5d jz .h16_main ; filter_strength == 0 mova m0, [tlq-16*2+1] imul r5d, 0x24924924 mova m1, [tlq-16*1+1] neg r4 movd m2, [tlq-16*0+1] shr r5d, 30 movd m3, [tlq+r4] adc r5, -4 ; filter_strength-3 pxor m7, m7 lea tlq, [rsp+16*2] mova [tlq-16*1], m0 pshufb m2, m7 mova [tlq+16*0], m1 pshufb m3, m7 mova [tlq+16*1], m2 movq [tlq+r4+8], m3 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge add tlq, 31 cmp wd, 16 jle .h16_main pshuflw m0, [tlq-47], q0000 sar r5, 1 movq m1, [base+z3_filter_k_tail+r5*4] lea r4d, [r5+33] pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq-35], m0 .h16_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] shl wd, 4 mova m6, m5 sub rsp, wq .h16_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*2] por m2, m1 movu m1, [tlq+r4-8*1] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 psrlw m2, m5, 6 paddw m5, m6 pmulhrsw m0, m10 pmulhrsw m1, m10 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 pand m0, m1 pandn m1, m7 por m0, m1 mova [rsp+wq-16], m0 sub wd, 16 jz .h16_transpose add r5, dyq jg .h16_loop .h16_end_loop: mova [rsp+wq-16], m7 sub wd, 16 jg .h16_end_loop .h16_transpose: %if ARCH_X86_32 mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif or r3d, 16 cmp org_wd, 4 %if ARCH_X86_64 jne .end_transpose_main %else jne .end_transpose_loop %endif .h16_transpose_w4: mova m2, [rsp+16*3] mova m4, [rsp+16*2] mova m3, [rsp+16*1] mova m0, [rsp+16*0] lea r2, [strideq*3] add rsp, 16*4 punpckhbw m1, m2, m4 punpcklbw m2, m4 punpckhbw m4, m3, m0 punpcklbw m3, m0 punpckhwd m0, m1, m4 punpcklwd m1, m4 call .write_4x8 lea dstq, [dstq+strideq*4] punpckhwd m0, m2, m3 punpcklwd m1, m2, m3 jmp .write_4x8_end .h32: lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) test angled, 0x400 ; !enable_intra_edge_filter jnz .h32_main mova m0, [tlq-16*4+1] mova m1, [tlq-16*3+1] mova m2, [tlq-16*2+1] mova m3, [tlq-16*1+1] movd m4, [tlq-16*0+1] neg r4 movd m5, [tlq+r4] pxor m7, m7 lea tlq, [rsp+16*4] mova [tlq-16*3], m0 mova [tlq-16*2], m1 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*1], m2 pshufb m4, m7 mova [tlq+16*0], m3 pshufb m5, m7 mova [tlq+16*1], m4 movq [tlq+r4+8], m5 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge add tlq, 63 cmp wd, 32 jle .h32_main pshuflw m0, [tlq-79], q0000 movq m1, [base+z3_filter_k_tail] add r4d, 2 pmaddubsw m0, m1 %if ARCH_X86_64 pmulhrsw m0, m10 %else pmulhrsw m0, m4 %endif packuswb m0, m0 movd [tlq-67], m0 .h32_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] mova m6, m5 .h32_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*4] por m2, m1 movu m1, [tlq+r4-8*3] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 sub rsp, 32 packuswb m0, m1 mova [rsp+16*0], m0 movu m0, [tlq+r4-8*2] movu m1, [tlq+r4-8*1] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 psrlw m2, m5, 6 paddw m5, m6 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] pand m0, m1 pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 mova [rsp+16*1], m0 pand m0, m1, [rsp+16*0] pandn m1, m7 por m0, m1 mova [rsp+16*0], m0 dec wd jz .h32_transpose add r5, dyq jg .h32_loop .h32_end_loop: sub rsp, 32 mova [rsp+16*1], m7 mova [rsp+16*0], m7 dec wd jg .h32_end_loop .h32_transpose: or r3d, 32 jmp .end_transpose_main .h64: lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main mova m0, [tlq-16*8+1] mova m1, [tlq-16*7+1] mova m2, [tlq-16*6+1] mova m3, [tlq-16*5+1] mova [rsp+16*1], m0 mova [rsp+16*2], m1 mova [rsp+16*3], m2 mova [rsp+16*4], m3 mova m0, [tlq-16*4+1] mova m1, [tlq-16*3+1] mova m2, [tlq-16*2+1] mova m3, [tlq-16*1+1] movd m4, [tlq-16*0+1] neg r4 movd m5, [tlq+r4] pxor m7, m7 lea tlq, [rsp+16*8] mova [tlq-16*3], m0 mova [tlq-16*2], m1 xor r5d, r5d ; filter_strength = 3 mova [tlq-16*1], m2 pshufb m4, m7 mova [tlq+16*0], m3 pshufb m5, m7 mova [tlq+16*1], m4 movq [tlq+r4+8], m5 neg r4d call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge sub tlq, 16*2 cmp wd, 64 jl .h64_filter96 ; skip one call if the last 32 bytes aren't used call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge .h64_filter96: add tlq, 127 .h64_main: movd m5, dyd sub tlq, r4 movd m4, r4d shl r4d, 6 movd m7, [tlq] pxor m6, m6 pshufb m5, [base+pw_256] neg dyq pshufb m7, m6 mova m3, [base+z3_shuf] lea r5, [dyq+r4+63] pshufb m4, m6 psubb m4, [base+pb_15to0] mova m6, m5 .h64_loop: mov r4, r5 pand m2, m8, m5 sar r4, 6 psubw m1, m9, m2 psllw m2, 8 movu m0, [tlq+r4-8*8] por m2, m1 movu m1, [tlq+r4-8*7] pshufb m0, m3 pmaddubsw m0, m2 pshufb m1, m3 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 sub rsp, 64 packuswb m0, m1 mova [rsp+16*0], m0 movu m0, [tlq+r4-8*6] movu m1, [tlq+r4-8*5] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*1], m0 movu m0, [tlq+r4-8*4] movu m1, [tlq+r4-8*3] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 packuswb m0, m1 mova [rsp+16*2], m0 movu m0, [tlq+r4-8*2] movu m1, [tlq+r4-8*1] pshufb m0, m3 pshufb m1, m3 pmaddubsw m0, m2 pmaddubsw m1, m2 pmulhrsw m0, m10 pmulhrsw m1, m10 psrlw m2, m5, 6 paddw m5, m6 packsswb m2, m2 packuswb m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] pand m0, m1 pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] mova [rsp+16*3], m0 pand m0, m1, [rsp+16*2] pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 paddsb m2, [base+pb_16] mova [rsp+16*2], m0 pand m0, m1, [rsp+16*1] pandn m1, m7 por m0, m1 pcmpgtb m1, m4, m2 mova [rsp+16*1], m0 pand m0, m1, [rsp+16*0] pandn m1, m7 por m0, m1 mova [rsp+16*0], m0 dec wd jz .h64_transpose add r5, dyq jg .h64_loop .h64_end_loop: sub rsp, 64 mova [rsp+16*3], m7 mova [rsp+16*2], m7 mova [rsp+16*1], m7 mova [rsp+16*0], m7 dec wd jg .h64_end_loop .h64_transpose: or r3d, 64 .end_transpose_main: %if ARCH_X86_64 lea r5, [r3*3] lea r7, [strideq*3] %else mov strideq, [dstq] mov org_wd, [dstq+strideq] %endif .end_transpose_loop: lea r4, [rsp+r3-8] lea r6, [dstq+org_wq-8] .end_transpose_loop_y: movq m0, [r4+r3*1] movq m4, [r4+r3*0] %if ARCH_X86_64 movq m1, [r4+r5 ] movq m5, [r4+r3*2] lea r2, [r4+r3*4] %else lea r2, [r4+r3*2] movq m1, [r2+r3*1] movq m5, [r2+r3*0] lea r2, [r2+r3*2] %endif movq m2, [r2+r3*1] movq m6, [r2+r3*0] %if ARCH_X86_64 movq m3, [r2+r5 ] movq m7, [r2+r3*2] %else lea r2, [r2+r3*2] movq m3, [r2+r3*1] movq m7, [r2+r3*0] %endif sub r4, 8 punpcklbw m0, m4 punpcklbw m1, m5 punpcklbw m2, m6 punpcklbw m3, m7 punpckhwd m4, m1, m0 punpcklwd m1, m0 punpckhwd m0, m3, m2 punpcklwd m3, m2 punpckhdq m2, m3, m1 punpckldq m3, m1 punpckldq m1, m0, m4 punpckhdq m0, m4 movhps [r6+strideq*0], m0 movq [r6+strideq*1], m0 %if ARCH_X86_64 movhps [r6+strideq*2], m1 movq [r6+r7 ], m1 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m1 movq [r6+strideq*1], m1 lea r6, [r6+strideq*2] %endif movhps [r6+strideq*0], m2 movq [r6+strideq*1], m2 %if ARCH_X86_64 movhps [r6+strideq*2], m3 movq [r6+r7 ], m3 lea r6, [r6+strideq*4] %else lea r6, [r6+strideq*2] movhps [r6+strideq*0], m3 movq [r6+strideq*1], m3 lea r6, [r6+strideq*2] %endif cmp r4, rsp jae .end_transpose_loop_y lea rsp, [rsp+r3*8] sub org_wd, 8 jg .end_transpose_loop RET ;--------------------------------------------------------------------------------------- ;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal, ; const uint8_t *idx, const int w, const int h); ;--------------------------------------------------------------------------------------- cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h mova m4, [palq] LEA r2, pal_pred_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r2+wq*4] packuswb m4, m4 add wq, r2 lea r2, [strideq*3] jmp wq .w4: pshufb m0, m4, [idxq] add idxq, 16 movd [dstq ], m0 pshuflw m1, m0, q1032 movd [dstq+strideq ], m1 punpckhqdq m0, m0 movd [dstq+strideq*2], m0 psrlq m0, 32 movd [dstq+r2 ], m0 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET ALIGN function_align .w8: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] add idxq, 32 movq [dstq ], m0 movhps [dstq+strideq ], m0 movq [dstq+strideq*2], m1 movhps [dstq+r2 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w8 RET ALIGN function_align .w16: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] pshufb m3, m4, [idxq+48] add idxq, 64 mova [dstq ], m0 mova [dstq+strideq ], m1 mova [dstq+strideq*2], m2 mova [dstq+r2 ], m3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16 RET ALIGN function_align .w32: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] pshufb m3, m4, [idxq+48] add idxq, 64 mova [dstq ], m0 mova [dstq+16 ], m1 mova [dstq+strideq ], m2 mova [dstq+strideq+16], m3 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32 RET ALIGN function_align .w64: pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] pshufb m3, m4, [idxq+48] add idxq, 64 mova [dstq ], m0 mova [dstq+16], m1 mova [dstq+32], m2 mova [dstq+48], m3 add dstq, strideq sub hd, 1 jg .w64 RET ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- %macro IPRED_CFL 1 ; ac in, unpacked pixels out psignw m3, m%1, m1 pabsw m%1, m%1 pmulhrsw m%1, m2 psignw m%1, m3 paddw m%1, m0 %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha movifnidn wd, wm movifnidn hd, hm tzcnt r6d, hd lea t0d, [wq+hq] movd m4, t0d tzcnt t0d, t0d movd m5, t0d LEA t0, ipred_cfl_ssse3_table tzcnt wd, wd movsxd r6, [t0+r6*4] movsxd wq, [t0+wq*4+16] pcmpeqd m3, m3 psrlw m4, 1 add r6, t0 add wq, t0 movifnidn acq, acmp jmp r6 .h4: movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 paddw m0, m1 pmaddwd m0, m3 cmp hd, 4 jg .w4_mul psrlw m0, 3 ; dc >>= ctz(width + height); jmp .w4_end .w4_mul: punpckhqdq m1, m0, m0 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 psrlw m0, 2 mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8 cmovz r6d, r2d movd m5, r6d pmulhuw m0, m5 .w4_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s4_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movd [dstq+strideq*0], m4 pshuflw m4, m4, q1032 movd [dstq+strideq*1], m4 punpckhqdq m4, m4 movd [dstq+strideq*2], m4 psrlq m4, 32 movd [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] add acq, 32 sub hd, 4 jg .s4_loop RET ALIGN function_align .h8: movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 paddw m0, m1 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 8 je .w8_end mov r6d, 0x5556 mov r2d, 0x3334 cmp hd, 32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w8_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 lea r6, [strideq*3] pabsw m2, m1 psllw m2, 9 .s8_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movq [dstq ], m4 movhps [dstq+strideq ], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 movq [dstq+strideq*2], m4 movhps [dstq+r6 ], m4 lea dstq, [dstq+strideq*4] add acq, 64 sub hd, 4 jg .s8_loop RET ALIGN function_align .h16: mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 16 je .w16_end mov r6d, 0x5556 mov r2d, 0x3334 test hd, 8|32 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w16_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s16_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq+strideq], m4 lea dstq, [dstq+strideq*2] add acq, 64 sub hd, 2 jg .s16_loop RET ALIGN function_align .h32: mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] pmaddubsw m2, m3 paddw m0, m2 jmp wq .w32: movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] pmaddubsw m2, m3 paddw m1, m2 paddw m0, m1 psubw m4, m0 punpckhqdq m0, m0 psubw m0, m4 pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 pmaddwd m0, m3 psrlw m0, m5 cmp hd, 32 je .w32_end lea r2d, [hq*2] mov r6d, 0x5556 mov r2d, 0x3334 test hd, 64|16 cmovz r6d, r2d movd m1, r6d pmulhuw m0, m1 .w32_end: pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 pabsw m2, m1 psllw m2, 9 .s32_loop: mova m4, [acq] mova m5, [acq+16] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq], m4 mova m4, [acq+32] mova m5, [acq+48] IPRED_CFL 4 IPRED_CFL 5 packuswb m4, m5 mova [dstq+16], m4 add dstq, strideq add acq, 64 dec hd jg .s32_loop RET ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_left_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha mov hd, hm ; zero upper half tzcnt r6d, hd sub tlq, hq tzcnt wd, wm movu m0, [tlq] mov t0d, 0x8000 movd m3, t0d movd m2, r6d psrld m3, m2 LEA t0, ipred_cfl_left_ssse3_table movsxd r6, [t0+r6*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 .h32: movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: pmaddwd m0, m2 pmulhrsw m0, m3 pshuflw m0, m0, q0000 punpcklqdq m0, m0 jmp wq ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_top_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_top_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha LEA t0, ipred_cfl_left_ssse3_table tzcnt wd, wm inc tlq movu m0, [tlq] movifnidn hd, hm mov r6d, 0x8000 movd m3, r6d movd m2, wd psrld m3, m2 movsxd r6, [t0+wq*4] pcmpeqd m2, m2 pmaddubsw m0, m2 add r6, t0 add t0, ipred_cfl_splat_ssse3_table-ipred_cfl_left_ssse3_table movsxd wq, [t0+wq*4] add wq, t0 movifnidn acq, acmp jmp r6 ;--------------------------------------------------------------------------------------- ;void dav1d_ipred_cfl_128_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft, ; const int width, const int height, const int16_t *ac, const int alpha); ;--------------------------------------------------------------------------------------- cglobal ipred_cfl_128_8bpc, 3, 7, 6, dst, stride, tl, w, h, ac, alpha tzcnt wd, wm movifnidn hd, hm LEA r6, ipred_cfl_splat_ssse3_table movsxd wq, [r6+wq*4] movddup m0, [r6-ipred_cfl_splat_ssse3_table+pw_128] add wq, r6 movifnidn acq, acmp jmp wq %macro RELOAD_ACQ_32 1 mov acq, ac_bakq ; restore acq %endmacro %if ARCH_X86_64 cglobal ipred_cfl_ac_420_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak DECLARE_REG_TMP 7 movddup m2, [pb_2] %else cglobal ipred_cfl_ac_420_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h DECLARE_REG_TMP 4 %define ac_bakq acmp mov t0d, 0x02020202 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm mov t0d, hm mov hd, t0d imul t0d, wd movd m5, t0d movifnidn hpadd, hpadm %if ARCH_X86_64 mov ac_bakq, acq %endif shl hpadd, 2 sub hd, hpadd pxor m4, m4 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movq m0, [yq] movq m1, [yq+strideq] movhps m0, [yq+strideq*2] movhps m1, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 16 sub hd, 2 jg .w4_loop test hpadd, hpadd jz .calc_avg_4_8 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop jmp .calc_avg_4_8 .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 mova m0, [yq+strideq*2] mova m1, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 32 sub hd, 2 jg .w8_loop test hpadd, hpadd jz .calc_avg_4_8 jmp .w8_hpad .w8_wpad: ; wpadd=1 movddup m0, [yq] movddup m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufhw m0, m0, q3333 mova [acq], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 16 sub hd, 1 jg .w8_wpad test hpadd, hpadd jz .calc_avg_4_8 .w8_hpad: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 1 jg .w8_hpad jmp .calc_avg_4_8 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 mova m6, [yq+16] mova m1, [yq+strideq+16] pmaddubsw m6, m2 pmaddubsw m1, m2 paddw m6, m1 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_loop test hpadd, hpadd jz .calc_avg16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movddup m0, [yq] movddup m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 pshufhw m0, m0, q3333 mova [acq], m0 paddw m4, m0 mova m6, m0 punpckhqdq m6, m0, m0 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 pshufhw m6, m0, q3333 punpckhqdq m6, m6 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m0, [yq] mova m1, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 paddw m0, m1 mova [acq], m0 paddw m4, m0 movddup m6, [yq+16] movddup m1, [yq+strideq+16] pmaddubsw m6, m2 pmaddubsw m1, m2 paddw m6, m1 pshufhw m6, m6, q3333 mova [acq+16], m6 paddw m4, m6 lea yq, [yq+strideq*2] add acq, 32 dec hd jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg16 .w16_hpad_loop: mova [acq], m0 paddw m4, m0 mova [acq+16], m6 paddw m4, m6 add acq, 32 dec hpadd jg .w16_hpad_loop jmp .calc_avg16 %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_4_8: psrlw m2, 9 pmaddwd m4, m2 jmp .calc_avg .calc_avg16: psrld m0, m4, 16 pslld m4, 16 psrld m4, 16 paddd m4, m0 .calc_avg: movd szd, m5 psrad m5, 1 tzcnt r1d, szd paddd m4, m5 movd m1, r1d pshufd m0, m4, q2301 paddd m0, m4 pshufd m4, m0, q1032 paddd m0, m4 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq .sub_loop: mova m1, [acq] psubw m1, m0 ; ac[x] -= sum; mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET %if ARCH_X86_64 cglobal ipred_cfl_ac_422_8bpc, 4, 8, 7, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, wpad, hpad, w, h mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm mov t0d, hm mov hd, t0d imul t0d, wd movd m6, t0d movifnidn hpadd, hpadm %if ARCH_X86_64 mov ac_bakq, acq %endif shl hpadd, 2 sub hd, hpadd pxor m4, m4 pxor m5, m5 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movq m1, [yq] movhps m1, [yq+strideq] movq m0, [yq+strideq*2] movhps m0, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_4 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m4, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop jmp .calc_avg_4 .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: mova m1, [yq] mova m0, [yq+strideq] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m4, m0 paddw m5, m1 mova m1, [yq+strideq*2] mova m0, [yq+stride3q] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w8_hpad .w8_wpad: movddup m1, [yq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 movddup m0, [yq+strideq] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 sub hd, 2 jg .w8_wpad test hpadd, hpadd jz .calc_avg_8_16 .w8_hpad: mova [acq], m0 paddw m4, m0 mova [acq+16], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad jmp .calc_avg_8_16 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m1, [yq] mova m0, [yq+16] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq], m1 mova [acq+16], m0 paddw m5, m0 paddw m5, m1 mova m1, [yq+strideq] mova m0, [yq+strideq+16] pmaddubsw m0, m2 pmaddubsw m1, m2 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m0 paddw m4, m1 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movddup m1, [yq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movddup m1, [yq+strideq] pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq+32], m1 paddw m4, m1 punpckhqdq m0, m1, m1 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: mova m1, [yq] pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 pshufhw m1, m1, q3333 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 mova m1, [yq+strideq] pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 mova m0, m1 pshufhw m0, m0, q3333 punpckhqdq m0, m0 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m1, [yq] pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 movddup m0, [yq+16] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m5, m0 mova m1, [yq+strideq] pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 movddup m0, [yq+strideq+16] pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg_8_16 .w16_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m4, m1 paddw m5, m0 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m1 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop jmp .calc_avg_8_16 %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_4: psrlw m2, 10 pmaddwd m5, m2 pmaddwd m0, m4, m2 jmp .calc_avg .calc_avg_8_16: mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, m4 psrld m0, 16 pslld m4, 16 psrld m4, 16 paddd m0, m4 .calc_avg: paddd m5, m0 movd szd, m6 psrad m6, 1 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); paddd m5, m6 movd m1, r1d pshufd m0, m5, q2301 paddd m0, m5 pshufd m5, m0, q1032 paddd m0, m5 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq ; ac = ac_orig .sub_loop: mova m1, [acq] psubw m1, m0 mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET %if ARCH_X86_64 cglobal ipred_cfl_ac_444_8bpc, 4, 8, 7, -4*16, ac, y, stride, wpad, hpad, w, h, ac_bak movddup m2, [pb_4] %else cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, stride, wpad, hpad, w, h %define ac_bakq [rsp+16*4] mov t0d, 0x04040404 movd m2, t0d pshufd m2, m2, q0000 %endif movifnidn wd, wm movifnidn hpadd, hpadm movd m0, hpadd mov t0d, hm mov hd, t0d imul t0d, wd movd m6, t0d movd hpadd, m0 mov ac_bakq, acq shl hpadd, 2 sub hd, hpadd pxor m5, m5 pxor m4, m4 cmp wd, 16 jg .w32 cmp wd, 8 jg .w16 je .w8 ; fall-through %if ARCH_X86_64 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, ac_bak %else DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: lea stride3q, [strideq*3] .w4_loop: movd m1, [yq] movd m3, [yq+strideq] punpckldq m1, m3 punpcklbw m1, m1 movd m0, [yq+strideq*2] movd m3, [yq+stride3q] punpckldq m0, m3 punpcklbw m0, m0 pmaddubsw m1, m2 pmaddubsw m0, m2 mova [acq], m1 mova [acq+16], m0 paddw m5, m0 paddw m5, m1 lea yq, [yq+strideq*4] add acq, 32 sub hd, 4 jg .w4_loop test hpadd, hpadd jz .calc_avg_4 punpckhqdq m0, m0 .w4_hpad_loop: mova [acq], m0 paddw m5, m0 add acq, 16 sub hpadd, 2 jg .w4_hpad_loop .calc_avg_4: psrlw m2, 10 pmaddwd m5, m2 jmp .calc_avg .w8: lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad .w8_loop: movq m1, [yq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 movq m0, [yq+strideq] punpcklbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 movq m1, [yq+strideq*2] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 movq m0, [yq+stride3q] punpcklbw m0, m0 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*4] add acq, 64 sub hd, 4 jg .w8_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w8_hpad .w8_wpad: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pmaddubsw m1, m2 pshufhw m1, m1, q3333 mova [acq], m1 paddw m5, m1 movd m0, [yq+strideq] punpcklbw m0, m0 punpcklqdq m0, m0 pmaddubsw m0, m2 pshufhw m0, m0, q3333 mova [acq+16], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 32 sub hd, 2 jg .w8_wpad test hpadd, hpadd jz .calc_avg_8_16 .w8_hpad: mova [acq], m0 paddw m5, m0 mova [acq+16], m0 paddw m4, m0 add acq, 32 sub hpadd, 2 jg .w8_hpad jmp .calc_avg_8_16 .w16: test wpadd, wpadd jnz .w16_wpad .w16_loop: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 mova m0, [yq+strideq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_loop test hpadd, hpadd jz .calc_avg_8_16 jmp .w16_hpad_loop .w16_wpad: cmp wpadd, 2 jl .w16_pad1 je .w16_pad2 .w16_pad3: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movd m1, [yq+strideq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhqdq m0, m1, m1 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad3 jmp .w16_wpad_done .w16_pad2: movq m1, [yq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 pshufhw m1, m1, q3333 punpckhqdq m1, m1 mova [acq+16], m1 paddw m5, m1 movq m1, [yq+strideq] punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 mova m0, m1 pshufhw m0, m0, q3333 punpckhqdq m0, m0 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad2 jmp .w16_wpad_done .w16_pad1: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0 mova m0, [yq+strideq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq+32], m1 paddw m4, m1 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+48], m0 paddw m4, m0 lea yq, [yq+strideq*2] add acq, 64 sub hd, 2 jg .w16_pad1 .w16_wpad_done: test hpadd, hpadd jz .calc_avg_8_16 .w16_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m4, m1 paddw m5, m0 mova [acq+32], m1 mova [acq+48], m0 paddw m4, m1 paddw m5, m0 add acq, 64 sub hpadd, 2 jg .w16_hpad_loop .calc_avg_8_16: mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, m4 psrld m0, 16 pslld m4, 16 psrld m4, 16 paddd m0, m4 paddd m5, m0 jmp .calc_avg .w32: pxor m0, m0 mova [rsp ], m0 mova [rsp+16], m0 mova [rsp+32], m0 mova [rsp+48], m0 test wpadd, wpadd jnz .w32_wpad .w32_loop: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m4, [yq+16] mova m3, m4 punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 punpckhbw m4, m4 pmaddubsw m4, m2 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_loop test hpadd, hpadd jz .calc_avg_32 jmp .w32_hpad_loop .w32_wpad: cmp wpadd, 2 jl .w32_pad1 je .w32_pad2 cmp wpadd, 4 jl .w32_pad3 je .w32_pad4 cmp wpadd, 6 jl .w32_pad5 je .w32_pad6 .w32_pad7: movd m1, [yq] punpcklbw m1, m1 punpcklqdq m1, m1 pshufhw m1, m1, q3333 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 mova m0, m1 punpckhqdq m0, m0 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad7 jmp .w32_wpad_done .w32_pad6: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 pshufhw m0, m1, q3333 punpckhqdq m0, m0 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad6 jmp .w32_wpad_done .w32_pad5: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 mova m5, [rsp] paddw m5, m1 mova [rsp ], m5 punpckhbw m0, m0 punpcklqdq m0, m0 pshufhw m0, m0, q3333 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 punpckhqdq m3, m3 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad5 jmp .w32_wpad_done .w32_pad4: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, m0 pshufhw m3, m3, q3333 punpckhqdq m3, m3 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad4 jmp .w32_wpad_done .w32_pad3: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 movd m3, [yq+16] punpcklbw m3, m3 punpcklqdq m3, m3 pshufhw m3, m3, q3333 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 mova m4, m3 punpckhqdq m4, m4 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad3 jmp .w32_wpad_done .w32_pad2: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m3, [yq+16] punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 pshufhw m4, m3, q3333 punpckhqdq m4, m4 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad2 jmp .w32_wpad_done .w32_pad1: mova m0, [yq] mova m1, m0 punpcklbw m1, m1 pmaddubsw m1, m2 mova [acq], m1 paddw m5, m1, [rsp] mova [rsp ], m5 punpckhbw m0, m0 pmaddubsw m0, m2 mova [acq+16], m0 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova m4, [yq+16] mova m3, m4 punpcklbw m3, m3 pmaddubsw m3, m2 mova [acq+32], m3 paddw m5, m3, [rsp+32] mova [rsp+32], m5 punpckhbw m4, m4 punpcklqdq m4, m4 pshufhw m4, m4, q3333 pmaddubsw m4, m2 mova [acq+48], m4 paddw m5, m4, [rsp+48] mova [rsp+48], m5 lea yq, [yq+strideq] add acq, 64 sub hd, 1 jg .w32_pad1 .w32_wpad_done: test hpadd, hpadd jz .calc_avg_32 .w32_hpad_loop: mova [acq], m1 mova [acq+16], m0 paddw m5, m1, [rsp] mova [rsp ], m5 paddw m5, m0, [rsp+16] mova [rsp+16], m5 mova [acq+32], m3 mova [acq+48], m4 paddw m5, m3, [rsp+32] mova [rsp+32], m5 paddw m5, m4, [rsp+48] mova [rsp+48], m5 add acq, 64 sub hpadd, 1 jg .w32_hpad_loop %if ARCH_X86_64 DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h, ac_bak %else DEFINE_ARGS ac, y, iptr, wpad, hpad, sz, h %endif .calc_avg_32: mova m5, [rsp] mova m0, m5 psrld m5, 16 pslld m0, 16 psrld m0, 16 paddd m5, m0 mova m0, [rsp+16] mova m3, m0 psrld m0, 16 pslld m3, 16 psrld m3, 16 paddd m0, m3 paddd m5, m0 mova m0, [rsp+32] mova m3, m0 psrld m0, 16 pslld m3, 16 psrld m3, 16 paddd m0, m3 mova m1, [rsp+48] mova m3, m1 psrld m1, 16 pslld m3, 16 psrld m3, 16 paddd m1, m3 paddd m1, m0 paddd m5, m1 .calc_avg: movd szd, m6 psrad m6, 1 tzcnt r1d, szd ; const int log2sz = ctz(width) + ctz(height); paddd m5, m6 movd m1, r1d pshufd m0, m5, q2301 paddd m0, m5 pshufd m5, m0, q1032 paddd m0, m5 psrad m0, m1 ; sum >>= log2sz; packssdw m0, m0 RELOAD_ACQ_32 acq ; ac = ac_orig .sub_loop: mova m1, [acq] psubw m1, m0 mova [acq], m1 add acq, 16 sub szd, 8 jg .sub_loop RET ; %1 simd register that hold the mask and will hold the result ; %2 simd register that holds the "true" values ; %3 location of the "false" values (simd register/memory) %macro BLEND 3 ; mask, true, false pand %2, %1 pandn %1, %3 por %1, %2 %endmacro %macro PAETH 2 ; top, ldiff pavgb m1, m%1, m3 pxor m0, m%1, m3 pand m0, m4 psubusb m2, m5, m1 psubb m1, m0 psubusb m1, m5 por m1, m2 paddusb m1, m1 por m1, m0 ; min(tldiff, 255) psubusb m2, m5, m3 psubusb m0, m3, m5 por m2, m0 ; tdiff %ifnum %2 pminub m2, m%2 pcmpeqb m0, m%2, m2 ; ldiff <= tdiff %else mova m0, %2 pminub m2, m0 pcmpeqb m0, m2 %endif pminub m1, m2 pcmpeqb m1, m2 ; ldiff <= tldiff && tdiff <= tldiff mova m2, m3 BLEND m0, m2, m%1 BLEND m1, m0, m5 %endmacro cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, tl, w, h %define base r5-ipred_paeth_ssse3_table tzcnt wd, wm movifnidn hd, hm pxor m0, m0 movd m5, [tlq] pshufb m5, m0 LEA r5, ipred_paeth_ssse3_table movsxd wq, [r5+wq*4] movddup m4, [base+ipred_paeth_shuf] add wq, r5 jmp wq .w4: movd m6, [tlq+1] ; top pshufd m6, m6, q0000 lea r3, [strideq*3] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 ; ldiff .w4_loop: sub tlq, 4 movd m3, [tlq] mova m1, [base+ipred_h_shuf] pshufb m3, m1 ; left PAETH 6, 7 movd [dstq ], m1 pshuflw m0, m1, q1032 movd [dstq+strideq ], m0 punpckhqdq m1, m1 movd [dstq+strideq*2], m1 psrlq m1, 32 movd [dstq+r3 ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4_loop RET ALIGN function_align .w8: movddup m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w8_loop: sub tlq, 2 movd m3, [tlq] pshufb m3, [base+ipred_paeth_shuf] PAETH 6, 7 movq [dstq ], m1 movhps [dstq+strideq], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 .w16_loop: sub tlq, 1 movd m3, [tlq] pxor m1, m1 pshufb m3, m1 PAETH 6, 7 mova [dstq], m1 add dstq, strideq sub hd, 1 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp ], m6 mova [rsp+16], m7 movu m6, [tlq+17] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+32], m6 .w32_loop: dec tlq movd m3, [tlq] pxor m1, m1 pshufb m3, m1 mova m6, [rsp] PAETH 6, [rsp+16] mova [dstq ], m1 mova m6, [rsp+32] PAETH 6, 7 mova [dstq+16], m1 add dstq, strideq dec hd jg .w32_loop RET ALIGN function_align .w64: movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp ], m6 mova [rsp+16], m7 movu m6, [tlq+17] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+32], m6 mova [rsp+48], m7 movu m6, [tlq+33] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+64], m6 mova [rsp+80], m7 movu m6, [tlq+49] psubusb m7, m5, m6 psubusb m0, m6, m5 por m7, m0 mova [rsp+96], m6 .w64_loop: dec tlq movd m3, [tlq] pxor m1, m1 pshufb m3, m1 mova m6, [rsp] PAETH 6, [rsp+16] mova [dstq ], m1 mova m6, [rsp+32] PAETH 6, [rsp+48] mova [dstq+16], m1 mova m6, [rsp+64] PAETH 6, [rsp+80] mova [dstq+32], m1 mova m6, [rsp+96] PAETH 6, 7 mova [dstq+48], m1 add dstq, strideq dec hd jg .w64_loop RET %macro FILTER 4 ;dst, src, tmp, shuf %ifnum %4 pshufb m%2, m%4 %else pshufb m%2, %4 %endif pshufd m%1, m%2, q0000 ;p0 p1 pmaddubsw m%1, m2 pshufd m%3, m%2, q1111 ;p2 p3 pmaddubsw m%3, m3 paddw m%1, [base+pw_8] paddw m%1, m%3 pshufd m%3, m%2, q2222 ;p4 p5 pmaddubsw m%3, m4 paddw m%1, m%3 pshufd m%3, m%2, q3333 ;p6 __ pmaddubsw m%3, m5 paddw m%1, m%3 psraw m%1, 4 packuswb m%1, m%1 %endmacro cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w, h, filter %define base r6-$$ LEA r6, $$ tzcnt wd, wm %ifidn filterd, filterm movzx filterd, filterb %else movzx filterd, byte filterm %endif shl filterd, 6 lea filterq, [base+filter_intra_taps+filterq] movq m0, [tlq-3] ;_ 6 5 0 1 2 3 4 movsxd wq, [base+ipred_filter_ssse3_table+wq*4] mova m2, [filterq+16*0] mova m3, [filterq+16*1] mova m4, [filterq+16*2] mova m5, [filterq+16*3] lea wq, [base+ipred_filter_ssse3_table+wq] mov hd, hm jmp wq .w4: mova m1, [base+filter_shuf1] sub tlq, 3 sub tlq, hq jmp .w4_loop_start .w4_loop: movd m0, [tlq+hq] punpckldq m0, m6 lea dstq, [dstq+strideq*2] .w4_loop_start: FILTER 6, 0, 7, 1 movd [dstq+strideq*0], m6 pshuflw m6, m6, q1032 movd [dstq+strideq*1], m6 sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 sub tlq, 5 sub tlq, hq .w8_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 FILTER 0, 6, 1, [base+filter_shuf2] punpckldq m6, m7, m0 movq [dstq+strideq*0], m6 punpckhqdq m6, m6 movq [dstq+strideq*1], m6 movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: movu m6, [tlq+1] ;top row sub tlq, 5 sub tlq, hq .w16_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+4+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+8+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movd [dstq+12+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+strideq*1], m6 movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: movu m6, [tlq+1] ;top row lea filterq, [tlq+17] sub tlq, 5 sub tlq, hq .w32_loop: FILTER 7, 0, 1, [base+filter_shuf1] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+4+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+8+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movu m1, [filterq] punpckldq m0, m7, m1 ;_ _ _ 0 1 2 3 4 _ _ _ _ _ _ _ _ punpcklqdq m0, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+12+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+strideq*1], m6 mova m6, m1 FILTER 7, 0, 6, [base+filter_shuf2] punpcklqdq m0, m1, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+16+strideq*0], m7 psrlq m7, 32 palignr m7, m1, 4 FILTER 6, 0, 1, [base+filter_shuf2] punpcklqdq m0, m7, m6 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+20+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 FILTER 7, 0, 1, [base+filter_shuf2] punpcklqdq m0, m6, m7 ;_ _ _ 0 1 2 3 4 _ _ _ 5 _ _ _ 6 movd [dstq+24+strideq*0], m7 psrlq m7, 32 palignr m7, m6, 4 FILTER 6, 0, 1, [base+filter_shuf2] movd [dstq+28+strideq*0], m6 psrlq m6, 32 palignr m6, m7, 4 mova [dstq+16+strideq*1], m6 mova m6, [dstq+strideq*1] movd m0, [tlq+hq] ;_ 6 5 0 punpckldq m0, m6 ;_ 6 5 0 1 2 3 4 lea filterq, [dstq+16+strideq*1] lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET rav1e-0.7.1/src/x86/itx16_avx2.asm000064400000000000000000011564541046102023000145200ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; Copyright © 2021, Matthias Dressel ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 %macro COEF_PAIR 2-3 0 pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) %define pd_%2 (pd_%1_%2 + 4*2) %if %3 dd -%2, -%2 %define pd_%2_m%2 pd_%2 %endif %endmacro COEF_PAIR 201, 995 COEF_PAIR 401, 1931 COEF_PAIR 799, 3406 COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 COEF_PAIR 2751, 2106 COEF_PAIR 2896, 1567, 1 COEF_PAIR 2896, 3784, 1 COEF_PAIR 3035, 3513 COEF_PAIR 3166, 3920 COEF_PAIR 3703, 3290 COEF_PAIR 3857, 4052 COEF_PAIR 4017, 2276 COEF_PAIR 4076, 3612 COEF_PAIR 4091, 3973 pd_8: dd 8 pd_m601: dd -601 pd_m1189: dd -1189 pd_m1380: dd -1380 pd_m2106: dd -2106 pd_m2598: dd -2598 pd_m2751: dd -2751 pd_m3344: dd -3344 pd_1024: dd 1024 pd_1321: dd 1321 pd_1448: dd 1448 pd_1697: dd 1697 pd_2482: dd 2482 pd_3072: dd 3072 ; 1024 + 2048 pd_3803: dd 3803 pd_5119: dd 5119 ; 1024 + 4096 - 1 pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 pd_6144: dd 6144 ; 2048 + 4096 pd_17408: dd 17408 ; 1024 + 16384 pixel_10bpc_max: times 2 dw 0x03ff pixel_12bpc_max: times 2 dw 0x0fff dconly_10bpc: times 2 dw 0x7c00 dconly_12bpc: times 2 dw 0x7000 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff clip_20b_min: dd -0x80000 clip_20b_max: dd 0x7ffff const idct64_mul_16bpc dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 cextern deint_shuf cextern idct64_mul cextern pw_1697x8 cextern pw_1697x16 cextern pw_1567_3784 cextern pw_m1567_m3784 cextern pw_m3784_1567 cextern pw_2896_2896 cextern pw_m2896_2896 cextern pw_5 cextern pw_2048 cextern pw_4096 cextern pw_8192 cextern pw_16384 cextern pw_2896x8 cextern pd_2048 cextern idct_4x8_internal_8bpc_avx2.main cextern idct_4x16_internal_8bpc_avx2.main cextern idct_8x8_internal_8bpc_avx2.main cextern idct_8x16_internal_8bpc_avx2.main cextern idct_16x4_internal_8bpc_avx2.main cextern idct_16x8_internal_8bpc_avx2.main cextern idct_16x16_internal_8bpc_avx2.main cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal cextern iadst_4x4_internal_8bpc_avx2.main cextern iadst_4x8_internal_8bpc_avx2.main_pass2 cextern iadst_4x16_internal_8bpc_avx2.main2 cextern iadst_8x4_internal_8bpc_avx2.main cextern iadst_8x8_internal_8bpc_avx2.main_pass2 cextern iadst_8x16_internal_8bpc_avx2.main cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end cextern iadst_16x4_internal_8bpc_avx2.main cextern iadst_16x8_internal_8bpc_avx2.main cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end cextern iadst_16x16_internal_8bpc_avx2.main cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %macro WRAP_XMM 1+ INIT_XMM cpuname %1 INIT_YMM cpuname %endmacro %macro IWHT4_1D_PACKED 0 ; m0 = in0 in2, m1 = in1 in3 psubd m2, m0, m1 ; t2 paddd xm0, xm1 ; t0 vpermq m2, m2, q3322 vpermq m0, m0, q1100 vpermq m1, m1, q3120 psubd m3, m0, m2 psrad m3, 1 psubd m3, m1 ; t1 t3 psubd m0, m3 ; ____ out0 paddd m2, m3 ; out3 ____ %endmacro INIT_YMM avx2 cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax mova xm0, [cq+16*0] vinserti128 m0, [cq+16*2], 1 mova xm1, [cq+16*1] vinserti128 m1, [cq+16*3], 1 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 lea r6, [dstq+strideq*2] psrad m0, 2 psrad m1, 2 IWHT4_1D_PACKED punpckhdq m0, m3 punpckldq m3, m2 punpckhqdq m1, m0, m3 punpcklqdq m0, m3 IWHT4_1D_PACKED vpblendd m0, m2, 0x33 packssdw m0, m3 vextracti128 xm2, m0, 1 punpckhdq xm1, xm0, xm2 ; out2 out1 punpckldq xm0, xm2 ; out3 out0 movq xm2, [r6 +strideq*1] movhps xm2, [dstq+strideq*0] movq xm3, [r6 +strideq*0] movhps xm3, [dstq+strideq*1] %ifidn bdmaxd, bdmaxm movd xm5, bdmaxd vpbroadcastw xm5, xm5 %else ; win64: load from stack vpbroadcastw xm5, bdmaxm %endif paddsw xm0, xm2 paddsw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movq [r6 +strideq*0], xm1 movq [r6 +strideq*1], xm0 RET ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 ; flags: 1 = packed, 2 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags %if %8 < 32 pmulld m%4, m%1, m%8 pmulld m%3, m%2, m%8 %else %if %9 & 1 vbroadcasti128 m%3, [pd_%8] %else vpbroadcastd m%3, [pd_%8] %endif pmulld m%4, m%1, m%3 pmulld m%3, m%2 %endif %if %7 < 32 pmulld m%1, m%7 pmulld m%2, m%7 %else %if %9 & 1 vbroadcasti128 m%5, [pd_%7] %else vpbroadcastd m%5, [pd_%7] %endif pmulld m%1, m%5 pmulld m%2, m%5 %endif %if %9 & 2 psubd m%4, m%6, m%4 psubd m%2, m%4, m%2 %else %ifnum %6 paddd m%4, m%6 %endif paddd m%2, m%4 %endif %ifnum %6 paddd m%1, m%6 %endif psubd m%1, m%3 %ifnum %6 psrad m%2, 12 psrad m%1, 12 %endif %endmacro %macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_%5bpc) ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%4_internal_%5bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else %if %3 add eobd, %3 %endif ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x4, %3 %ifidn %1_%2, dct_dct vpbroadcastd xm2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 4 .dconly2: add r6d, 128 sar r6d, 8 .dconly3: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm2 vpbroadcastw xm0, xm0 .dconly_loop: movq xm1, [dstq+strideq*0] movhps xm1, [dstq+strideq*1] paddsw xm1, xm0 psubusw xm1, xm2 movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop WRAP_XMM RET %else jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly %endif %endif %endmacro %macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 punpckhqdq m%3, m%2, m%1 ; t3 t2 punpcklqdq m%2, m%1 ; t0 t1 paddd m%1, m%2, m%3 ; out0 out1 psubd m%2, m%3 ; out3 out2 %endmacro %macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd vpbroadcastd m%5, [pw_m3784_1567] punpckhwd m%3, m%2, m%1 vpbroadcastd m%4, [pw_1567_3784] punpcklwd m%2, m%1 vpbroadcastd m%1, [pw_m2896_2896] pmaddwd m%5, m%3 pmaddwd m%3, m%4 vpbroadcastd m%4, [pw_2896_2896] pmaddwd m%1, m%2 pmaddwd m%2, m%4 REPX {paddd x, m%6}, m%5, m%3, m%1, m%2 REPX {psrad x, 12 }, m%5, m%3, m%1, m%2 packssdw m%3, m%5 ; t3 t2 packssdw m%2, m%1 ; t0 t1 paddsw m%1, m%2, m%3 ; out0 out1 psubsw m%2, m%3 ; out3 out2 %endmacro INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, identity INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main vbroadcasti128 m2, [idct4_shuf] packssdw m0, m1 pshufb m0, m2 jmp tx2q .pass2: vextracti128 xm1, m0, 1 WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 packssdw xm5, xm5 ; pw_2048 pmulhrsw xm0, xm5 pmulhrsw xm1, xm5 movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] lea r6, [dstq+strideq*2] movq xm3, [r6 +strideq*1] movhps xm3, [r6 +strideq*0] vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movhps [r6 +strideq*0], xm1 movq [r6 +strideq*1], xm1 RET ALIGN function_align .main: vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m5, [pd_2048] .main2: IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 ret INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity %macro IADST4_1D 0 vpbroadcastd m5, [pd_1321] vpbroadcastd m7, [pd_2482] pmulld m4, m0, m5 ; 1321*in0 pmulld m6, m3, m7 ; 2482*in3 paddd m4, m6 ; 1321*in0 + 2482*in3 pmulld m6, m0, m7 ; 2482*in0 paddd m0, m3 ; in0 + in3 paddd m7, m5 ; pd_3803 pmulld m5, m2 ; 1321*in2 pmulld m3, m7 ; 3803*in3 pmulld m7, m2 ; 3803*in2 psubd m2, m0 ; in2 - in0 - in3 vpbroadcastd m0, [pd_m3344] pmulld m1, m0 ; -t3 pmulld m2, m0 ; out2 (unrounded) psubd m6, m5 ; 2482*in0 - 1321*in2 paddd m4, m7 ; t0 psubd m6, m3 ; t1 paddd m3, m4, m6 psubd m4, m1 ; out0 (unrounded) psubd m6, m1 ; out1 (unrounded) paddd m3, m1 ; out3 (unrounded) %endmacro cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call .main vinserti128 m0, m4, xm6, 1 vinserti128 m1, m2, xm3, 1 .pass1_end: vpbroadcastd m5, [pd_2048] mova m2, [itx4_shuf] paddd m0, m5 paddd m1, m5 psrad m0, 12 psrad m1, 12 packssdw m0, m1 vpermd m0, m2, m0 psrld m2, 4 pshufb m0, m2 %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif jmp tx2q .pass2: lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main .end: vpbroadcastd xm4, [pw_2048] movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] lea r6, [dstq+strideq*2] movq xm3, [r6 +strideq*0] movhps xm3, [r6 +strideq*1] vpbroadcastd xm5, [pixel_10bpc_max] pmulhrsw xm0, xm4 pmulhrsw xm1, xm4 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [r6 +strideq*0], xm1 movhps [r6 +strideq*1], xm1 RET ALIGN function_align .main: mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] mova xm3, [cq+16*3] %if WIN64 movaps [rsp+16], xmm6 movaps [rsp+32], xmm7 %endif .main2: WRAP_XMM IADST4_1D ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main vinserti128 m0, m3, xm2, 1 vinserti128 m1, m6, xm4, 1 jmp m(iadst_4x4_internal_10bpc).pass1_end .pass2: lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main vpbroadcastd xm4, [pw_2048] movq xm3, [dstq+strideq*1] movhps xm3, [dstq+strideq*0] lea r6, [dstq+strideq*2] movq xm2, [r6 +strideq*1] movhps xm2, [r6 +strideq*0] vpbroadcastd xm5, [pixel_10bpc_max] pmulhrsw xm0, xm4 pmulhrsw xm1, xm4 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm4 pmaxsw xm1, xm4 pminsw xm0, xm5 pminsw xm1, xm5 movhps [dstq+strideq*0], xm1 movq [dstq+strideq*1], xm1 movhps [r6 +strideq*0], xm0 movq [r6 +strideq*1], xm0 RET INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 vpbroadcastd m1, [pd_5793] pmulld m0, m1, [cq+32*0] pmulld m1, [cq+32*1] vpbroadcastd m5, [pd_2048] mova m3, [itx4_shuf] paddd m0, m5 paddd m1, m5 psrad m0, 12 psrad m1, 12 packssdw m0, m1 vpermd m0, m3, m0 psrld m3, 4 pshufb m0, m3 jmp tx2q .pass2: vpbroadcastd m1, [pw_1697x8] movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] lea r6, [dstq+strideq*2] pmulhrsw m1, m0 paddsw m0, m1 movq xm3, [r6 +strideq*0] movhps xm3, [r6 +strideq*1] vpbroadcastd xm4, [pixel_10bpc_max] packssdw m5, m5 ; pw_2048 pmulhrsw m0, m5 pxor m5, m5 mova [cq+32*0], m5 mova [cq+32*1], m5 vextracti128 xm1, m0, 1 paddw xm0, xm2 paddw xm1, xm3 pmaxsw xm0, xm5 pmaxsw xm1, xm5 pminsw xm0, xm4 pminsw xm1, xm4 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [r6 +strideq*0], xm1 movhps [r6 +strideq*1], xm1 RET INV_TXFM_4X4_FN dct, dct, 12 INV_TXFM_4X4_FN dct, identity, 12 INV_TXFM_4X4_FN dct, adst, 12 INV_TXFM_4X4_FN dct, flipadst, 12 cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(idct_4x4_internal_10bpc).main mova m3, [idct4_12_shuf] mova m4, [idct4_12_shuf2] vpermd m2, m4, m1 vpermd m1, m3, m0 jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: vpbroadcastd m5, [pd_2048] vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(idct_4x4_internal_10bpc).main2 vpermq m0, m0, q3120 vpermq m1, m1, q2031 jmp m(iadst_4x4_internal_12bpc).end INV_TXFM_4X4_FN adst, dct, 12 INV_TXFM_4X4_FN adst, adst, 12 INV_TXFM_4X4_FN adst, flipadst, 12 INV_TXFM_4X4_FN adst, identity, 12 cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main vinserti128 m1, m4, xm6, 1 vinserti128 m2, xm3, 1 .pass1_end: mova m3, [itx4_shuf] vpbroadcastd m5, [pd_1024] psrad m1, 1 psrad m2, 1 vpermd m1, m3, m1 vpermd m2, m3, m2 paddd m1, m5 paddd m2, m5 psrad m1, 11 psrad m2, 11 .pass1_end2: vpbroadcastd m3, [clip_18b_min] vpbroadcastd m4, [clip_18b_max] punpcklqdq m0, m1, m2 punpckhqdq m1, m2 pmaxsd m0, m3 pmaxsd m1, m3 pminsd m0, m4 pminsd m1, m4 jmp tx2q .pass2: call .main_pass2 vinserti128 m0, m4, xm6, 1 vinserti128 m1, m2, xm3, 1 .pass2_end: vpbroadcastd m5, [pd_2048] paddd m0, m5 paddd m1, m5 psrad m0, 12 psrad m1, 12 .end: %if WIN64 WIN64_RESTORE_XMM_INTERNAL %assign xmm_regs_used 6 %endif .end2: vpbroadcastd m4, [pw_16384] movq xm2, [dstq+strideq*0] movq xm3, [dstq+strideq*1] lea r6, [dstq+strideq*2] movhps xm2, [r6 +strideq*0] ; dst0 dst2 movhps xm3, [r6 +strideq*1] ; dst1 dst3 vpbroadcastd m5, [pixel_12bpc_max] vinserti128 m2, xm3, 1 psrad m0, 3 psrad m1, 3 packssdw m0, m1 ; t0 t2 t1 t3 pmulhrsw m0, m4 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 paddw m0, m2 ; out0 out2 out1 out3 pmaxsw m0, m4 pminsw m0, m5 vextracti128 xm1, m0, 1 ; out1 out3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [r6 +strideq*0], xm0 movhps [r6 +strideq*1], xm1 RET .main_pass2: vextracti128 xm3, m1, 1 mova xm2, xm1 vextracti128 xm1, m0, 1 jmp m(iadst_4x4_internal_10bpc).main2 INV_TXFM_4X4_FN flipadst, dct, 12 INV_TXFM_4X4_FN flipadst, adst, 12 INV_TXFM_4X4_FN flipadst, flipadst, 12 INV_TXFM_4X4_FN flipadst, identity, 12 cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_10bpc).main vinserti128 m1, m3, xm2, 1 vinserti128 m2, m6, xm4, 1 jmp m(iadst_4x4_internal_12bpc).pass1_end .pass2: call m(iadst_4x4_internal_12bpc).main_pass2 vinserti128 m0, m3, xm2, 1 vinserti128 m1, m6, xm4, 1 jmp m(iadst_4x4_internal_12bpc).pass2_end INV_TXFM_4X4_FN identity, dct, 12 INV_TXFM_4X4_FN identity, adst, 12 INV_TXFM_4X4_FN identity, flipadst, 12 INV_TXFM_4X4_FN identity, identity, 12 cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 mova m2, [itx4_shuf] vpbroadcastd m3, [pd_1697] vpermd m0, m2, [cq+32*0] vpermd m2, m2, [cq+32*1] vpbroadcastd m5, [pd_2048] pmulld m1, m3, m0 pmulld m3, m2 paddd m1, m5 paddd m3, m5 psrad m1, 12 psrad m3, 12 paddd m1, m0 paddd m2, m3 jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: ; m0 = in0 in1 ; m1 = in2 in3 vpbroadcastd m3, [pd_5793] vpbroadcastd m5, [pd_2048] pmulld m0, m3 pmulld m1, m3 paddd m0, m5 ; 2048 paddd m1, m5 psrad m0, 12 psrad m1, 12 jmp m(iadst_4x4_internal_12bpc).end %macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x8, %3 %ifidn %1_%2, dct_dct vpbroadcastd xm2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 %else jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly %endif %endif %endmacro %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 vpbroadcastd m%5, [pd_2896] pmulld m%1, m%5 pmulld m%3, m%5 paddd m%1, m%8 paddd m%5, m%1, m%3 psubd m%1, m%3 psrad m%5, 12 ; t0 psrad m%1, 12 ; t1 psubd m%3, m%1, m%2 paddd m%2, m%1 paddd m%1, m%5, m%4 psubd m%4, m%5, m%4 %endmacro INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, identity INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m3, [pd_2896] pmulld m0, m3, [cq+32*0] pmulld m1, m3, [cq+32*1] pmulld m2, m3, [cq+32*2] pmulld m3, m3, [cq+32*3] vpbroadcastd m7, [pd_2048] REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 jmp tx2q .pass2: packssdw m0, m2 packssdw m1, m3 lea r6, [deint_shuf+128] punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m2 ; 2 3 punpckldq m0, m2 ; 0 1 vextracti128 xm2, m0, 1 ; 4 5 vextracti128 xm3, m1, 1 ; 6 7 call m(idct_4x8_internal_8bpc).main vpbroadcastd xm4, [pw_2048] REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] movq xm5, [dstq+r3 ] movhps xm5, [dstq+strideq*2] movq xm6, [r6 +strideq*0] movhps xm6, [r6 +strideq*1] movq xm7, [r6 +r3 ] movhps xm7, [r6 +strideq*2] paddw xm0, xm4 ; 0 1 paddw xm1, xm5 ; 3 2 paddw xm2, xm6 ; 4 5 paddw xm3, xm7 ; 7 6 vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movhps [dstq+strideq*2], xm1 movq [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movhps [r6 +strideq*2], xm3 movq [r6 +r3 ], xm3 RET INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main vpbroadcastd m5, [pd_2048] paddd m0, m5, m4 paddd m1, m5, m6 paddd m2, m5 paddd m3, m5 .pass1_end: REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: call .pass2_main mova xm4, [pw_2048_m2048] REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 .end: lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] movq xm5, [dstq+strideq*2] movhps xm5, [dstq+r3 ] movq xm6, [r6 +strideq*0] movhps xm6, [r6 +strideq*1] movq xm7, [r6 +strideq*2] movhps xm7, [r6 +r3 ] paddw xm0, xm4 ; 0 1 paddw xm1, xm5 ; 2 3 paddw xm2, xm6 ; 4 5 paddw xm3, xm7 ; 6 7 vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movq [r6 +strideq*2], xm3 movhps [r6 +r3 ], xm3 RET ALIGN function_align .pass2_main: packssdw m0, m2 packssdw m1, m3 lea r6, [deint_shuf+128] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpckhdq m5, m4, m0 punpckldq m4, m0 vextracti128 xm2, m4, 1 ; 4 5 vextracti128 xm3, m5, 1 ; 6 7 pshufd xm4, xm4, q1032 ; 1 0 pshufd xm5, xm5, q1032 ; 3 2 jmp m(iadst_4x8_internal_8bpc).main_pass2 ALIGN function_align .main: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] .main2: vbroadcasti128 m0, [cq+16*0] vbroadcasti128 m2, [cq+16*2] vbroadcasti128 m3, [cq+16*5] vbroadcasti128 m1, [cq+16*7] vpbroadcastd m6, [pd_2896] shufpd m0, m2, 0x0c ; 0 2 shufpd m1, m3, 0x0c ; 7 5 vbroadcasti128 m2, [cq+16*4] vbroadcasti128 m4, [cq+16*6] vbroadcasti128 m5, [cq+16*1] vbroadcasti128 m3, [cq+16*3] vpbroadcastd m7, [pd_2048] shufpd m2, m4, 0x0c ; 4 6 shufpd m3, m5, 0x0c ; 3 1 REPX {pmulld x, m6}, m0, m1, m2, m3 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 .main3: ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 psubd m4, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 REPX {pmaxsd x, m8}, m4, m2, m0, m1 REPX {pminsd x, m9}, m4, m2, m0, m1 pxor m5, m5 psubd m5, m4 vpblendd m4, m2, 0xcc ; t4 t7 vpblendd m2, m5, 0xcc ; t5 -t6 ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 vpbroadcastd m5, [pd_2896] vbroadcasti128 m6, [pw_2048_m2048] ; + + - - punpckhqdq m3, m0, m1 punpcklqdq m0, m1 psubd m1, m0, m3 ; t2 t3 paddd m0, m3 ; out0 -out7 punpckhqdq m3, m4, m2 ; t7a t6a punpcklqdq m4, m2 ; t5a t4a psubd m2, m4, m3 ; t7 t6 paddd m4, m3 ; out6 -out1 REPX {pmaxsd x, m8}, m1, m2 REPX {pminsd x, m9}, m1, m2 vpblendd m3, m1, m2, 0xcc shufpd m1, m2, 0x05 pmulld m3, m5 pmulld m5, m1 psignd m0, m6 ; out0 out7 psignd m4, m6 ; out6 out1 paddd m3, m7 psubd m2, m3, m5 paddd m5, m3 psrad m2, 12 ; out4 -out5 psrad m5, 12 ; -out3 out2 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main vpbroadcastd m5, [pd_2048] paddd m0, m5, m3 paddd m1, m5, m2 paddd m2, m5, m6 paddd m3, m5, m4 jmp m(iadst_4x8_internal_10bpc).pass1_end .pass2: call m(iadst_4x8_internal_10bpc).pass2_main mova xm4, [pw_2048_m2048] REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*1] movhps xm4, [dstq+strideq*0] movq xm5, [dstq+r3 ] movhps xm5, [dstq+strideq*2] movq xm6, [r6 +strideq*1] movhps xm6, [r6 +strideq*0] movq xm7, [r6 +r3 ] movhps xm7, [r6 +strideq*2] paddw xm3, xm4 ; 1 0 paddw xm2, xm5 ; 3 2 paddw xm1, xm6 ; 5 4 paddw xm0, xm7 ; 7 6 vpbroadcastd xm5, [pixel_10bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 movhps [dstq+strideq*0], xm3 movq [dstq+strideq*1], xm3 movhps [dstq+strideq*2], xm2 movq [dstq+r3 ], xm2 movhps [r6 +strideq*0], xm1 movq [r6 +strideq*1], xm1 movhps [r6 +strideq*2], xm0 movq [r6 +r3 ], xm0 RET INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m3, [pd_2896] pmulld m0, m3, [cq+32*0] pmulld m1, m3, [cq+32*1] pmulld m2, m3, [cq+32*2] pmulld m3, [cq+32*3] vpbroadcastd m5, [pd_2048] vpbroadcastd m4, [pd_5793] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 REPX {pmulld x, m4}, m0, m1, m2, m3 REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: vpbroadcastd m6, [pixel_10bpc_max] call .pass2_end RET ALIGN function_align .pass2_end: vpbroadcastd m4, [pw_4096] packssdw m0, m2 packssdw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 pmulhrsw m2, m4 pmulhrsw m0, m4 punpckhdq m1, m0, m2 ; 2 3 6 7 punpckldq m0, m2 ; 0 1 4 5 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] vpbroadcastq m4, [r6 +strideq*0] vpbroadcastq m5, [r6 +strideq*1] movq xm3, [dstq+strideq*2] movhps xm3, [dstq+r3 ] vpblendd m2, m4, 0x30 vpblendd m2, m5, 0xc0 vpbroadcastq m4, [r6 +strideq*2] vpbroadcastq m5, [r6 +r3 ] vpblendd m3, m4, 0x30 vpblendd m3, m5, 0xc0 pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 ; out0 out1 out4 out5 paddw m1, m3 ; out2 out3 out6 out7 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m6 pminsw m1, m6 vextracti128 xm2, m0, 1 ; out4 out5 vextracti128 xm3, m1, 1 ; out6 out7 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movq [r6 +strideq*2], xm3 movhps [r6 +r3 ], xm3 ret INV_TXFM_4X8_FN dct, dct, 12 INV_TXFM_4X8_FN dct, identity, 12 INV_TXFM_4X8_FN dct, adst, 12 INV_TXFM_4X8_FN dct, flipadst, 12 cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(idct_4x8_internal_10bpc).pass1 .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 ; transpose & interleave pshufd m0, m0, q1320 pshufd m1, m1, q1320 pshufd m2, m2, q1320 pshufd m3, m3, q1320 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 punpckhdq m2, m3 vpermq m0, m0, q3102 vpermq m2, m2, q3102 vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) vpbroadcastd m7, [pd_2048] call m(idct_8x4_internal_10bpc).main psubd m3, m0, m4 ; out7 out6 paddd m0, m4 ; out0 out1 paddd m1, m2, m5 ; out3 out2 psubd m2, m5 ; out4 out5 pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp m(iadst_4x8_internal_12bpc).end INV_TXFM_4X8_FN adst, dct, 12 INV_TXFM_4X8_FN adst, adst, 12 INV_TXFM_4X8_FN adst, flipadst, 12 INV_TXFM_4X8_FN adst, identity, 12 cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main psrad m0, m4, 1 psrad m1, m6, 1 psrad m2, 1 psrad m3, 1 .pass1_end: vpbroadcastd m5, [pd_1024] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 11}, m0, m1, m2, m3 jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call .pass2_main vpblendd m3, m0, m4, 0x33 ; out6 out7 vpblendd m0, m4, 0xcc ; out0 out1 pshufd m1, m5, q1032 psignd m2, m6 ; out4 out5 psignd m1, m6 ; out2 out3 .end: vpbroadcastd m4, [pw_16384] REPX {psrad x, 3}, m0, m1, m2, m3 packssdw m0, m2 ; 0 1 4 5 (interleaved) packssdw m1, m3 ; 2 3 6 7 (interleaved) mova m2, [iadst8_12_shuf] vpermd m0, m2, m0 ; 0 1 4 5 vpermd m1, m2, m1 ; 2 3 6 7 pmulhrsw m0, m4 pmulhrsw m1, m4 lea r3, [strideq*3] lea r6, [dstq+strideq*4] movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] movq xm5, [dstq+strideq*2] movhps xm5, [dstq+r3 ] movq xm6, [r6 +strideq*0] movhps xm6, [r6 +strideq*1] vinserti128 m4, xm6, 1 movq xm7, [r6 +strideq*2] movhps xm7, [r6 +r3 ] vinserti128 m5, xm7, 1 paddw m0, m4 ; 0 1 4 5 paddw m1, m5 ; 2 3 6 7 vpbroadcastd m5, [pixel_12bpc_max] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 REPX {pmaxsw x, m4}, m0, m1 REPX {pminsw x, m5}, m0, m1 vextracti128 xm2, m0, 1 ; out4 out5 vextracti128 xm3, m1, 1 ; out6 out7 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 movq [dstq+strideq*2], xm1 movhps [dstq+r3 ], xm1 movq [r6 +strideq*0], xm2 movhps [r6 +strideq*1], xm2 movq [r6 +strideq*2], xm3 movhps [r6 +r3 ], xm3 RET ALIGN function_align .pass2_main: ; transpose & interleave pshufd m0, m0, q1320 pshufd m1, m1, q1320 pshufd m2, m2, q1320 pshufd m3, m3, q1320 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 punpckhdq m2, m3 vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) vpbroadcastd m7, [pd_2048] jmp m(iadst_4x8_internal_10bpc).main3 INV_TXFM_4X8_FN flipadst, dct, 12 INV_TXFM_4X8_FN flipadst, adst, 12 INV_TXFM_4X8_FN flipadst, flipadst, 12 INV_TXFM_4X8_FN flipadst, identity, 12 cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 call m(iadst_8x4_internal_10bpc).main psrad m0, m3, 1 psrad m1, m2, 1 psrad m2, m6, 1 psrad m3, m4, 1 jmp m(iadst_4x8_internal_12bpc).pass1_end .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_4x8_internal_12bpc).pass2_main shufpd m3, m4, m0, 0x05 ; out1 out0 shufpd m0, m4, 0x05 ; out7 out6 psignd m2, m6 pshufd m6, m6, q1032 pshufd m1, m2, q1032 ; out5 out4 psignd m2, m5, m6 ; out3 out2 jmp m(iadst_4x8_internal_12bpc).end INV_TXFM_4X8_FN identity, dct, 12 INV_TXFM_4X8_FN identity, adst, 12 INV_TXFM_4X8_FN identity, flipadst, 12 INV_TXFM_4X8_FN identity, identity, 12 cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(iidentity_4x8_internal_10bpc).pass1 .pass2: ; m0 = in0 in1 ; m1 = in2 in3 ; m2 = in4 in5 ; m3 = in6 in7 vpbroadcastd m6, [pixel_12bpc_max] call m(iidentity_4x8_internal_10bpc).pass2_end RET %macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 4x16, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd xm2, [dconly_%3bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 %endif %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, identity INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m10, [pd_3072] mova m1, [cq+32*2] mova m3, [cq+32*6] mova m5, [cq+32*3] mova m7, [cq+32*7] call .pass1_main pmulld m0, m6, [cq+32*0] pmulld m2, m6, [cq+32*4] pmulld m4, m6, [cq+32*1] pmulld m6, [cq+32*5] call .pass1_main2 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 lea r6, [deint_shuf+128] punpcklwd m4, m2, m3 punpckhwd m2, m3 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m4 ; 2 3 punpckldq m0, m4 ; 0 1 punpckldq m4, m5, m2 ; 8 9 punpckhdq m5, m2 ; a b vextracti128 xm2, m0, 1 ; 4 5 vextracti128 xm3, m1, 1 ; 6 7 vextracti128 xm6, m4, 1 ; c d vextracti128 xm7, m5, 1 ; e f call m(idct_4x16_internal_8bpc).main vpbroadcastd m9, [pw_2048] vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 vinserti128 m2, m4, xm5, 1 ; 8 9 b a vinserti128 m3, m6, xm7, 1 ; c d f e vpbroadcastd m8, [pixel_10bpc_max] call .pass2_end RET ALIGN function_align .pass1_main: vpbroadcastd m4, [pd_3784] vpbroadcastd m8, [pd_1567] vpbroadcastd m9, [pd_2048] vpbroadcastd m6, [pd_1448] ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h ret ALIGN function_align .pass1_main2: paddd m0, m10 paddd m4, m10 paddd m8, m0, m2 psubd m0, m2 paddd m9, m4, m6 psubd m4, m6 REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h psubd m2, m0, m1 paddd m1, m0 psubd m6, m4, m5 paddd m5, m4 paddd m0, m8, m3 psubd m3, m8, m3 paddd m4, m9, m7 psubd m7, m9, m7 ret ALIGN function_align .pass2_end: lea r6, [strideq*3] pxor m7, m7 pmulhrsw m0, m9 call .write_4x4 pmulhrsw m0, m1, m9 call .write_4x4 pmulhrsw m0, m2, m9 call .write_4x4 pmulhrsw m0, m3, m9 call .write_4x4 ret ALIGN function_align .write_4x4: movq xm4, [dstq+strideq*0] movhps xm4, [dstq+strideq*1] vpbroadcastq m5, [dstq+strideq*2] vpbroadcastq m6, [dstq+r6 ] mova [cq+32*0], m7 mova [cq+32*1], m7 add cq, 32*2 vpblendd m4, m5, 0xc0 vpblendd m4, m6, 0x30 paddw m4, m0 pmaxsw m4, m7 pminsw m4, m8 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm4 movhps [dstq+strideq*2], xm5 movq [dstq+r6 ], xm5 lea dstq, [dstq+strideq*4] ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_6144] call m(iadst_16x4_internal_10bpc).main_end psrad m0, m4, 13 psrad m1, m5, 13 psrad m2, 13 psrad m3, 13 psrad m4, m8, 13 psrad m5, m9, 13 psrad m6, 13 psrad m7, 13 jmp tx2q .pass2: call .pass2_main vpbroadcastd m5, [pw_2048] vpbroadcastd m8, [pixel_10bpc_max] lea r6, [strideq*3] vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 pxor m7, m7 psubw m9, m7, m5 vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 pmulhrsw m0, m4, m9 call .write_4x4 pmulhrsw m0, m1, m9 call .write_4x4 pmulhrsw m0, m2, m9 call .write_4x4 pmulhrsw m0, m3, m9 call .write_4x4 RET ALIGN function_align .write_4x4: movq xm4, [dstq+r6 ] movhps xm4, [dstq+strideq*0] vpbroadcastq m5, [dstq+strideq*1] vpbroadcastq m6, [dstq+strideq*2] mova [cq+32*0], m7 mova [cq+32*1], m7 add cq, 32*2 vpblendd m4, m5, 0xc0 vpblendd m4, m6, 0x30 paddw m4, m0 pmaxsw m4, m7 pminsw m4, m8 vextracti128 xm5, m4, 1 movhps [dstq+strideq*0], xm4 movhps [dstq+strideq*1], xm5 movq [dstq+strideq*2], xm5 movq [dstq+r6 ], xm4 lea dstq, [dstq+strideq*4] ret ALIGN function_align .pass2_main: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 lea r6, [deint_shuf+128] punpcklwd m4, m2, m3 punpckhwd m2, m3 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m4 punpckldq m0, m4 punpckldq m4, m5, m2 punpckhdq m5, m2 vpblendd m3, m0, m1, 0x33 vpblendd m0, m1, 0xcc shufpd m2, m5, m4, 0x05 shufpd m4, m5, 0x05 vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 vinserti128 m0, xm3, 1 ; 0 3 2 1 vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? vinserti128 m2, xm4, 1 ; b 8 9 a call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m5, [pw_2896x8] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 ret ALIGN function_align .main: vbroadcasti128 m0, [cq+16* 0] vbroadcasti128 m4, [cq+16* 2] vbroadcasti128 m1, [cq+16*15] vbroadcasti128 m5, [cq+16*13] vbroadcasti128 m2, [cq+16* 4] vbroadcasti128 m6, [cq+16* 6] vbroadcasti128 m3, [cq+16*11] vbroadcasti128 m7, [cq+16* 9] shufpd m0, m4, 0x0c ; 0 2 shufpd m1, m5, 0x0c ; 15 13 shufpd m2, m6, 0x0c ; 4 6 shufpd m3, m7, 0x0c ; 11 9 vbroadcasti128 m4, [cq+16* 8] vbroadcasti128 m6, [cq+16*10] vbroadcasti128 m5, [cq+16* 7] vbroadcasti128 m7, [cq+16* 5] shufpd m4, m6, 0x0c ; 8 10 shufpd m5, m7, 0x0c ; 7 5 vbroadcasti128 m6, [cq+16*12] vbroadcasti128 m7, [cq+16*14] shufpd m6, m7, 0x0c ; 12 14 vbroadcasti128 m7, [cq+16* 3] vbroadcasti128 m8, [cq+16* 1] shufpd m7, m8, 0x0c ; 3 1 .main2: ; expects: m12 = clip_min m13 = clip_max vpbroadcastd m11, [pd_2048] ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 psubd m8, m0, m4 ; t8a t10a paddd m0, m4 ; t0a t2a psubd m4, m1, m5 ; t9a t11a paddd m1, m5 ; t1a t3a psubd m5, m2, m6 ; t12a t14a paddd m2, m6 ; t4a t6a psubd m6, m3, m7 ; t13a t15a paddd m3, m7 ; t5a t7a REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 psubd m7, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 psubd m3, m4, m6 ; t12a t14a paddd m4, m6 ; t8a t10a psubd m6, m8, m5 ; t13a t15a paddd m8, m5 ; t9a t11a REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 punpcklqdq m5, m3, m7 ; t12a t4 punpckhqdq m3, m7 ; t14a t6 punpckhqdq m7, m6, m2 ; t15a t7 punpcklqdq m6, m2 ; t13a t5 ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 vpbroadcastd m10, [pd_2896] vbroadcasti128 m9, [pw_2048_m2048] ; + + - - punpckhqdq m2, m4, m0 ; t10a t2 punpcklqdq m4, m0 ; t8a t0 punpckhqdq m0, m8, m1 ; t11a t3 punpcklqdq m8, m1 ; t9a t1 paddd m1, m6, m7 ; out2 -out3 psubd m6, m7 ; t14a t6 paddd m7, m5, m3 ; -out13 out12 psubd m5, m3 ; t15a t7 psubd m3, m8, m0 ; t11 t3a paddd m8, m0 ; out14 -out15 paddd m0, m4, m2 ; -out1 out0 psubd m4, m2 ; t10 t2a REPX {pmaxsd x, m12}, m6, m5, m3, m4 REPX {pminsd x, m13}, m6, m5, m3, m4 REPX {pmulld x, m10}, m6, m5, m3, m4 paddd m6, m11 paddd m4, m11 paddd m2, m6, m5 ; -out5 out4 psubd m6, m5 ; out10 -out11 psubd m5, m4, m3 ; -out9 out8 paddd m3, m4 ; out6 -out7 REPX {psrad x, 12}, m2, m3, m5, m6 REPX {psignd x, m9}, m1, m8, m3, m6 pshufd m9, m9, q1032 REPX {psignd x, m9}, m0, m7, m2, m5 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 .pass1: call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_6144] call m(iadst_16x4_internal_10bpc).main_end psrad m0, m3, 13 psrad m1, m2, 13 psrad m2, m5, 13 psrad m3, m4, 13 psrad m4, m7, 13 psrad m5, m6, 13 psrad m6, m9, 13 psrad m7, m8, 13 jmp tx2q .pass2: call m(iadst_4x16_internal_10bpc).pass2_main vpbroadcastd m5, [pw_2048] vpbroadcastd m8, [pixel_10bpc_max] lea r6, [strideq*3] vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 pxor m7, m7 psubw m9, m7, m5 vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 pmulhrsw m0, m4, m9 call .write_4x4 pmulhrsw m0, m2, m9 call .write_4x4 pmulhrsw m0, m1, m9 call .write_4x4 pmulhrsw m0, m3, m9 call .write_4x4 RET ALIGN function_align .write_4x4: movq xm4, [dstq+strideq*0] movhps xm4, [dstq+r6 ] vpbroadcastq m5, [dstq+strideq*1] vpbroadcastq m6, [dstq+strideq*2] mova [cq+32*0], m7 mova [cq+32*1], m7 add cq, 32*2 vpblendd m4, m5, 0x30 vpblendd m4, m6, 0xc0 paddw m4, m0 pmaxsw m4, m7 pminsw m4, m8 vextracti128 xm5, m4, 1 movq [dstq+strideq*0], xm4 movq [dstq+strideq*1], xm5 movhps [dstq+strideq*2], xm5 movhps [dstq+r6 ], xm4 lea dstq, [dstq+strideq*4] ret INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 vpbroadcastd m7, [pd_5793] pmulld m0, m7, [cq+32*0] pmulld m4, m7, [cq+32*1] pmulld m1, m7, [cq+32*2] pmulld m5, m7, [cq+32*3] pmulld m2, m7, [cq+32*4] pmulld m6, m7, [cq+32*5] pmulld m3, m7, [cq+32*6] pmulld m7, [cq+32*7] vpbroadcastd m8, [pd_6144] REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 jmp tx2q .pass2: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m7, [pw_1697x16] vpbroadcastd m8, [pw_2048] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 vpbroadcastd m4, [pixel_10bpc_max] call .pass2_end RET ALIGN function_align .pass2_end: punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 lea r6, [strideq*5] pxor m3, m3 punpckhdq m5, m0, m2 ; 2 3 6 7 punpckldq m0, m2 ; 0 1 4 5 punpckldq m6, m7, m1 ; 8 9 c d punpckhdq m7, m1 ; a b e f pmulhrsw m0, m8 call .write_2x4x2 pmulhrsw m0, m5, m8 call .write_2x4x2 pmulhrsw m0, m6, m8 lea dstq, [dstq+strideq*4] call .write_2x4x2 pmulhrsw m0, m7, m8 call .write_2x4x2 ret ALIGN function_align .write_2x4x2: movq xm1, [dstq+strideq*0] movhps xm1, [dstq+strideq*1] vpbroadcastq m2, [dstq+strideq*4] vpblendd m1, m2, 0x30 vpbroadcastq m2, [dstq+r6 ] vpblendd m1, m2, 0xc0 mova [cq+32*0], m3 mova [cq+32*1], m3 add cq, 32*2 paddw m1, m0 pmaxsw m1, m3 pminsw m1, m4 vextracti128 xm2, m1, 1 movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 movq [dstq+strideq*4], xm2 movhps [dstq+r6 ], xm2 lea dstq, [dstq+strideq*2] ret INV_TXFM_4X16_FN dct, dct, 12 INV_TXFM_4X16_FN dct, identity, 12 INV_TXFM_4X16_FN dct, adst, 12 INV_TXFM_4X16_FN dct, flipadst, 12 cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(idct_4x16_internal_10bpc).pass1 .pass2: punpckldq m8, m0, m1 punpckhdq m0, m1 punpckldq m9, m2, m3 punpckhdq m2, m3 punpckldq m1, m4, m5 punpckhdq m4, m5 punpckldq m3, m6, m7 punpckhdq m6, m7 punpcklqdq m5, m0, m2 ; 2 6 punpckhqdq m12, m0, m2 ; 3 7 punpcklqdq m0, m8, m9 ; 0 4 punpckhqdq m10, m8, m9 ; 1 5 punpcklqdq m2, m1, m3 ; 8 12 punpckhqdq m13, m1, m3 ; 9 13 punpcklqdq m9, m4, m6 ; 10 14 punpckhqdq m4, m6 ; 11 15 vperm2i128 m1, m5, m9, 0x20 ; 2 10 vperm2i128 m3, m9, m5, 0x31 ; 14 6 vpermq m11, m4, q1302 ; 15 11 ; interleave REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 call m(idct_16x4_internal_10bpc).pass1_main vpermq m6, m12, q1302 ; 7 3 vpermq m5, m13, q3120 ; 9 13 call m(idct_16x4_internal_10bpc).pass1_main2 call m(idct_16x4_internal_10bpc).pass1_main3 REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 mova m4, [idct16_12_shuf] REPX {vpermd x, m4, x}, m0, m1, m2, m3 vpbroadcastd m9, [pw_16384] vpbroadcastd m8, [pixel_12bpc_max] call m(idct_4x16_internal_10bpc).pass2_end RET INV_TXFM_4X16_FN adst, dct, 12 INV_TXFM_4X16_FN adst, adst, 12 INV_TXFM_4X16_FN adst, flipadst, 12 INV_TXFM_4X16_FN adst, identity, 12 cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 call .main_pass1 psrad m0, m4, 12 psrad m1, m5, 12 psrad m2, 12 psrad m3, 12 psrad m4, m8, 12 psrad m5, m9, 12 psrad m6, 12 psrad m7, 12 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose_16x4 call m(iadst_4x16_internal_10bpc).main2 pshufd m4, m5, q1032 psrad m5, m6, 3 pshufd m6, m7, q1032 psrad m7, m8, 3 REPX {pshufd x, x, q1032}, m0, m2 REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 .pass2_end: packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 mova m4, [iadst16_12_shuf] REPX {vpermd x, m4, x}, m0, m1, m2, m3 vpbroadcastd m9, [pw_16384] vpbroadcastd m8, [pixel_12bpc_max] lea r6, [strideq*3] pxor m7, m7 pmulhrsw m0, m9 call m(iadst_4x16_internal_10bpc).write_4x4 pmulhrsw m0, m9, m1 call m(iadst_4x16_internal_10bpc).write_4x4 pmulhrsw m0, m9, m2 call m(iadst_4x16_internal_10bpc).write_4x4 pmulhrsw m0, m9, m3 call m(iadst_4x16_internal_10bpc).write_4x4 RET ALIGN function_align .transpose_16x4: ; transpose & interleave punpckldq m8, m0, m1 punpckhdq m0, m1 punpckldq m9, m2, m3 punpckhdq m2, m3 punpckldq m1, m4, m5 punpckhdq m4, m5 punpckldq m3, m6, m7 punpckhdq m6, m7 punpcklqdq m10, m8, m0 punpckhqdq m0, m8 punpcklqdq m11, m9, m2 punpckhqdq m2, m9 punpcklqdq m8, m1, m4 punpckhqdq m4, m1 punpcklqdq m9, m3, m6 punpckhqdq m6, m3 vperm2i128 m5, m0, m2, 0x31 ; 7 5 vperm2i128 m7, m0, m2, 0x20 ; 3 1 vperm2i128 m0, m10, m11, 0x20 ; 0 2 vperm2i128 m2, m10, m11, 0x31 ; 4 6 vperm2i128 m1, m4, m6, 0x31 ; 15 13 vperm2i128 m3, m4, m6, 0x20 ; 11 9 vperm2i128 m4, m8, m9, 0x20 ; 8 10 vperm2i128 m6, m8, m9, 0x31 ; 12 14 ret ALIGN function_align .main_pass1: call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_3072] paddd m10, m4, m5 psubd m4, m3 psubd m5, m3 paddd m3, m10 psubd m8, m7, m1 paddd m7, m9 psubd m9, m1 paddd m7, m1 REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 paddd m6, m0 ret INV_TXFM_4X16_FN flipadst, dct, 12 INV_TXFM_4X16_FN flipadst, adst, 12 INV_TXFM_4X16_FN flipadst, flipadst, 12 INV_TXFM_4X16_FN flipadst, identity, 12 cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 call m(iadst_4x16_internal_12bpc).main_pass1 psrad m0, m3, 12 psrad m1, m2, 12 psrad m2, m5, 12 psrad m3, m4, 12 psrad m4, m7, 12 psrad m5, m6, 12 psrad m6, m9, 12 psrad m7, m8, 12 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_4x16_internal_12bpc).transpose_16x4 call m(iadst_4x16_internal_10bpc).main2 pshufd m4, m3, q1032 psrad m3, m5, 3 psrad m5, m2, 3 pshufd m2, m6, q1032 pshufd m6, m1, q1032 psrad m1, m7, 3 psrad m7, m0, 3 pshufd m0, m8, q1032 REPX {psrad x, 3}, m0, m2, m4, m6 jmp m(iadst_4x16_internal_12bpc).pass2_end INV_TXFM_4X16_FN identity, dct, 12 INV_TXFM_4X16_FN identity, adst, 12 INV_TXFM_4X16_FN identity, flipadst, 12 INV_TXFM_4X16_FN identity, identity, 12 cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [pd_1697] mova m0, [cq+32*0] mova m4, [cq+32*1] mova m1, [cq+32*2] mova m5, [cq+32*3] vpbroadcastd m9, [pd_6144] pmulld m2, m8, m0 pmulld m6, m8, m4 pmulld m3, m8, m1 pmulld m7, m8, m5 mova m10, [cq+32*4] mova m11, [cq+32*5] mova m12, [cq+32*6] mova m13, [cq+32*7] REPX {paddd x, m9}, m2, m6, m3, m7 REPX {psrad x, 12}, m2, m6, m3, m7 paddd m0, m2 pmulld m2, m8, m10 paddd m4, m6 pmulld m6, m8, m11 paddd m1, m3 pmulld m3, m8, m12 paddd m5, m7 pmulld m7, m8, m13 REPX {psrad x, 1 }, m0, m4, m1, m5 REPX {paddd x, m9}, m2, m6, m3, m7 REPX {psrad x, 12}, m2, m6, m3, m7 paddd m2, m10 paddd m6, m11 paddd m3, m12 paddd m7, m13 REPX {psrad x, 1 }, m2, m6, m3, m7 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m8, [pd_5793] vpbroadcastd m9, [pd_1024] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m8, [pw_16384] vpbroadcastd m4, [pixel_12bpc_max] call m(iidentity_4x16_internal_10bpc).pass2_end RET %macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x4, %3 %ifidn %1_%2, dct_dct vpbroadcastd m2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 4 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 %else jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly %endif %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, identity INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] .pass1: vbroadcasti128 m1, [cq+16*1] vbroadcasti128 m0, [cq+16*5] vbroadcasti128 m2, [cq+16*3] vbroadcasti128 m3, [cq+16*7] vpbroadcastd m6, [pd_2896] shufpd m1, m0, 0x0c ; 1 5 shufpd m3, m2, 0x0c ; 7 3 vbroadcasti128 m0, [cq+16*0] vbroadcasti128 m4, [cq+16*2] vbroadcasti128 m2, [cq+16*4] vbroadcasti128 m5, [cq+16*6] vpbroadcastd m7, [pd_2048] shufpd m0, m4, 0x0c ; 0 2 shufpd m2, m5, 0x0c ; 4 6 REPX {pmulld x, m6}, m1, m3, m0, m2 REPX {paddd x, m7}, m1, m3, m0, m2 REPX {psrad x, 12}, m1, m3, m0, m2 call .main psubd m3, m0, m4 ; out7 out6 (interleaved) paddd m0, m4 ; out0 out1 (interleaved) paddd m1, m2, m5 ; out3 out2 (interleaved) psubd m2, m5 ; out4 out5 (interleaved) pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp tx2q .pass2: vbroadcasti128 m4, [deint_shuf] packssdw m0, m1 packssdw m2, m3 vperm2i128 m1, m0, m2, 0x31 vinserti128 m0, xm2, 1 pshufb m0, m4 pshufb m1, m4 IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 vpermq m0, m0, q3120 ; out0 out1 vpermq m2, m1, q2031 ; out2 out3 jmp m(iadst_8x4_internal_10bpc).end ALIGN function_align .main: ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 vpbroadcastd m6, [pd_2896] punpcklqdq m4, m1, m3 ; t4a t7a punpckhqdq m1, m3 ; t5a t6a psubd m3, m4, m1 ; t5a t6a paddd m4, m1 ; t4 t7 REPX {pmaxsd x, m8}, m3, m4, m0, m2 REPX {pminsd x, m9}, m3, m4, m0, m2 pmulld m3, m6 pshufd m1, m3, q1032 paddd m3, m7 psubd m5, m3, m1 paddd m1, m3 psrad m5, 12 psrad m1, 12 vpblendd m5, m4, 0x33 ; t4 t5 punpckhqdq m4, m1 ; t7 t6 ret INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 call m(iadst_4x8_internal_10bpc).main vpblendd m3, m0, m4, 0x33 ; out6 out7 vpblendd m0, m4, 0xcc ; out0 out1 pshufd m1, m5, q1032 psignd m2, m6 ; out4 out5 psignd m1, m6 ; out2 out3 jmp tx2q .pass2: call .pass2_main vpermq m0, m0, q3120 ; out0 out1 vpermq m2, m1, q3120 ; out2 out3 .end: vpbroadcastd m1, [pw_2048] pmulhrsw m0, m1 pmulhrsw m1, m2 vpbroadcastd m5, [pixel_10bpc_max] .end2: mova xm2, [dstq+strideq*0] vinserti128 m2, [dstq+strideq*1], 1 lea r6, [dstq+strideq*2] mova xm3, [r6 +strideq*0] vinserti128 m3, [r6 +strideq*1], 1 pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [r6 +strideq*0], xm1 vextracti128 [r6 +strideq*1], m1, 1 RET ALIGN function_align .pass2_main: vbroadcasti128 m4, [deint_shuf] packssdw m0, m1 packssdw m2, m3 lea r6, [deint_shuf+128] vperm2i128 m1, m0, m2, 0x31 vinserti128 m0, xm2, 1 pshufb m0, m4 pshufb m1, m4 jmp m(iadst_8x4_internal_8bpc).main ALIGN function_align .main: vpbroadcastd m1, [pd_2896] pmulld m0, m1, [cq+32*0] pmulld m3, m1, [cq+32*3] pmulld m2, m1, [cq+32*2] pmulld m1, [cq+32*1] vpbroadcastd m4, [pd_2048] REPX {paddd x, m4}, m0, m3, m2, m1 REPX {psrad x, 12}, m0, m3, m2, m1 .main2: IADST4_1D ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2 call m(iadst_4x8_internal_10bpc).main shufpd m3, m4, m0, 0x05 shufpd m0, m4, 0x05 psignd m2, m6 pshufd m6, m6, q1032 pshufd m1, m2, q1032 psignd m2, m5, m6 jmp tx2q .pass2: call m(iadst_8x4_internal_10bpc).pass2_main vpermq m2, m0, q2031 vpermq m0, m1, q2031 jmp m(iadst_8x4_internal_10bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m4, [pd_2896] vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpermq m2, [cq+32*2], q3120 vpermq m3, [cq+32*3], q3120 vpbroadcastd m7, [pd_2048] REPX {pmulld x, m4}, m0, m1, m2, m3 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 REPX {paddd x, x }, m0, m1, m2, m3 jmp tx2q .pass2: vpbroadcastd m5, [pixel_10bpc_max] vpbroadcastd m4, [pw_1697x8] packssdw m0, m1 packssdw m2, m3 pmulhrsw m1, m4, m0 pmulhrsw m4, m2 paddsw m0, m1 paddsw m2, m4 packssdw m7, m7 ; pw_2048 .pass2_end: punpckhwd m1, m0, m2 punpcklwd m0, m2 lea r6, [dstq+strideq*2] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmulhrsw m2, m7 pmulhrsw m0, m7 punpckhwd m1, m0, m2 punpcklwd m0, m2 mova xm2, [dstq+strideq*0] vinserti128 m2, [r6 +strideq*0], 1 mova xm3, [dstq+strideq*1] vinserti128 m3, [r6 +strideq*1], 1 pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 vextracti128 [r6 +strideq*0], m0, 1 vextracti128 [r6 +strideq*1], m1, 1 RET INV_TXFM_8X4_FN dct, dct, 12 INV_TXFM_8X4_FN dct, identity, 12 INV_TXFM_8X4_FN dct, adst, 12 INV_TXFM_8X4_FN dct, flipadst, 12 cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] jmp m(idct_8x4_internal_10bpc).pass1 .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).transpose_4x8 IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 jmp m(iadst_8x4_internal_12bpc).end INV_TXFM_8X4_FN adst, dct, 12 INV_TXFM_8X4_FN adst, adst, 12 INV_TXFM_8X4_FN adst, flipadst, 12 INV_TXFM_8X4_FN adst, identity, 12 cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] call m(iadst_4x8_internal_10bpc).main2 vpblendd m3, m0, m4, 0x33 ; out6 out7 vpblendd m0, m4, 0xcc ; out0 out1 pshufd m1, m5, q1032 psignd m2, m6 ; out4 out5 psignd m1, m6 ; out2 out3 jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call .pass2_main vpbroadcastd m5, [pd_2048] paddd m0, m5, m4 paddd m1, m5, m6 paddd m2, m5 paddd m3, m5 .pass2_end: REPX {psrad x, 12}, m0, m1, m2, m3 .end: vpbroadcastd m4, [pw_16384] REPX {psrad x, 3}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 pmulhrsw m0, m4 pmulhrsw m1, m2, m4 vpermq m0, m0, q3120 ; out0 out1 vpermq m1, m1, q3120 ; out2 out3 vpbroadcastd m5, [pixel_12bpc_max] jmp m(iadst_8x4_internal_10bpc).end2 ALIGN function_align .pass2_main: call .transpose_4x8 jmp m(iadst_8x4_internal_10bpc).main2 ALIGN function_align .transpose_4x8: ; deinterleave pshufd m0, m0, q3120 pshufd m1, m1, q3120 pshufd m2, m2, q3120 pshufd m3, m3, q3120 ; transpose punpcklqdq m4, m0, m1 punpckhqdq m0, m1 punpcklqdq m5, m2, m3 punpckhqdq m2, m3 vperm2i128 m1, m0, m2, 0x20 ; out1 vperm2i128 m3, m0, m2, 0x31 ; out3 vperm2i128 m2, m4, m5, 0x31 ; out2 vperm2i128 m0, m4, m5, 0x20 ; out0 ret INV_TXFM_8X4_FN flipadst, dct, 12 INV_TXFM_8X4_FN flipadst, adst, 12 INV_TXFM_8X4_FN flipadst, flipadst, 12 INV_TXFM_8X4_FN flipadst, identity, 12 cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] call m(iadst_4x8_internal_10bpc).main2 shufpd m3, m4, m0, 0x05 shufpd m0, m4, 0x05 psignd m2, m6 pshufd m6, m6, q1032 pshufd m1, m2, q1032 psignd m2, m5, m6 jmp tx2q .pass2: vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).pass2_main vpbroadcastd m5, [pd_2048] paddd m0, m5, m3 paddd m1, m5, m2 paddd m3, m5, m4 paddd m2, m5, m6 jmp m(iadst_8x4_internal_12bpc).pass2_end INV_TXFM_8X4_FN identity, dct, 12 INV_TXFM_8X4_FN identity, adst, 12 INV_TXFM_8X4_FN identity, flipadst, 12 INV_TXFM_8X4_FN identity, identity, 12 cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(iidentity_8x4_internal_10bpc).pass1 .pass2: ; m0 = in0 in1 (interleaved) ; m1 = in2 in3 (interleaved) ; m2 = in4 in5 (interleaved) ; m3 = in6 in7 (interleaved) vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 REPX {pminsd x, m9}, m0, m1, m2, m3 vpbroadcastd m4, [pd_5793] REPX {pmulld x, m4}, m0, m1, m2, m3 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 15}, m0, m1, m2, m3 vpbroadcastd m5, [pixel_12bpc_max] vpbroadcastd m7, [pw_16384] packssdw m0, m1 packssdw m2, m3 jmp m(iidentity_8x4_internal_10bpc).pass2_end %macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 8x8, %3 %ifidn %1_%2, dct_dct vpbroadcastd m2, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 .dconly2: add r6d, 384 sar r6d, 9 .dconly3: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm2 vpbroadcastw m0, xm0 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 paddsw m1, m0 psubusw m1, m2 mova [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %else jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly %endif %endif %endmacro %macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a psubd m%9, m%3, m%7 ; t6 paddd m%3, m%7 ; t2 psubd m%7, m%1, m%5 ; t4 paddd m%1, m%5 ; t0 psubd m%5, m%6, m%2 ; t7 paddd m%6, m%2 ; t3 psubd m%2, m%8, m%4 ; t5 paddd m%8, m%4 ; t1 REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a psubd m%10, m%7, m%9 ; t7 paddd m%7, m%9 ; out6 vpbroadcastd m%9, [pd_1448] psubd m%4, m%8, m%6 ; t3 paddd m%8, m%6 ; -out7 psubd m%6, m%1, m%3 ; t2 paddd m%1, m%3 ; out0 psubd m%3, m%2, m%5 ; t6 paddd m%2, m%5 ; -out1 REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 paddd m%4, m%6 ; (t2 + t3) * 1448 psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 paddd m%3, m%10 ; (t6 + t7) * 1448 %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, identity INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] mova m4, [cq+32*4] mova m5, [cq+32*5] mova m6, [cq+32*6] mova m7, [cq+32*7] vpbroadcastd m11, [pd_2048] call .main call .round_shift1 jmp tx2q .pass2: call .transpose_8x8_packed call m(idct_8x8_internal_8bpc).main vpbroadcastd m12, [pw_2048] vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call .write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call .write_8x4 RET ALIGN function_align .write_8x4_start: vpbroadcastd m11, [pixel_10bpc_max] lea r6, [strideq*3] pxor m10, m10 .write_8x4: mova xm8, [dstq+strideq*0] vinserti128 m8, [dstq+strideq*1], 1 mova xm9, [dstq+strideq*2] vinserti128 m9, [dstq+r6 ], 1 mova [cq+32*0], m10 mova [cq+32*1], m10 mova [cq+32*2], m10 mova [cq+32*3], m10 add cq, 32*4 paddw m0, m8 paddw m1, m9 pmaxsw m0, m10 pmaxsw m1, m10 pminsw m0, m11 pminsw m1, m11 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+strideq*4] ret ALIGN function_align .transpose_8x8_packed: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 lea r6, [deint_shuf+128] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckhdq m2, m4, m1 punpckldq m4, m1 vinserti128 m1, m3, xm2, 1 vperm2i128 m3, m2, 0x31 vperm2i128 m2, m0, m4, 0x31 vinserti128 m0, xm4, 1 ret ALIGN function_align .main_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main: ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 paddd m8, m1, m5 ; t4 psubd m1, m5 ; t5a paddd m9, m7, m3 ; t7 psubd m7, m3 ; t6a vpbroadcastd m3, [pd_2896] REPX {pmaxsd x, m12}, m1, m8, m7, m9 REPX {pminsd x, m13}, m1, m8, m7, m9 REPX {pmulld x, m3 }, m0, m4, m7, m1 paddd m0, m11 paddd m7, m11 psubd m5, m0, m4 paddd m0, m4 psubd m4, m7, m1 paddd m7, m1 REPX {psrad x, 12 }, m5, m0, m4, m7 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 paddd m6, m5, m2 ; dct4 out1 psubd m5, m2 ; dct4 out2 REPX {pmaxsd x, m12}, m0, m6, m5, m3 REPX {pminsd x, m13}, m0, m6, m5, m3 ret ALIGN function_align .round_shift1: pcmpeqd m1, m1 REPX {psubd x, m1}, m0, m6, m5, m3 paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call .main call .main_end jmp tx2q .pass2: call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m5, [pw_2048] vpbroadcastd xm12, [pw_4096] psubw m12, m5 REPX {vpermq x, x, q3120}, m0, m1, m2, m3 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .main: mova m0, [cq+32*0] mova m7, [cq+32*7] mova m1, [cq+32*1] mova m6, [cq+32*6] mova m2, [cq+32*2] mova m5, [cq+32*5] mova m3, [cq+32*3] mova m4, [cq+32*4] vpbroadcastd m11, [pd_2048] .main2: IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 psrld m8, 10 ; pd_1 vpbroadcastd m9, [pd_3072] ret ALIGN function_align .main_end: paddd m0, m8 psubd m1, m8, m1 paddd m6, m8 psubd m7, m8, m7 REPX {psrad x, 1 }, m0, m1, m6, m7 ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 psubd m8, m9, m8 ; pd_3071 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 REPX {psrad x, 12}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call m(iadst_8x8_internal_10bpc).main call .main_end jmp tx2q .pass2: call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m12, [pw_2048] vpbroadcastd xm5, [pw_4096] psubw m12, m5 vpermq m8, m3, q2031 vpermq m9, m2, q2031 vpermq m2, m1, q2031 vpermq m3, m0, q2031 pmulhrsw m0, m8, m12 pmulhrsw m1, m9, m12 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .main_end: paddd m10, m8, m0 psubd m0, m8, m7 psubd m7, m8, m1 paddd m1, m8, m6 psrad m0, 1 psrad m1, 1 psrad m6, m7, 1 psrad m7, m10, 1 psubd m8, m9, m8 ; pd_6143 psubd m10, m8, m5 paddd m5, m9, m2 psubd m2, m8, m3 paddd m3, m9, m4 psrad m4, m2, 12 psrad m2, m10, 12 psrad m3, 12 psrad m5, 12 ret INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 .pass1: mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] mova m4, [cq+32*4] mova m5, [cq+32*5] mova m6, [cq+32*6] mova m7, [cq+32*7] jmp tx2q .pass2: packssdw m3, m7 vpbroadcastd m7, [pixel_10bpc_max] .pass2_main: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 vpbroadcastd m12, [pw_4096] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m1 punpckhdq m4, m1 punpckhqdq m1, m0, m2 ; 1 5 punpcklqdq m0, m2 ; 0 4 punpcklqdq m2, m3, m4 ; 2 6 punpckhqdq m3, m4 ; 3 7 pmulhrsw m0, m12 pmulhrsw m1, m12 call .write_2x8x2_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call .write_2x8x2_zero RET .write_2x8x2_start: lea r6, [strideq*5] pxor m6, m6 .write_2x8x2_zero: mova [cq+32*0], m6 mova [cq+32*1], m6 mova [cq+32*2], m6 mova [cq+32*3], m6 add cq, 32*4 .write_2x8x2: mova xm4, [dstq+strideq*0] vinserti128 m4, [dstq+strideq*4], 1 mova xm5, [dstq+strideq*1] vinserti128 m5, [dstq+r6 ], 1 paddw m0, m4 paddw m1, m5 pmaxsw m0, m6 pmaxsw m1, m6 pminsw m0, m7 pminsw m1, m7 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 vextracti128 [dstq+strideq*4], m0, 1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+strideq*2] ret %macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] punpckldq m%9, m%1, m%2 ; aibj emfn punpckhdq m%1, m%2 ; ckdl gohp punpckldq m%10, m%3, m%4 ; qyrz uCvD punpckhdq m%3, m%4 ; sAtB wExF punpckldq m%11, m%5, m%6 ; GOHP KSLT punpckhdq m%5, m%6 ; IQJR MUNV punpckldq m%12, m%7, m%8 ; WeXf aibj punpckhdq m%7, m%8 ; YgZh ckdl punpcklqdq m%2, m%9, m%10 ; aiqy emuC punpckhqdq m%9, m%10 ; bjrz fnvD punpcklqdq m%4, m%1, m%3 ; cksA gowE punpckhqdq m%10, m%1, m%3 ; dltB hpxF punpcklqdq m%6, m%11, m%12 ; GOWe KSai punpckhqdq m%11, m%12 ; HPXf LTbj punpcklqdq m%8, m%5, m%7 ; IQYg MUck punpckhqdq m%12, m%5, m%7 ; JRZh NVdl vperm2i128 m%1, m%2, m%6, 0x20 ; out0 vperm2i128 m%5, m%2, m%6, 0x31 ; out4 vperm2i128 m%2, m%9, m%11, 0x20 ; out1 vperm2i128 m%6, m%9, m%11, 0x31 ; out5 vperm2i128 m%3, m%4, m%8, 0x20 ; out2 vperm2i128 m%7, m%4, m%8, 0x31 ; out6 vperm2i128 m%4, m%10, m%12, 0x20 ; out3 vperm2i128 m%8, m%10, m%12, 0x31 ; out7 %endmacro INV_TXFM_8X8_FN dct, dct, 12 INV_TXFM_8X8_FN dct, identity, 12 INV_TXFM_8X8_FN dct, adst, 12 INV_TXFM_8X8_FN dct, flipadst, 12 cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_8x8_internal_10bpc).pass1 .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose_8x8 vpbroadcastd m11, [pd_2048] call m(idct_8x8_internal_10bpc).main call .round_shift4 jmp m(iadst_8x8_internal_12bpc).pass2_end ALIGN function_align .write_8x4_start: vpbroadcastd m11, [pixel_12bpc_max] lea r6, [strideq*3] pxor m10, m10 ret ALIGN function_align .transpose_8x8: TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 ret ALIGN function_align .round_shift4: vpbroadcastd m1, [pd_8] REPX {paddd x, m1}, m0, m6, m5, m3 paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_8X8_FN adst, dct, 12 INV_TXFM_8X8_FN adst, adst, 12 INV_TXFM_8X8_FN adst, flipadst, 12 INV_TXFM_8X8_FN adst, identity, 12 cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iadst_8x8_internal_10bpc).pass1 .pass2: call .pass2_main .pass2_end: packssdw m0, m1 packssdw m1, m2, m3 REPX {vpermq x, x, q3120}, m0, m1 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 packssdw m0, m4, m5 packssdw m1, m6, m7 REPX {vpermq x, x, q3120}, m0, m1 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .pass2_main: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_12bpc).transpose_8x8 vpbroadcastd m11, [pd_2048] .pass2_main2: call m(iadst_8x8_internal_10bpc).main2 pslld m9, m8, 3 ; pd_8 paddd m0, m9 psubd m1, m9, m1 ; 8+x paddd m6, m9 psubd m7, m9, m7 REPX {psrad x, 4}, m0, m1, m6, m7 vpbroadcastd m9, [pd_17408] psubd m8, m9, m8 ; 17407 paddd m2, m9 psubd m3, m8, m3 paddd m4, m9 psubd m5, m8, m5 REPX {psrad x, 15}, m2, m3, m4, m5 ret INV_TXFM_8X8_FN flipadst, dct, 12 INV_TXFM_8X8_FN flipadst, adst, 12 INV_TXFM_8X8_FN flipadst, flipadst, 12 INV_TXFM_8X8_FN flipadst, identity, 12 cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_8x8_internal_10bpc).pass1 .pass2: call m(iadst_8x8_internal_12bpc).pass2_main packssdw m7, m7, m6 packssdw m6, m1, m0 packssdw m1, m5, m4 vpermq m0, m7, q3120 vpermq m1, m1, q3120 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 packssdw m0, m3, m2 vpermq m0, m0, q3120 vpermq m1, m6, q3120 call m(idct_8x8_internal_10bpc).write_8x4 RET INV_TXFM_8X8_FN identity, dct, 12 INV_TXFM_8X8_FN identity, adst, 12 INV_TXFM_8X8_FN identity, flipadst, 12 INV_TXFM_8X8_FN identity, identity, 12 cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(iidentity_8x8_internal_10bpc).pass1 .pass2: packssdw m3, m7 vpbroadcastd m7, [pixel_12bpc_max] jmp m(iidentity_8x8_internal_10bpc).pass2_main %macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 8x16, %4 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd m2, [dconly_%4bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity, 35 INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 call .pass1_main sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 mova [cq+32* 5], m2 mova [cq+32* 7], m3 mova [cq+32* 9], m4 mova [cq+32*11], m5 mova [cq+32*13], m6 mova m15, m7 call .pass1_main mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] mova m11, [cq+32* 7] mova m12, [cq+32* 9] mova m13, [cq+32*11] mova m14, [cq+32*13] jmp tx2q .fast: call .pass1_main pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call .transpose call m(idct_8x16_internal_8bpc).main vpbroadcastd m12, [pw_2048] REPX {vpermq x, x, q3120}, m0, m2, m4, m6 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 .end: pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m0, m2, m12 pmulhrsw m1, m3, m12 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m0, m4, m12 pmulhrsw m1, m5, m12 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m0, m6, m12 pmulhrsw m1, m7, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .transpose: packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m7, m15 lea r6, [deint_shuf+128] punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpcklwd m3, m4, m5 punpckhwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 punpckhdq m7, m3, m6 punpckldq m3, m6 punpckhdq m6, m4, m5 punpckldq m4, m5 punpckhdq m5, m8, m1 punpckldq m8, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 vperm2i128 m2, m0, m3, 0x31 vinserti128 m0, xm3, 1 vperm2i128 m3, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m7, m5, m6, 0x31 vinserti128 m5, xm6, 1 vperm2i128 m6, m8, m4, 0x31 vinserti128 m4, m8, xm4, 1 ret ALIGN function_align .pass1_main: pmulld m0, m14, [cq+32* 0] pmulld m1, m14, [cq+32* 2] pmulld m2, m14, [cq+32* 4] pmulld m3, m14, [cq+32* 6] pmulld m4, m14, [cq+32* 8] pmulld m5, m14, [cq+32*10] pmulld m6, m14, [cq+32*12] pmulld m7, m14, [cq+32*14] call m(idct_8x8_internal_10bpc).main_rect2 jmp m(idct_8x8_internal_10bpc).round_shift1 ALIGN function_align .main_evenhalf: paddd m1, m6, m7 ; idct8 out1 psubd m6, m7 ; idct8 out6 psubd m7, m0, m9 ; idct8 out7 paddd m0, m9 ; idct8 out0 paddd m2, m5, m4 ; idct8 out2 psubd m5, m4 ; idct8 out5 psubd m4, m3, m8 ; idct8 out4 paddd m3, m8 ; idct8 out3 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 ret .main_oddhalf_fast_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_fast: ; lower half zero vpbroadcastd m7, [pd_4076] vpbroadcastd m8, [pd_401] vpbroadcastd m6, [pd_m1189] vpbroadcastd m9, [pd_3920] vpbroadcastd m5, [pd_3612] vpbroadcastd m10, [pd_1931] vpbroadcastd m4, [pd_m2598] vpbroadcastd m15, [pd_3166] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 pmulld m1, m9 pmulld m5, m2 pmulld m2, m10 pmulld m4, m3 pmulld m3, m15 jmp .main_oddhalf_fast2 .main_oddhalf_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf: ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a .main_oddhalf_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 psubd m8, m0, m4 ; t9 paddd m0, m4 ; t8 psubd m4, m6, m2 ; t10 paddd m2, m6 ; t11 psubd m6, m1, m5 ; t13 paddd m5, m1 ; t12 psubd m1, m7, m3 ; t14 paddd m7, m3 ; t15 REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 psubd m3, m1, m4 ; t10 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a paddd m0, m2 ; t8a psubd m2, m8, m6 ; t13 paddd m6, m8 ; t14 psubd m8, m7, m5 ; t12a paddd m7, m5 ; t15a REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pmulld x, m14}, m2, m8, m3, m4 paddd m2, m11 paddd m8, m11 paddd m5, m2, m3 ; t13a psubd m2, m3 ; t10a psubd m3, m8, m4 ; t11 paddd m4, m8 ; t12 REPX {psrad x, 12}, m5, m2, m3, m4 mova [r6-32*4], m7 mova [r6-32*3], m6 mova [r6-32*2], m5 mova [r6-32*1], m4 mova [r6+32*0], m3 mova [r6+32*1], m2 mova [r6+32*2], m1 mova [r6+32*3], m0 ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity, 35 cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 call .pass1_main call m(iadst_8x8_internal_10bpc).main_end sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 mova [cq+32* 5], m2 mova [cq+32* 7], m3 mova [cq+32* 9], m4 mova [cq+32*11], m5 mova [cq+32*13], m6 mova m15, m7 call .pass1_main call m(iadst_8x8_internal_10bpc).main_end mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] mova m11, [cq+32* 7] mova m12, [cq+32* 9] mova m13, [cq+32*11] mova m14, [cq+32*13] jmp tx2q .fast: call .pass1_main call m(iadst_8x8_internal_10bpc).main_end pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [pw_2048] vpbroadcastd xm12, [pw_4096] REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 psubw m12, m8 jmp m(idct_8x16_internal_10bpc).end ALIGN function_align .pass1_main: pmulld m0, m14, [cq+32* 0] pmulld m7, m14, [cq+32*14] pmulld m1, m14, [cq+32* 2] pmulld m6, m14, [cq+32*12] pmulld m2, m14, [cq+32* 4] pmulld m5, m14, [cq+32*10] pmulld m3, m14, [cq+32* 6] pmulld m4, m14, [cq+32* 8] REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 jmp m(iadst_8x8_internal_10bpc).main2 INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity, 35 cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] cmp eobd, 43 jl .fast add cq, 32 call m(iadst_8x16_internal_10bpc).pass1_main call m(iflipadst_8x8_internal_10bpc).main_end sub cq, 32 mova [cq+32* 1], m0 mova [cq+32* 3], m1 mova [cq+32* 5], m2 mova [cq+32* 7], m3 mova [cq+32* 9], m4 mova [cq+32*11], m5 mova [cq+32*13], m6 mova m15, m7 call m(iadst_8x16_internal_10bpc).pass1_main call m(iflipadst_8x8_internal_10bpc).main_end mova m8, [cq+32* 1] mova m9, [cq+32* 3] mova m10, [cq+32* 5] mova m11, [cq+32* 7] mova m12, [cq+32* 9] mova m13, [cq+32*11] mova m14, [cq+32*13] jmp tx2q .fast: call m(iadst_8x16_internal_10bpc).pass1_main call m(iflipadst_8x8_internal_10bpc).main_end pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m12, [pw_2048] vpbroadcastd xm13, [pw_4096] mova m11, m0 vpermq m0, m7, q2031 mova m10, m1 vpermq m1, m6, q2031 mova m9, m2 vpermq m2, m5, q2031 mova m8, m3 vpermq m3, m4, q2031 vpermq m4, m8, q3120 vpermq m5, m9, q3120 vpermq m6, m10, q3120 vpermq m7, m11, q3120 psubw m12, m13 jmp m(idct_8x16_internal_10bpc).end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 %ifnum %4 pmulhrsw m%2, m%4 %else ; without rounding psraw m%2, 1 %endif %else paddsw m%1, m%1 %endif paddsw m%1, m%2 %endmacro cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 0] pmulld m8, m15, [cq+32* 1] pmulld m1, m15, [cq+32* 2] pmulld m9, m15, [cq+32* 3] pmulld m2, m15, [cq+32* 4] pmulld m10, m15, [cq+32* 5] pmulld m3, m15, [cq+32* 6] pmulld m11, m15, [cq+32* 7] pmulld m4, m15, [cq+32* 8] pmulld m12, m15, [cq+32* 9] pmulld m5, m15, [cq+32*10] pmulld m13, m15, [cq+32*11] pmulld m6, m15, [cq+32*12] pmulld m14, m15, [cq+32*13] pmulld m7, m15, [cq+32*14] pmulld m15, [cq+32*15] mova [cq], m7 vpbroadcastd m7, [pd_2048] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [cq] REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m13, m7, m15 vpbroadcastd m8, [pw_1697x16] REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 vpbroadcastd m7, [pixel_10bpc_max] vpbroadcastd m12, [pw_2048] call .pass2_end RET ALIGN function_align .pass2_end: punpckhwd m9, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m13 punpcklwd m6, m13 punpckhwd m13, m4, m5 punpcklwd m4, m5 punpcklwd m5, m2, m3 punpckhwd m2, m3 punpckhdq m3, m0, m5 punpckldq m0, m5 punpckhdq m11, m9, m2 punpckldq m9, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckldq m6, m13, m1 punpckhdq m13, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m8, m9, m6 punpckhqdq m9, m6 punpcklqdq m10, m11, m13 punpckhqdq m11, m13 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(iidentity_8x8_internal_10bpc).write_2x8x2_start pmulhrsw m0, m12, m2 pmulhrsw m1, m12, m3 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero pmulhrsw m0, m12, m8 pmulhrsw m1, m12, m9 lea dstq, [dstq+strideq*4] call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero pmulhrsw m0, m12, m10 pmulhrsw m1, m12, m11 call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero ret INV_TXFM_8X16_FN dct, dct, 0, 12 INV_TXFM_8X16_FN dct, identity, 35, 12 INV_TXFM_8X16_FN dct, adst, 0, 12 INV_TXFM_8X16_FN dct, flipadst, 0, 12 cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_8x16_internal_10bpc).pass1 .pass2: lea r6, [rsp+32*4] call .transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] mova [cq+32* 8], m0 mova [cq+32*10], m2 mova [cq+32*12], m4 mova [cq+32*14], m6 pmaxsd m0, m12, [cq+32* 1] pmaxsd m4, m12, m1 pmaxsd m1, m12, [cq+32* 3] pmaxsd m2, m12, [cq+32* 5] pmaxsd m6, m12, m5 pmaxsd m5, m12, m3 pmaxsd m3, m12, [cq+32* 7] pmaxsd m7, m12 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(idct_8x16_internal_10bpc).main_oddhalf pmaxsd m0, m12, [cq+32* 0] pmaxsd m1, m12, [cq+32* 2] pmaxsd m2, m12, [cq+32* 4] pmaxsd m3, m12, [cq+32* 6] pmaxsd m4, m12, [cq+32* 8] pmaxsd m5, m12, [cq+32*10] pmaxsd m6, m12, [cq+32*12] pmaxsd m7, m12, [cq+32*14] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf vpbroadcastd m11, [pd_8] REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_16x8_internal_10bpc).pass1_rotations REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 .end: packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 packssdw m4, m8, m9 packssdw m5, m10, m11 packssdw m6, m12, m13 packssdw m7, m14, m15 vpermq m0, m0, q3120 vpermq m1, m1, q3120 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m2, q3120 vpermq m1, m3, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m4, q3120 vpermq m1, m5, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m6, q3120 vpermq m1, m7, q3120 call m(idct_8x8_internal_10bpc).write_8x4 RET ALIGN function_align .transpose: mova [cq+32* 8], m8 mova [cq+32* 9], m9 mova [cq+32*10], m10 mova [cq+32*11], m11 call m(idct_8x8_internal_12bpc).transpose_8x8 mova [cq+32* 0], m0 mova [cq+32* 1], m1 mova [cq+32* 2], m2 mova [cq+32* 3], m3 mova [cq+32* 4], m4 mova [cq+32* 5], m5 mova [cq+32* 6], m6 mova [cq+32* 7], m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, m12 mova m5, m13 mova m6, m14 mova m7, m15 jmp m(idct_8x8_internal_12bpc).transpose_8x8 INV_TXFM_8X16_FN adst, dct, 0, 12 INV_TXFM_8X16_FN adst, adst, 0, 12 INV_TXFM_8X16_FN adst, flipadst, 0, 12 INV_TXFM_8X16_FN adst, identity, 35, 12 cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iadst_8x16_internal_10bpc).pass1 .pass2: lea r6, [rsp+32*4] call .pass2_main call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_end: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 jmp m(idct_8x16_internal_12bpc).end ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] mova [cq+32* 8], m0 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*15], m7 pmaxsd m0, m13, [cq+32* 2] ; 2 pmaxsd m3, m13, m1 ; 9 pmaxsd m1, m13, m5 ; 13 pmaxsd m4, m13, m2 ; 10 pmaxsd m2, m13, [cq+32* 6] ; 6 pmaxsd m5, m13, [cq+32* 5] ; 5 pmaxsd m6, m13, m6 ; 14 pmaxsd m7, m13, [cq+32* 1] ; 1 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m12, [pd_2048] vpbroadcastd m15, [pd_2896] call m(iadst_16x8_internal_10bpc).main_part1 pmaxsd m0, m13, [cq+32* 0] ; 0 pmaxsd m1, m13, [cq+32*15] ; 15 pmaxsd m2, m13, [cq+32* 4] ; 4 pmaxsd m3, m13, [cq+32*11] ; 11 pmaxsd m4, m13, [cq+32* 8] ; 8 pmaxsd m5, m13, [cq+32* 7] ; 7 pmaxsd m6, m13, [cq+32*12] ; 12 pmaxsd m7, m13, [cq+32* 3] ; 3 REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_16x8_internal_10bpc).main_part2 vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret INV_TXFM_8X16_FN flipadst, dct, 0, 12 INV_TXFM_8X16_FN flipadst, adst, 0, 12 INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 INV_TXFM_8X16_FN flipadst, identity, 35, 12 cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_8x16_internal_10bpc).pass1 .pass2: lea r6, [rsp+32*4] call m(iadst_8x16_internal_12bpc).pass2_main call m(iflipadst_16x8_internal_10bpc).pass1_rotations jmp m(iadst_8x16_internal_12bpc).pass2_end INV_TXFM_8X16_FN identity, dct, 0, 12 INV_TXFM_8X16_FN identity, adst, 0, 12 INV_TXFM_8X16_FN identity, flipadst, 0, 12 INV_TXFM_8X16_FN identity, identity, 0, 12 cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iidentity_8x16_internal_10bpc).pass1 .pass2: call .pass2_main packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m13, m7, m15 vpbroadcastd m7, [pixel_12bpc_max] vpbroadcastd m12, [pw_16384] call m(iidentity_8x16_internal_10bpc).pass2_end RET ALIGN function_align .pass2_main: mova [cq], m7 vpbroadcastd m7, [clip_18b_min] REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 pmaxsd m7, [cq] mova [cq], m15 vpbroadcastd m15, [clip_18b_max] REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pminsd m15, [cq] mova [cq], m7 vpbroadcastd m7, [pd_5793] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 pmulld m7, [cq] mova [cq], m15 vpbroadcastd m15, [pd_1024] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [cq] REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ret %macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x4, %3 %ifidn %1_%2, dct_dct vpbroadcastd m3, [dconly_%3bpc] %if %3 = 10 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 4 .dconly2: add r6d, 384 sar r6d, 9 .dconly3: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm3 vpbroadcastw m0, xm0 .dconly_loop: paddsw m1, m0, [dstq+strideq*0] paddsw m2, m0, [dstq+strideq*1] psubusw m1, m3 psubusw m2, m3 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %else jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly %endif %endif %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, identity INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] .pass1: vbroadcasti128 m0, [cq+16* 0] vbroadcasti128 m4, [cq+16* 4] vbroadcasti128 m1, [cq+16* 2] vbroadcasti128 m7, [cq+16* 6] vbroadcasti128 m5, [cq+16*10] vbroadcasti128 m2, [cq+16* 8] vbroadcasti128 m6, [cq+16*12] vbroadcasti128 m3, [cq+16*14] shufpd m0, m4, 0x0c ; 0 4 shufpd m1, m5, 0x0c ; 2 10 shufpd m2, m6, 0x0c ; 8 12 shufpd m3, m7, 0x0c ; 14 6 call .pass1_main vbroadcasti128 m10, [cq+16* 1] vbroadcasti128 m4, [cq+16* 5] vbroadcasti128 m11, [cq+16*15] vbroadcasti128 m5, [cq+16*11] shufpd m10, m4, 0x0c ; 1 5 shufpd m11, m5, 0x0c ; 15 11 vbroadcasti128 m5, [cq+16* 9] vbroadcasti128 m4, [cq+16*13] shufpd m5, m4, 0x0c ; 9 13 vbroadcasti128 m6, [cq+16* 7] vbroadcasti128 m4, [cq+16* 3] shufpd m6, m4, 0x0c ; 7 3 call .pass1_main2 pcmpeqd m4, m4 REPX {psubd x, m4}, m0, m1, m2, m3 call .pass1_main3 REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call .transpose_4x16_packed lea r6, [deint_shuf+128] call m(idct_16x4_internal_8bpc).main .end: vpbroadcastd m4, [pw_2048] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 vpbroadcastd m5, [pixel_10bpc_max] .end2: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] .end3: lea r6, [dstq+strideq*2] paddw m2, [r6 +strideq*0] paddw m3, [r6 +strideq*1] pxor m4, m4 REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmaxsw x, m4}, m0, m1, m2, m3 REPX {pminsw x, m5}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [r6 +strideq*0], m2 mova [r6 +strideq*1], m3 RET ALIGN function_align .pass1_main: vpbroadcastd m7, [pd_2048] call m(idct_8x4_internal_10bpc).main psubd m3, m0, m4 ; idct8 out7 out6 paddd m0, m4 ; idct8 out0 out1 paddd m1, m2, m5 ; idct8 out3 out2 psubd m2, m5 ; idct8 out4 out5 ret ALIGN function_align .pass1_main2: ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 vbroadcasti128 m12, [pd_3784_m3784] psubd m4, m10, m5 paddd m10, m5 ; t8 t11 psignd m4, m12 ; t9 t10 psubd m5, m11, m6 paddd m11, m6 ; t15 t12 psignd m5, m12 ; t14 t13 vpbroadcastd m6, [pd_1567] vpbroadcastd m13, [pd_3784] REPX {pmaxsd x, m8}, m5, m4 REPX {pminsd x, m9}, m5, m4 pmulld m12, m5 pmulld m5, m6 vbroadcasti128 m6, [pd_1567_m1567] pmulld m13, m4 pmulld m4, m6 REPX {pmaxsd x, m8}, m10, m11, m0, m1 REPX {pminsd x, m9}, m10, m11, m0, m1 paddd m12, m7 paddd m5, m7 paddd m4, m12 psubd m5, m13 psrad m4, 12 ; t14a t10a psrad m5, 12 ; t9a t13a vpbroadcastd m12, [pd_2896] punpckhqdq m6, m11, m5 punpcklqdq m11, m4 punpckhqdq m4, m10, m4 punpcklqdq m10, m5 psubd m5, m11, m6 ; t12a t13 paddd m11, m6 ; t15a t14 psubd m6, m10, m4 ; t11a t10 paddd m10, m4 ; t8a t9 REPX {pmaxsd x, m8}, m5, m6 REPX {pminsd x, m9}, m5, m6 pmulld m5, m12 pmulld m6, m12 REPX {pmaxsd x, m8}, m2, m3, m11, m10 REPX {pminsd x, m9}, m2, m3, m11, m10 ret ALIGN function_align .pass1_main3: paddd m5, m7 psubd m4, m5, m6 paddd m5, m6 psrad m4, 12 ; t11 t10a psrad m5, 12 ; t12 t13a psubd m7, m0, m11 ; out15 out14 paddd m0, m11 ; out0 out1 psubd m6, m1, m5 ; out12 out13 paddd m1, m5 ; out3 out2 psubd m5, m2, m4 ; out11 out10 paddd m2, m4 ; out4 out5 psubd m4, m3, m10 ; out8 out9 paddd m3, m10 ; out7 out6 REPX {pshufd x, x, q1032}, m1, m3, m5, m7 ret ALIGN function_align .transpose_4x16_packed: vbroadcasti128 m8, [deint_shuf] packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 REPX {pshufb x, m8}, m0, m2, m4, m6 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpckhqdq m2, m4, m6 punpcklqdq m4, m6 vperm2i128 m3, m1, m2, 0x31 vinserti128 m1, xm2, 1 vperm2i128 m2, m0, m4, 0x31 vinserti128 m0, xm4, 1 ret INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call m(iadst_4x16_internal_10bpc).main psrad m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3 paddd m4, m5, m11 paddd m5, m6, m11 paddd m6, m7, m11 paddd m7, m8, m11 .pass1_end: REPX {pshufd x, x, q1032}, m0, m2, m4, m6 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main jmp m(idct_16x4_internal_10bpc).end ALIGN function_align .main: vpbroadcastd m6, [pd_1321] mova m0, [cq+32*0] mova m1, [cq+32*1] vpbroadcastd m7, [pd_2482] mova m2, [cq+32*6] mova m3, [cq+32*7] pmulld m4, m0, m6 pmulld m5, m1, m6 ; 1321*in0 pmulld m9, m2, m7 pmulld m8, m3, m7 ; 2482*in3 paddd m4, m9 paddd m8, m5 ; 1321*in0 + 2482*in3 pmulld m5, m0, m7 pmulld m9, m1, m7 ; 2482*in0 paddd m0, m2 paddd m1, m3 ; in0 + in3 paddd m7, m6 ; pd_3803 pmulld m2, m7 pmulld m3, m7 ; 3803*in3 psubd m5, m2 psubd m9, m3 ; 2482*in0 - 3803*in3 mova m2, [cq+32*4] pmulld m10, m7, m2 pmulld m3, m6, m2 psubd m2, m0 mova m0, [cq+32*5] pmulld m7, m0 ; 3803*in2 pmulld m6, m0 ; 1321*in2 psubd m0, m1 ; in2 - in0 - in3 vpbroadcastd m1, [pd_m3344] paddd m4, m10 paddd m7, m8 ; t0 psubd m5, m3 psubd m9, m6 ; t1 pmulld m2, m1 pmulld m0, m1 ; t2 pmulld m3, m1, [cq+32*2] pmulld m1, [cq+32*3] ; -t3 ret ALIGN function_align .main_end: ; expects: m6 = rnd paddd m5, m6 paddd m9, m6 paddd m10, m4, m5 paddd m4, m6 paddd m8, m7, m6 paddd m7, m9 psubd m4, m3 ; out0 (unshifted) psubd m5, m3 ; out1 (unshifted) paddd m2, m6 ; out2 (unshifted) paddd m3, m10 ; out3 (unshifted) psubd m8, m1 ; out4 (unshifted) psubd m9, m1 ; out5 (unshifted) paddd m6, m0 ; out6 (unshifted) paddd m7, m1 ; out7 (unshifted) ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: call m(iadst_4x16_internal_10bpc).main psrad m11, 11 ; pd_1 paddd m4, m3, m11 paddd m3, m5, m11 paddd m5, m2, m11 paddd m2, m6, m11 paddd m6, m1, m11 paddd m1, m7, m11 paddd m7, m0, m11 paddd m0, m8, m11 jmp m(iadst_16x4_internal_10bpc).pass1_end .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [pw_2048] pmulhrsw m5, m3, m4 pmulhrsw m6, m2, m4 pmulhrsw m2, m1, m4 pmulhrsw m3, m0, m4 paddw m0, m5, [dstq+strideq*0] paddw m1, m6, [dstq+strideq*1] vpbroadcastd m5, [pixel_10bpc_max] jmp m(idct_16x4_internal_10bpc).end3 INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [pd_5793] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m4, [cq+32*4], q3120 ; 8 9 vpermq m5, [cq+32*5], q3120 ; a b vpermq m6, [cq+32*6], q3120 ; c d vpermq m7, [cq+32*7], q3120 ; e f vpbroadcastd m9, [pd_3072] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call m(idct_16x4_internal_10bpc).transpose_4x16_packed vpbroadcastd m7, [pw_1697x8] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(idct_16x4_internal_10bpc).end INV_TXFM_16X4_FN dct, dct, 12 INV_TXFM_16X4_FN dct, identity, 12 INV_TXFM_16X4_FN dct, adst, 12 INV_TXFM_16X4_FN dct, flipadst, 12 cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [clip_20b_min] vpbroadcastd m9, [clip_20b_max] jmp m(idct_16x4_internal_10bpc).pass1 .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 ; deinterleave REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 ; transpose punpcklqdq m8, m0, m1 punpckhqdq m0, m1 punpcklqdq m9, m2, m3 punpckhqdq m2, m3 punpcklqdq m10, m4, m5 punpckhqdq m4, m5 punpcklqdq m11, m6, m7 punpckhqdq m6, m7 vperm2i128 m3, m0, m2, 0x31 ; out6 vperm2i128 m1, m0, m2, 0x20 ; out2 vperm2i128 m7, m4, m6, 0x31 ; out7 vperm2i128 m5, m4, m6, 0x20 ; out3 vperm2i128 m13, m10, m11, 0x31 ; out5 vperm2i128 m12, m10, m11, 0x20 ; out1 vperm2i128 m11, m8, m9, 0x31 ; out4 vperm2i128 m10, m8, m9, 0x20 ; out0 call m(idct_4x16_internal_10bpc).pass1_main pmulld m0, m6, m10 pmulld m2, m6, m11 pmulld m4, m6, m12 pmulld m6, m13 vpbroadcastd m10, [pd_17408] call m(idct_4x16_internal_10bpc).pass1_main2 REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m5, [pixel_12bpc_max] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 jmp m(idct_16x4_internal_10bpc).end2 INV_TXFM_16X4_FN adst, dct, 12 INV_TXFM_16X4_FN adst, adst, 12 INV_TXFM_16X4_FN adst, flipadst, 12 INV_TXFM_16X4_FN adst, identity, 12 cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iadst_16x4_internal_10bpc).pass1 .pass2: call .pass2_main REPX {vpermq x, x, q3120}, m0, m1, m2, m3 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 jmp m(idct_16x4_internal_10bpc).end2 ALIGN function_align .pass2_main: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 pmaxsd m8, m4, m12 pmaxsd m9, m5, m12 REPX {pminsd x, m13}, m0, m1, m2, m3 call m(iadst_8x4_internal_12bpc).transpose_4x8 mova [cq+32*0], m0 mova [cq+32*2], m1 mova [cq+32*4], m2 mova [cq+32*6], m3 pminsd m0, m8, m13 pminsd m1, m9, m13 pminsd m2, m6, m13 pminsd m3, m7, m13 call m(iadst_8x4_internal_12bpc).transpose_4x8 mova [cq+32*1], m0 mova [cq+32*3], m1 mova [cq+32*5], m2 mova [cq+32*7], m3 call m(iadst_16x4_internal_10bpc).main vpbroadcastd m6, [pd_2048] call m(iadst_16x4_internal_10bpc).main_end psrad m0, m4, 15 psrad m1, m5, 15 psrad m2, 15 psrad m3, 15 psrad m4, m8, 15 psrad m5, m9, 15 psrad m6, 15 psrad m7, 15 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpbroadcastd m4, [pw_16384] vpbroadcastd m5, [pixel_12bpc_max] ret INV_TXFM_16X4_FN flipadst, dct, 12 INV_TXFM_16X4_FN flipadst, adst, 12 INV_TXFM_16X4_FN flipadst, flipadst, 12 INV_TXFM_16X4_FN flipadst, identity, 12 cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_16x4_internal_10bpc).pass1 .pass2: call m(iadst_16x4_internal_12bpc).pass2_main vpermq m7, m0, q3120 vpermq m6, m1, q3120 vpermq m1, m2, q3120 vpermq m0, m3, q3120 pmulhrsw m0, m4 pmulhrsw m1, m4 pmulhrsw m2, m6, m4 pmulhrsw m3, m7, m4 jmp m(idct_16x4_internal_10bpc).end2 INV_TXFM_16X4_FN identity, dct, 12 INV_TXFM_16X4_FN identity, adst, 12 INV_TXFM_16X4_FN identity, flipadst, 12 INV_TXFM_16X4_FN identity, identity, 12 cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 vpbroadcastd m8, [pd_1697] vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m1, [cq+32*1], q3120 ; 2 3 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m3, [cq+32*3], q3120 ; 6 7 vpbroadcastd m9, [pd_3072] pmulld m4, m8, m0 pmulld m5, m8, m1 pmulld m6, m8, m2 pmulld m7, m8, m3 vpermq m10, [cq+32*4], q3120 ; 8 9 vpermq m11, [cq+32*5], q3120 ; a b vpermq m12, [cq+32*6], q3120 ; c d vpermq m13, [cq+32*7], q3120 ; e f REPX {paddd x, m9}, m4, m5, m6, m7 REPX {psrad x, 12}, m4, m5, m6, m7 paddd m0, m4 pmulld m4, m8, m10 paddd m1, m5 pmulld m5, m8, m11 paddd m2, m6 pmulld m6, m8, m12 paddd m3, m7 pmulld m7, m8, m13 REPX {paddd x, m9}, m4, m5, m6, m7 REPX {psrad x, 12}, m4, m5, m6, m7 paddd m4, m10 paddd m5, m11 paddd m6, m12 paddd m7, m13 jmp tx2q .pass2: vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m8, [pd_5793] vpbroadcastd m9, [pd_2048] REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_16x4_internal_10bpc).transpose_4x16_packed vpbroadcastd m4, [pw_16384] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 vpbroadcastd m5, [pixel_12bpc_max] jmp m(idct_16x4_internal_10bpc).end2 %macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth INV_TXFM_FN %1, %2, 0, 16x8, %3 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd m3, [dconly_%3bpc] mov [cq], eobd ; 0 or r3d, 8 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m14, [pd_2896] pmulld m0, m14, [cq+32* 1] pmulld m1, m14, [cq+32* 3] pmulld m2, m14, [cq+32* 5] pmulld m3, m14, [cq+32* 7] pmulld m4, m14, [cq+32* 9] pmulld m5, m14, [cq+32*11] pmulld m6, m14, [cq+32*13] pmulld m7, m14, [cq+32*15] vpbroadcastd m11, [pd_2048] lea r6, [rsp+32*4] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+32* 0] pmulld m1, m14, [cq+32* 2] pmulld m2, m14, [cq+32* 4] pmulld m3, m14, [cq+32* 6] pmulld m4, m14, [cq+32* 8] pmulld m5, m14, [cq+32*10] pmulld m6, m14, [cq+32*12] pmulld m7, m14, [cq+32*14] call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf psrld m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 call .pass1_rotations REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call .transpose call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [pw_2048] .end: pmulhrsw m0, m10 pmulhrsw m1, m10 pmulhrsw m2, m10 pmulhrsw m3, m10 call .write_16x4_start .end2: pmulhrsw m0, m4, m10 pmulhrsw m1, m5, m10 pmulhrsw m2, m6, m10 pmulhrsw m3, m7, m10 call .write_16x4_zero RET ALIGN function_align .pass1_rotations: mova m14, [r6-32*4] mova m13, [r6-32*3] mova m12, [r6-32*2] mova m11, [r6-32*1] mova m10, [r6+32*0] mova m9, [r6+32*1] mova m8, [r6+32*2] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r6+32*3] ; out8 paddd m7, [r6+32*3] ; out7 ret ALIGN function_align .transpose: lea r6, [deint_shuf+128] .transpose2: packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 packssdw m3, m11 packssdw m4, m12 packssdw m5, m13 packssdw m6, m14 packssdw m7, m15 .transpose3: punpckhwd m8, m0, m1 punpcklwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpckhwd m3, m4, m5 punpcklwd m4, m5 punpckhwd m5, m6, m7 punpcklwd m6, m7 punpckhdq m7, m4, m6 punpckldq m4, m6 punpckldq m6, m8, m2 punpckhdq m8, m2 punpckhdq m2, m0, m1 punpckldq m0, m1 punpckhdq m1, m3, m5 punpckldq m3, m5 punpcklqdq m5, m6, m3 punpckhqdq m6, m3 punpckhqdq m3, m2, m7 punpcklqdq m2, m7 punpcklqdq m7, m8, m1 punpckhqdq m8, m1 punpckhqdq m1, m0, m4 punpcklqdq m0, m4 vperm2i128 m4, m0, m5, 0x31 vinserti128 m0, xm5, 1 vperm2i128 m5, m1, m6, 0x31 vinserti128 m1, xm6, 1 vperm2i128 m6, m2, m7, 0x31 vinserti128 m2, xm7, 1 vperm2i128 m7, m3, m8, 0x31 vinserti128 m3, xm8, 1 ret ALIGN function_align .write_16x4_start: vpbroadcastd m9, [pixel_10bpc_max] lea r3, [strideq*3] pxor m8, m8 .write_16x4_zero: REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 add cq, 32*8 .write_16x4: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3 ] REPX {pmaxsw x, m8}, m0, m1, m2, m3 REPX {pminsw x, m9}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 lea dstq, [dstq+strideq*4] ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: lea r6, [rsp+32*4] call .main vpbroadcastd m14, [pd_3072] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_3071 call .pass1_rotations .pass1_end: REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 jmp tx2q .pass2: call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 pmulhrsw m0, m10 pmulhrsw m1, m11 pmulhrsw m2, m10 pmulhrsw m3, m11 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m4, m10 pmulhrsw m1, m5, m11 pmulhrsw m2, m6, m10 pmulhrsw m3, m7, m11 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .pass1_rotations: paddd m0, m15 psubd m1, m15, m1 paddd m2, m15 psubd m3, m15, m3 paddd m4, m14 psubd m5, m13, m5 paddd m6, m14 psubd m7, m13, m7 paddd m8, m14, m9 psubd m9, m13, m10 paddd m10, m14, m11 psubd m11, m13, m12 paddd m12, m15, [r6-32*1] psubd m13, m15, [r6-32*2] paddd m14, m15, [r6-32*3] psubd m15, [r6-32*4] ret ALIGN function_align .main: ; expects: m13 = clip_min m14 = clip_max vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 2] pmulld m1, m15, [cq+32*13] pmulld m2, m15, [cq+32* 6] pmulld m3, m15, [cq+32* 9] pmulld m4, m15, [cq+32*10] pmulld m5, m15, [cq+32* 5] pmulld m6, m15, [cq+32*14] pmulld m7, m15, [cq+32* 1] vpbroadcastd m12, [pd_2048] REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 call .main_part1 pmulld m0, m15, [cq+32* 0] pmulld m1, m15, [cq+32*15] pmulld m2, m15, [cq+32* 4] pmulld m3, m15, [cq+32*11] pmulld m4, m15, [cq+32* 8] pmulld m5, m15, [cq+32* 7] pmulld m6, m15, [cq+32*12] pmulld m7, m15, [cq+32* 3] REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_part2: ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 psubd m8, m0, m4 ; t8a paddd m0, m4 ; t0a psubd m4, m1, m5 ; t9a paddd m1, m5 ; t1a psubd m5, m2, m6 ; t12a paddd m2, m6 ; t4a psubd m6, m3, m7 ; t13a paddd m7, m3 ; t5a REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 vpbroadcastd m11, [pd_4017] vpbroadcastd m10, [pd_799] ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 psubd m3, m0, m2 ; t4 paddd m0, m2 ; t0 psubd m2, m1, m7 ; t5 paddd m1, m7 ; t1 psubd m7, m4, m6 ; t12a paddd m4, m6 ; t8a psubd m6, m8, m5 ; t13a paddd m5, m8 ; t9a REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 vpbroadcastd m11, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 pminsd m10, m14, [r6-32*4] ; t2 pminsd m8, m14, [r6-32*3] ; t3 psubd m9, m0, m10 ; t2a paddd m0, m10 ; out0 psubd m10, m1, m8 ; t3a paddd m1, m8 ; -out15 pmaxsd m9, m13 pmaxsd m10, m13 pminsd m9, m14 pminsd m10, m14 mova [r6-32*4], m1 mova m11, [r6-32*1] ; t7a mova m1, [r6-32*2] ; t6a psubd m8, m3, m11 ; t7 paddd m11, m3 ; out12 paddd m3, m2, m1 ; -out3 psubd m2, m1 ; t6 pmaxsd m8, m13 pmaxsd m2, m13 pminsd m8, m14 pminsd m2, m14 mova [r6-32*1], m11 mova [r6-32*3], m2 mova m1, [r6+32*3] ; t15 mova m2, [r6+32*2] ; t14 paddd m12, m7, m1 ; -out13 psubd m7, m1 ; t15a psubd m11, m6, m2 ; t14a paddd m2, m6 ; out2 pmaxsd m7, m13 pmaxsd m11, m13 pminsd m7, m14 pminsd m11, m14 mova [r6-32*2], m12 pminsd m1, m14, [r6+32*0] ; t10a pminsd m12, m14, [r6+32*1] ; t11a psubd m6, m4, m1 ; t10 paddd m1, m4 ; -out1 psubd m4, m5, m12 ; t11 paddd m5, m12 ; out14 vpbroadcastd m12, [pd_1448] pmaxsd m6, m13 pmaxsd m4, m13 pminsd m6, m14 pminsd m4, m14 REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 pmulld m12, [r6-32*3] ; t6 mova [r6-32*3], m5 paddd m5, m11, m7 ; -out5 (unshifted) psubd m11, m7 ; out10 (unshifted) paddd m7, m9, m10 ; -out7 (unshifted) psubd m9, m10 ; out8 (unshifted) psubd m10, m6, m4 ; -out9 (unshifted) paddd m6, m4 ; out6 (unshifted) paddd m4, m12, m8 ; out4 (unshifted) psubd m12, m8 ; -out11 (unshifted) ret .main_part1: ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 psubd m8, m0, m4 ; t10a paddd m0, m4 ; t2a psubd m4, m1, m5 ; t11a paddd m1, m5 ; t3a psubd m5, m2, m6 ; t14a paddd m2, m6 ; t6a psubd m6, m3, m7 ; t15a paddd m7, m3 ; t7a REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 vpbroadcastd m11, [pd_2276] vpbroadcastd m10, [pd_3406] ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 psubd m3, m0, m2 ; t6 paddd m0, m2 ; t2 psubd m2, m1, m7 ; t7 paddd m1, m7 ; t3 psubd m7, m4, m6 ; t14a paddd m4, m6 ; t10a psubd m6, m8, m5 ; t15a paddd m5, m8 ; t11a REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later vpbroadcastd m11, [pd_1567] vpbroadcastd m10, [pd_3784] ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*2], m6 mova [r6+32*3], m7 ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: lea r6, [rsp+32*4] call m(iadst_16x8_internal_10bpc).main vpbroadcastd m14, [pd_3072] psrld m15, 11 psubd m13, m14, m15 call .pass1_rotations jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end vpbroadcastd m10, [pw_2048] pxor m11, m11 psubw m11, m10 mova m12, m0 pmulhrsw m0, m7, m11 mova m7, m1 pmulhrsw m1, m6, m10 mova m6, m2 pmulhrsw m2, m5, m11 mova m5, m3 pmulhrsw m3, m4, m10 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m5, m11 pmulhrsw m1, m6, m10 pmulhrsw m2, m7, m11 pmulhrsw m3, m12, m10 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .pass1_rotations: psubd m8, m13, m7 paddd m7, m14, m9 paddd m9, m14, m6 psubd m6, m13, m10 psubd m10, m13, m5 paddd m5, m14, m11 paddd m11, m14, m4 psubd m4, m13, m12 psubd m12, m15, m3 paddd m3, m15, [r6-32*1] paddd m13, m15, m2 psubd m2, m15, [r6-32*2] psubd m14, m15, m1 mova m1, m15 paddd m15, m0 psubd m0, m1, [r6-32*4] paddd m1, [r6-32*3] ret INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 .pass1: vpbroadcastd m15, [pd_2896] pmulld m0, m15, [cq+32* 0] pmulld m1, m15, [cq+32* 1] pmulld m2, m15, [cq+32* 2] pmulld m3, m15, [cq+32* 3] pmulld m4, m15, [cq+32* 4] pmulld m5, m15, [cq+32* 5] pmulld m6, m15, [cq+32* 6] pmulld m7, m15, [cq+32* 7] pmulld m8, m15, [cq+32* 8] pmulld m9, m15, [cq+32* 9] pmulld m10, m15, [cq+32*10] pmulld m11, m15, [cq+32*11] pmulld m12, m15, [cq+32*12] pmulld m13, m15, [cq+32*13] pmulld m14, m15, [cq+32*14] pmulld m15, [cq+32*15] mova [rsp], m7 vpbroadcastd m7, [pd_2048] REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [rsp] REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 mova [rsp], m15 vpbroadcastd m15, [pd_5793] REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pmulld m15, [rsp] mova [rsp], m7 vpbroadcastd m7, [pd_3072] REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [rsp] REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_16x8_internal_10bpc).transpose vpbroadcastd m10, [pw_4096] jmp m(idct_16x8_internal_10bpc).end INV_TXFM_16X8_FN dct, dct, 12 INV_TXFM_16X8_FN dct, identity, 12 INV_TXFM_16X8_FN dct, adst, 12 INV_TXFM_16X8_FN dct, flipadst, 12 cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x8_internal_10bpc).pass1 .pass2: call .pass2_main RET ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m11, [pd_2048] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_12bpc).round_shift4 mova [cq+32* 8], m0 mova [cq+32* 9], m1 mova [cq+32*10], m2 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*13], m5 mova [cq+32*14], m6 mova [cq+32*15], m7 pmaxsd m0, m12, [cq+32*0] pmaxsd m1, m12, [cq+32*1] pmaxsd m2, m12, [cq+32*2] pmaxsd m3, m12, [cq+32*3] pmaxsd m4, m12, [cq+32*4] pmaxsd m5, m12, [cq+32*5] pmaxsd m6, m12, [cq+32*6] pmaxsd m7, m12, [cq+32*7] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_12bpc).round_shift4 .end: packssdw m0, [cq+32* 8] packssdw m1, [cq+32* 9] packssdw m2, [cq+32*10] packssdw m3, [cq+32*11] packssdw m4, [cq+32*12] packssdw m5, [cq+32*13] packssdw m6, [cq+32*14] packssdw m7, [cq+32*15] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 call .write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero vpermq m0, m4, q3120 vpermq m1, m5, q3120 vpermq m2, m6, q3120 vpermq m3, m7, q3120 jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .write_16x4_start: vpbroadcastd m9, [pixel_12bpc_max] lea r3, [strideq*3] pxor m8, m8 ret INV_TXFM_16X8_FN adst, dct, 12 INV_TXFM_16X8_FN adst, adst, 12 INV_TXFM_16X8_FN adst, flipadst, 12 INV_TXFM_16X8_FN adst, identity, 12 cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iadst_16x8_internal_10bpc).pass1 .pass2: call .pass2_main call m(idct_16x8_internal_12bpc).end RET ALIGN function_align .pass2_main: call m(idct_8x16_internal_12bpc).transpose vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m11, [pd_2048] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_8x8_internal_12bpc).pass2_main2 mova [cq+32* 8], m0 mova [cq+32* 9], m1 mova [cq+32*10], m2 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*13], m5 mova [cq+32*14], m6 mova [cq+32*15], m7 pmaxsd m0, m12, [cq+32*0] pmaxsd m1, m12, [cq+32*1] pmaxsd m2, m12, [cq+32*2] pmaxsd m3, m12, [cq+32*3] pmaxsd m4, m12, [cq+32*4] pmaxsd m5, m12, [cq+32*5] pmaxsd m6, m12, [cq+32*6] pmaxsd m7, m12, [cq+32*7] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(iadst_8x8_internal_12bpc).pass2_main2 ret INV_TXFM_16X8_FN flipadst, dct, 12 INV_TXFM_16X8_FN flipadst, adst, 12 INV_TXFM_16X8_FN flipadst, flipadst, 12 INV_TXFM_16X8_FN flipadst, identity, 12 cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iflipadst_16x8_internal_10bpc).pass1 .pass2: call m(iadst_16x8_internal_12bpc).pass2_main packssdw m13, m0, [cq+32* 8] packssdw m12, m1, [cq+32* 9] packssdw m11, m2, [cq+32*10] packssdw m10, m3, [cq+32*11] packssdw m3, m4, [cq+32*12] packssdw m2, m5, [cq+32*13] packssdw m1, m6, [cq+32*14] packssdw m0, m7, [cq+32*15] REPX {vpermq x, x, q3120}, m0, m1, m2, m3 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero vpermq m0, m10, q3120 vpermq m1, m11, q3120 vpermq m2, m12, q3120 vpermq m3, m13, q3120 call m(idct_16x8_internal_10bpc).write_16x4_zero RET INV_TXFM_16X8_FN identity, dct, 12 INV_TXFM_16X8_FN identity, adst, 12 INV_TXFM_16X8_FN identity, flipadst, 12 INV_TXFM_16X8_FN identity, identity, 12 cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iidentity_16x8_internal_10bpc).pass1 .pass2: call m(idct_16x8_internal_10bpc).transpose2 vpbroadcastd m10, [pw_4096] pmulhrsw m0, m10 pmulhrsw m1, m10 pmulhrsw m2, m10 pmulhrsw m3, m10 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero jmp m(idct_16x8_internal_10bpc).end2 %macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth INV_TXFM_FN %1, %2, %3, 16x16, %4 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 vpbroadcastd m3, [dconly_%4bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity, 28 INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] .pass1: vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 jl .fast add cq, 32 call .main sub cq, 32 mova m10, [r6-32*4] mova m9, [r6-32*3] mova m8, [r6-32*2] psubd m15, m0, m10 ; out15 paddd m0, m10 ; out0 psubd m10, m1, m9 ; out14 paddd m1, m9 ; out1 psubd m9, m2, m8 ; out13 paddd m2, m8 ; out2 REPX {psrad x, 2}, m0, m1, m2 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova m2, [r6-32*1] mova m1, [r6+32*0] mova m0, [r6+32*1] REPX {psrad x, 2}, m9, m10, m15 psubd m8, m3, m2 ; out12 paddd m3, m2 ; out3 psubd m2, m4, m1 ; out11 paddd m4, m1 ; out4 psubd m1, m5, m0 ; out10 paddd m5, m0 ; out5 REPX {psrad x, 2}, m3, m4, m5 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova m4, [r6+32*2] mova m3, [r6+32*3] REPX {psrad x, 2}, m1, m2, m8 psubd m5, m6, m4 ; out9 paddd m6, m4 ; out6 psubd m4, m7, m3 ; out8 paddd m7, m3 ; out7 REPX {psrad x, 2}, m6, m7, m4, m5 mova [r6+32*2], m6 mova [r6+32*3], m7 add r6, 32*8 mova [r6-32*4], m4 mova [r6-32*3], m5 mova [r6-32*2], m1 mova [r6-32*1], m2 mova [r6+32*0], m8 mova [r6+32*1], m9 mova [r6+32*2], m10 mova [r6+32*3], m15 .fast: add r6, 32*8 call .main mova m14, [r6-32*4] mova m13, [r6-32*3] mova m12, [r6-32*2] mova m11, [r6-32*1] mova m10, [r6+32*0] mova m9, [r6+32*1] mova m8, [r6+32*2] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r6+32*3] ; out8 paddd m7, [r6+32*3] ; out7 sub r6, 32*8 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call .transpose lea r6, [pw_5+128] mova [rsp], m15 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] .end: call .write_16x16 RET ALIGN function_align .write_16x16: mova [rsp+gprsize+32*0], m8 mova [rsp+gprsize+32*1], m9 mova [rsp+gprsize+32*2], m12 vpbroadcastd m12, [pw_2048] pmulhrsw m0, m12 pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 call m(idct_16x8_internal_10bpc).write_16x4_start .write_16x16_2: pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+gprsize+32*0] pmulhrsw m1, m12, [rsp+gprsize+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m12, m11 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+gprsize+32*2] pmulhrsw m1, m12, m13 pmulhrsw m2, m12, m14 pmulhrsw m3, m12, m15 jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .transpose: test eobd, eobd jl .transpose_fast packssdw m8, [r6-32*4] packssdw m9, [r6-32*3] packssdw m10, [r6-32*2] packssdw m11, [r6-32*1] packssdw m12, [r6+32*0] packssdw m13, [r6+32*1] packssdw m14, [r6+32*2] packssdw m15, [r6+32*3] sub r6, 32*8 packssdw m0, [r6-32*4] packssdw m1, [r6-32*3] packssdw m2, [r6-32*2] packssdw m3, [r6-32*1] packssdw m4, [r6+32*0] packssdw m5, [r6+32*1] packssdw m6, [r6+32*2] packssdw m7, [r6+32*3] mova [r6], m8 punpckhwd m8, m0, m1 punpcklwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpckhwd m3, m6, m7 punpcklwd m6, m7 punpcklwd m7, m4, m5 punpckhwd m4, m5 punpckldq m5, m8, m2 punpckhdq m8, m2 punpckhdq m2, m0, m1 punpckldq m0, m1 punpckhdq m1, m7, m6 punpckldq m7, m6 punpckhdq m6, m4, m3 punpckldq m4, m3 punpckhqdq m3, m2, m1 punpcklqdq m2, m1 punpckhqdq m1, m0, m7 punpcklqdq m0, m7 punpcklqdq m7, m8, m6 punpckhqdq m8, m6 punpckhqdq m6, m5, m4 punpcklqdq m5, m4 mova m4, [r6] mova [r6], m8 punpcklwd m8, m4, m9 punpckhwd m4, m9 punpcklwd m9, m10, m11 punpckhwd m10, m11 punpckhwd m11, m14, m15 punpcklwd m14, m15 punpckhwd m15, m12, m13 punpcklwd m12, m13 punpckldq m13, m4, m10 punpckhdq m4, m10 punpckhdq m10, m8, m9 punpckldq m8, m9 punpckhdq m9, m12, m14 punpckldq m12, m14 punpckhdq m14, m15, m11 punpckldq m15, m11 punpckhqdq m11, m10, m9 punpcklqdq m10, m9 punpckhqdq m9, m8, m12 punpcklqdq m8, m12 punpcklqdq m12, m13, m15 punpckhqdq m13, m15 punpckhqdq m15, m4, m14 punpcklqdq m14, m4, m14 vperm2i128 m4, m0, m8, 0x31 vinserti128 m0, xm8, 1 vinserti128 m8, m5, xm12, 1 vperm2i128 m12, m5, 0x13 vperm2i128 m5, m1, m9, 0x31 vinserti128 m1, xm9, 1 vinserti128 m9, m6, xm13, 1 vperm2i128 m13, m6, 0x13 vperm2i128 m6, m2, m10, 0x31 vinserti128 m2, xm10, 1 vinserti128 m10, m7, xm14, 1 vperm2i128 m14, m7, 0x13 vperm2i128 m7, m3, m11, 0x31 vinserti128 m3, xm11, 1 mova xm11, [r6] vinserti128 m11, xm15, 1 vinserti128 m15, [r6+16], 0 ret .transpose_fast: call m(idct_16x8_internal_10bpc).transpose2 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 ret ALIGN function_align .main: mova m0, [cq+64* 1] mova m1, [cq+64* 3] mova m2, [cq+64* 5] mova m3, [cq+64* 7] mova m4, [cq+64* 9] mova m5, [cq+64*11] mova m6, [cq+64*13] mova m7, [cq+64*15] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+64* 0] mova m1, [cq+64* 2] mova m2, [cq+64* 4] mova m3, [cq+64* 6] mova m4, [cq+64* 8] mova m5, [cq+64*10] mova m6, [cq+64*12] mova m7, [cq+64*14] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf psrld m10, m11, 10 ; pd_2 REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: vpbroadcastd m15, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 jl .fast add cq, 32 call .main sub cq, 32 vpbroadcastd m8, [pd_5120] paddd m4, m8 paddd m6, m8 paddd m9, m8 paddd m11, m8 vpbroadcastd m8, [pd_5119] psubd m5, m8, m5 psubd m7, m8, m7 psubd m10, m8, m10 psubd m12, m8, m12 REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 psrld m4, m15, 10 ; pd_2 paddd m0, m4 psubd m1, m4, m1 paddd m2, m4 psubd m3, m4, m3 psubd m7, m4, [r6-32*4] paddd m6, m4, [r6-32*3] psubd m5, m4, [r6-32*2] paddd m4, [r6-32*1] REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 add r6, 32*8 mova [r6-32*4], m9 mova [r6-32*3], m10 mova [r6-32*2], m11 mova [r6-32*1], m12 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 .fast: add r6, 32*8 call .main vpbroadcastd m14, [pd_5120] vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 paddd m0, m15 psubd m1, m15, m1 paddd m2, m15 psubd m3, m15, m3 paddd m4, m14 psubd m5, m13, m5 paddd m6, m14 psubd m7, m13, m7 paddd m8, m14, m9 psubd m9, m13, m10 paddd m10, m14, m11 psubd m11, m13, m12 paddd m12, m15, [r6-32*1] psubd m13, m15, [r6-32*2] paddd m14, m15, [r6-32*3] psubd m15, [r6-32*4] .pass1_end: REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 sub r6, 32*8 jmp tx2q .pass2: call m(idct_16x16_internal_10bpc).transpose lea r6, [pw_5+128] mova [rsp], m15 call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*0], m8 mova [rsp+32*2], m12 mova [rsp+32*3], m13 vpbroadcastd m12, [pw_2048] pxor m13, m13 psubw m13, m12 pmulhrsw m0, m12 pmulhrsw m1, m13, [rsp+32*1] mova [rsp+32*1], m9 pmulhrsw m2, m12 pmulhrsw m3, m13 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m12, m4 pmulhrsw m1, m13, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m13, m7 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+32*0] pmulhrsw m1, m13, [rsp+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m13, m11 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m12, [rsp+32*2] pmulhrsw m1, m13, [rsp+32*3] pmulhrsw m2, m12, m14 pmulhrsw m3, m13, m15 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .main: mova m0, [cq+64* 2] mova m1, [cq+64*13] mova m2, [cq+64* 6] mova m3, [cq+64* 9] mova m4, [cq+64*10] mova m5, [cq+64* 5] mova m6, [cq+64*14] mova m7, [cq+64* 1] vpbroadcastd m12, [pd_2048] call m(iadst_16x8_internal_10bpc).main_part1 mova m0, [cq+64* 0] mova m1, [cq+64*15] mova m2, [cq+64* 4] mova m3, [cq+64*11] mova m4, [cq+64* 8] mova m5, [cq+64* 7] mova m6, [cq+64*12] mova m7, [cq+64* 3] jmp m(iadst_16x8_internal_10bpc).main_part2 INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] .pass1: vpbroadcastd m15, [pd_2896] lea r6, [rsp+32*4] sub eobd, 36 jl .fast add cq, 32 call m(iadst_16x16_internal_10bpc).main sub cq, 32 vpbroadcastd m8, [pd_5120] paddd m11, m8 paddd m9, m8 paddd m6, m8 paddd m4, m8 vpbroadcastd m8, [pd_5119] psubd m12, m8, m12 psubd m10, m8, m10 psubd m7, m8, m7 psubd m5, m8, m5 REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 mova [r6+32*0], m12 mova [r6+32*1], m11 mova [r6+32*2], m10 mova [r6+32*3], m9 psrld m9, m15, 10 ; pd_2 psubd m3, m9, m3 paddd m2, m9 psubd m1, m9, m1 paddd m0, m9 psubd m12, m9, [r6-32*4] paddd m11, m9, [r6-32*3] psubd m10, m9, [r6-32*2] paddd m9, [r6-32*1] REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 mova [r6-32*4], m12 mova [r6-32*3], m11 mova [r6-32*2], m10 mova [r6-32*1], m9 add r6, 32*8 mova [r6-32*4], m7 mova [r6-32*3], m6 mova [r6-32*2], m5 mova [r6-32*1], m4 mova [r6+32*0], m3 mova [r6+32*1], m2 mova [r6+32*2], m1 mova [r6+32*3], m0 .fast: add r6, 32*8 call m(iadst_16x16_internal_10bpc).main vpbroadcastd m14, [pd_5120] vpbroadcastd m13, [pd_5119] psrld m15, 10 ; pd_2 psubd m8, m13, m7 paddd m7, m14, m9 paddd m9, m14, m6 psubd m6, m13, m10 psubd m10, m13, m5 paddd m5, m14, m11 paddd m11, m14, m4 psubd m4, m13, m12 psubd m12, m15, m3 paddd m3, m15, [r6-32*1] paddd m13, m15, m2 psubd m2, m15, [r6-32*2] psubd m14, m15, m1 mova m1, m15 paddd m15, m0 psubd m0, m1, [r6-32*4] paddd m1, [r6-32*3] jmp m(iadst_16x16_internal_10bpc).pass1_end .pass2: call m(idct_16x16_internal_10bpc).transpose lea r6, [pw_5+128] mova [rsp], m15 call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass2_end mova [rsp+32*3], m3 mova [rsp+32*2], m2 mova [rsp+32*0], m0 mova m2, m13 mova m3, m12 vpbroadcastd m12, [pw_2048] pxor m13, m13 psubw m13, m12 pmulhrsw m0, m13, m15 pmulhrsw m1, m12, m14 pmulhrsw m2, m13 pmulhrsw m3, m12 mova m14, m8 mova m15, m9 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m13, m11 pmulhrsw m1, m12, m10 pmulhrsw m2, m13, m15 pmulhrsw m3, m12, m14 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m13, m7 pmulhrsw m1, m12, m6 pmulhrsw m2, m13, m5 pmulhrsw m3, m12, m4 call m(idct_16x8_internal_10bpc).write_16x4_zero pmulhrsw m0, m13, [rsp+32*3] pmulhrsw m1, m12, [rsp+32*2] pmulhrsw m2, m13, [rsp+32*1] pmulhrsw m3, m12, [rsp+32*0] call m(idct_16x8_internal_10bpc).write_16x4_zero RET INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m15, [pd_5793] vpbroadcastd m7, [pd_5120] lea r6, [rsp+32*4] sub eobd, 36 jl .fast mov r3, -32*8*4 .righthalf: pmulld m0, m15, [cq+r3+32*33] pmulld m1, m15, [cq+r3+32*35] pmulld m2, m15, [cq+r3+32*37] pmulld m3, m15, [cq+r3+32*39] add r6, 32*4 REPX {paddd x, m7}, m0, m1, m2, m3 REPX {psrad x, 13}, m0, m1, m2, m3 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 add r3, 32*8 jl .righthalf .fast: pmulld m0, m15, [cq+64* 0] pmulld m1, m15, [cq+64* 1] pmulld m2, m15, [cq+64* 2] pmulld m3, m15, [cq+64* 3] pmulld m4, m15, [cq+64* 4] pmulld m5, m15, [cq+64* 5] pmulld m6, m15, [cq+64* 6] pmulld m8, m15, [cq+64* 7] mova [cq], m8 pmulld m8, m15, [cq+64* 8] pmulld m9, m15, [cq+64* 9] pmulld m10, m15, [cq+64*10] pmulld m11, m15, [cq+64*11] pmulld m12, m15, [cq+64*12] pmulld m13, m15, [cq+64*13] pmulld m14, m15, [cq+64*14] pmulld m15, [cq+64*15] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ m8, m9, m10, m11, m12, m13, m14, m15 paddd m7, [cq] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: call m(idct_16x16_internal_10bpc).transpose mova [cq+32*0], m15 mova [cq+32*1], m0 vpbroadcastd m15, [pw_1697x16] REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14 mova m0, [cq+32*1] mova [cq+32*1], m1 IDTX16 0, 1, 15 mova m1, [cq+32*0] pmulhrsw m15, m1 paddsw m1, m1 paddsw m15, m1 mova m1, [cq+32*1] jmp m(idct_16x16_internal_10bpc).end INV_TXFM_16X16_FN dct, dct, 0, 12 INV_TXFM_16X16_FN dct, identity, 28, 12 INV_TXFM_16X16_FN dct, adst, 0, 12 INV_TXFM_16X16_FN dct, flipadst, 0, 12 cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x16_internal_10bpc).pass1 .pass2: mova [cq+32* 8], m8 mova [cq+32* 9], m9 mova [cq+32*10], m10 mova [cq+32*11], m11 mova [cq+32*12], m12 mova [cq+32*13], m13 mova [cq+32*14], m14 mova [cq+32*15], m15 call .pass2_main packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 packssdw m4, m8, m9 packssdw m5, m10, m11 packssdw m6, m12, m13 packssdw m7, m14, m15 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, [cq+32*12] mova m5, [cq+32*13] mova m6, [cq+32*14] mova m7, [cq+32*15] mov r5, r6 add r6, 32*16 call .pass2_main jmp m(iadst_16x16_internal_12bpc).end ALIGN function_align .write_16x16: mova [rsp+gprsize+32*0], m8 mova [rsp+gprsize+32*1], m9 mova [rsp+gprsize+32*2], m12 vpbroadcastd m12, [pw_16384] pmulhrsw m0, m12 pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero jmp m(idct_16x16_internal_10bpc).write_16x16_2 ALIGN function_align .pass2_main: call m(idct_8x8_internal_12bpc).transpose_8x8 mova [cq+32* 0], m0 mova [cq+32* 1], m2 mova [cq+32* 2], m4 mova [cq+32* 3], m6 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] pmaxsd m0, m12, m1 pmaxsd m1, m12, m3 pmaxsd m2, m12, m5 pmaxsd m3, m12, m7 REPX {pminsd x, m13}, m0, m1, m2, m3 test eobd, eobd jge .pass2_slow pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 jmp .pass2_fast .pass2_slow: sub r6, 32*8 mova m8, [r6-32*4] mova m4, [r6-32*3] mova m10, [r6-32*2] mova m5, [r6-32*1] mova m12, [r6+32*0] mova m6, [r6+32*1] mova m14, [r6+32*2] mova m7, [r6+32*3] TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 mova [cq+32* 4], m8 mova [cq+32* 5], m10 mova [cq+32* 6], m12 mova [cq+32* 7], m14 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m4, m5, m6, m7 REPX {pminsd x, m13}, m4, m5, m6, m7 .pass2_fast: vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(idct_8x16_internal_10bpc).main_oddhalf pmaxsd m0, m12, [cq+32* 0] pmaxsd m1, m12, [cq+32* 1] pmaxsd m2, m12, [cq+32* 2] pmaxsd m3, m12, [cq+32* 3] REPX {pminsd x, m13}, m0, m1, m2, m3 test eobd, eobd jge .pass2_slow2 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 jmp .pass2_fast2 .pass2_slow2: pmaxsd m4, m12, [cq+32* 4] pmaxsd m5, m12, [cq+32* 5] pmaxsd m6, m12, [cq+32* 6] pmaxsd m7, m12, [cq+32* 7] REPX {pminsd x, m13}, m4, m5, m6, m7 .pass2_fast2: call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf psrad m11, 8 ; pd_8 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_16x8_internal_10bpc).pass1_rotations REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ret INV_TXFM_16X16_FN adst, dct, 0, 12 INV_TXFM_16X16_FN adst, adst, 0, 12 INV_TXFM_16X16_FN adst, flipadst, 0, 12 cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iadst_16x16_internal_10bpc).pass1 .pass2: call .pass2_part1 call m(iadst_16x8_internal_10bpc).pass1_rotations call .pass2_part2 call m(iadst_16x8_internal_10bpc).pass1_rotations .pass2_part3: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 .end: packssdw m15, m14 packssdw m14, m13, m12 packssdw m13, m11, m10 packssdw m12, m9, m8 packssdw m11, m7, m6 packssdw m10, m5, m4 packssdw m7, m3, m2 packssdw m6, m1, m0 vpblendd m0, m6, [r5-32*4], 0x33 vpblendd m1, m6, [r5-32*4], 0xcc vpblendd m2, m7, [r5-32*3], 0x33 vpblendd m3, m7, [r5-32*3], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_12bpc).write_16x4_start call m(idct_16x8_internal_10bpc).write_16x4_zero vpblendd m0, m10, [r5-32*2], 0x33 vpblendd m1, m10, [r5-32*2], 0xcc vpblendd m2, m11, [r5-32*1], 0x33 vpblendd m3, m11, [r5-32*1], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_10bpc).write_16x4_zero vpblendd m0, m12, [r5+32*0], 0x33 vpblendd m1, m12, [r5+32*0], 0xcc vpblendd m2, m13, [r5+32*1], 0x33 vpblendd m3, m13, [r5+32*1], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_10bpc).write_16x4_zero vpblendd m0, m14, [r5+32*2], 0x33 vpblendd m1, m14, [r5+32*2], 0xcc vpblendd m2, m15, [r5+32*3], 0x33 vpblendd m3, m15, [r5+32*3], 0xcc vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 call m(idct_16x8_internal_10bpc).write_16x4_zero RET ALIGN function_align .pass2_part1: mova [cq+32* 8], m8 mova [cq+32* 9], m9 mova [cq+32*10], m10 mova [cq+32*11], m11 mova [cq+32*12], m12 mova [cq+32*13], m13 mova [cq+32*14], m14 mova [cq+32*15], m15 .pass2_main: call m(idct_8x8_internal_12bpc).transpose_8x8 mova [cq+32* 0], m0 mova [cq+32* 1], m3 mova [cq+32* 2], m4 mova [cq+32* 3], m7 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] pmaxsd m0, m13, m2 pmaxsd m2, m13, m6 pmaxsd m5, m13, m5 pmaxsd m7, m13, m1 REPX {pminsd x, m14}, m0, m2, m5, m7 test eobd, eobd jge .pass2_slow pxor m1, m1 REPX {mova x, m1}, m3, m4, m6 jmp .pass2_fast .pass2_slow: sub r6, 32*8 mova m8, [r6-32*4] mova m3, [r6-32*3] mova m4, [r6-32*2] mova m11, [r6-32*1] mova m12, [r6+32*0] mova m1, [r6+32*1] mova m6, [r6+32*2] mova m15, [r6+32*3] TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 mova [cq+32* 4], m8 mova [cq+32* 5], m11 mova [cq+32* 6], m12 mova [cq+32* 7], m15 vpbroadcastd m13, [clip_18b_min] vpbroadcastd m14, [clip_18b_max] REPX {pmaxsd x, m13}, m1, m3, m4, m6 REPX {pminsd x, m14}, m1, m3, m4, m6 .pass2_fast: vpbroadcastd m12, [pd_2048] vpbroadcastd m15, [pd_2896] call m(iadst_16x8_internal_10bpc).main_part1 pmaxsd m0, m13, [cq+32* 0] ; 0 pmaxsd m7, m13, [cq+32* 1] ; 3 pmaxsd m2, m13, [cq+32* 2] ; 4 pmaxsd m5, m13, [cq+32* 3] ; 7 REPX {pminsd x, m14}, m0, m2, m5, m7 test eobd, eobd jge .pass2_slow2 pxor m1, m1 REPX {mova x, m1}, m3, m4, m6 jmp .pass2_fast2 .pass2_slow2: pmaxsd m4, m13, [cq+32* 4] ; 8 pmaxsd m3, m13, [cq+32* 5] ; 11 pmaxsd m6, m13, [cq+32* 6] ; 12 pmaxsd m1, m13, [cq+32* 7] ; 15 REPX {pminsd x, m14}, m1, m3, m4, m6 .pass2_fast2: call m(iadst_16x8_internal_10bpc).main_part2 vpbroadcastd m14, [pd_17408] psrld m15, 11 ; pd_1 psubd m13, m14, m15 ; pd_17407 pslld m15, 3 ; pd_8 ret ALIGN function_align .pass2_part2: REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 packssdw m0, m1 packssdw m1, m2, m3 packssdw m2, m4, m5 packssdw m3, m6, m7 packssdw m4, m8, m9 packssdw m5, m10, m11 packssdw m6, m12, m13 packssdw m7, m14, m15 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, [cq+32*12] mova m5, [cq+32*13] mova m6, [cq+32*14] mova m7, [cq+32*15] mov r5, r6 add r6, 32*16 jmp .pass2_main INV_TXFM_16X16_FN flipadst, dct, 0, 12 INV_TXFM_16X16_FN flipadst, adst, 0, 12 INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m13, [clip_20b_min] vpbroadcastd m14, [clip_20b_max] jmp m(iflipadst_16x16_internal_10bpc).pass1 .pass2: call m(iadst_16x16_internal_12bpc).pass2_part1 call m(iflipadst_16x8_internal_10bpc).pass1_rotations call m(iadst_16x16_internal_12bpc).pass2_part2 call m(iflipadst_16x8_internal_10bpc).pass1_rotations jmp m(iadst_16x16_internal_12bpc).pass2_part3 INV_TXFM_16X16_FN identity, dct, -92, 12 INV_TXFM_16X16_FN identity, identity, 0, 12 %macro IDTX16_12BPC 1 ; src pmulld m6, m7, m%1 paddd m6, m15 psrad m6, 12 paddd m6, m%1 psrad m%1, m6, 1 %endmacro cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 vpbroadcastd m7, [pd_1697] vpbroadcastd m15, [pd_5120] lea r6, [rsp+32*4] sub eobd, 36 jl .fast mov r3, -32*8*4 .righthalf: mova m10, [cq+r3+32*33] mova m11, [cq+r3+32*35] mova m12, [cq+r3+32*37] mova m13, [cq+r3+32*39] add r6, 32*4 pmulld m0, m7, m10 pmulld m1, m7, m11 pmulld m2, m7, m12 pmulld m3, m7, m13 REPX {paddd x, m15}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 paddd m0, m10 paddd m1, m11 paddd m2, m12 paddd m3, m13 REPX {psrad x, 1 }, m0, m1, m2, m3 mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 add r3, 32*8 jl .righthalf .fast: mova m0, [cq+64* 0] mova m1, [cq+64* 1] mova m2, [cq+64* 2] mova m3, [cq+64* 3] mova m4, [cq+64* 4] mova m5, [cq+64* 5] mova m8, [cq+64* 6] mova m9, [cq+64* 7] REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 mova [cq+64*0], m8 mova [cq+64*1], m9 mova m8, [cq+64* 8] mova m9, [cq+64* 9] mova m10, [cq+64*10] mova m11, [cq+64*11] mova m12, [cq+64*12] mova m13, [cq+64*13] mova m14, [cq+64*14] REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 mova m6, [cq+64*15] pmulld m7, m6 paddd m7, m15 psrad m7, 12 paddd m7, m6 mova m6, [cq+64*0] psrad m15, m7, 1 mova m7, [cq+64*1] jmp tx2q .pass2: call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x16_internal_10bpc).transpose_fast test eobd, eobd jl .pass2_fast mova [cq+32* 8], m0 mova [cq+32* 9], m1 mova [cq+32*10], m2 mova [cq+32*11], m3 mova [cq+32*12], m4 mova [cq+32*13], m5 mova [cq+32*14], m6 mova [cq+32*15], m7 mova m8, [r6-32*4] mova m9, [r6-32*3] mova m10, [r6-32*2] mova m11, [r6-32*1] mova m12, [r6+32*0] mova m13, [r6+32*1] mova m14, [r6+32*2] mova m15, [r6+32*3] sub r6, 32*8 mova m0, [r6-32*4] mova m1, [r6-32*3] mova m2, [r6-32*2] mova m3, [r6-32*1] mova m4, [r6+32*0] mova m5, [r6+32*1] mova m6, [r6+32*2] mova m7, [r6+32*3] call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x8_internal_10bpc).transpose2 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 mova m12, m4 mova m13, m5 mova m14, m6 mova m15, m7 mova m0, [cq+32* 8] mova m1, [cq+32* 9] mova m2, [cq+32*10] mova m3, [cq+32*11] mova m4, [cq+32*12] mova m5, [cq+32*13] mova m6, [cq+32*14] mova m7, [cq+32*15] .pass2_fast: call m(idct_16x16_internal_12bpc).write_16x16 RET %macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack mova m%4, [r6+32*(%1-4)] mova m%2, [r5+32*(3-%1)] mova m%5, [r4+32*(%1-4)] psubd m%3, m%1, m%4 ; idct16 out15 - n paddd m%1, m%4 ; idct16 out0 + n pmaxsd m%1, m12 pmaxsd m%3, m12 pminsd m%1, m13 pminsd m%3, m13 paddd m%1, m11 paddd m%3, m11 psubd m%4, m%1, m%2 ; out31 - n paddd m%1, m%2 ; out0 + n paddd m%2, m%3, m%5 ; out15 - n psubd m%3, m%5 ; out16 + n REPX {psrad x, %6}, m%1, m%3, m%2, m%4 %if %7 & 1 packssdw m%1, m%3 ; out0 + n, out16 + n packssdw m%2, m%4 ; out15 - n, out31 - n %endif %endmacro cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vbroadcasti128 m14, [idct32_shuf] mov r4, cq call .pass1_main mova [rsp+32*0], m2 mova [rsp+32*1], m3 cmp eobd, 43 jge .eob43 pxor m4, m4 REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 jmp .pass1_end_fast .eob43: lea r6, [rsp+32*8] mova [r6-32*4], m0 mova [r6-32*3], m1 call .pass1_main mova [rsp+32*2], m2 cmp eobd, 107 jge .eob107 mova m11, m3 mova m2, m0 mova m3, m1 mova m0, [r6-32*4] mova m1, [r6-32*3] pxor m4, m4 .pass1_end_fast: vpbroadcastd m10, [pw_2048] lea r6, [deint_shuf+128] REPX {mova x, m4}, m5, m6, m7 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .end .eob107: mova [rsp+32*3], m3 mova [r6-32*2], m0 mova [r6-32*1], m1 call .pass1_main cmp eobd, 171 jge .eob171 pshufd m12, m2, q1032 pshufd m13, m3, q1032 mova m4, m0 mova m5, m1 pxor m6, m6 REPX {mova x, m6}, m7, m14, m15 jmp .pass1_end .eob171: mova [r6+32*0], m0 mova [r6+32*1], m1 mova [r6+32*2], m2 mova [r6+32*3], m3 call .pass1_main pshufd m12, [r6+32*2], q1032 ; out19 out17 pshufd m13, [r6+32*3], q1032 ; out23 out21 mova m4, [r6+32*0] ; out16 out18 mova m5, [r6+32*1] ; out20 out22 pshufd m14, m2, q1032 ; out27 out25 pshufd m15, m3, q1032 ; out31 out29 mova m6, m0 ; out24 out26 mova m7, m1 ; out28 out30 .pass1_end: mova m0, [r6-32*4] ; out0 out2 mova m1, [r6-32*3] ; out4 out6 mova m2, [r6-32*2] ; out8 out10 mova m3, [r6-32*1] ; out12 out14 lea r6, [deint_shuf+128] mova m11, [rsp+32*3] ; out13 out15 vpbroadcastd m10, [pw_2048] call m(inv_txfm_add_dct_dct_8x32_8bpc).main .end: ; [rsp+0*32] = m12 vpbroadcastd m12, [pw_2048] mov cq, r4 mova [rsp+32*1], m8 mova [rsp+32*2], m9 mova [rsp+32*3], m10 mova [rsp+32*4], m11 vpermq m0, m0, q3120 vpermq m1, m1, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4_start vpermq m0, m2, q3120 vpermq m1, m3, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m4, q3120 vpermq m1, m5, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m6, q3120 vpermq m1, m7, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*1], q3120 vpermq m1, [rsp+32*2], q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*3], q3120 vpermq m1, [rsp+32*4], q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [rsp+32*0], q3120 vpermq m1, m13, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m14, q3120 vpermq m1, m15, q2031 pmulhrsw m0, m12 pmulhrsw m1, m12 call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: imul r6d, [cq], 181 vpbroadcastd m2, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 ALIGN function_align .pass1_main_part1: mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] mova m3, [cq+128*3] mova m4, [cq+128*4] mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] call m(idct_8x8_internal_10bpc).main psrld m1, m11, 10 ; pd_2 REPX {paddd x, m1}, m0, m6, m5, m3 paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 ret ALIGN function_align .pass1_main: call .pass1_main_part1 add cq, 32 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 pshufb m0, m14 pshufb m2, m14 pshufb m4, m14 pshufb m6, m14 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 vperm2i128 m1, m0, m2, 0x31 ; 4 6 vinserti128 m0, xm2, 1 ; 0 2 vinserti128 m2, m3, xm4, 1 ; 1 3 vperm2i128 m3, m4, 0x31 ; 5 7 ret .main_oddhalf_part1_fast_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_part1_fast: ; lower half zero vpbroadcastd m7, [pd_4091] vpbroadcastd m8, [pd_201] vpbroadcastd m6, [pd_m1380] vpbroadcastd m9, [pd_3857] vpbroadcastd m5, [pd_3703] vpbroadcastd m10, [pd_1751] vpbroadcastd m4, [pd_m2751] vpbroadcastd m15, [pd_3035] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 pmulld m1, m9 pmulld m5, m2 pmulld m2, m10 pmulld m4, m3 pmulld m3, m15 jmp .main_oddhalf_part1_fast2 .main_oddhalf_part1_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a .main_oddhalf_part1_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 psubd m8, m0, m4 ; t17 paddd m0, m4 ; t16 psubd m4, m6, m2 ; t18 paddd m6, m2 ; t19 psubd m2, m1, m5 ; t29 paddd m1, m5 ; t28 psubd m5, m7, m3 ; t30 paddd m7, m3 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 vpbroadcastd m15, [pd_4017] vpbroadcastd m10, [pd_799] ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m1 ; t28a paddd m7, m1 ; t31a psubd m1, m5, m4 ; t18 paddd m5, m4 ; t17 psubd m4, m8, m2 ; t29 paddd m8, m2 ; t30 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 mova [r6-32*4], m0 mova [r6-32*3], m5 mova [r6-32*2], m4 mova [r6-32*1], m6 mova [r6+32*0], m3 mova [r6+32*1], m1 mova [r6+32*2], m8 mova [r6+32*3], m7 ret .main_oddhalf_part2_fast_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_oddhalf_part2_fast: ; lower half zero vpbroadcastd m7, [pd_m601] vpbroadcastd m8, [pd_4052] vpbroadcastd m6, [pd_3973] vpbroadcastd m9, [pd_995] vpbroadcastd m5, [pd_m2106] vpbroadcastd m10, [pd_3513] vpbroadcastd m4, [pd_3290] vpbroadcastd m15, [pd_2440] pmulld m7, m0 pmulld m0, m8 pmulld m6, m1 pmulld m1, m9 pmulld m5, m2 pmulld m2, m10 pmulld m4, m3 pmulld m3, m15 jmp .main_oddhalf_part2_fast2 .main_oddhalf_part2_rect2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a .main_oddhalf_part2_fast2: REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 psubd m8, m0, m4 ; t25 paddd m0, m4 ; t24 psubd m4, m6, m2 ; t26 paddd m6, m2 ; t27 psubd m2, m1, m5 ; t21 paddd m1, m5 ; t20 psubd m5, m7, m3 ; t22 paddd m7, m3 ; t23 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 vpbroadcastd m15, [pd_2276] vpbroadcastd m10, [pd_3406] ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m1 ; t20a paddd m7, m1 ; t23a psubd m1, m5, m4 ; t21 paddd m5, m4 ; t22 psubd m4, m8, m2 ; t26 paddd m8, m2 ; t25 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 vpbroadcastd m15, [pd_3784] vpbroadcastd m10, [pd_1567] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 mova m9, [r6-32*4] ; t16a mova m10, [r6-32*3] ; t17 psubd m2, m9, m7 ; t23 paddd m9, m7 ; t16 psubd m7, m10, m5 ; t22a paddd m10, m5 ; t17a REPX {pmaxsd x, m12}, m9, m10, m2, m7 REPX {pminsd x, m13}, m9, m10, m2, m7 mova [r6-32*4], m9 mova [r6-32*3], m10 mova m9, [r6-32*2] ; t18a mova m10, [r6-32*1] ; t19 psubd m5, m9, m1 ; t21 paddd m9, m1 ; t18 psubd m1, m10, m6 ; t20a paddd m10, m6 ; t19a REPX {pmaxsd x, m12}, m9, m10, m5, m1 REPX {pminsd x, m13}, m9, m10, m5, m1 mova [r6-32*2], m9 mova [r6-32*1], m10 mova m9, [r6+32*0] ; t28 mova m10, [r6+32*1] ; t29a psubd m6, m9, m3 ; t27a paddd m9, m3 ; t28a psubd m3, m10, m4 ; t26 paddd m10, m4 ; t29 REPX {pmaxsd x, m12}, m9, m10, m6, m3 REPX {pminsd x, m13}, m9, m10, m6, m3 REPX {pmulld x, m14}, m6, m3, m1, m5 paddd m6, m11 paddd m3, m11 psubd m4, m6, m1 ; t20 paddd m6, m1 ; t27 psubd m1, m3, m5 ; t21a paddd m3, m5 ; t26a REPX {psrad x, 12 }, m4, m1, m3, m6 mova [r6+32*0], m4 mova [r6+32*1], m1 mova m4, [r6+32*2] ; t30 mova m1, [r6+32*3] ; t31a psubd m5, m4, m8 ; t25a paddd m4, m8 ; t30a psubd m8, m1, m0 ; t24 paddd m1, m0 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m1 REPX {pminsd x, m13}, m8, m5, m4, m1 REPX {pmulld x, m14}, m5, m8, m7, m2 paddd m5, m11 paddd m8, m11 psubd m0, m5, m7 ; t22 paddd m5, m7 ; t25 psubd m7, m8, m2 ; t23a paddd m2, m8 ; t24a REPX {psrad x, 12 }, m0, m7, m2, m5 mova [r6+32*2], m0 mova [r6+32*3], m7 mov r4, r6 add r6, 32*8 mova [r6-32*4], m2 mova [r6-32*3], m5 mova [r6-32*2], m3 mova [r6-32*1], m6 mova [r6+32*0], m9 mova [r6+32*1], m10 mova [r6+32*2], m4 mova [r6+32*3], m1 mov r5, r6 add r6, 32*8 ret ALIGN function_align .main_end: psrld m11, 10 ; pd_2 IDCT32_END 0, 15, 8, 9, 10, 2 IDCT32_END 1, 14, 8, 9, 10, 2 punpckhwd m8, m0, m1 ; 16 17 punpcklwd m0, m1 ; 0 1 punpcklwd m1, m14, m15 ; 14 15 punpckhwd m14, m15 ; 30 31 mova [r5+32*3], m8 mova [r5+32*2], m14 IDCT32_END 2, 15, 8, 9, 10, 2 IDCT32_END 3, 14, 8, 9, 10, 2 punpckhwd m8, m2, m3 ; 18 19 punpcklwd m2, m3 ; 2 3 punpcklwd m3, m14, m15 ; 12 13 punpckhwd m14, m15 ; 28 29 mova [r5+32*1], m8 mova [r5+32*0], m14 IDCT32_END 4, 15, 8, 9, 10, 2 IDCT32_END 5, 14, 8, 9, 10, 2 punpckhwd m8, m4, m5 ; 20 21 punpcklwd m4, m5 ; 4 5 punpcklwd m5, m14, m15 ; 10 11 punpckhwd m14, m15 ; 26 27 mova [r5-32*1], m8 mova [r5-32*2], m14 IDCT32_END 6, 15, 8, 9, 10, 2 IDCT32_END 7, 14, 8, 9, 10, 2 punpckhwd m8, m6, m7 ; 22 23 punpcklwd m6, m7 ; 6 7 punpcklwd m7, m14, m15 ; 8 9 punpckhwd m14, m15 ; 24 25 mova [r5-32*3], m8 mova [r5-32*4], m14 .transpose: punpckhdq m15, m3, m1 punpckldq m3, m1 punpckhdq m1, m4, m6 punpckldq m4, m6 punpckhdq m6, m0, m2 punpckldq m0, m2 punpckhdq m2, m7, m5 punpckldq m7, m5 punpcklqdq m5, m2, m15 punpckhqdq m2, m15 punpckhqdq m15, m7, m3 punpcklqdq m7, m3 punpckhqdq m3, m6, m1 punpcklqdq m6, m1 punpckhqdq m1, m0, m4 punpcklqdq m0, m4 vperm2i128 m4, m0, m7, 0x31 vinserti128 m0, xm7, 1 vperm2i128 m7, m3, m2, 0x31 vinserti128 m3, xm2, 1 vinserti128 m2, m6, xm5, 1 vperm2i128 m6, m5, 0x31 vperm2i128 m5, m1, m15, 0x31 vinserti128 m1, xm15, 1 ret cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m5, [pw_5] pxor m6, m6 mov r6d, eobd add eobb, 21 cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 lea r6, [strideq*3] lea r5, [strideq*5] lea r4, [strideq+r6*2] ; strideq*7 .loop: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {paddsw x, m5}, m0, m1, m2, m3 REPX {psraw x, 3 }, m0, m1, m2, m3 call .main_zero add cq, 32 lea dstq, [dstq+strideq*8] sub eobd, 64 jge .loop RET ALIGN function_align .main_zero: REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 .main: punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m4 punpcklwd m0, m4 punpckhwd m4, m2, m1 punpcklwd m2, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 mova xm4, [dstq+strideq*0] vinserti128 m4, [dstq+strideq*4], 1 paddw m0, m4 mova xm4, [dstq+strideq*1] vinserti128 m4, [dstq+r5 ], 1 paddw m1, m4 mova xm4, [dstq+strideq*2] vinserti128 m4, [dstq+r6*2 ], 1 paddw m2, m4 mova xm4, [dstq+r6 ] vinserti128 m4, [dstq+r4 ], 1 paddw m3, m4 REPX {pmaxsw x, m6}, m0, m1, m2, m3 REPX {pminsw x, m7}, m0, m1, m2, m3 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*4], m0, 1 mova [dstq+strideq*1], xm1 vextracti128 [dstq+r5 ], m1, 1 mova [dstq+strideq*2], xm2 vextracti128 [dstq+r6*2 ], m2, 1 mova [dstq+r6 ], xm3 vextracti128 [dstq+r4 ], m3, 1 ret cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] mov r4, cq lea r6, [rsp+32*4] call .pass1_main cmp eobd, 43 jge .eob43 jmp .pass2_fast .eob43: call .pass1_main cmp eobd, 107 jge .eob107 .pass2_fast: mov cq, r4 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] pmaxsd m0, m12, [cq+128*1+ 0] pmaxsd m1, m12, [cq+128*7+ 0] pmaxsd m2, m12, [cq+128*1+32] pmaxsd m3, m12, [cq+128*7+32] REPX {pminsd x, m13}, m0, m1, m2, m3 vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast pmaxsd m0, m12, [cq+128*3+ 0] pmaxsd m1, m12, [cq+128*5+ 0] pmaxsd m2, m12, [cq+128*3+32] pmaxsd m3, m12, [cq+128*5+32] REPX {pminsd x, m13}, m0, m1, m2, m3 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast pmaxsd m0, m12, [cq+128*2+ 0] pmaxsd m1, m12, [cq+128*6+ 0] pmaxsd m2, m12, [cq+128*2+32] pmaxsd m3, m12, [cq+128*6+32] REPX {pminsd x, m13}, m0, m1, m2, m3 call m(idct_8x16_internal_10bpc).main_oddhalf_fast pmaxsd m0, m12, [cq+128*0+ 0] pmaxsd m1, m12, [cq+128*4+ 0] pmaxsd m2, m12, [cq+128*0+32] pmaxsd m3, m12, [cq+128*4+32] REPX {pminsd x, m13}, m0, m1, m2, m3 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf jmp .pass2_end .eob107: call .pass1_main cmp eobd, 171 jge .eob171 jmp .pass2 .eob171: call .pass1_main .pass2: mov cq, r4 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] pmaxsd m0, m12, [cq+128*1+ 0] pmaxsd m1, m12, [cq+128*7+ 0] pmaxsd m2, m12, [cq+128*1+32] pmaxsd m3, m12, [cq+128*7+32] pmaxsd m4, m12, [cq+128*1+64] pmaxsd m5, m12, [cq+128*7+64] pmaxsd m6, m12, [cq+128*1+96] pmaxsd m7, m12, [cq+128*7+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 pmaxsd m0, m12, [cq+128*3+ 0] pmaxsd m1, m12, [cq+128*5+ 0] pmaxsd m2, m12, [cq+128*3+32] pmaxsd m3, m12, [cq+128*5+32] pmaxsd m4, m12, [cq+128*3+64] pmaxsd m5, m12, [cq+128*5+64] pmaxsd m6, m12, [cq+128*3+96] pmaxsd m7, m12, [cq+128*5+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 pmaxsd m0, m12, [cq+128*2+ 0] pmaxsd m1, m12, [cq+128*6+ 0] pmaxsd m2, m12, [cq+128*2+32] pmaxsd m3, m12, [cq+128*6+32] pmaxsd m4, m12, [cq+128*2+64] pmaxsd m5, m12, [cq+128*6+64] pmaxsd m6, m12, [cq+128*2+96] pmaxsd m7, m12, [cq+128*6+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x16_internal_10bpc).main_oddhalf pmaxsd m0, m12, [cq+128*0+ 0] pmaxsd m1, m12, [cq+128*4+ 0] pmaxsd m2, m12, [cq+128*0+32] pmaxsd m3, m12, [cq+128*4+32] pmaxsd m4, m12, [cq+128*0+64] pmaxsd m5, m12, [cq+128*4+64] pmaxsd m6, m12, [cq+128*0+96] pmaxsd m7, m12, [cq+128*4+96] REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf .pass2_end: psrld m11, 8 ; pd_8 IDCT32_END 0, 15, 8, 9, 10, 4 IDCT32_END 1, 14, 8, 9, 10, 4 punpckhqdq m8, m0, m1 ; 16 17 (interleaved) punpcklqdq m0, m1 ; 0 1 (interleaved) punpcklqdq m1, m14, m15 ; 14 15 (interleaved) punpckhqdq m14, m15 ; 30 31 (interleaved) mova [r5+32*3], m8 mova [r5+32*2], m14 IDCT32_END 2, 15, 8, 9, 10, 4 IDCT32_END 3, 14, 8, 9, 10, 4 punpckhqdq m8, m2, m3 ; 18 19 (interleaved) punpcklqdq m2, m3 ; 2 3 (interleaved) punpcklqdq m3, m14, m15 ; 12 13 (interleaved) punpckhqdq m14, m15 ; 28 29 (interleaved) mova [r5+32*1], m8 mova [r5+32*0], m14 IDCT32_END 4, 15, 8, 9, 10, 4 IDCT32_END 5, 14, 8, 9, 10, 4 punpckhqdq m8, m4, m5 ; 20 21 (interleaved) punpcklqdq m4, m5 ; 4 5 (interleaved) punpcklqdq m5, m14, m15 ; 10 11 (interleaved) punpckhqdq m14, m15 ; 26 27 (interleaved) mova [r5-32*1], m8 mova [r5-32*2], m14 IDCT32_END 6, 15, 8, 9, 10, 4 IDCT32_END 7, 14, 8, 9, 10, 4 punpckhqdq m8, m6, m7 ; 22 23 (interleaved) punpcklqdq m6, m7 ; 6 7 (interleaved) punpcklqdq m7, m14, m15 ; 8 9 (interleaved) punpckhqdq m14, m15 ; 24 25 (interleaved) mova [r5-32*3], m8 mova [r5-32*4], m14 mova m15, m1 .end: vpermq m0, m0, q3120 vpermq m1, m2, q3120 call m(idct_8x8_internal_12bpc).write_8x4_start call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m4, q3120 vpermq m1, m6, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m7, q3120 vpermq m1, m5, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, m3, q3120 vpermq m1, m15, q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5+32*3], q3120 vpermq m1, [r5+32*1], q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5-32*1], q3120 vpermq m1, [r5-32*3], q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5-32*4], q3120 vpermq m1, [r5-32*2], q3120 call m(idct_8x8_internal_10bpc).write_8x4 vpermq m0, [r5+32*0], q3120 vpermq m1, [r5+32*2], q3120 call m(idct_8x8_internal_10bpc).write_8x4 RET .dconly: imul r6d, [cq], 181 vpbroadcastd m2, [dconly_12bpc] mov [cq], eobd ; 0 or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 ALIGN function_align .pass1_main: call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 mova [cq+128*0], m0 mova [cq+128*1], m1 mova [cq+128*2], m2 mova [cq+128*3], m3 mova [cq+128*4], m4 mova [cq+128*5], m5 mova [cq+128*6], m6 mova [cq+128*7], m7 add cq, 32 ret ALIGN function_align .main_end: psrld m11, 10 ; pd_2 IDCT32_END 0, 15, 8, 9, 10, 2, 0 mova [cq+32*16], m8 mova [cq+32*31], m9 IDCT32_END 1, 14, 8, 9, 10, 2, 0 mova [cq+32*17], m8 mova [cq+32*30], m9 mova [cq+32*14], m14 IDCT32_END 2, 14, 8, 9, 10, 2, 0 mova [cq+32*18], m8 mova [cq+32*29], m9 mova [cq+32*13], m14 IDCT32_END 3, 14, 8, 9, 10, 2, 0 mova [cq+32*19], m8 mova [cq+32*28], m9 mova [cq+32*12], m14 IDCT32_END 4, 14, 8, 9, 10, 2, 0 mova [cq+32*20], m8 mova [cq+32*27], m9 mova [cq+32* 0], m0 mova [cq+32* 1], m1 mova [cq+32* 2], m2 IDCT32_END 5, 10, 0, 1, 2, 2, 0 mova [cq+32*21], m0 mova [cq+32*26], m1 IDCT32_END 6, 9, 0, 1, 2, 2, 0 mova [cq+32*22], m0 mova [cq+32*25], m1 IDCT32_END 7, 8, 0, 1, 2, 2, 0 mova [cq+32*23], m0 mova [cq+32*24], m1 mova m0, [cq+32* 0] mova m1, [cq+32* 1] mova m2, [cq+32* 2] mova m11, m14 mova m12, [cq+32*12] mova m13, [cq+32*13] mova m14, [cq+32*14] ret cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 8 .dconly: add r6d, 640 sar r6d, 10 .dconly2: imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm3 vpbroadcastw m0, xm0 .dconly_loop: paddsw m1, m0, [dstq+32*0] paddsw m2, m0, [dstq+32*1] psubusw m1, m3 psubusw m2, m3 mova [dstq+32*0], m1 mova [dstq+32*1], m2 add dstq, strideq dec r3d jg .dconly_loop RET .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob lea r6, [rsp+32*4] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] call .pass1 call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end lea r6, [deint_shuf+128] vpbroadcastd m11, [pw_2048] mov r4, dstq call .pass2 mova m0, [r5+32*3] ; 16 17 mova m1, [r5+32*2] ; 30 31 mova m2, [r5+32*1] ; 18 19 mova m3, [r5+32*0] ; 28 29 mova m4, [r5-32*1] ; 20 21 mova m5, [r5-32*2] ; 26 27 mova m6, [r5-32*3] ; 22 23 mova m7, [r5-32*4] ; 24 25 call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose lea dstq, [r4+32] call .pass2 RET ALIGN function_align .pass2: call m(idct_16x8_internal_8bpc).main REPX {pmulhrsw x, m11}, m0, m1, m2, m3 call m(idct_16x8_internal_10bpc).write_16x4_start pmulhrsw m0, m11, m4 pmulhrsw m1, m11, m5 pmulhrsw m2, m11, m6 pmulhrsw m3, m11, m7 jmp m(idct_16x8_internal_10bpc).write_16x4_zero ALIGN function_align .pass1: mova m0, [cq+32* 1] mova m1, [cq+32* 7] mova m2, [cq+32* 9] mova m3, [cq+32*15] mova m4, [cq+32*17] mova m5, [cq+32*23] mova m6, [cq+32*25] mova m7, [cq+32*31] vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+32* 3] mova m1, [cq+32* 5] mova m2, [cq+32*11] mova m3, [cq+32*13] mova m4, [cq+32*19] mova m5, [cq+32*21] mova m6, [cq+32*27] mova m7, [cq+32*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 mova m0, [cq+32* 2] mova m1, [cq+32* 6] mova m2, [cq+32*10] mova m3, [cq+32*14] mova m4, [cq+32*18] mova m5, [cq+32*22] mova m6, [cq+32*26] mova m7, [cq+32*30] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+32* 0] mova m1, [cq+32* 4] mova m2, [cq+32* 8] mova m3, [cq+32*12] mova m4, [cq+32*16] mova m5, [cq+32*20] mova m6, [cq+32*24] mova m7, [cq+32*28] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf ret cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m5, [pw_4096] pxor m6, m6 mov r6d, eobd add eobb, 21 cmovc eobd, r6d lea r6, [strideq*3] lea r5, [strideq*5] lea r4, [strideq+r6*2] ; strideq*7 .loop: mova m0, [cq+32*0] packssdw m0, [cq+32*1] mova m1, [cq+32*2] packssdw m1, [cq+32*3] REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 add cq, 32*8 mova m2, [cq-32*4] packssdw m2, [cq-32*3] mova m3, [cq-32*2] packssdw m3, [cq-32*1] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 call m(inv_txfm_add_identity_identity_8x32_10bpc).main add dstq, 16 sub eobd, 64 jge .loop RET cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .full imul r6d, [cq], 181 vpbroadcastd m3, [dconly_12bpc] mov [cq], eobd ; 0 or r3d, 8 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .full: PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob lea r6, [rsp+32*4] vpbroadcastd m12, [clip_20b_min] vpbroadcastd m13, [clip_20b_max] call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end mov r4, dstq call m(idct_16x8_internal_12bpc).pass2_main mova m0, [cq+32* 0] ; 16 mova m1, [cq+32* 1] ; 17 mova m2, [cq+32* 2] ; 18 mova m3, [cq+32* 3] ; 19 mova m4, [cq+32* 4] ; 20 mova m5, [cq+32* 5] ; 21 mova m6, [cq+32* 6] ; 22 mova m7, [cq+32* 7] ; 23 mova m8, [cq+32* 8] ; 24 mova m9, [cq+32* 9] ; 25 mova m10, [cq+32*10] ; 26 mova m11, [cq+32*11] ; 27 mova m12, [cq+32*12] ; 28 mova m13, [cq+32*13] ; 29 mova m14, [cq+32*14] ; 30 mova m15, [cq+32*15] ; 31 lea dstq, [r4+32] call m(idct_16x8_internal_12bpc).pass2_main RET cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 %macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] mova m%4, [%2] paddsw m%3, m%1, m%4 psubsw m%1, m%4 %if %1 == 0 pxor m6, m6 %endif pmulhrsw m%3, m15 pmulhrsw m%1, m15 paddw m%3, [dstq+%5] paddw m%1, [r2+%6] pmaxsw m%3, m6 pmaxsw m%1, m6 pminsw m%3, m7 pminsw m%1, m7 mova [dstq+%5], m%3 mova [r2+%6], m%1 %endmacro cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*16] lea r4, [r6+32*8] lea r5, [r6+32*16] call .main sub eobd, 44 jge .eob44 vperm2i128 m2, m0, m3, 0x31 ; 5 vinserti128 m0, xm3, 1 ; 1 vperm2i128 m3, m1, m4, 0x31 ; 7 vinserti128 m1, xm4, 1 ; 3 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 jmp .fast .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 .eob44: mova [r4+16*0], xm0 mova [r4+16*1], xm3 mova [r4+16*2], xm1 mova [r4+16*3], xm4 vextracti128 [r4+16*4], m0, 1 vextracti128 [r4+16*5], m3, 1 vextracti128 [r4+16*6], m1, 1 vextracti128 [r4+16*7], m4, 1 call .main sub eobd, 107 jge .eob151 vperm2i128 m7, m1, m4, 0x31 ; 15 vinserti128 m5, m1, xm4, 1 ; 11 vperm2i128 m6, m0, m3, 0x31 ; 13 vinserti128 m4, m0, xm3, 1 ; 9 mova m0, [r4+32*0] mova m1, [r4+32*1] mova m2, [r4+32*2] mova m3, [r4+32*3] .fast: lea r6, [pw_5+128] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp .idct16 .eob151: mova [r4-16*8], xm0 mova [r4-16*7], xm3 mova [r4-16*6], xm1 mova [r4-16*5], xm4 vextracti128 [r4-16*4], m0, 1 vextracti128 [r4-16*3], m3, 1 vextracti128 [r4-16*2], m1, 1 vextracti128 [r4-16*1], m4, 1 call .main sub eobd, 128 jge .eob279 vperm2i128 m10, m0, m3, 0x31 ; 21 vinserti128 m8, m0, xm3, 1 ; 17 vperm2i128 m11, m1, m4, 0x31 ; 23 vinserti128 m9, m1, xm4, 1 ; 19 pxor m12, m12 REPX {mova x, m12}, m13, m14, m15 REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 jmp .full .eob279: mova [r5+16*0], xm0 mova [r5+16*1], xm3 mova [r5+16*2], xm1 mova [r5+16*3], xm4 vextracti128 [r5+16*4], m0, 1 vextracti128 [r5+16*5], m3, 1 vextracti128 [r5+16*6], m1, 1 vextracti128 [r5+16*7], m4, 1 call .main vperm2i128 m14, m0, m3, 0x31 ; 29 vinserti128 m12, m0, xm3, 1 ; 25 vperm2i128 m15, m1, m4, 0x31 ; 31 vinserti128 m13, m1, xm4, 1 ; 27 mova m8, [r5+32*0] mova m9, [r5+32*1] mova m10, [r5+32*2] mova m11, [r5+32*3] .full: mova m0, [r4+32*0] mova m1, [r4+32*1] mova m2, [r4+32*2] mova m3, [r4+32*3] mova m4, [r4-32*4] mova m5, [r4-32*3] mova m6, [r4-32*2] mova m7, [r4-32*1] lea r6, [pw_5 + 128] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea r3, [rsp+32*8] mova m8, [r3+32*0] mova m9, [r3+32*1] mova m10, [r3+32*2] mova m11, [r3+32*3] mova m12, [r3-32*4] mova m13, [r3-32*3] mova m14, [r3-32*2] mova m15, [r3-32*1] .idct16: lea r3, [rsp+32*16] mova m0, [r3+32*0] mova m1, [r3+32*1] mova m2, [r3+32*2] mova m3, [r3+32*3] mova m4, [r3-32*4] mova m5, [r3-32*3] mova m6, [r3-32*2] mova m7, [r3-32*1] mova [rsp], m15 call m(idct_16x16_internal_8bpc).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq call .pass2_end RET ALIGN function_align .main: pmulld m0, m14, [cq+128* 1] pmulld m1, m14, [cq+128* 3] pmulld m2, m14, [cq+128* 5] pmulld m3, m14, [cq+128* 7] pmulld m4, m14, [cq+128* 9] pmulld m5, m14, [cq+128*11] pmulld m6, m14, [cq+128*13] pmulld m7, m14, [cq+128*15] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 2] pmulld m2, m14, [cq+128* 4] pmulld m3, m14, [cq+128* 6] pmulld m4, m14, [cq+128* 8] pmulld m5, m14, [cq+128*10] pmulld m6, m14, [cq+128*12] pmulld m7, m14, [cq+128*14] call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf psrld m15, m11, 11 ; pd_1 mova m8, [r6-32*4] mova m9, [r6-32*3] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 psubd m10, m0, m8 ; out15 paddd m0, m8 ; out0 mova m8, [r6-32*2] paddd m15, m1, m9 ; out1 psubd m1, m9 ; out14 mova m9, [r6-32*1] REPX {psrad x, 1}, m0, m15, m10, m1 packssdw m0, m15 packssdw m1, m10 psubd m10, m2, m8 ; out13 paddd m2, m8 ; out2 mova m8, [r6+32*0] paddd m15, m3, m9 ; out3 psubd m3, m9 ; out12 mova m9, [r6+32*1] REPX {psrad x, 1}, m2, m15, m10, m3 packssdw m2, m15 packssdw m3, m10 psubd m10, m4, m8 ; out11 paddd m4, m8 ; out4 mova m8, [r6+32*2] paddd m15, m5, m9 ; out5 psubd m5, m9 ; out10 mova m9, [r6+32*3] REPX {psrad x, 1}, m4, m10, m15, m5 packssdw m4, m15 packssdw m5, m10 psubd m10, m6, m8 ; out9 paddd m6, m8 ; out6 paddd m15, m7, m9 ; out7 psubd m7, m9 ; out8 REPX {psrad x, 1}, m6, m10, m15, m7 packssdw m6, m15 packssdw m7, m10 punpckhwd m8, m0, m2 punpcklwd m0, m2 punpckhwd m2, m3, m1 punpcklwd m3, m1 punpckhwd m1, m4, m6 punpcklwd m4, m6 punpcklwd m6, m7, m5 punpckhwd m7, m5 pxor m5, m5 mov r7d, 128*13 .main_zero_loop: mova [cq+r7-128*1], m5 mova [cq+r7+128*0], m5 mova [cq+r7+128*1], m5 mova [cq+r7+128*2], m5 sub r7d, 128*4 jg .main_zero_loop add cq, 32 punpcklwd m5, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m1 punpckhwd m4, m1 punpckhwd m1, m0, m8 punpcklwd m0, m8 punpckhwd m8, m6, m7 punpcklwd m6, m7 punpcklqdq m7, m1, m4 punpckhqdq m1, m4 punpckhqdq m4, m8, m3 punpcklqdq m8, m3 punpckhqdq m3, m6, m5 punpcklqdq m6, m5 punpcklqdq m5, m0, m2 punpckhqdq m0, m2 mova [r6+16*0], xm5 mova [r6+16*1], xm6 mova [r6+16*2], xm7 mova [r6+16*3], xm8 vextracti128 [r6+16*4], m5, 1 vextracti128 [r6+16*5], m6, 1 vextracti128 [r6+16*6], m7, 1 vextracti128 [r6+16*7], m8, 1 sub r6, 32*4 ret ALIGN function_align .pass2_end: mova [rsp+gprsize+32*0], m6 mova [rsp+gprsize+32*2], m7 mova [rsp+gprsize+32*3], m15 vpbroadcastd m15, [pw_2048] vpbroadcastd m7, [pixel_10bpc_max] IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*1] IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*0] IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*2] mova m2, [rsp+gprsize+32*3] IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 ret cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m11, [pw_8192] lea r6, [strideq*5] pxor m6, m6 paddw m10, m11, m11 ; pw_16384 mov r5, dstq call .main sub eobd, 36 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main sub cq, 128*8-32 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 107 ; eob < 143 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main sub cq, 128*8-32 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 271 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main sub cq, 128*8-32 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 399 jl .ret add cq, 128*8 lea dstq, [r5+16] call .main .ret: RET ALIGN function_align .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 .main2: punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m4 punpcklwd m0, m4 punpcklwd m4, m2, m1 punpckhwd m2, m1 punpckhqdq m1, m0, m4 punpcklqdq m0, m4 call m(iidentity_8x8_internal_10bpc).write_2x8x2 punpcklqdq m0, m3, m2 punpckhqdq m1, m3, m2 jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1 cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*4] call .main cmp eobd, 36 jge .full call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] lea r6, [pw_5+128] mov r7, dstq call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] jmp .end .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .full: add cq, 32 mova [r4+32*3], m0 mova [r4+32*2], m1 mova [r4+32*1], m2 mova [r4+32*0], m3 mova [r4-32*1], m4 mova [r4-32*2], m5 mova [r4-32*3], m6 mova [r4-32*4], m7 call .main sub r4, 32*16 ; topleft 16x8 call .transpose_16x16 lea r6, [pw_5+128] mov r7, dstq call m(idct_16x16_internal_8bpc).main call .write_16x16 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] add r4, 32*8 ; bottomleft 16x8 call .transpose_16x16 .end: lea dstq, [r7+32] call m(idct_16x16_internal_8bpc).main call .write_16x16 RET ALIGN function_align .transpose_16x16: punpckhdq m8, m3, m1 punpckldq m3, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckhdq m2, m7, m5 punpckldq m7, m5 punpckhdq m5, m4, m6 punpckldq m4, m6 punpckhqdq m6, m0, m4 punpcklqdq m0, m4 punpckhqdq m4, m1, m5 punpcklqdq m1, m5 punpckhqdq m5, m7, m3 punpcklqdq m7, m3 punpckhqdq m3, m2, m8 punpcklqdq m2, m8 vinserti128 m8, m0, xm7, 1 vperm2i128 m12, m0, m7, 0x31 vinserti128 m9, m6, xm5, 1 vperm2i128 m13, m6, m5, 0x31 vinserti128 m10, m1, xm2, 1 vperm2i128 m14, m1, m2, 0x31 vinserti128 m11, m4, xm3, 1 vperm2i128 m15, m4, m3, 0x31 mova m0, [r4+32*3] mova m1, [r4+32*2] mova m2, [r4+32*1] mova m3, [r4+32*0] mova m4, [r4-32*1] mova m5, [r4-32*2] mova m6, [r4-32*3] mova m7, [r4-32*4] mova [rsp+gprsize], m15 jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose ALIGN function_align .main: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] pmulld m0, m14, [cq+64* 1] pmulld m1, m14, [cq+64* 7] pmulld m2, m14, [cq+64* 9] pmulld m3, m14, [cq+64*15] pmulld m4, m14, [cq+64*17] pmulld m5, m14, [cq+64*23] pmulld m6, m14, [cq+64*25] pmulld m7, m14, [cq+64*31] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 pmulld m0, m14, [cq+64* 3] pmulld m1, m14, [cq+64* 5] pmulld m2, m14, [cq+64*11] pmulld m3, m14, [cq+64*13] pmulld m4, m14, [cq+64*19] pmulld m5, m14, [cq+64*21] pmulld m6, m14, [cq+64*27] pmulld m7, m14, [cq+64*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 pmulld m0, m14, [cq+64* 2] pmulld m1, m14, [cq+64* 6] pmulld m2, m14, [cq+64*10] pmulld m3, m14, [cq+64*14] pmulld m4, m14, [cq+64*18] pmulld m5, m14, [cq+64*22] pmulld m6, m14, [cq+64*26] pmulld m7, m14, [cq+64*30] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+64* 0] pmulld m1, m14, [cq+64* 4] pmulld m2, m14, [cq+64* 8] pmulld m3, m14, [cq+64*12] pmulld m4, m14, [cq+64*16] pmulld m5, m14, [cq+64*20] pmulld m6, m14, [cq+64*24] pmulld m7, m14, [cq+64*28] call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf pxor m8, m8 mov r7d, 64*30 .main_zero_loop: mova [cq+r7-64*2], m8 mova [cq+r7-64*1], m8 mova [cq+r7+64*0], m8 mova [cq+r7+64*1], m8 sub r7d, 64*4 jg .main_zero_loop .main_end: psrld m11, 11 ; pd_1 IDCT32_END 0, 15, 8, 9, 10, 1 IDCT32_END 1, 14, 8, 9, 10, 1 punpckhwd m8, m0, m1 ; 16 17 punpcklwd m0, m1 ; 0 1 punpcklwd m1, m14, m15 ; 14 15 punpckhwd m14, m15 ; 30 31 mova [r5+32*3], m8 mova [r5+32*2], m14 IDCT32_END 2, 15, 8, 9, 10, 1 IDCT32_END 3, 14, 8, 9, 10, 1 punpckhwd m8, m2, m3 ; 18 19 punpcklwd m2, m3 ; 2 3 punpcklwd m3, m14, m15 ; 12 13 punpckhwd m14, m15 ; 28 29 mova [r5+32*1], m8 mova [r5+32*0], m14 IDCT32_END 4, 15, 8, 9, 10, 1 IDCT32_END 5, 14, 8, 9, 10, 1 punpckhwd m8, m4, m5 ; 20 21 punpcklwd m4, m5 ; 4 5 punpcklwd m5, m14, m15 ; 10 11 punpckhwd m14, m15 ; 26 27 mova [r5-32*1], m8 mova [r5-32*2], m14 IDCT32_END 6, 15, 8, 9, 10, 1 IDCT32_END 7, 14, 8, 9, 10, 1 punpckhwd m8, m6, m7 ; 22 23 punpcklwd m6, m7 ; 6 7 punpcklwd m7, m14, m15 ; 8 9 punpckhwd m14, m15 ; 24 25 mova [r5-32*3], m8 mova [r5-32*4], m14 ret ALIGN function_align .write_16x16: mova m1, [rsp+gprsize+32*1] mova [rsp+gprsize+32*0], m8 mova [rsp+gprsize+32*1], m9 mova [rsp+gprsize+32*2], m12 vpbroadcastd m12, [pw_2048] vpbroadcastd m9, [pixel_10bpc_max] lea r3, [strideq*3] pxor m8, m8 pmulhrsw m0, m12 pmulhrsw m1, m12 pmulhrsw m2, m12 pmulhrsw m3, m12 call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, m4 pmulhrsw m1, m12, m5 pmulhrsw m2, m12, m6 pmulhrsw m3, m12, m7 call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, [rsp+gprsize+32*0] pmulhrsw m1, m12, [rsp+gprsize+32*1] pmulhrsw m2, m12, m10 pmulhrsw m3, m12, m11 call m(idct_16x8_internal_10bpc).write_16x4 pmulhrsw m0, m12, [rsp+gprsize+32*2] pmulhrsw m1, m12, m13 pmulhrsw m2, m12, m14 pmulhrsw m3, m12, m15 jmp m(idct_16x8_internal_10bpc).write_16x4 cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m10, [pw_4096] lea r6, [strideq*5] pxor m6, m6 mov r5, dstq call .main sub eobd, 36 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main add cq, 64*8-32 lea dstq, [r5+16*1] call .main sub eobd, 107 ; eob < 143 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main add cq, 64*8-32 lea dstq, [r5+16*2] call .main sub eobd, 128 ; eob < 271 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main add cq, 64*8-32 lea dstq, [r5+16*3] call .main sub eobd, 128 ; eob < 399 jl .ret add cq, 32 lea dstq, [dstq+strideq*4] call .main .ret: RET ALIGN function_align .main: mova m0, [cq+64*0] packssdw m0, [cq+64*1] mova m1, [cq+64*2] packssdw m1, [cq+64*3] mova m2, [cq+64*4] packssdw m2, [cq+64*5] mova m3, [cq+64*6] packssdw m3, [cq+64*7] REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 REPX {paddsw x, x }, m0, m1, m2, m3 REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1 cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*7] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly .fast: lea r4, [rsp+32*71] pxor m0, m0 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r3, [rsp+32*3] mov r4, r6 lea r5, [r6+32*8] lea r6, [pw_5+128] call .pass2_oddhalf call .pass2_evenhalf imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] add dstq, 32 lea r3, [rsp+32*11] call .pass2_oddhalf call .pass2_evenhalf lea r3, [strideq*3] call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end RET ALIGN function_align .main: mova m0, [cq+128* 1] mova m1, [cq+128* 7] mova m2, [cq+128* 9] mova m3, [cq+128*15] mova m4, [cq+128*17] mova m5, [cq+128*23] mova m6, [cq+128*25] mova m7, [cq+128*31] vpbroadcastd m11, [pd_2048] vpbroadcastd m14, [pd_2896] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 mova m0, [cq+128* 3] mova m1, [cq+128* 5] mova m2, [cq+128*11] mova m3, [cq+128*13] mova m4, [cq+128*19] mova m5, [cq+128*21] mova m6, [cq+128*27] mova m7, [cq+128*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 mova m0, [cq+128* 2] mova m1, [cq+128* 6] mova m2, [cq+128*10] mova m3, [cq+128*14] mova m4, [cq+128*18] mova m5, [cq+128*22] mova m6, [cq+128*26] mova m7, [cq+128*30] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+128* 0] mova m1, [cq+128* 4] mova m2, [cq+128* 8] mova m3, [cq+128*12] mova m4, [cq+128*16] mova m5, [cq+128*20] mova m6, [cq+128*24] mova m7, [cq+128*28] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop add cq, 32 mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 mova [r5-32*1], m3 mova [r5+32*0], m4 mova [r5+32*1], m5 mova [r5+32*2], m6 mova [r5+32*3], m7 ret ALIGN function_align .pass2_oddhalf: mova m0, [r3+32* 1] ; 1 mova m1, [r3+32* 3] ; 3 mova m2, [r3+32* 5] ; 5 mova m3, [r3+32* 7] ; 7 mova m4, [r3+32*17] ; 9 mova m5, [r3+32*19] ; 11 mova m6, [r3+32*21] ; 13 mova m7, [r3+32*23] ; 15 mova m8, [r3+32*33] ; 17 mova m9, [r3+32*35] ; 19 mova m10, [r3+32*37] ; 21 mova m11, [r3+32*39] ; 23 mova m12, [r3+32*49] ; 25 mova m13, [r3+32*51] ; 27 mova m14, [r3+32*53] ; 29 mova m15, [r3+32*55] ; 31 jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf ALIGN function_align .pass2_evenhalf: mova m0, [r3+32* 0] ; 0 mova m1, [r3+32* 2] ; 2 mova m2, [r3+32* 4] ; 4 mova m3, [r3+32* 6] ; 6 mova m4, [r3+32*16] ; 8 mova m5, [r3+32*18] ; 10 mova m6, [r3+32*20] ; 12 mova m7, [r3+32*22] ; 14 mova m8, [r3+32*32] ; 16 mova m9, [r3+32*34] ; 18 mova m10, [r3+32*36] ; 20 mova m11, [r3+32*38] ; 22 mova m12, [r3+32*48] ; 24 mova m13, [r3+32*50] ; 26 mova m14, [r3+32*52] ; 28 mova m15, [r3+32*54] ; 30 mova [rsp+gprsize], m15 jmp m(idct_16x16_internal_8bpc).main cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob %undef cmp vpbroadcastd m7, [pixel_10bpc_max] .pass1: vpbroadcastd m5, [pw_8192] pxor m6, m6 lea r6, [strideq*3] lea r5, [strideq*5] lea r4, [strideq+r6*2] ; strideq*7 call .main ; 0 cmp eobd, 36 jl .ret add cq, 128*8 ; 0 1 mov r7, dstq ; 1 add dstq, 16 call .main call .main2 cmp eobd, 136 jl .ret add cq, 128*16-32 ; 0 1 2 lea dstq, [r7+16*2] ; 1 2 call .main ; 2 call .main2 call .main2 cmp eobd, 300 jl .ret add cq, 128*24-64 ; 0 1 2 3 add r7, 16*3 ; 1 2 3 mov dstq, r7 ; 2 3 call .main ; 3 call .main2 call .main2 call .main2 cmp eobd, 535 jl .ret add cq, 128*24-64 ; 0 1 2 3 lea dstq, [r7+strideq*8] ; 1 2 3 4 mov r7, dstq ; 2 3 4 call .main ; 3 4 call .main2 call .main2 cmp eobd, 755 jl .ret add cq, 128*16-32 ; 0 1 2 3 lea dstq, [r7+strideq*8] ; 1 2 3 4 call .main ; 2 3 4 5 call .main2 ; 3 4 5 cmp eobd, 911 jl .ret add cq, 128*8 ; 0 1 2 3 add dstq, 16 ; 1 2 3 4 call .main ; 2 3 4 5 .ret: ; 3 4 5 6 RET ALIGN function_align .main2: sub cq, 128*8-32 lea dstq, [dstq+strideq*8-16] .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob vpbroadcastd m7, [pixel_12bpc_max] jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1 %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) %if %1 & 1 mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n mova m%4, [r4-32*(14+%1)] ; idct32 out31-n %else mova m%5, [r4-32*(45-%1)] mova m%4, [r5-32*(20+%1)] %endif paddsw m%6, m%5, m%4 ; idct32 out 0+n psubsw m%5, m%4 ; idct32 out31-n paddsw m%4, m%5, m%3 ; out31-n psubsw m%5, m%3 ; out32+n paddsw m%3, m%6, m%2 ; out 0+n psubsw m%6, m%2 ; out63-n REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 %if %1 & 1 %define %%d0 r2 %define %%d1 dstq %else %define %%d0 dstq %define %%d1 r2 %endif paddw m%3, [%%d0+%7 ] paddw m%4, [%%d1+%8 ] paddw m%5, [%%d0+%9 ] paddw m%6, [%%d1+%10] pxor m%2, m%2 REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 vpbroadcastd m%2, [pixel_10bpc_max] REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 mova [%%d0+%7 ], m%3 mova [%%d1+%8 ], m%4 mova [%%d0+%9 ], m%5 mova [%%d1+%10], m%6 %endmacro cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*6] call .main sub eobd, 44 jl .fast call .main sub eobd, 107 jl .fast call .main sub eobd, 128 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 64 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 .fast: lea r4, [rsp+32*38] pxor m0, m0 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r6, [pw_5+128] mova m0, [rsp+32* 2] ; in0 mova m1, [rsp+32* 6] ; in4 mova m2, [rsp+32*10] ; in8 mova m3, [rsp+32*14] ; in12 mova m4, [rsp+32*18] ; in16 mova m5, [rsp+32*22] ; in20 mova m6, [rsp+32*26] ; in24 mova m7, [rsp+32*30] ; in28 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*38] mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 add r4, 32*8 mova [r4-32*4], m8 mova [r4-32*3], m9 mova [r4-32*2], m10 mova [r4-32*1], m11 mova [r4+32*0], m12 mova [r4+32*1], m13 mova [r4+32*2], m14 mova [r4+32*3], m15 mova m0, [rsp+32* 4] ; in2 mova m1, [rsp+32* 8] ; in6 mova m2, [rsp+32*12] ; in10 mova m3, [rsp+32*16] ; in14 mova m4, [rsp+32*20] ; in18 mova m5, [rsp+32*24] ; in22 mova m6, [rsp+32*28] ; in26 mova m7, [rsp+32*32] ; in30 lea r5, [r4+32*16] add r4, 32*8 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [rsp+32* 3] ; in1 mova m1, [rsp+32*33] ; in31 mova m2, [rsp+32*19] ; in17 mova m3, [rsp+32*17] ; in15 mova m4, [rsp+32*11] ; in9 mova m5, [rsp+32*25] ; in23 mova m6, [rsp+32*27] ; in25 mova m7, [rsp+32* 9] ; in7 lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [rsp+32* 7] ; in5 mova m1, [rsp+32*29] ; in27 mova m2, [rsp+32*23] ; in21 mova m3, [rsp+32*13] ; in11 mova m4, [rsp+32*15] ; in13 mova m5, [rsp+32*21] ; in19 mova m6, [rsp+32*31] ; in29 mova m7, [rsp+32* 5] ; in3 add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 call .main_part2_pass2 RET ALIGN function_align .main: mova m0, [cq+128* 1] mova m1, [cq+128* 3] mova m2, [cq+128* 5] mova m3, [cq+128* 7] mova m4, [cq+128* 9] mova m5, [cq+128*11] mova m6, [cq+128*13] mova m7, [cq+128*15] call m(idct_8x16_internal_10bpc).main_oddhalf mova m0, [cq+128* 0] mova m1, [cq+128* 2] mova m2, [cq+128* 4] mova m3, [cq+128* 6] mova m4, [cq+128* 8] mova m5, [cq+128*10] mova m6, [cq+128*12] mova m7, [cq+128*14] call m(idct_8x8_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_evenhalf pxor m15, m15 mov r7d, 128*13 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop add cq, 32 psrld m15, m11, 10 ; pd_2 mova m8, [r6-32*4] mova m9, [r6+32*3] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 psubd m10, m0, m8 ; out15 paddd m0, m8 ; out0 mova m8, [r6-32*3] psubd m15, m7, m9 ; out8 paddd m7, m9 ; out7 mova m9, [r6+32*2] REPX {psrad x, 2}, m0, m15, m10, m7 packssdw m0, m15 packssdw m7, m10 psubd m10, m1, m8 ; out14 paddd m1, m8 ; out1 mova m8, [r6-32*2] psubd m15, m6, m9 ; out9 paddd m6, m9 ; out6 mova m9, [r6+32*1] REPX {psrad x, 2}, m1, m15, m10, m6 packssdw m1, m15 packssdw m6, m10 psubd m10, m2, m8 ; out13 paddd m2, m8 ; out2 mova m8, [r6-32*1] psubd m15, m5, m9 ; out10 paddd m5, m9 ; out5 mova m9, [r6+32*0] REPX {psrad x, 2}, m2, m15, m10, m5 packssdw m2, m15 packssdw m5, m10 psubd m10, m3, m8 ; out12 paddd m3, m8 ; out3 psubd m15, m4, m9 ; out11 paddd m4, m9 ; out4 REPX {psrad x, 2}, m3, m15, m10, m4 packssdw m3, m15 packssdw m4, m10 call m(idct_16x8_internal_10bpc).transpose3 mova [r6-32*4], m0 mova [r6-32*3], m1 mova [r6-32*2], m2 mova [r6-32*1], m3 mova [r6+32*0], m4 mova [r6+32*1], m5 mova [r6+32*2], m6 mova [r6+32*3], m7 add r6, 32*8 ret .main_part2_pass2: vpbroadcastd m11, [pw_1567_3784] vpbroadcastd m12, [pw_m3784_1567] vpbroadcastd m13, [pw_2896_2896] lea r6, [pw_5+128] lea r2, [dstq+r7] .main_part2_pass2_loop: vpbroadcastd m14, [pw_m2896_2896] call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal vpbroadcastd m14, [pw_2048] IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 add dstq, strideq sub r2, strideq cmp r4, r5 jne .main_part2_pass2_loop ret ALIGN function_align .main_part1_rect2: REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_part1: ; idct64 steps 1-5 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vpbroadcastd m7, [r5+4*0] vpbroadcastd m8, [r5+4*1] vpbroadcastd m6, [r5+4*2] vpbroadcastd m9, [r5+4*3] vpbroadcastd m5, [r5+4*4] vpbroadcastd m10, [r5+4*5] vpbroadcastd m4, [r5+4*6] vpbroadcastd m15, [r5+4*7] pmulld m7, m0 ; t63a pmulld m0, m8 ; t32a pmulld m6, m1 ; t62a pmulld m1, m9 ; t33a pmulld m5, m2 ; t61a pmulld m2, m10 ; t34a pmulld m4, m3 ; t60a pmulld m3, m15 ; t35a vpbroadcastd m10, [r5+4*8] vpbroadcastd m15, [r5+4*9] REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 psubd m8, m0, m1 ; t33 paddd m0, m1 ; t32 psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m5 ; t61 paddd m4, m5 ; t60 REPX {pmaxsd x, m12}, m8, m1, m6, m2 REPX {pminsd x, m13}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a REPX {pmaxsd x, m12}, m0, m3, m7, m4 REPX {pminsd x, m13}, m0, m3, m7, m4 vpbroadcastd m10, [r5+4*10] vpbroadcastd m15, [r5+4*11] psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m8, m2 ; t61 paddd m8, m2 ; t62 REPX {pmaxsd x, m12}, m5, m3, m4, m6 REPX {pminsd x, m13}, m5, m3, m4, m6 ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a REPX {pmaxsd x, m12}, m0, m7, m1, m8 REPX {pminsd x, m13}, m0, m7, m1, m8 add r5, 4*12 mova [r6-32*4], m0 mova [r6+32*3], m7 mova [r6-32*3], m1 mova [r6+32*2], m8 mova [r6-32*2], m6 mova [r6+32*1], m4 mova [r6-32*1], m3 mova [r6+32*0], m5 add r6, 32*8 ret .main_part2: ; idct64 steps 6-9 lea r5, [r6+32*3] sub r6, 32*4 vpbroadcastd m10, [pd_1567] vpbroadcastd m15, [pd_3784] .main_part2_loop: mova m0, [r6-32*32] ; t32a mova m1, [r5-32*24] ; t39a mova m2, [r5-32*32] ; t63a mova m3, [r6-32*24] ; t56a mova m4, [r6-32*16] ; t40a mova m5, [r5-32* 8] ; t47a mova m6, [r5-32*16] ; t55a mova m7, [r6-32* 8] ; t48a psubd m8, m0, m1 ; t39 paddd m0, m1 ; t32 psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 REPX {pmaxsd x, m12}, m8, m1, m3, m4 REPX {pminsd x, m13}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a REPX {pmaxsd x, m12}, m0, m2, m5, m7 REPX {pminsd x, m13}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a paddd m2, m7 ; t63a psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m8, m4 ; t55 paddd m8, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 REPX {pmaxsd x, m12}, m6, m7, m5, m4 REPX {pminsd x, m13}, m6, m7, m5, m4 REPX {pmulld x, m14}, m6, m7, m5, m4 REPX {pmaxsd x, m12}, m2, m0, m8, m1 REPX {pminsd x, m13}, m2, m0, m8, m1 paddd m6, m11 paddd m5, m11 psubd m3, m6, m7 ; t47 paddd m6, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m3, m6, m7, m5 mova [r5-32* 8], m2 mova [r6-32*32], m0 mova [r6-32* 8], m8 mova [r5-32*32], m1 mova [r5-32*24], m3 mova [r6-32*16], m6 mova [r6-32*24], m7 mova [r5-32*16], m5 add r6, 32 sub r5, 32 cmp r6, r5 jl .main_part2_loop ret cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob %undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*6] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 vpbroadcastd m3, [dconly_10bpc] mov [cq], eobd ; 0 or r3d, 64 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 .fast: lea r4, [rsp+32*70] pxor m0, m0 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r6, [pw_5 + 128] mov r10, rsp lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 .pass2_loop: mova m0, [r10+32* 2] ; in0 mova m1, [r10+32* 6] ; in4 mova m2, [r10+32*18] ; in8 mova m3, [r10+32*22] ; in12 mova m4, [r10+32*34] ; in16 mova m5, [r10+32*38] ; in20 mova m6, [r10+32*50] ; in24 mova m7, [r10+32*54] ; in28 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] lea r4, [rsp+32*70] mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 add r4, 32*8 mova [r4-32*4], m8 mova [r4-32*3], m9 mova [r4-32*2], m10 mova [r4-32*1], m11 mova [r4+32*0], m12 mova [r4+32*1], m13 mova [r4+32*2], m14 mova [r4+32*3], m15 mova m0, [r10+32* 4] ; in2 mova m1, [r10+32* 8] ; in6 mova m2, [r10+32*20] ; in10 mova m3, [r10+32*24] ; in14 mova m4, [r10+32*36] ; in18 mova m5, [r10+32*40] ; in22 mova m6, [r10+32*52] ; in26 mova m7, [r10+32*56] ; in30 lea r5, [r4+32*16] add r4, 32*8 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10+32* 3] ; in1 mova m1, [r10+32*57] ; in31 mova m2, [r10+32*35] ; in17 mova m3, [r10+32*25] ; in15 mova m4, [r10+32*19] ; in9 mova m5, [r10+32*41] ; in23 mova m6, [r10+32*51] ; in25 mova m7, [r10+32* 9] ; in7 lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10+32* 7] ; in5 mova m1, [r10+32*53] ; in27 mova m2, [r10+32*39] ; in21 mova m3, [r10+32*21] ; in11 mova m4, [r10+32*23] ; in13 mova m5, [r10+32*37] ; in19 mova m6, [r10+32*55] ; in29 mova m7, [r10+32* 5] ; in3 add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 add r10, 32*8 sub r4, 32*98 ; rsp+32*16 sub dstq, r8 add dstq, 32 cmp r10, r4 jl .pass2_loop RET ALIGN function_align .main: vpbroadcastd m14, [pd_2896] vpbroadcastd m11, [pd_2048] pmulld m0, m14, [cq+128* 1] pmulld m1, m14, [cq+128* 7] pmulld m2, m14, [cq+128* 9] pmulld m3, m14, [cq+128*15] pmulld m4, m14, [cq+128*17] pmulld m5, m14, [cq+128*23] pmulld m6, m14, [cq+128*25] pmulld m7, m14, [cq+128*31] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 pmulld m0, m14, [cq+128* 3] pmulld m1, m14, [cq+128* 5] pmulld m2, m14, [cq+128*11] pmulld m3, m14, [cq+128*13] pmulld m4, m14, [cq+128*19] pmulld m5, m14, [cq+128*21] pmulld m6, m14, [cq+128*27] pmulld m7, m14, [cq+128*29] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 pmulld m0, m14, [cq+128* 2] pmulld m1, m14, [cq+128* 6] pmulld m2, m14, [cq+128*10] pmulld m3, m14, [cq+128*14] pmulld m4, m14, [cq+128*18] pmulld m5, m14, [cq+128*22] pmulld m6, m14, [cq+128*26] pmulld m7, m14, [cq+128*30] call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 4] pmulld m2, m14, [cq+128* 8] pmulld m3, m14, [cq+128*12] pmulld m4, m14, [cq+128*16] pmulld m5, m14, [cq+128*20] pmulld m6, m14, [cq+128*24] pmulld m7, m14, [cq+128*28] pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop add cq, 32 call m(idct_8x8_internal_10bpc).main_rect2 call m(idct_8x16_internal_10bpc).main_evenhalf call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 mova m0, [r5+32*3] mova m1, [r5+32*2] mova m2, [r5+32*1] mova m3, [r5+32*0] mova m4, [r5-32*1] mova m5, [r5-32*2] mova m6, [r5-32*3] mova m7, [r5-32*4] call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 mova [r5-32*1], m3 mova [r5+32*0], m4 mova [r5+32*1], m5 mova [r5+32*2], m6 mova [r5+32*3], m7 ret cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jnz .normal imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 16 .dconly: add r6d, 640 sar r6d, 10 .dconly2: vpbroadcastd m5, [dconly_10bpc] imul r6d, 181 add r6d, 2176 sar r6d, 12 movd xm0, r6d paddsw xm0, xm5 vpbroadcastw m0, xm0 .dconly_loop: paddsw m1, m0, [dstq+32*0] paddsw m2, m0, [dstq+32*1] paddsw m3, m0, [dstq+32*2] paddsw m4, m0, [dstq+32*3] REPX {psubusw x, m5}, m1, m2, m3, m4 mova [dstq+32*0], m1 mova [dstq+32*1], m2 mova [dstq+32*2], m3 mova [dstq+32*3], m4 add dstq, strideq dec r3d jg .dconly_loop RET .normal: PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*4] call .main call .shift_transpose cmp eobd, 36 jl .fast call .main call .shift_transpose jmp .pass2 .fast: pxor m0, m0 mov r3d, 4 .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 dec r3d jg .fast_loop .pass2: lea r7, [r6-32*64] lea r4, [r6-32*32] lea r6, [pw_5+128] mov r5, dstq .pass2_loop: mova m0, [r7-32*4] mova m1, [r7-32*3] mova m2, [r7-32*2] mova m3, [r7-32*1] mova m4, [r7+32*0] mova m5, [r7+32*1] mova m6, [r7+32*2] mova m7, [r7+32*3] add r7, 32*32 mova m8, [r7-32*4] mova m9, [r7-32*3] mova m10, [r7-32*2] mova m11, [r7-32*1] mova m12, [r7+32*0] mova m13, [r7+32*1] mova m14, [r7+32*2] mova m15, [r7+32*3] sub r7, 32*24 mova [rsp], m15 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16 add r5, 32 mov dstq, r5 cmp r7, r4 jl .pass2_loop RET ALIGN function_align .main: lea r5, [idct64_mul_16bpc] mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 7] mova m1, [cq+64*25] mova m2, [cq+64*23] mova m3, [cq+64* 9] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+64* 3] mova m1, [cq+64*29] mova m2, [cq+64*19] mova m3, [cq+64*13] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 mova m0, [cq+64* 2] mova m1, [cq+64*14] mova m2, [cq+64*18] mova m3, [cq+64*30] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast mova m0, [cq+64* 6] mova m1, [cq+64*10] mova m2, [cq+64*22] mova m3, [cq+64*26] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast mova m0, [cq+64* 4] mova m1, [cq+64*12] mova m2, [cq+64*20] mova m3, [cq+64*28] call m(idct_8x16_internal_10bpc).main_oddhalf_fast mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m2, [cq+64*16] mova m3, [cq+64*24] pxor m15, m15 mov r7d, 64*30 .main_zero_loop: mova [cq+r7-64*2], m15 mova [cq+r7-64*1], m15 mova [cq+r7+64*0], m15 mova [cq+r7+64*1], m15 sub r7d, 64*4 jg .main_zero_loop .main_end: psrld m15, m11, 10 ; pd_2 .main_end2: add cq, 32 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_10bpc).main add r6, 32*8 call m(idct_8x16_internal_10bpc).main_evenhalf mova [r6+32*2], m1 mova [r6+32*1], m2 mova [r6+32*0], m3 mova [r6-32*1], m4 mova [r6-32*2], m5 mova [r6-32*3], m6 mova [r6-32*4], m7 jmp .main_end_loop_start .main_end_loop: mova m0, [r6+32* 3] ; idct8 0 + n .main_end_loop_start: mova m1, [r5+32* 4] ; idct16 15 - n mova m2, [r5-32*12] ; idct32 16 + n mova m3, [r6-32*13] ; idct32 31 - n mova m4, [r6-32*29] ; idct64 63 - n mova m5, [r5-32*28] ; idct64 48 + n mova m6, [r6-32*45] ; idct64 47 - n mova m7, [r5-32*44] ; idct64 32 + n paddd m8, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m12}, m8, m0 REPX {pminsd x, m13}, m8, m0 paddd m1, m8, m3 ; idct32 out0 + n psubd m8, m3 ; idct32 out31 - n paddd m3, m0, m2 ; idct32 out15 - n psubd m0, m2 ; idct32 out16 + n REPX {pmaxsd x, m12}, m1, m8, m3, m0 REPX {pminsd x, m13}, m1, m3, m8, m0 REPX {paddd x, m15}, m1, m3, m0, m8 paddd m2, m1, m4 ; idct64 out0 + n (unshifted) psubd m1, m4 ; idct64 out63 - n (unshifted) paddd m4, m3, m5 ; idct64 out15 - n (unshifted) psubd m3, m5 ; idct64 out48 + n (unshifted) paddd m5, m0, m6 ; idct64 out16 + n (unshifted) psubd m0, m6 ; idct64 out47 - n (unshifted) paddd m6, m8, m7 ; idct64 out31 - n (unshifted) psubd m8, m7 ; idct64 out32 + n (unshifted) mova [r5-32*44], m2 mova [r6+32* 3], m1 mova [r6-32*45], m4 mova [r5+32* 4], m3 mova [r5-32*28], m5 mova [r6-32*13], m0 mova [r6-32*29], m6 mova [r5-32*12], m8 add r5, 32 sub r6, 32 cmp r5, r6 jl .main_end_loop ret .shift_transpose: %macro IDCT64_SHIFT_TRANSPOSE 1 ; shift sub r6, 32*48 mov r5, r6 %%loop: mova m0, [r6-32* 4] mova m4, [r6+32* 4] mova m1, [r6-32* 3] mova m5, [r6+32* 5] mova m2, [r6-32* 2] mova m6, [r6+32* 6] mova m3, [r6-32* 1] mova m7, [r6+32* 7] REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 mova m4, [r6+32* 0] mova m6, [r6+32* 8] mova m5, [r6+32* 1] mova m7, [r6+32* 9] REPX {psrad x, %1}, m4, m6, m5, m7 packssdw m4, m6 packssdw m5, m7 mova m6, [r6+32* 2] mova m8, [r6+32*10] mova m7, [r6+32* 3] mova m9, [r6+32*11] REPX {psrad x, %1}, m6, m8, m7, m9 packssdw m6, m8 packssdw m7, m9 call m(idct_16x8_internal_10bpc).transpose3 mova [r5-32*4], m0 mova [r5-32*3], m1 mova [r5-32*2], m2 mova [r5-32*1], m3 mova [r5+32*0], m4 mova [r5+32*1], m5 mova [r5+32*2], m6 mova [r5+32*3], m7 add r6, 32*16 add r5, 32*8 cmp r5, r4 jl %%loop mov r6, r4 %endmacro IDCT64_SHIFT_TRANSPOSE 2 ret cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*7] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .fast: pxor m0, m0 lea r4, [rsp+32*135] .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r7, [r6-32*32] lea r5, [r6+32*8] lea r6, [pw_5+128] imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq .pass2_loop: mova m0, [r7-32*99] mova m1, [r7-32*97] mova m2, [r7-32*95] mova m3, [r7-32*93] mova m4, [r7-32*67] mova m5, [r7-32*65] mova m6, [r7-32*63] mova m7, [r7-32*61] mova m8, [r7-32*35] mova m9, [r7-32*33] mova m10, [r7-32*31] mova m11, [r7-32*29] mova m12, [r7-32* 3] mova m13, [r7-32* 1] mova m14, [r7+32* 1] mova m15, [r7+32* 3] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf mova m0, [r7-32*100] mova m1, [r7-32*98] mova m2, [r7-32*96] mova m3, [r7-32*94] mova m4, [r7-32*68] mova m5, [r7-32*66] mova m6, [r7-32*64] mova m7, [r7-32*62] mova m8, [r7-32*36] mova m9, [r7-32*34] mova m10, [r7-32*32] mova m11, [r7-32*30] mova m12, [r7-32* 4] mova m13, [r7-32* 2] mova m14, [r7+32* 0] mova m15, [r7+32* 2] add r7, 32*8 mova [rsp], m15 call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end sub dstq, r3 lea r2, [r2+r3+32] add dstq, 32 cmp r7, r4 jl .pass2_loop RET ALIGN function_align .main: lea r5, [idct64_mul_16bpc] pmulld m0, m14, [cq+128* 1] pmulld m1, m14, [cq+128*31] pmulld m2, m14, [cq+128*17] pmulld m3, m14, [cq+128*15] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 7] pmulld m1, m14, [cq+128*25] pmulld m2, m14, [cq+128*23] pmulld m3, m14, [cq+128* 9] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 5] pmulld m1, m14, [cq+128*27] pmulld m2, m14, [cq+128*21] pmulld m3, m14, [cq+128*11] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 pmulld m0, m14, [cq+128* 3] pmulld m1, m14, [cq+128*29] pmulld m2, m14, [cq+128*19] pmulld m3, m14, [cq+128*13] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 pmulld m0, m14, [cq+128* 2] pmulld m1, m14, [cq+128*14] pmulld m2, m14, [cq+128*18] pmulld m3, m14, [cq+128*30] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2 pmulld m0, m14, [cq+128* 6] pmulld m1, m14, [cq+128*10] pmulld m2, m14, [cq+128*22] pmulld m3, m14, [cq+128*26] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2 pmulld m0, m14, [cq+128* 4] pmulld m1, m14, [cq+128*12] pmulld m2, m14, [cq+128*20] pmulld m3, m14, [cq+128*28] call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2 pmulld m0, m14, [cq+128* 0] pmulld m1, m14, [cq+128* 8] pmulld m2, m14, [cq+128*16] pmulld m3, m14, [cq+128*24] pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop psrld m15, m11, 11 ; pd_1 REPX {paddd x, m11}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2 IDCT64_SHIFT_TRANSPOSE 1 ret cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob %undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] vpbroadcastd m14, [pd_2896] lea r6, [rsp+32*7] call .main cmp eobd, 36 jl .fast call .main cmp eobd, 136 jl .fast call .main cmp eobd, 300 jl .fast call .main jmp .pass2 .dconly: imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly .fast: pxor m0, m0 lea r4, [rsp+32*135] .fast_loop: REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 add r6, 32*8 cmp r6, r4 jl .fast_loop .pass2: lea r10, [r6-32*32] lea r6, [pw_5+128] lea r8, [strideq*4] lea r9, [strideq*5] lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 .pass2_loop: mova m0, [r10-32*100] ; in0 mova m1, [r10-32*96] ; in4 mova m2, [r10-32*68] ; in8 mova m3, [r10-32*64] ; in12 mova m4, [r10-32*36] ; in16 mova m5, [r10-32*32] ; in20 mova m6, [r10-32* 4] ; in24 mova m7, [r10+32* 0] ; in28 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [r4-32*4], m0 mova [r4-32*3], m1 mova [r4-32*2], m2 mova [r4-32*1], m3 mova [r4+32*0], m4 mova [r4+32*1], m5 mova [r4+32*2], m6 mova [r4+32*3], m7 add r4, 32*8 mova [r4-32*4], m8 mova [r4-32*3], m9 mova [r4-32*2], m10 mova [r4-32*1], m11 mova [r4+32*0], m12 mova [r4+32*1], m13 mova [r4+32*2], m14 mova [r4+32*3], m15 mova m0, [r10-32*98] ; in2 mova m1, [r10-32*94] ; in6 mova m2, [r10-32*66] ; in10 mova m3, [r10-32*62] ; in14 mova m4, [r10-32*34] ; in18 mova m5, [r10-32*30] ; in22 mova m6, [r10-32* 2] ; in26 mova m7, [r10+32* 2] ; in30 lea r5, [r4+32*16] add r4, 32*8 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova m0, [r10-32*99] ; in1 mova m1, [r10+32* 3] ; in31 mova m2, [r10-32*35] ; in17 mova m3, [r10-32*61] ; in15 mova m4, [r10-32*67] ; in9 mova m5, [r10-32*29] ; in23 mova m6, [r10-32* 3] ; in25 mova m7, [r10-32*93] ; in7 lea r6, [idct64_mul - 8] add r4, 32*16 add r5, 32*32 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 mova m0, [r10-32*95] ; in5 mova m1, [r10-32* 1] ; in27 mova m2, [r10-32*31] ; in21 mova m3, [r10-32*65] ; in11 mova m4, [r10-32*63] ; in13 mova m5, [r10-32*33] ; in19 mova m6, [r10+32* 1] ; in29 mova m7, [r10-32*97] ; in3 add r6, 8 add r4, 32*8 sub r5, 32*8 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 add r10, 32*8 sub dstq, r8 sub r4, 32*44 add dstq, 32 cmp r10, r4 jl .pass2_loop RET ALIGN function_align .main: lea r5, [idct64_mul_16bpc] mova m0, [cq+128* 1] mova m1, [cq+128*31] mova m2, [cq+128*17] mova m3, [cq+128*15] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 7] mova m1, [cq+128*25] mova m2, [cq+128*23] mova m3, [cq+128* 9] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 5] mova m1, [cq+128*27] mova m2, [cq+128*21] mova m3, [cq+128*11] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 mova m0, [cq+128* 3] mova m1, [cq+128*29] mova m2, [cq+128*19] mova m3, [cq+128*13] call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 mova m0, [cq+128* 2] mova m1, [cq+128*14] mova m2, [cq+128*18] mova m3, [cq+128*30] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast mova m0, [cq+128* 6] mova m1, [cq+128*10] mova m2, [cq+128*22] mova m3, [cq+128*26] call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast mova m0, [cq+128* 4] mova m1, [cq+128*12] mova m2, [cq+128*20] mova m3, [cq+128*28] call m(idct_8x16_internal_10bpc).main_oddhalf_fast mova m0, [cq+128* 0] mova m1, [cq+128* 8] mova m2, [cq+128*16] mova m3, [cq+128*24] pxor m15, m15 mov r7d, 128*29 .main_zero_loop: mova [cq+r7-128*1], m15 mova [cq+r7+128*0], m15 mova [cq+r7+128*1], m15 mova [cq+r7+128*2], m15 sub r7d, 128*4 jg .main_zero_loop call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/itx16_avx512.asm000064400000000000000000007075411046102023000146640ustar 00000000000000; Copyright © 2022-2023, VideoLAN and dav1d authors ; Copyright © 2022-2023, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 idct8x8p: db 0, 1, 4, 5, 2, 3, 6, 7, 16, 17, 20, 21, 18, 19, 22, 23 db 8, 9, 12, 13, 10, 11, 14, 15, 24, 25, 28, 29, 26, 27, 30, 31 db 32, 33, 36, 37, 34, 35, 38, 39, 48, 49, 52, 53, 50, 51, 54, 55 db 40, 41, 44, 45, 42, 43, 46, 47, 56, 57, 60, 61, 58, 59, 62, 63 idtx8x8p: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 idct8x16p: db 54, 55, 2, 3, 22, 23, 34, 35, 38, 39, 18, 19, 6, 7, 50, 51 db 62, 63, 10, 11, 30, 31, 42, 43, 46, 47, 26, 27, 14, 15, 58, 59 db 52, 53, 4, 5, 20, 21, 36, 37, 32, 33, 0, 1, 48, 49, 16, 17 db 60, 61, 12, 13, 28, 29, 44, 45, 40, 41, 8, 9, 56, 57, 24, 25 iadst8x16p: db 0, 1, 54, 55, 48, 49, 6, 7, 16, 17, 38, 39, 32, 33, 22, 23 db 8, 9, 62, 63, 56, 57, 14, 15, 24, 25, 46, 47, 40, 41, 30, 31 db 4, 5, 50, 51, 52, 53, 2, 3, 20, 21, 34, 35, 36, 37, 18, 19 db 12, 13, 58, 59, 60, 61, 10, 11, 28, 29, 42, 43, 44, 45, 26, 27 permA: db 0, 1, 0, 8, 4, 5, 1, 9, 8, 9, 4, 12, 12, 13, 5, 13 db 16, 17, 16, 24, 20, 21, 17, 25, 24, 25, 20, 28, 28, 29, 21, 29 db 2, 3, 2, 10, 6, 7, 3, 11, 10, 11, 6, 14, 14, 15, 7, 15 db 18, 19, 18, 26, 22, 23, 19, 27, 26, 27, 22, 30, 30, 31, 23, 31 permB: db 4, 2, 1, 8, 0, 0, 1, 0, 12, 3, 3, 10, 8, 1, 3, 2 db 5, 10, 5, 12, 1, 8, 5, 4, 13, 11, 7, 14, 9, 9, 7, 6 db 6, 6, 13, 4, 2, 4, 4, 5, 14, 7, 15, 6, 10, 5, 6, 7 db 7, 14, 9, 0, 3, 12, 0, 1, 15, 15, 11, 2, 11, 13, 2, 3 permC: db 0, 9, 0, 0, 0, 1, 4, 4, 2, 11, 2, 2, 2, 3, 6, 6 db 1, 8, 1, 8, 4, 5, 5, 12, 3, 10, 3, 10, 6, 7, 7, 14 db 9, 1, 8, 1, 1, 0, 12, 5, 11, 3, 10, 3, 3, 2, 14, 7 db 8, 0, 9, 9, 5, 4, 13, 13, 10, 2, 11, 11, 7, 6, 15, 15 idct8x32p: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 idct32x8p: db 2, 18, 0, 16, 3, 19, 1, 17, 10, 26, 8, 24, 11, 27, 9, 25 db 34, 50, 32, 48, 35, 51, 33, 49, 42, 58, 40, 56, 43, 59, 41, 57 db 6, 22, 4, 20, 7, 23, 5, 21, 14, 30, 12, 28, 15, 31, 13, 29 db 38, 54, 36, 52, 39, 55, 37, 53, 46, 62, 44, 60, 47, 63, 45, 61 idtx32x8p: db 0, 8, 16, 24, 4, 12, 20, 28, 2, 10, 18, 26, 6, 14, 22, 30 db 32, 40, 48, 56, 36, 44, 52, 60, 34, 42, 50, 58, 38, 46, 54, 62 db 1, 9, 17, 25, 5, 13, 21, 29, 3, 11, 19, 27, 7, 15, 23, 31 db 33, 41, 49, 57, 37, 45, 53, 61, 35, 43, 51, 59, 39, 47, 55, 63 pw_2048_m2048: times 16 dw 2048 pw_m2048_2048: times 16 dw -2048 pw_2048: times 16 dw 2048 ; flags: 0 = ++, 1 = +-, 2 = -+, 3 = ++-, 4=-- %macro COEF_PAIR 2-3 0 ; a, b, flags %if %3 == 1 pd_%1_m%2: dd %1, %1, -%2, -%2 %define pd_%1 (pd_%1_m%2 + 4*0) %define pd_m%2 (pd_%1_m%2 + 4*2) %elif %3 == 2 pd_m%1_%2: dd -%1, -%1, %2, %2 %define pd_m%1 (pd_m%1_%2 + 4*0) %define pd_%2 (pd_m%1_%2 + 4*2) %elif %3 == 4 pd_m%1_m%2: dd -%1, -%1, -%2, -%2 %define pd_m%1 (pd_m%1_m%2 + 4*0) %define pd_m%2 (pd_m%1_m%2 + 4*2) %else pd_%1_%2: dd %1, %1, %2, %2 %define pd_%1 (pd_%1_%2 + 4*0) %define pd_%2 (pd_%1_%2 + 4*2) %if %3 == 3 %define pd_%2_m%2 pd_%2 dd -%2, -%2 %endif %endif %endmacro COEF_PAIR 101, 501 COEF_PAIR 201, 601, 1 COEF_PAIR 201, 995 COEF_PAIR 401, 1189, 1 COEF_PAIR 401, 1931 COEF_PAIR 401, 3920 COEF_PAIR 401, 4076 COEF_PAIR 700, 301, 4 COEF_PAIR 799, 2276, 1 COEF_PAIR 799, 3406 COEF_PAIR 799, 4017 COEF_PAIR 1380, 601 COEF_PAIR 1751, 2440 COEF_PAIR 2598, 1189 COEF_PAIR 2598, 1931, 2 COEF_PAIR 2598, 3612 COEF_PAIR 2751, 2106 COEF_PAIR 2896, 1567, 3 COEF_PAIR 2896, 3784, 3 COEF_PAIR 3035, 3513 COEF_PAIR 3166, 1931 COEF_PAIR 3166, 3612 COEF_PAIR 3166, 3920 COEF_PAIR 3703, 3290 COEF_PAIR 3857, 4052 COEF_PAIR 4017, 2276 COEF_PAIR 4017, 3406 COEF_PAIR 4036, 4085 COEF_PAIR 4076, 1189 COEF_PAIR 4076, 3612 COEF_PAIR 4076, 3920 COEF_PAIR 4091, 3973 COEF_PAIR 4091, 4052 COEF_PAIR 4095, 4065 pb_32: times 4 db 32 pw_5: times 2 dw 5 pw_4096: times 2 dw 4096 pw_8192: times 2 dw 8192 pw_1697x16: times 2 dw 1697*16 pw_2896x8: times 2 dw 2896*8 pixel_10bpc_max: times 2 dw 0x03ff dconly_10bpc: times 2 dw 0x7c00 clip_18b_min: dd -0x20000 clip_18b_max: dd 0x1ffff pd_1: dd 1 pd_2: dd 2 pd_1448: dd 1448 pd_2048: dd 2048 pd_3071: dd 3071 ; 1024 + 2048 - 1 pd_3072: dd 3072 ; 1024 + 2048 pd_5119: dd 5119 ; 1024 + 4096 - 1 pd_5120: dd 5120 ; 1024 + 4096 pd_5793: dd 5793 cextern dup16_perm cextern int8_permA cextern idct64_mul_16bpc cextern idct_8x8_internal_8bpc_avx512icl.main cextern iadst_8x8_internal_8bpc_avx512icl.main_pass2 cextern idct_8x16_internal_8bpc_avx512icl.main cextern idct_8x16_internal_8bpc_avx512icl.main2 cextern idct_8x16_internal_8bpc_avx512icl.main_fast cextern idct_8x16_internal_8bpc_avx512icl.main_fast2 cextern iadst_8x16_internal_8bpc_avx512icl.main2 cextern idct_16x8_internal_8bpc_avx512icl.main cextern iadst_16x8_internal_8bpc_avx512icl.main_pass2 cextern idct_16x16_internal_8bpc_avx512icl.main cextern idct_16x16_internal_8bpc_avx512icl.main2 cextern idct_16x16_internal_8bpc_avx512icl.main_fast cextern idct_16x16_internal_8bpc_avx512icl.main_fast2 cextern iadst_16x16_internal_8bpc_avx512icl.main_pass2b cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_fast2 cextern inv_txfm_add_dct_dct_8x32_8bpc_avx512icl.main_end cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_16x32_8bpc_avx512icl.main_oddhalf_fast2 cextern inv_txfm_add_dct_dct_32x8_8bpc_avx512icl.main cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2 cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast3 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast3 cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part1_fast2 cextern inv_txfm_add_dct_dct_32x64_8bpc_avx512icl.main_part2 SECTION .text %define o_base (pw_2048+4*128) %define o_base_8bpc (int8_permA+64*18) %define o(x) (r5 - o_base + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) INIT_ZMM avx512icl ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 ; flags: 1 = inv_dst1, 2 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags %if %8 < 32 pmulld m%4, m%1, m%8 pmulld m%3, m%2, m%8 %else %if %8 < 4096 vpbroadcastd m%3, [o(pd_%8)] %else vbroadcasti32x4 m%3, [o(pd_%8)] %endif pmulld m%4, m%1, m%3 pmulld m%3, m%2 %endif %if %7 < 32 pmulld m%1, m%7 pmulld m%2, m%7 %else %if %7 < 4096 vpbroadcastd m%5, [o(pd_%7)] %else vbroadcasti32x4 m%5, [o(pd_%7)] %endif pmulld m%1, m%5 pmulld m%2, m%5 %endif %if %9 & 2 psubd m%4, m%6, m%4 psubd m%2, m%4, m%2 %else %ifnum %6 paddd m%4, m%6 %endif paddd m%2, m%4 %endif %ifnum %6 paddd m%1, m%6 %endif %if %9 & 1 psubd m%1, m%3, m%1 %else psubd m%1, m%3 %endif %ifnum %6 psrad m%2, 12 psrad m%1, 12 %endif %endmacro %macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_10bpc) lea r5, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%4_internal_10bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else %if %3 add eobd, %3 %endif ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 8x8 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 .dconly: add r6d, 384 sar r6d, 9 .dconly2: vpbroadcastd ym2, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw ym1, r6d paddsw ym1, ym2 .dconly_loop: mova xm0, [dstq+strideq*0] vinserti32x4 ym0, [dstq+strideq*1], 1 paddsw ym0, ym1 psubusw ym0, ym2 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call .load vpermi2q m1, m0, m2 ; 1 5 vpermi2q m3, m6, m4 ; 7 3 vpermt2q m0, m5, m4 ; 0 2 vpermt2q m2, m5, m6 ; 4 6 call .main call .main_end mova m4, [o(idct8x8p)] packssdw m0, m2 ; 0 1 4 5 packssdw m1, m3 ; 3 2 7 6 vpermb m0, m4, m0 vprolq m1, 32 vpermb m2, m4, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 jmp tx2q .pass2: lea r5, [o_base_8bpc] vextracti32x8 ym2, m0, 1 vextracti32x8 ym3, m1, 1 call m(idct_8x8_internal_8bpc).main mova m10, [permC] vpbroadcastd m12, [pw_2048] .end: vpermt2q m0, m10, m1 vpermt2q m2, m10, m3 .end2: vpbroadcastd m11, [pixel_10bpc_max] lea r6, [strideq*3] pxor m10, m10 pmulhrsw m8, m12, m0 call .write_8x4_start pmulhrsw m8, m12, m2 .write_8x4: lea dstq, [dstq+strideq*4] add cq, 64*2 .write_8x4_start: mova xm9, [dstq+strideq*0] vinserti32x4 ym9, [dstq+strideq*1], 1 vinserti32x4 m9, [dstq+strideq*2], 2 vinserti32x4 m9, [dstq+r6 ], 3 mova [cq+64*0], m10 mova [cq+64*1], m10 paddw m9, m8 pmaxsw m9, m10 pminsw m9, m11 mova [dstq+strideq*0], xm9 vextracti32x4 [dstq+strideq*1], ym9, 1 vextracti32x4 [dstq+strideq*2], m9, 2 vextracti32x4 [dstq+r6 ], m9, 3 ret ALIGN function_align .load: mova m0, [cq+64*0] ; 0 1 mova m4, [cq+64*1] ; 2 3 mova m1, [o(permB)] mova m2, [cq+64*2] ; 4 5 mova m6, [cq+64*3] ; 6 7 vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] psrlq m5, m1, 32 vpbroadcastd m12, [o(pd_2896)] mova m3, m1 vpbroadcastd m11, [o(pd_1)] ret ALIGN function_align .main_fast: ; bottom half is zero vbroadcasti32x4 m3, [o(pd_4017_3406)] vbroadcasti32x4 m8, [o(pd_799_m2276)] vbroadcasti32x4 m2, [o(pd_2896_3784)] vbroadcasti32x4 m9, [o(pd_2896_1567)] pmulld m3, m1 ; t4a t5a pmulld m1, m8 ; t7a t6a pmulld m2, m0 ; t0 t3 pmulld m0, m9 ; t1 t2 jmp .main2 .main: ITX_MULSUB_2D 1, 3, 8, 9, 10, _, 799_3406, 4017_2276 ITX_MULSUB_2D 0, 2, 8, 9, 10, _, 2896_1567, 2896_3784 .main2: REPX {paddd x, m13}, m1, m3, m0, m2 REPX {psrad x, 12 }, m1, m3, m0, m2 punpcklqdq m8, m1, m3 ; t4a t7a punpckhqdq m1, m3 ; t5a t6a psubd m3, m8, m1 ; t5a t6a paddd m8, m1 ; t4 t7 pmaxsd m3, m14 punpckhqdq m1, m2, m0 ; t3 t2 pminsd m3, m15 punpcklqdq m2, m0 ; t0 t1 pmulld m3, m12 paddd m0, m2, m1 ; dct4 out0 out1 psubd m2, m1 ; dct4 out3 out2 REPX {pmaxsd x, m14}, m8, m0, m2 REPX {pminsd x, m15}, m8, m0, m2 .main3: pshufd m1, m3, q1032 paddd m3, m13 psubd m9, m3, m1 paddd m3, m1 psrad m9, 12 psrad m3, 12 punpckhqdq m1, m8, m3 ; t7 t6 shufpd m8, m9, 0xaa ; t4 t5 ret .main_end: paddd m0, m11 paddd m2, m11 psubd m3, m0, m1 ; out7 out6 paddd m0, m1 ; out0 out1 paddd m1, m2, m8 ; out3 out2 psubd m2, m8 ; out4 out5 REPX {vpsravd x, m11}, m0, m2, m3, m1 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity INV_TXFM_8X8_FN adst, adst cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x8_internal_10bpc).load vpermi2q m1, m6, m2 ; 7 5 vpermi2q m3, m4, m0 ; 3 1 vpermt2q m0, m5, m4 ; 0 2 vpermt2q m2, m5, m6 ; 4 6 call .main punpckldq m1, m2, m4 ; out4 out6 punpckhdq m2, m0 ; -out5 -out7 punpckldq m0, m3 ; out0 out2 punpckhdq m4, m3 ; -out1 -out3 paddd m1, m11 psubd m3, m11, m2 paddd m0, m11 psubd m4, m11, m4 .pass1_end: REPX {psrad x, 1}, m1, m0, m3, m4 packssdw m0, m1 ; 0 2 4 6 packssdw m4, m3 ; 1 3 5 7 psrlq m1, [o(permB)], 8 punpckhwd m3, m0, m4 punpcklwd m0, m4 psrlq m2, m1, 32 vpermi2q m1, m0, m3 vpermt2q m0, m2, m3 jmp tx2q .pass2: call .main_pass2 movu m10, [permC+2] vbroadcasti32x8 m12, [pw_2048_m2048+16] jmp m(idct_8x8_internal_10bpc).end .main_pass2: vextracti32x8 ym2, m0, 1 vextracti32x8 ym3, m1, 1 lea r5, [o_base_8bpc] pshufd ym4, ym0, q1032 pshufd ym5, ym1, q1032 jmp m(iadst_8x8_internal_8bpc).main_pass2 ALIGN function_align .main: ITX_MULSUB_2D 1, 0, 4, 5, 6, 13, 401_1931, 4076_3612 ITX_MULSUB_2D 3, 2, 4, 5, 6, 13, 3166_3920, 2598_1189 psubd m4, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 REPX {pmaxsd x, m14}, m4, m2, m0, m1 REPX {pminsd x, m15}, m4, m2, m0, m1 pxor m5, m5 psubd m5, m4 shufpd m4, m2, 0xaa ; t4 t7 shufpd m2, m5, 0xaa ; t5 -t6 ITX_MULSUB_2D 4, 2, 3, 5, 6, 13, 1567, 3784 punpckhqdq m3, m0, m1 punpcklqdq m0, m1 psubd m1, m0, m3 ; t2 t3 paddd m0, m3 ; out0 -out7 punpckhqdq m3, m4, m2 ; t7a t6a punpcklqdq m4, m2 ; t5a t4a psubd m2, m4, m3 ; t7 t6 paddd m4, m3 ; out6 -out1 REPX {pmaxsd x, m14}, m1, m2 REPX {pminsd x, m15}, m1, m2 shufpd m3, m1, m2, 0xaa shufpd m1, m2, 0x55 pmulld m3, m12 pmulld m1, m12 paddd m3, m13 psubd m2, m3, m1 paddd m3, m1 psrad m2, 12 ; out4 -out5 pshufd m3, m3, q1032 psrad m3, 12 ; out2 -out3 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, identity INV_TXFM_8X8_FN flipadst, flipadst cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x8_internal_10bpc).load vpermi2q m1, m6, m2 ; 7 5 vpermi2q m3, m4, m0 ; 3 1 vpermt2q m0, m5, m4 ; 0 2 vpermt2q m2, m5, m6 ; 4 6 call m(iadst_8x8_internal_10bpc).main punpckhdq m1, m3, m4 ; -out3 -out1 punpckldq m3, m0 ; out2 out0 punpckhdq m0, m2 ; -out7 -out5 punpckldq m4, m2 ; out6 out4 psubd m1, m11, m1 paddd m3, m11 psubd m0, m11, m0 paddd m4, m11 jmp m(iadst_8x8_internal_10bpc).pass1_end .pass2: call m(iadst_8x8_internal_10bpc).main_pass2 movu m10, [permC+1] vbroadcasti32x8 m12, [pw_m2048_2048+16] lea r6, [strideq*3] vpermt2q m0, m10, m1 ; 7 6 5 4 vpbroadcastd m11, [pixel_10bpc_max] vpermt2q m2, m10, m3 ; 3 2 1 0 pxor m10, m10 pmulhrsw m8, m12, m2 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m8, m12, m0 jmp m(idct_8x8_internal_10bpc).write_8x4 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 mova m1, [cq+64*0] packssdw m1, [cq+64*2] ; 0 4 1 5 mova m2, [cq+64*1] ; 2 6 3 7 packssdw m2, [cq+64*3] mova m0, [o(idtx8x8p)] vpermb m1, m0, m1 vpermb m2, m0, m2 punpckldq m0, m1, m2 ; 0 1 4 5 punpckhdq m1, m2 ; 2 3 6 7 jmp tx2q .pass2: movu m3, [o(permC+2)] vpbroadcastd m12, [o(pw_4096)] psrlq m2, m3, 32 vpermi2q m2, m0, m1 vpermt2q m0, m3, m1 jmp m(idct_8x8_internal_10bpc).end2 %macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 8x16 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 16 add r6d, 128 sar r6d, 8 imul r6d, 181 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity, 35 INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, adst cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 43 jl .fast call .load call .main call .main_end .pass1_end: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 jmp tx2q .pass2: mova m8, [o(idct8x16p)] REPX {vpermb x, m8, x}, m0, m1, m2, m3 punpckhdq m5, m0, m1 punpckldq m0, m1 punpckhdq m4, m2, m3 punpckldq m2, m3 punpcklqdq m8, m0, m2 ; 15 1 punpckhqdq m0, m2 ; 7 9 punpckhqdq m1, m5, m4 ; 3 13 punpcklqdq m5, m4 ; 11 5 lea r5, [o_base_8bpc] vextracti32x8 ym7, m8, 1 ; 14 2 vextracti32x8 ym3, m0, 1 ; 6 10 vextracti32x8 ym6, m1, 1 ; 12 4 vextracti32x8 ym9, m5, 1 ; 8 0 call m(idct_8x16_internal_8bpc).main2 mova m8, [permC] vpbroadcastd m12, [pw_2048] vpermt2q m0, m8, m1 lea r6, [strideq*3] vpermt2q m2, m8, m3 vpbroadcastd m11, [pixel_10bpc_max] vpermt2q m4, m8, m5 pxor m10, m10 vpermt2q m6, m8, m7 pmulhrsw m8, m12, m0 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m8, m12, m2 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m4 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m6 jmp m(idct_8x8_internal_10bpc).write_8x4 .fast: mova ym0, [cq+64*0] mova ym4, [cq+64*2] mova ym1, [cq+64*1] mova ym5, [cq+64*5] mova ym2, [cq+64*4] mova ym6, [cq+64*6] mova ym3, [cq+64*7] mova ym7, [cq+64*3] call .round_input_fast call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_10bpc).main_end movu m6, [o(permC+3)] packssdw m3, m1, m3 packssdw m1, m0, m2 vprolq m3, 32 vpermd m1, m6, m1 vpermd m3, m6, m3 mova ym0, ym1 ; 0 4 vextracti32x8 ym1, m1, 1 ; 1 5 mova ym2, ym3 ; 2 6 vextracti32x8 ym3, m3, 1 ; 3 7 jmp tx2q ALIGN function_align .round_input_fast: movshdup m8, [o(permB)] vpbroadcastd m12, [o(pd_2896)] vpermt2q m0, m8, m4 vpermt2q m1, m8, m5 vpermt2q m2, m8, m6 vpermt2q m3, m8, m7 vpbroadcastd m13, [o(pd_2048)] REPX {pmulld x, m12}, m0, m1, m2, m3 vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] REPX {paddd x, m13}, m0, m1, m2, m3 vpbroadcastd m11, [o(pd_1)] REPX {psrad x, 12 }, m0, m1, m2, m3 ret ALIGN function_align .load: vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] .load2: vpbroadcastd m12, [o(pd_2896)] pmulld m0, m12, [cq+64*0] pmulld m1, m12, [cq+64*1] pmulld m2, m12, [cq+64*2] pmulld m3, m12, [cq+64*3] vpbroadcastd m13, [o(pd_2048)] pmulld m4, m12, [cq+64*4] pmulld m5, m12, [cq+64*5] pmulld m6, m12, [cq+64*6] pmulld m7, m12, [cq+64*7] .round: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 REPX {paddd x, m13}, m4, m5, m6, m7 REPX {psrad x, 12 }, m4, m5, m6, m7 ret ALIGN function_align .main_fast2_rect2: REPX {paddd x, m13}, m0, m1 REPX {psrad x, 12 }, m0, m1 .main_fast2: pmulld m0, m12 pmulld m6, m1, [o(pd_4017)] {1to16} ; t7a pmulld m8, m1, [o(pd_799)] {1to16} ; t4a REPX {paddd x, m13}, m0, m6, m8 REPX {psrad x, 12 }, m0, m6, m8 pmulld m5, m6, m12 pmulld m1, m8, m12 paddd m5, m13 psubd m4, m5, m1 paddd m5, m1 REPX {psrad x, 12 }, m4, m5 REPX {mova x, m0 }, m1, m2, m3 ret .main_fast_rect2: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_fast: pmulld m0, m12 pmulld m5, m3, [o(pd_2276)] {1to16} ; t5a pmulld m3, [o(pd_3406)] {1to16} ; t6a pmulld m7, m1, [o(pd_4017)] {1to16} ; t7a pmulld m1, [o(pd_799)] {1to16} ; t4a pmulld m6, m2, [o(pd_3784)] {1to16} ; t3 pmulld m2, [o(pd_1567)] {1to16} ; t2 paddd m0, m13 psubd m5, m13, m5 psrad m0, 12 ; t0 mova m9, m0 ; t1 jmp .main2 .main_rect2: call .round .main: pmulld m0, m12 ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 pmulld m4, m12 paddd m0, m13 paddd m5, m13 psubd m9, m0, m4 ; t1 paddd m0, m4 ; t0 psrad m9, 12 psrad m0, 12 .main2: REPX {paddd x, m13}, m3, m1, m7 REPX {psrad x, 12 }, m5, m1, m3, m7 paddd m8, m1, m5 ; t4 psubd m1, m5 ; t5a psubd m5, m7, m3 ; t6a paddd m7, m3 ; t7 pmaxsd m5, m14 pmaxsd m1, m14 paddd m2, m13 paddd m6, m13 pminsd m5, m15 pminsd m1, m15 pmulld m5, m12 pmulld m1, m12 pmaxsd m8, m14 pmaxsd m7, m14 pminsd m8, m15 paddd m5, m13 psubd m4, m5, m1 paddd m5, m1 REPX {psrad x, 12 }, m2, m6, m5, m4 paddd m1, m9, m2 ; dct4 out1 psubd m2, m9, m2 ; dct4 out2 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 pminsd m6, m15, m7 REPX {pmaxsd x, m14}, m0, m1, m2, m3 REPX {pminsd x, m15}, m0, m1, m2, m3 ret .main_end: vpbroadcastd m11, [o(pd_1)] .main_end2: REPX {paddd x, m11}, m0, m1, m2, m3 psubd m7, m0, m6 ; out7 paddd m0, m6 ; out0 psubd m6, m1, m5 ; out6 paddd m1, m5 ; out1 psubd m5, m2, m4 ; out5 paddd m2, m4 ; out2 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 REPX {vpsravd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, identity, 35 INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, adst cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 43 jl .fast call m(idct_8x16_internal_10bpc).load call .main psrad m0, 1 psrad m1, 1 psrad m6, m10, 1 psrad m7, m11, 1 psrad m2, 12 psrad m3, 12 psrad m4, m8, 12 psrad m5, m9, 12 jmp m(idct_8x16_internal_10bpc).pass1_end .fast: call .fast_main punpcklqdq m1, m2, m4 ; out4 out6 punpckhqdq m2, m0 ; -out5 -out7 punpcklqdq m0, m3 ; out0 out2 punpckhqdq m4, m3 ; -out1 -out3 paddd m1, m11 psubd m3, m11, m2 paddd m0, m11 psubd m4, m11, m4 .fast_end: movu m5, [o(permC+3)] REPX {psrad x, 1}, m1, m0, m3, m4 packssdw m2, m0, m1 ; 0 2 4 6 packssdw m3, m4, m3 ; 1 3 5 7 vpermd m2, m5, m2 vpermd m3, m5, m3 mova ym0, ym2 vextracti32x8 ym2, m2, 1 mova ym1, ym3 vextracti32x8 ym3, m3, 1 jmp tx2q .pass2: call .pass2_main movu m4, [permB+2] vbroadcasti32x8 m12, [pw_2048_m2048+16] psrlq m7, m4, 8 vpermi2q m4, m0, m3 ; 0 1 2 3 psrlq m5, m7, 24 vpermi2q m7, m0, m3 ; 12 13 14 15 psrlq m6, m5, 8 vpermq m5, m5, m1 ; 4 5 6 7 vpermq m6, m6, m2 ; 8 9 10 11 .pass2_end: vpbroadcastd m11, [pixel_10bpc_max] pxor m10, m10 lea r6, [strideq*3] pmulhrsw m8, m12, m4 call m(idct_8x8_internal_10bpc).write_8x4_start pmulhrsw m8, m12, m5 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m6 call m(idct_8x8_internal_10bpc).write_8x4 pmulhrsw m8, m12, m7 jmp m(idct_8x8_internal_10bpc).write_8x4 ALIGN function_align .main: ITX_MULSUB_2D 7, 0, 8, 9, 10, 13, 401, 4076 ; t1a, t0a ITX_MULSUB_2D 1, 6, 8, 9, 10, 13, 3920, 1189 ; t7a, t6a ITX_MULSUB_2D 5, 2, 8, 9, 10, 13, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D 3, 4, 8, 9, 10, 13, 3166, 2598 ; t5a, t4a psubd m8, m2, m6 ; t6 paddd m2, m6 ; t2 psubd m6, m0, m4 ; t4 paddd m0, m4 ; t0 psubd m4, m5, m1 ; t7 paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 REPX {pmaxsd x, m14}, m6, m1, m8, m4, m2, m0, m5, m7 REPX {pminsd x, m15}, m6, m1, m8, m4, m2, m0, m5, m7 vpbroadcastd m10, [o(pd_1567)] vpbroadcastd m11, [o(pd_3784)] ITX_MULSUB_2D 6, 1, 3, 9, _, 13, 10, 11 ; t5a, t4a ITX_MULSUB_2D 4, 8, 3, 9, _, 13, 11, 10 ; t6a, t7a vpbroadcastd m12, [o(pd_1448)] psubd m9, m6, m8 ; t7 paddd m6, m8 ; out6 psubd m3, m7, m5 ; t3 paddd m7, m5 ; -out7 psubd m5, m0, m2 ; t2 paddd m0, m2 ; out0 psubd m2, m1, m4 ; t6 paddd m1, m4 ; -out1 REPX {pmaxsd x, m14}, m5, m3, m2, m9 REPX {pminsd x, m15}, m5, m3, m2, m9 REPX {pmulld x, m12}, m5, m3, m2, m9 vpbroadcastd m4, [o(pd_1)] psubd m8, m5, m3 ; (t2 - t3) * 1448 paddd m3, m5 ; (t2 + t3) * 1448 psubd m5, m2, m9 ; (t6 - t7) * 1448 paddd m2, m9 ; (t6 + t7) * 1448 vpbroadcastd m9, [o(pd_3072)] paddd m0, m4 psubd m1, m4, m1 paddd m10, m6, m4 psubd m11, m4, m7 paddd m2, m9 paddd m8, m9 vpbroadcastd m9, [o(pd_3071)] psubd m3, m9, m3 psubd m9, m5 ret ALIGN function_align .fast_main: mova ym0, [cq+64*0] mova ym4, [cq+64*2] mova ym1, [cq+64*7] mova ym5, [cq+64*5] mova ym2, [cq+64*4] mova ym6, [cq+64*6] mova ym3, [cq+64*3] mova ym7, [cq+64*1] call m(idct_8x16_internal_10bpc).round_input_fast jmp m(iadst_8x8_internal_10bpc).main ALIGN function_align .pass2_main: mova m8, [o(iadst8x16p)] REPX {vpermb x, m8, x}, m0, m1, m2, m3 vpbroadcastd m10, [o(pw_2896x8)] punpckhdq m5, m0, m1 punpckldq m0, m1 punpckhdq m1, m2, m3 punpckldq m2, m3 lea r5, [o_base_8bpc] punpckhqdq m4, m0, m2 ; 12 3 14 1 punpcklqdq m0, m2 ; 0 15 2 13 punpckhqdq m6, m5, m1 ; 8 7 10 5 punpcklqdq m5, m1 ; 4 11 6 9 call m(iadst_8x16_internal_8bpc).main2 paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m10 ; -out7 out4 out6 -out5 pmulhrsw m2, m10 ; out8 -out11 -out9 out10 ret INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, identity, 35 INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 43 jl .fast call m(idct_8x16_internal_10bpc).load call m(iadst_8x16_internal_10bpc).main psrad m7, m0, 1 psrad m0, m11, 1 psrad m6, m1, 1 psrad m1, m10, 1 psrad m5, m2, 12 psrad m2, m9, 12 psrad m4, m3, 12 psrad m3, m8, 12 jmp m(idct_8x16_internal_10bpc).pass1_end .fast: call m(iadst_8x16_internal_10bpc).fast_main punpckhqdq m1, m3, m4 ; -out3 -out1 punpcklqdq m3, m0 ; out2 out0 punpckhqdq m0, m2 ; -out7 -out5 punpcklqdq m4, m2 ; out6 out4 psubd m1, m11, m1 paddd m3, m11 psubd m0, m11, m0 paddd m4, m11 jmp m(iadst_8x16_internal_10bpc).fast_end .pass2: call m(iadst_8x16_internal_10bpc).pass2_main movu m7, [permB+2] vbroadcasti32x8 m12, [pw_m2048_2048+16] psrlq m4, m7, 8 vpermi2q m7, m3, m0 ; 3 2 1 0 psrlq m5, m4, 24 vpermi2q m4, m3, m0 ; 15 14 13 12 psrlq m6, m5, 8 vpermq m5, m5, m2 ; 11 10 9 8 vpermq m6, m6, m1 ; 7 6 5 4 jmp m(iadst_8x16_internal_10bpc).pass2_end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x16_internal_10bpc).load2 jmp m(idct_8x16_internal_10bpc).pass1_end .pass2: vpbroadcastd m8, [o(pw_1697x16)] pmulhrsw m4, m8, m0 pmulhrsw m5, m8, m1 pmulhrsw m6, m8, m2 pmulhrsw m7, m8, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 vpbroadcastd m7, [o(pw_2048)] punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 vpbroadcastd m6, [o(pixel_10bpc_max)] punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m1 punpckhdq m4, m1 pxor m5, m5 punpckhqdq m1, m0, m2 ; 1 5 9 13 punpcklqdq m0, m2 ; 0 4 8 12 punpcklqdq m2, m3, m4 ; 2 6 10 14 punpckhqdq m3, m4 ; 3 7 11 15 lea r6, [strideq*3] pmulhrsw m0, m7 call .write_8x4_start pmulhrsw m0, m7, m1 call .write_8x4 pmulhrsw m0, m7, m2 call .write_8x4 pmulhrsw m0, m7, m3 .write_8x4: add dstq, strideq add cq, 64*2 .write_8x4_start: mova xm4, [dstq+strideq*0] vinserti32x4 ym4, [dstq+strideq*4], 1 vinserti32x4 m4, [dstq+strideq*8], 2 vinserti32x4 m4, [dstq+r6*4 ], 3 mova [cq+64*0], m5 mova [cq+64*1], m5 paddw m4, m0 pmaxsw m4, m5 pminsw m4, m6 mova [dstq+strideq*0], xm4 vextracti32x4 [dstq+strideq*4], ym4, 1 vextracti32x4 [dstq+strideq*8], m4, 2 vextracti32x4 [dstq+r6*4 ], m4, 3 ret %macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 16x8 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 8 .dconly: add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 .dconly2: vpbroadcastd m2, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw m1, r6d paddsw m1, m2 .dconly_loop: mova ym0, [dstq+strideq*0] vinserti32x8 m0, [dstq+strideq*1], 1 paddsw m0, m1 psubusw m0, m2 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity, -21 INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, adst cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m12, [o(pd_2896)] pmulld m4, m12, [cq+64*0] ; 0 1 pmulld m9, m12, [cq+64*1] ; 2 3 pmulld m8, m12, [cq+64*2] ; 4 5 pmulld m7, m12, [cq+64*3] ; 6 7 vpbroadcastd m13, [o(pd_2048)] pxor m2, m2 mova m15, [o(permB)] REPX {mova [cq+64*x], m2}, 0, 1, 2, 3 psrlq m0, m15, 32 REPX {paddd x, m13}, m4, m9, m8, m7 vpbroadcastd m14, [o(clip_18b_min)] REPX {psrad x, 12 }, m4, m8, m9, m7 mova m1, m0 vpermi2q m0, m4, m8 ; 0 4 cmp eobd, 43 jl .fast pmulld m5, m12, [cq+64*4] ; 8 9 pmulld m10, m12, [cq+64*5] ; 10 11 pmulld m11, m12, [cq+64*6] ; 12 13 pmulld m6, m12, [cq+64*7] ; 14 15 REPX {mova [cq+64*x], m2}, 4, 5, 6, 7 REPX {paddd x, m13}, m5, m10, m11, m6 REPX {psrad x, 12 }, m10, m5, m11, m6 mova m2, m1 vpermi2q m1, m9, m10 ; 2 10 mova m3, m2 vpermi2q m2, m5, m11 ; 8 12 vpermi2q m3, m6, m7 ; 14 6 vpermt2q m4, m15, m11 ; 1 13 vpermt2q m6, m15, m9 ; 15 3 vpermt2q m5, m15, m8 ; 9 5 vpermt2q m7, m15, m10 ; 7 11 vpbroadcastd m15, [o(clip_18b_max)] call m(idct_8x8_internal_10bpc).main call .main jmp .pass1_end .fast: vpermi2q m1, m9, m7 ; 2 6 vpermt2q m4, m15, m9 ; 1 3 vpermt2q m7, m15, m8 ; 7 5 vpbroadcastd m15, [o(clip_18b_max)] call m(idct_8x8_internal_10bpc).main_fast call .main_fast .pass1_end: call m(idct_8x16_internal_10bpc).main_end mova m8, [o(permA)] psrlq m9, m8, 8 .pass1_end2: mova m10, m9 mova m11, m8 call .transpose_16x8 jmp tx2q .pass2: lea r5, [o_base_8bpc] call m(idct_16x8_internal_8bpc).main movshdup m4, [permC] vpbroadcastd m11, [pw_2048] psrlq m5, m4, 8 .end: vpbroadcastd m13, [pixel_10bpc_max] pxor m12, m12 vpermq m8, m4, m0 vpermq m9, m5, m1 lea r6, [strideq*3] call .write_16x4 vpermq m8, m4, m2 vpermq m9, m5, m3 .write_16x4: pmulhrsw m8, m11 pmulhrsw m9, m11 .write_16x4_noround: mova ym10, [dstq+strideq*0] vinserti32x8 m10, [dstq+strideq*1], 1 paddw m8, m10 mova ym10, [dstq+strideq*2] vinserti32x8 m10, [dstq+r6 ], 1 paddw m9, m10 pmaxsw m8, m12 pmaxsw m9, m12 pminsw m8, m13 pminsw m9, m13 mova [dstq+strideq*0], ym8 vextracti32x8 [dstq+strideq*1], m8, 1 mova [dstq+strideq*2], ym9 vextracti32x8 [dstq+r6 ], m9, 1 lea dstq, [dstq+strideq*4] ret ALIGN function_align .main_fast: ; bottom half is zero vbroadcasti32x4 m6, [o(pd_4076_3920)] vbroadcasti32x4 m3, [o(pd_401_m1189)] vbroadcasti32x4 m5, [o(pd_m2598_1931)] vbroadcasti32x4 m9, [o(pd_3166_3612)] pmulld m6, m4 ; t15a t12a pmulld m4, m3 ; t8a t11a pmulld m5, m7 ; t9a t10a pmulld m7, m9 ; t14a t13a jmp .main2 .main: ITX_MULSUB_2D 4, 6, 3, 9, 10, _, 401_3920, 4076_1189 ITX_MULSUB_2D 5, 7, 3, 9, 10, _, 3166_1931, 2598_3612 .main2: REPX {paddd x, m13}, m4, m6, m5, m7 REPX {psrad x, 12 }, m4, m5, m6, m7 paddd m9, m4, m5 ; t8 t11 psubd m4, m5 ; t9 t10 psubd m5, m6, m7 ; t14 t13 paddd m6, m7 ; t15 t12 REPX {pmaxsd x, m14}, m5, m4, m9, m6 REPX {pminsd x, m15}, m5, m4, m9, m6 .main3: psubd m3, m0, m1 ; dct8 out7 out6 paddd m0, m1 ; dct8 out0 out1 vbroadcasti32x4 m7, [o(pd_3784_m3784)] pmulld m7, m5 vpmulld m5, [o(pd_1567)] {1to16} paddd m1, m2, m8 ; dct8 out3 out2 psubd m2, m8 ; dct8 out4 out5 vbroadcasti32x4 m8, [o(pd_1567_m1567)] pmulld m8, m4 vpmulld m4, [o(pd_3784)] {1to16} REPX {pmaxsd x, m14}, m0, m1 REPX {pminsd x, m15}, m0, m1 paddd m7, m13 paddd m5, m13 paddd m7, m8 psubd m5, m4 psrad m7, 12 ; t14a t10a psrad m5, 12 ; t9a t13a punpckhqdq m4, m9, m7 punpcklqdq m8, m9, m5 punpckhqdq m5, m6, m5 punpcklqdq m6, m7 psubd m7, m8, m4 ; t11a t10 paddd m8, m4 ; t8a t9 psubd m4, m6, m5 ; t12a t13 paddd m6, m5 ; t15a t14 REPX {pmaxsd x, m14}, m4, m7 REPX {pminsd x, m15}, m4, m7 pmulld m4, m12 pmulld m7, m12 REPX {pmaxsd x, m14}, m2, m3, m6, m8 REPX {pminsd x, m15}, m2, m3, m6, m8 paddd m4, m13 paddd m5, m4, m7 psubd m4, m7 psrad m4, 12 ; t11 t10a psrad m5, 12 ; t12 t13a ret ALIGN function_align .transpose_16x8: packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 vpermi2d m8, m0, m2 vpermt2d m0, m9, m2 vpermi2d m10, m1, m3 vpermi2d m11, m1, m3 punpckhwd m3, m8, m0 punpcklwd m1, m8, m0 punpckhwd m4, m10, m11 punpcklwd m2, m10, m11 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, identity, -21 INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, adst cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp call .main_pass1 vpbroadcastd m9, [o(pd_1)] paddd m0, m9 psubd m1, m9, m1 paddd m2, m9 psubd m3, m9, m3 paddd m4, m9, m5 psubd m5, m9, m6 paddd m6, m9, m7 psubd m7, m9, m8 .pass1_end: mova m9, [o(permA)] psrlq m8, m9, 8 REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7 jmp m(idct_16x8_internal_10bpc).pass1_end2 .pass2: call .main_pass2 vpermq m8, m11, m0 vpermq m9, m11, m1 call m(idct_16x8_internal_10bpc).write_16x4_noround vpermq m8, m11, m2 vpermq m9, m11, m3 jmp m(idct_16x8_internal_10bpc).write_16x4_noround ALIGN function_align .main_pass1: vpbroadcastd m12, [o(pd_2896)] pmulld m2, m12, [cq+64*0] pmulld m7, m12, [cq+64*1] pmulld m1, m12, [cq+64*2] pmulld m5, m12, [cq+64*3] vpbroadcastd m13, [o(pd_2048)] pxor m4, m4 mova m10, [o(permB)] REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 REPX {paddd x, m13}, m2, m7, m1, m5 psrlq m6, m10, 32 REPX {psrad x, 12 }, m2, m7, m1, m5 mova m0, m6 vpermi2q m0, m2, m7 ; 0 2 vpermt2q m7, m10, m2 ; 3 1 mova m2, m6 vpermi2q m2, m1, m5 ; 4 6 vpermt2q m5, m10, m1 ; 7 5 cmp eobd, 43 jl .main_fast pmulld m8, m12, [cq+64*4] pmulld m3, m12, [cq+64*5] pmulld m9, m12, [cq+64*6] pmulld m1, m12, [cq+64*7] REPX {mova [cq+64*x], m4}, 4, 5, 6, 7 REPX {paddd x, m13}, m8, m3, m9, m1 REPX {psrad x, 12 }, m8, m3, m9, m1 mova m4, m6 vpermi2q m4, m8, m3 ; 8 10 vpermt2q m3, m10, m8 ; 11 9 vpermi2q m6, m9, m1 ; 12 14 vpermt2q m1, m10, m9 ; 15 13 .main: ITX_MULSUB_2D 1, 0, 8, 9, 10, _, 201_995, 4091_3973, 1 ITX_MULSUB_2D 3, 2, 8, 9, 10, _, 1751_2440, 3703_3290, 1 ITX_MULSUB_2D 5, 4, 8, 9, 10, _, 3035_3513, 2751_2106 ITX_MULSUB_2D 7, 6, 8, 9, 10, _, 3857_4052, 1380_601 jmp .main2 .main_fast: vbroadcasti32x4 m1, [o(pd_4091_3973)] vbroadcasti32x4 m8, [o(pd_201_995)] vbroadcasti32x4 m3, [o(pd_3703_3290)] vbroadcasti32x4 m9, [o(pd_1751_2440)] vbroadcasti32x4 m4, [o(pd_2751_2106)] vbroadcasti32x4 m10, [o(pd_3035_3513)] vbroadcasti32x4 m6, [o(pd_1380_601)] vbroadcasti32x4 m11, [o(pd_3857_4052)] pmulld m1, m0 pmulld m0, m8 pmulld m3, m2 pmulld m2, m9 pmulld m4, m5 pmulld m5, m10 pmulld m6, m7 pmulld m7, m11 .main2: vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] REPX {psubd x, m13, x}, m1, m3 REPX {paddd x, m13 }, m0, m2, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m4, m1, m5, m2, m6, m3, m7 psubd m8, m0, m4 ; t8a t10a paddd m0, m4 ; t0a t2a psubd m4, m1, m5 ; t9a t11a paddd m1, m5 ; t1a t3a psubd m5, m2, m6 ; t12a t14a paddd m2, m6 ; t4a t6a psubd m6, m3, m7 ; t13a t15a paddd m3, m7 ; t5a t7a REPX {pmaxsd x, m14}, m8, m4, m5, m6 REPX {pminsd x, m15}, m8, m4, m5, m6 vbroadcasti32x4 m11, [o(pd_4017_2276)] vbroadcasti32x4 m10, [o(pd_799_3406)] ITX_MULSUB_2D 8, 4, 7, 9, _, 13, 10, 11 ITX_MULSUB_2D 6, 5, 7, 9, _, 13, 11, 10 REPX {pmaxsd x, m14}, m0, m2, m1, m3 REPX {pminsd x, m15}, m0, m2, m1, m3 psubd m7, m0, m2 ; t4 t6 paddd m0, m2 ; t0 t2 psubd m2, m1, m3 ; t5 t7 paddd m1, m3 ; t1 t3 psubd m3, m4, m6 ; t12a t14a paddd m4, m6 ; t8a t10a psubd m6, m8, m5 ; t13a t15a paddd m8, m5 ; t9a t11a REPX {pmaxsd x, m14}, m7, m3, m2, m6 REPX {pminsd x, m15}, m7, m3, m2, m6 punpcklqdq m5, m3, m7 ; t12a t4 punpckhqdq m3, m7 ; t14a t6 punpckhqdq m7, m6, m2 ; t15a t7 punpcklqdq m6, m2 ; t13a t5 vpbroadcastd m11, [o(pd_1567)] vpbroadcastd m10, [o(pd_3784)] ITX_MULSUB_2D 7, 3, 2, 9, 10, 13, 10, 11 ITX_MULSUB_2D 5, 6, 2, 9, 10, 13, 11, 10 REPX {pmaxsd x, m14}, m0, m4, m1, m8 REPX {pminsd x, m15}, m0, m4, m1, m8 punpckhqdq m2, m4, m0 ; t10a t2 punpcklqdq m4, m0 ; t8a t0 punpckhqdq m0, m8, m1 ; t11a t3 punpcklqdq m8, m1 ; t9a t1 paddd m1, m6, m7 ; out2 -out3 psubd m6, m7 ; t14a t6 paddd m7, m5, m3 ; -out13 out12 psubd m5, m3 ; t15a t7 psubd m3, m8, m0 ; t11 t3a paddd m8, m0 ; out14 -out15 paddd m0, m4, m2 ; -out1 out0 psubd m4, m2 ; t10 t2a REPX {pmaxsd x, m14}, m6, m5, m3, m4 mov r6d, 0x3333 REPX {pminsd x, m15}, m6, m5, m3, m4 kmovw k1, r6d REPX {pmulld x, m12}, m6, m5, m3, m4 pxor m9, m9 REPX {vpsubd x{k1}, m9, x}, m0, m1, m7, m8 paddd m6, m13 paddd m4, m13 paddd m2, m6, m5 ; -out5 out4 psubd m6, m5 ; out10 -out11 psubd m5, m4, m3 ; -out9 out8 paddd m3, m4 ; out6 -out7 REPX {psrad x, 12}, m2, m3, m5, m6 REPX {vpsubd x{k1}, m9, x}, m2, m3, m5, m6 ret ALIGN function_align .main_pass2: lea r5, [o_base_8bpc] pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_16x8_internal_8bpc).main_pass2 movshdup m11, [permC] pmulhrsw m0, m6 pmulhrsw m1, m6 vpbroadcastd m13, [pixel_10bpc_max] pxor m12, m12 lea r6, [strideq*3] ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, identity, -21 INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_10bpc).main_pass1 vpbroadcastd m9, [o(pd_1)] psubd m4, m9, m3 paddd m3, m9, m5 paddd m5, m9, m2 psubd m2, m9, m6 psubd m6, m9, m1 paddd m1, m9, m7 paddd m7, m9, m0 psubd m0, m9, m8 jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: call m(iadst_16x8_internal_10bpc).main_pass2 psrlq m11, 8 vpermq m8, m11, m3 vpermq m9, m11, m2 call m(idct_16x8_internal_10bpc).write_16x4_noround vpermq m8, m11, m1 vpermq m9, m11, m0 jmp m(idct_16x8_internal_10bpc).write_16x4_noround INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 call m(idct_8x16_internal_10bpc).load2 vpbroadcastd m8, [o(pd_5793)] vpbroadcastd m13, [o(pd_3072)] pxor m10, m10 REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {mova [cq+64*x], m10}, 0, 1, 2, 3, 4, 5, 6, 7 call m(idct_8x16_internal_10bpc).round psrlq m8, [o(permA)], 16 psrlq m9, m8, 8 mova m10, m8 mova m11, m9 call m(idct_16x8_internal_10bpc).transpose_16x8 jmp tx2q .pass2: movshdup m4, [o(permC)] vpbroadcastd m11, [o(pw_4096)] mova m5, m4 jmp m(idct_16x8_internal_10bpc).end %macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 16x16 %ifidn %1_%2, dct_dct imul r6d, [cq], 181 mov [cq], eobd ; 0 or r3d, 16 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity, 28 INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, adst cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 36 jl .fast mova m0, [cq+64* 0] mova m1, [cq+64* 2] mova m2, [cq+64* 4] mova m3, [cq+64* 6] mova m4, [cq+64* 8] mova m5, [cq+64*10] mova m6, [cq+64*12] mova m7, [cq+64*14] %if WIN64 movaps [cq+16*0], xmm6 movaps [cq+16*1], xmm7 %endif call m(idct_8x16_internal_10bpc).main mova m16, [cq+64* 1] mova m17, [cq+64* 3] mova m18, [cq+64* 5] mova m19, [cq+64* 7] mova m20, [cq+64* 9] mova m21, [cq+64*11] mova m22, [cq+64*13] mova m23, [cq+64*15] call .main call .main_end .pass1_end: %if WIN64 movaps xmm6, [cq+16*0] movaps xmm7, [cq+16*1] %endif vzeroupper .pass1_end2: call .main_end3 .pass1_end3: mov r6d, 64*12 pxor m8, m8 .zero_loop: mova [cq+r6+64*3], m8 mova [cq+r6+64*2], m8 mova [cq+r6+64*1], m8 mova [cq+r6+64*0], m8 sub r6d, 64*4 jge .zero_loop jmp tx2q .pass2: lea r5, [o_base_8bpc] call m(idct_16x16_internal_8bpc).main movshdup m12, [permC] vpbroadcastd m11, [pw_2048] psrlq m13, m12, 8 vpermq m8, m12, m0 vpermq m0, m13, m7 vpermq m7, m13, m1 vpermq m1, m12, m6 vpermq m6, m12, m2 vpermq m2, m13, m5 vpermq m5, m13, m3 vpermq m3, m12, m4 .pass2_end: lea r6, [strideq*3] vpbroadcastd m13, [pixel_10bpc_max] pxor m12, m12 pmulhrsw m8, m11, m8 pmulhrsw m9, m11, m7 call m(idct_16x8_internal_10bpc).write_16x4_noround pmulhrsw m8, m11, m6 pmulhrsw m9, m11, m5 call m(idct_16x8_internal_10bpc).write_16x4_noround pmulhrsw m8, m11, m3 pmulhrsw m9, m11, m2 call m(idct_16x8_internal_10bpc).write_16x4_noround pmulhrsw m8, m11, m1 pmulhrsw m9, m11, m0 jmp m(idct_16x8_internal_10bpc).write_16x4_noround .fast: mova ym0, [cq+64*0] mova ym2, [cq+64*4] movshdup m8, [o(permB)] mova ym1, [cq+64*2] mova ym3, [cq+64*6] mova ym4, [cq+64*1] mova ym5, [cq+64*3] mova ym6, [cq+64*5] mova ym7, [cq+64*7] vpermt2q m0, m8, m2 ; 0 4 vpermt2q m1, m8, m3 ; 2 6 vpermt2q m4, m8, m5 ; 1 3 vpermt2q m7, m8, m6 ; 7 5 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast vpbroadcastd m11, [o(pd_2)] call m(idct_8x16_internal_10bpc).main_end2 mova m8, [o(permA)] psrlq m9, m8, 8 jmp m(iadst_16x16_internal_10bpc).pass1_fast_end2 ALIGN function_align .main_fast2_rect2: REPX {paddd x, m13}, m16, m17 REPX {psrad x, 12 }, m16, m17 .main_fast2: pmulld m22, m16, [o(pd_4076)] {1to16} ; t15a pmulld m9, m16, [o(pd_401)] {1to16} ; t8a pmulld m18, m17, [o(pd_1189)] {1to16} ; t11a pmulld m17, [o(pd_3920)] {1to16} ; t12a psubd m18, m13, m18 REPX {paddd x, m13}, m22, m9, m17 REPX {psrad x, 12 }, m18, m22, m9, m17 mova m20, m9 mova m16, m18 mova m23, m22 mova m19, m17 jmp .main3 .main_fast_rect2: REPX {paddd x, m13}, m16, m17, m18, m19 REPX {psrad x, 12 }, m16, m17, m18, m19 .main_fast: pmulld m23, m16, [o(pd_4076)] {1to16} ; t15a pmulld m16, [o(pd_401)] {1to16} ; t8a pmulld m20, m19, [o(pd_2598)] {1to16} ; t9a pmulld m19, [o(pd_3166)] {1to16} ; t14a pmulld m22, m17, [o(pd_1189)] {1to16} ; t11a pmulld m17, [o(pd_3920)] {1to16} ; t12a pmulld m21, m18, [o(pd_3612)] {1to16} ; t13a pmulld m18, [o(pd_1931)] {1to16} ; t10a psubd m20, m13, m20 psubd m22, m13, m22 call .round2 jmp .main2 .main_rect2: call .round .main: ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3166, 2598 ; t9a, t14a ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3920, 1189 ; t11a, t12a ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1931, 3612 ; t10a, t13a call .round .main2: paddd m9, m20, m16 ; t8 psubd m20, m16, m20 ; t9 psubd m16, m22, m18 ; t10 paddd m18, m22 ; t11 paddd m22, m23, m19 ; t15 psubd m23, m19 ; t14 psubd m19, m17, m21 ; t13 paddd m17, m21 ; t12 REPX {pmaxsd x, m14}, m20, m23, m16, m19 REPX {pminsd x, m15}, m20, m23, m16, m19 REPX {pmaxsd x, m14}, m9, m18, m22, m17 REPX {pminsd x, m15}, m9, m18, m22, m17 .main3: vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] ITX_MULSUB_2D 23, 20, 21, 7, _, 13, 10, 11 ITX_MULSUB_2D 19, 16, 21, 7, _, 13, 10, 11, 2 paddd m21, m20, m19 ; t14 psubd m20, m19 ; t13 psubd m19, m9, m18 ; t11a paddd m9, m18 ; t8a psubd m18, m23, m16 ; t10 paddd m16, m23 ; t9 psubd m23, m22, m17 ; t12a paddd m22, m17 ; t15a REPX {pmaxsd x, m14}, m20, m23, m18, m19 REPX {pminsd x, m15}, m20, m23, m18, m19 REPX {pmulld x, m12}, m20, m23, m18, m19 psubd m7, m0, m6 ; dct8 out7 paddd m0, m6 ; dct8 out0 psubd m6, m1, m5 ; dct8 out6 paddd m1, m5 ; dct8 out1 REPX {pmaxsd x, m14}, m7, m0, m6, m1 psubd m5, m2, m4 ; dct8 out5 paddd m2, m4 ; dct8 out2 REPX {pminsd x, m15}, m7, m0, m6, m1 psubd m4, m3, m8 ; dct8 out4 paddd m3, m8 ; dct8 out3 REPX {pmaxsd x, m14}, m5, m2, m4, m3 paddd m20, m13 paddd m23, m13 REPX {pminsd x, m15}, m5, m2, m4, m3 psubd m17, m20, m18 ; t10a paddd m20, m18 ; t13a REPX {pmaxsd x, m14}, m22, m21, m16, m9 psubd m18, m23, m19 ; t11 paddd m19, m23 ; t12 REPX {pminsd x, m15}, m22, m21, m16, m9 REPX {psrad x, 12 }, m20, m19, m18, m17 ret .main_end: vpbroadcastd m11, [o(pd_2)] .main_end2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 psubd m23, m0, m22 ; out15 paddd m0, m22 ; out0 psubd m22, m1, m21 ; out14 paddd m1, m21 ; out1 psubd m21, m2, m20 ; out13 paddd m2, m20 ; out2 psubd m20, m3, m19 ; out12 paddd m3, m19 ; out3 psubd m19, m4, m18 ; out11 paddd m4, m18 ; out4 psubd m18, m5, m17 ; out10 paddd m5, m17 ; out5 psubd m17, m6, m16 ; out9 paddd m6, m16 ; out6 psubd m16, m7, m9 ; out8 paddd m7, m9 ; out7 REPX {vpsravd x, m11}, m0, m16, m1, m17, m2, m18, m3, m19, \ m4, m20, m5, m21, m6, m22, m7, m23 packssdw m0, m16 packssdw m1, m17 packssdw m2, m18 packssdw m3, m19 packssdw m4, m20 packssdw m5, m21 packssdw m6, m22 packssdw m7, m23 ret .main_end3: punpckhwd m8, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m4, m5 punpcklwd m4, m5 punpcklwd m5, m6, m7 punpckhwd m6, m7 punpckhdq m7, m0, m2 punpckldq m0, m2 punpckhdq m2, m8, m1 punpckldq m8, m1 punpckhdq m1, m4, m5 punpckldq m4, m5 punpckhdq m5, m3, m6 punpckldq m3, m6 vshufi32x4 m6, m0, m4, q3232 vinserti32x8 m0, ym4, 1 vinserti32x8 m4, m8, ym3, 1 vshufi32x4 m8, m3, q3232 vinserti32x8 m3, m7, ym1, 1 vshufi32x4 m7, m1, q3232 vshufi32x4 m1, m2, m5, q3232 vinserti32x8 m2, ym5, 1 vshufi32x4 m5, m7, m1, q2020 ; 10 11 vshufi32x4 m7, m1, q3131 ; 14 15 vshufi32x4 m1, m3, m2, q2020 ; 2 3 vshufi32x4 m3, m2, q3131 ; 6 7 vshufi32x4 m2, m0, m4, q3131 ; 4 5 vshufi32x4 m0, m4, q2020 ; 0 1 vshufi32x4 m4, m6, m8, q2020 ; 8 9 vshufi32x4 m6, m8, q3131 ; 12 13 ret ALIGN function_align .round: paddd m20, m13 paddd m22, m13 .round2: paddd m16, m13 paddd m18, m13 .round3: REPX {psrad x, 12 }, m16, m18, m20, m22 REPX {paddd x, m13}, m17, m19, m21, m23 REPX {psrad x, 12 }, m17, m19, m21, m23 ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, flipadst INV_TXFM_16X16_FN adst, adst cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 36 jl .fast call .main_pass1 packssdw m0, m16 packssdw m1, m17 packssdw m2, m18 packssdw m3, m19 packssdw m4, m5, m20 packssdw m5, m6, m21 packssdw m6, m7, m22 packssdw m7, m8, m23 jmp m(idct_16x16_internal_10bpc).pass1_end .fast: call .main_pass1_fast vpbroadcastd m9, [o(pd_2)] paddd m0, m9 psubd m1, m9, m1 paddd m2, m9 psubd m3, m9, m3 paddd m4, m9, m5 psubd m5, m9, m6 paddd m6, m9, m7 psubd m7, m9, m8 .pass1_fast_end: mova m9, [o(permA)] psrlq m8, m9, 8 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 .pass1_fast_end2: mova m10, m9 mova m11, m8 call m(idct_16x8_internal_10bpc).transpose_16x8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7 jmp tx2q .pass2: lea r5, [o_base_8bpc] call m(iadst_16x16_internal_8bpc).main_pass2b movshdup m12, [permC] mova m11, [pw_2048_m2048] psrlq m13, m12, 8 vpermq m8, m13, m0 vpermq m0, m12, m7 vpermq m7, m13, m1 vpermq m1, m12, m6 vpermq m6, m13, m2 vpermq m2, m12, m5 vpermq m5, m13, m3 vpermq m3, m12, m4 jmp m(idct_16x16_internal_10bpc).pass2_end ALIGN function_align .main_pass1: mova m0, [cq+64* 0] %if WIN64 movaps [cq+16*0], xmm6 movaps [cq+16*1], xmm7 %endif mova m23, [cq+64*15] vpbroadcastd m13, [o(pd_2048)] ITX_MULSUB_2D 23, 0, 8, 9, 10, 13, 201, 4091 ; t1 t0 mova m7, [cq+64* 7] mova m16, [cq+64* 8] ITX_MULSUB_2D 7, 16, 8, 9, 10, 13, 3035, 2751 ; t9 t8 mova m2, [cq+64* 2] mova m21, [cq+64*13] ITX_MULSUB_2D 21, 2, 8, 9, 10, 13, 995, 3973 ; t3 t2 mova m5, [cq+64* 5] mova m18, [cq+64*10] ITX_MULSUB_2D 5, 18, 8, 9, 10, 13, 3513, 2106 ; t11 t10 mova m4, [cq+64* 4] mova m19, [cq+64*11] ITX_MULSUB_2D 19, 4, 8, 9, 10, 13, 1751, 3703 ; t5 t4 mova m3, [cq+64* 3] mova m20, [cq+64*12] ITX_MULSUB_2D 3, 20, 8, 9, 10, 13, 3857, 1380 ; t13 t12 mova m6, [cq+64* 6] mova m17, [cq+64* 9] ITX_MULSUB_2D 17, 6, 8, 9, 10, 13, 2440, 3290 ; t7 t6 mova m1, [cq+64* 1] mova m22, [cq+64*14] ITX_MULSUB_2D 1, 22, 8, 9, 10, 13, 4052, 601 ; t15 t14 vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] psubd m9, m23, m7 ; t9a paddd m23, m7 ; t1a psubd m7, m2, m18 ; t10a paddd m18, m2 ; t2a REPX {pmaxsd x, m14}, m9, m23, m7, m18 psubd m2, m17, m1 ; t15a paddd m17, m1 ; t7a REPX {pminsd x, m15}, m9, m23, m7, m18 psubd m1, m21, m5 ; t11a paddd m21, m5 ; t3a REPX {pmaxsd x, m14}, m2, m17, m1, m21 psubd m5, m4, m20 ; t12a paddd m4, m20 ; t4a REPX {pminsd x, m15}, m2, m17, m1, m21 psubd m20, m19, m3 ; t13a paddd m19, m3 ; t5a REPX {pmaxsd x, m14}, m5, m4, m20, m19 psubd m8, m6, m22 ; t14a paddd m6, m22 ; t6a REPX {pminsd x, m15}, m5, m4, m20, m19 psubd m22, m0, m16 ; t8a paddd m16, m0 ; t0a REPX {pmaxsd x, m14}, m8, m6, m22, m16 vpbroadcastd m11, [o(pd_4017)] vpbroadcastd m10, [o(pd_799)] REPX {pminsd x, m15}, m8, m6, m22, m16 ITX_MULSUB_2D 22, 9, 0, 3, _, 13, 10, 11 ; t9 t8 ITX_MULSUB_2D 20, 5, 0, 3, _, 13, 11, 10 ; t12 t13 vpbroadcastd m11, [o(pd_2276)] vpbroadcastd m10, [o(pd_3406)] ITX_MULSUB_2D 7, 1, 0, 3, _, 13, 10, 11 ; t11 t10 ITX_MULSUB_2D 2, 8, 0, 3, _, 13, 11, 10 ; t14 t15 paddd m0, m16, m4 ; t0 psubd m16, m4 ; t4 psubd m3, m23, m19 ; t5 paddd m23, m19 ; t1 REPX {pmaxsd x, m14}, m0, m16, m3, m23 psubd m19, m18, m6 ; t6 paddd m18, m6 ; t2 REPX {pminsd x, m15}, m0, m16, m3, m23 psubd m6, m21, m17 ; t7 paddd m21, m17 ; t3 REPX {pmaxsd x, m14}, m19, m18, m6, m21 paddd m17, m9, m20 ; t8a psubd m9, m20 ; t12a REPX {pminsd x, m15}, m19, m18, m6, m21 psubd m20, m22, m5 ; t13a paddd m22, m5 ; t9a REPX {pmaxsd x, m14}, m17, m9, m20, m22 psubd m5, m1, m2 ; t14a paddd m1, m2 ; t10a REPX {pminsd x, m15}, m17, m9, m20, m22 psubd m2, m7, m8 ; t15a paddd m7, m8 ; t11a REPX {pmaxsd x, m14}, m5, m1, m2, m7 vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] REPX {pminsd x, m15}, m5, m1, m2, m7 ITX_MULSUB_2D 16, 3, 4, 8, _, 13, 10, 11 ; t5a t4a ITX_MULSUB_2D 6, 19, 4, 8, _, 13, 11, 10 ; t6a t7a ITX_MULSUB_2D 9, 20, 4, 8, _, 13, 10, 11 ; t13 t12 ITX_MULSUB_2D 2, 5, 4, 8, _, 13, 11, 10 ; t14 t15 psubd m8, m0, m18 ; t2a paddd m0, m18 ; out0 psubd m18, m23, m21 ; t3a paddd m23, m21 ; -out15 paddd m21, m9, m5 ; -out13 psubd m9, m5 ; t15a psubd m5, m3, m6 ; t6 paddd m3, m6 ; -out3 REPX {pmaxsd x, m14}, m8, m18, m9, m5 psubd m6, m20, m2 ; t14a paddd m2, m20 ; out2 paddd m20, m16, m19 ; out12 psubd m16, m19 ; t7 REPX {pminsd x, m15}, m8, m18, m9, m5 psubd m19, m22, m7 ; t11 paddd m22, m7 ; out14 psubd m7, m17, m1 ; t10 paddd m1, m17 ; -out1 REPX {pmaxsd x, m14}, m6, m16, m19, m7 vpbroadcastd m12, [o(pd_1448)] vpbroadcastd m4, [o(pd_2)] vpbroadcastd m10, [o(pd_5120)] vpbroadcastd m11, [o(pd_5119)] REPX {pminsd x, m15}, m6, m16, m19, m7 psubd m17, m7, m19 ; -out9 paddd m7, m19 ; out6 psubd m19, m5, m16 ; -out11 paddd m5, m16 ; out4 REPX {pmulld x, m12}, m17, m7, m19, m5 psubd m16, m8, m18 ; out8 paddd m8, m18 ; -out7 psubd m18, m6, m9 ; out10 paddd m6, m9 ; -out5 REPX {pmulld x, m12}, m16, m8, m18, m6 REPX {paddd x, m4 }, m0, m2, m20, m22 REPX {psubd x, m4, x}, m1, m3, m21, m23 REPX {paddd x, m10 }, m7, m5, m16, m18 REPX {psubd x, m11, x}, m17, m19, m8, m6 REPX {psrad x, 2 }, m20, m22, m0, m2, m21, m23, m1, m3 REPX {psrad x, 13}, m17, m19, m5, m7, m16, m18, m6, m8 ret ALIGN function_align .main_pass1_fast: mova ym0, [cq+64*0] mova ym1, [cq+64*2] movshdup m8, [o(permB)] mova ym6, [cq+64*1] mova ym7, [cq+64*3] mova ym2, [cq+64*4] mova ym3, [cq+64*6] mova ym4, [cq+64*5] mova ym5, [cq+64*7] vpermt2q m0, m8, m1 ; 0 2 vpermt2q m7, m8, m6 ; 3 1 vpermt2q m2, m8, m3 ; 4 6 vpermt2q m5, m8, m4 ; 7 5 vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m12, [o(pd_2896)] jmp m(iadst_16x8_internal_10bpc).main_fast INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp cmp eobd, 36 jl .fast call m(iadst_16x16_internal_10bpc).main_pass1 packssdw m4, m19, m3 packssdw m3, m20, m5 packssdw m5, m18, m2 packssdw m2, m21, m6 packssdw m6, m17, m1 packssdw m1, m22, m7 packssdw m7, m16, m0 packssdw m0, m23, m8 jmp m(idct_16x16_internal_10bpc).pass1_end .fast: call m(iadst_16x16_internal_10bpc).main_pass1_fast vpbroadcastd m9, [o(pd_2)] psubd m4, m9, m3 paddd m3, m9, m5 paddd m5, m9, m2 psubd m2, m9, m6 psubd m6, m9, m1 paddd m1, m9, m7 paddd m7, m9, m0 psubd m0, m9, m8 jmp m(iadst_16x16_internal_10bpc).pass1_fast_end .pass2: lea r5, [o_base_8bpc] call m(iadst_16x16_internal_8bpc).main_pass2b movshdup m12, [permC] movu m11, [pw_m2048_2048] psrlq m13, m12, 8 vpermq m8, m13, m7 vpermq m7, m13, m6 vpermq m6, m13, m5 vpermq m5, m13, m4 vpermq m3, m12, m3 vpermq m2, m12, m2 vpermq m1, m12, m1 vpermq m0, m12, m0 jmp m(idct_16x16_internal_10bpc).pass2_end INV_TXFM_16X16_FN identity, dct, -92 INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 %undef cmp vpbroadcastd m10, [o(pd_5793)] vpbroadcastd m11, [o(pd_5120)] mov r6, cq cmp eobd, 36 jl .fast call .pass1_main packssdw m0, m6, m8 packssdw m1, m7, m9 call .pass1_main packssdw m2, m6, m8 packssdw m3, m7, m9 call .pass1_main packssdw m4, m6, m8 packssdw m5, m7, m9 call .pass1_main packssdw m6, m8 packssdw m7, m9 jmp m(idct_16x16_internal_10bpc).pass1_end2 .fast: call .pass1_main_fast packssdw m0, m6, m7 call .pass1_main_fast packssdw m1, m6, m7 call .pass1_main_fast packssdw m2, m6, m7 call .pass1_main_fast packssdw m3, m6, m7 punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckldq m3, m4, m1 punpckhdq m4, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 pxor m7, m7 vshufi32x4 m2, m0, m3, q3131 vshufi32x4 m0, m3, q2020 vshufi32x4 m3, m1, m4, q3131 vshufi32x4 m1, m4, q2020 REPX {mova x, m7}, m4, m5, m6 jmp m(idct_16x16_internal_10bpc).pass1_end3 .pass2: movshdup m14, [o(permC)] vpbroadcastd m15, [o(pw_1697x16)] lea r6, [strideq*3] vpbroadcastd m11, [o(pw_2048)] pxor m12, m12 vpbroadcastd m13, [pixel_10bpc_max] vpermq m8, m14, m0 vpermq m9, m14, m1 call .pass2_main vpermq m8, m14, m2 vpermq m9, m14, m3 call .pass2_main vpermq m8, m14, m4 vpermq m9, m14, m5 call .pass2_main vpermq m8, m14, m6 vpermq m9, m14, m7 .pass2_main: pmulhrsw m0, m15, m8 pmulhrsw m1, m15, m9 paddsw m8, m8 paddsw m9, m9 paddsw m8, m0 paddsw m9, m1 jmp m(idct_16x8_internal_10bpc).write_16x4 ALIGN function_align .pass1_main: pmulld m6, m10, [r6+64*0] pmulld m7, m10, [r6+64*1] pmulld m8, m10, [r6+64*8] pmulld m9, m10, [r6+64*9] add r6, 64*2 REPX {paddd x, m11}, m6, m7, m8, m9 REPX {psrad x, 13 }, m6, m8, m7, m9 ret ALIGN function_align .pass1_main_fast: mova ym6, [r6+64* 0] vinserti32x8 m6, [r6+64* 4], 1 mova ym7, [r6+64* 8] vinserti32x8 m7, [r6+64*12], 1 add r6, 64 REPX {pmulld x, m10}, m6, m7 REPX {paddd x, m11}, m6, m7 REPX {psrad x, 13 }, m6, m7 ret cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 22, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] vpbroadcastd m11, [o(pd_2)] mova m20, [o(idct8x32p)] pxor m21, m21 cmp eobd, 43 jl .fast call .pass1_main punpcklwd m16, m0, m1 punpcklwd m17, m2, m3 punpckhwd m18, m0, m1 punpckhwd m19, m2, m3 cmp eobd, 107 jge .full punpckldq m0, m16, m17 ; 0 2 punpckhdq m1, m16, m17 ; 4 6 punpckldq m2, m18, m19 ; 8 10 punpckhdq m3, m18, m19 ; 12 14 lea r5, [o_base_8bpc] vextracti32x8 ym14, m0, 1 vextracti32x8 ym15, m1, 1 vextracti32x8 ym16, m2, 1 vextracti32x8 ym17, m3, 1 call m(idct_8x16_internal_8bpc).main_fast call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .end .full: add cq, 64 call .pass1_main punpcklwd m5, m0, m1 punpcklwd m6, m2, m3 punpckhwd m7, m0, m1 punpckhwd m8, m2, m3 punpckldq m0, m16, m17 ; 0 2 punpckhdq m1, m16, m17 ; 4 6 punpckldq m2, m18, m19 ; 8 10 punpckhdq m3, m18, m19 ; 12 14 punpckldq m4, m5, m6 ; 16 18 punpckhdq m5, m6 ; 20 22 punpckldq m6, m7, m8 ; 24 26 punpckhdq m7, m8 ; 28 30 lea r5, [o_base_8bpc] vextracti32x8 ym14, m0, 1 vextracti32x8 ym15, m1, 1 vextracti32x8 ym16, m2, 1 vextracti32x8 ym17, m3, 1 vextracti32x8 ym18, m4, 1 vextracti32x8 ym19, m5, 1 vextracti32x8 ym20, m6, 1 vextracti32x8 ym21, m7, 1 call m(idct_8x16_internal_8bpc).main REPX {pshufd x, x, q1032}, ym18, ym19, ym20, ym21 call m(inv_txfm_add_dct_dct_8x32_8bpc).main jmp .end .fast: movshdup m8, [o(permB)] mova ym1, [cq+128*1] mova ym5, [cq+128*5] mova ym7, [cq+128*3] mova ym3, [cq+128*7] mova ym0, [cq+128*0] mova ym4, [cq+128*2] mova ym2, [cq+128*4] mova ym6, [cq+128*6] vpermt2q m1, m8, m5 ; 1 5 vpermt2q m3, m8, m7 ; 7 3 vpermt2q m0, m8, m4 ; 0 2 vpermt2q m2, m8, m6 ; 4 6 mova [cq+128*0], ym21 REPX {vmovdqa32 [cq+128*x], ym21}, 1, 2, 3, 4, 5, 6, 7 call m(idct_8x8_internal_10bpc).main call m(idct_8x8_internal_10bpc).main_end packssdw m0, m2 packssdw m1, m3 vpermb m0, m20, m0 vprold m20, 16 vpermb m2, m20, m1 punpckhdq m1, m0, m2 punpckldq m0, m2 lea r5, [o_base_8bpc] vextracti32x8 ym14, m0, 1 vextracti32x8 ym15, m1, 1 call m(idct_8x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast2 .end: call m(inv_txfm_add_dct_dct_8x32_8bpc).main_end ; performs vzeroupper lea r3, [strideq*2] vpbroadcastd m12, [pixel_10bpc_max] lea r6, [strideq*3] pxor m11, m11 lea r3, [dstq+r3*8] pmulhrsw m0, m10 pmulhrsw m1, m10 call .write_8x4x2 pmulhrsw m0, m10, m2 pmulhrsw m1, m10, m3 call .write_8x4x2 pmulhrsw m0, m10, m4 pmulhrsw m1, m10, m5 call .write_8x4x2 pmulhrsw m0, m10, m6 pmulhrsw m1, m10, m7 .write_8x4x2: mova xm8, [dstq+strideq*0] vinserti32x4 ym8, [dstq+strideq*1], 1 vinserti32x4 m8, [dstq+strideq*2], 2 vinserti32x4 m8, [dstq+r6 ], 3 mova xm9, [r3 +r6 ] vinserti32x4 ym9, [r3 +strideq*2], 1 vinserti32x4 m9, [r3 +strideq*1], 2 vinserti32x4 m9, [r3 +strideq*0], 3 paddw m8, m0 paddw m9, m1 pmaxsw m8, m11 pmaxsw m9, m11 pminsw m8, m12 pminsw m9, m12 mova [dstq+strideq*0], xm8 vextracti32x4 [dstq+strideq*1], ym8, 1 vextracti32x4 [dstq+strideq*2], m8, 2 vextracti32x4 [dstq+r6 ], m8, 3 lea dstq, [dstq+strideq*4] vextracti32x4 [r3 +strideq*0], m9, 3 vextracti32x4 [r3 +strideq*1], m9, 2 vextracti32x4 [r3 +strideq*2], ym9, 1 mova [r3 +r6 ], xm9 lea r3, [r3+strideq*4] ret .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 ALIGN function_align .pass1_main: mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] mova m3, [cq+128*3] mova m4, [cq+128*4] mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] REPX {mova [cq+128*x], m21}, 0, 1, 2, 3, 4, 5, 6, 7 call m(idct_8x16_internal_10bpc).main call m(idct_8x16_internal_10bpc).main_end2 packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 REPX {vpermb x, m20, x}, m0, m1, m2, m3 ret cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 8, 12, dst, stride, c, eob vpbroadcastd m9, [pw_5] lea r4, [strideq*3] pxor m10, m10 lea r5, [strideq*5] vpbroadcastd m11, [pixel_10bpc_max] sub eobd, 107 lea r6, [strideq+r4*2] .loop: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] lea r7, [dstq+strideq*8] REPX {mova [cq+128*x], m10}, 0, 1, 2, 3 REPX {paddsw x, m9}, m0, m1, m2, m3 REPX {mova [cq+128*x], m10}, 4, 5, 6, 7 REPX {psraw x, 3 }, m0, m1, m2, m3 add cq, 64 mova xm4, [dstq+strideq*0] mova xm5, [dstq+strideq*1] mova xm6, [dstq+strideq*2] mova xm7, [dstq+r4 *1] punpckhwd m8, m0, m1 vinserti32x4 ym4, [dstq+strideq*4], 1 punpcklwd m0, m1 vinserti32x4 ym5, [dstq+r5 *1], 1 punpckhwd m1, m2, m3 vinserti32x4 ym6, [dstq+r4 *2], 1 punpcklwd m2, m3 vinserti32x4 ym7, [dstq+r6 *1], 1 punpckhwd m3, m0, m8 vinserti32x4 m4, [r7 +strideq*0], 2 punpcklwd m0, m8 vinserti32x4 m5, [r7 +strideq*1], 2 punpckhwd m8, m2, m1 vinserti32x4 m6, [r7 +strideq*2], 2 punpcklwd m2, m1 vinserti32x4 m7, [r7 +r4 *1], 2 punpckhqdq m1, m0, m2 vinserti32x4 m4, [r7 +strideq*4], 3 punpcklqdq m0, m2 vinserti32x4 m5, [r7 +r5 *1], 3 punpcklqdq m2, m3, m8 vinserti32x4 m6, [r7 +r4 *2], 3 punpckhqdq m3, m8 vinserti32x4 m7, [r7 +r6 *1], 3 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 REPX {pmaxsw x, m10}, m0, m1, m2, m3 REPX {pminsw x, m11}, m0, m1, m2, m3 mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 mova [dstq+strideq*2], xm2 mova [dstq+r4 *1], xm3 vextracti32x4 [dstq+strideq*4], ym0, 1 vextracti32x4 [dstq+r5 *1], ym1, 1 vextracti32x4 [dstq+r4 *2], ym2, 1 vextracti32x4 [dstq+r6 *1], ym3, 1 lea dstq, [r7+strideq*8] vextracti32x4 [r7 +strideq*0], m0, 2 vextracti32x4 [r7 +strideq*1], m1, 2 vextracti32x4 [r7 +strideq*2], m2, 2 vextracti32x4 [r7 +r4 *1], m3, 2 vextracti32x4 [r7 +strideq*4], m0, 3 vextracti32x4 [r7 +r5 *1], m1, 3 vextracti32x4 [r7 +r4 *2], m2, 3 vextracti32x4 [r7 +r6 *1], m3, 3 add eobd, 0x80000000 jnc .loop RET cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly mova m11, [o(permB)] mova m0, [cq+64* 0] ; 0 1 mova m4, [cq+64* 1] ; 2 3 mova m1, [cq+64* 2] ; 4 5 mova m8, [cq+64* 3] ; 6 7 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] psrlq m10, m11, 32 %if WIN64 movaps [cq+16*0], xmm6 movaps [cq+16*1], xmm7 %endif mova m16, m11 vpermi2q m16, m0, m1 ; 1 5 mova m17, m11 vpermi2q m17, m8, m4 ; 7 3 cmp eobd, 43 jl .fast mova m18, [cq+64* 4] ; 8 9 mova m20, [cq+64* 5] ; 10 11 mova m6, [cq+64* 6] ; 12 13 mova m7, [cq+64* 7] ; 14 15 vpermt2q m0, m10, m18 ; 0 8 vpermt2q m18, m11, m6 ; 9 13 mova m19, m11 vpermi2q m19, m7, m20 ; 15 11 cmp eobd, 107 jge .full vpermt2q m1, m10, m6 ; 4 12 vpermt2q m4, m10, m8 ; 2 6 vpermt2q m7, m10, m20 ; 14 10 mov r6d, 64*1 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast call .main_fast call m(idct_16x16_internal_10bpc).main_end jmp .end .full: mova m2, [cq+64* 8] ; 16 17 mova m5, [cq+64* 9] ; 18 19 mova m9, [cq+64*10] ; 20 21 mova m21, [cq+64*11] ; 22 23 vpermt2q m1, m10, m9 ; 4 20 vpermt2q m7, m10, m21 ; 14 22 vpermt2q m21, m11, m5 ; 23 19 vpermt2q m5, m10, m20 ; 18 10 mova m20, m11 vpermi2q m20, m2, m9 ; 17 21 mova m22, [cq+64*12] ; 24 25 mova m9, [cq+64*13] ; 26 27 mova m3, [cq+64*14] ; 28 29 mova m23, [cq+64*15] ; 30 31 vpermt2q m2, m10, m22 ; 16 24 vpermt2q m22, m11, m3 ; 25 29 vpermt2q m3, m10, m6 ; 28 12 vpermt2q m4, m10, m9 ; 2 26 mova m6, m10 vpermi2q m6, m23, m8 ; 30 6 vpermt2q m23, m11, m9 ; 31 27 mov r6d, 64*3 call m(idct_8x8_internal_10bpc).main call m(idct_16x8_internal_10bpc).main call .main call m(idct_16x16_internal_10bpc).main_end jmp .end .fast: vpermq m0, m10, m0 ; 0 0 vpermq m1, m10, m1 ; 4 4 vpermt2q m4, m10, m8 ; 2 6 xor r6d, r6d call .main_fast2 call m(idct_16x16_internal_10bpc).main_end .end: %if WIN64 movaps xmm6, [cq+16*0] movaps xmm7, [cq+16*1] %endif vzeroupper call .transpose_8x32 pxor m14, m14 .zero_loop: mova [cq+r6*4+64*3], m14 mova [cq+r6*4+64*2], m14 mova [cq+r6*4+64*1], m14 mova [cq+r6*4+64*0], m14 sub r6d, 64 jge .zero_loop lea r5, [o_base_8bpc] punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m8 punpcklqdq m6, m8 call m(inv_txfm_add_dct_dct_32x8_8bpc).main pxor m12, m12 .write_32x8_start: vpbroadcastd m11, [pw_2048] vpbroadcastd m13, [pixel_10bpc_max] lea r3, [strideq*3] .write_32x8: pmulhrsw m0, m11 pmulhrsw m1, m11 pmulhrsw m2, m11 pmulhrsw m3, m11 call .write_32x4 pmulhrsw m0, m11, m4 pmulhrsw m1, m11, m5 pmulhrsw m2, m11, m6 pmulhrsw m3, m11, m7 .write_32x4: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3 ] REPX {pmaxsw x, m12}, m0, m1, m2, m3 REPX {pminsw x, m13}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 lea dstq, [dstq+strideq*4] ret .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 8 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 ALIGN function_align .main_fast3: ; assuming m0=in0 in0, m4=in2 in2, and m16=in1 in3 vbroadcasti32x4 m5, [o(pd_401_4076)] pmulld m3, m0, m12 pmulld m4, m5 REPX {paddd x, m13}, m3, m4 REPX {psrad x, 12 }, m3, m4 ; m3=idct8:t0-7, m4=t8a t15a ; t8a t15a -> t8/9 t14/15 vbroadcasti32x4 m5, [o(pd_3784_m3784)] pshufd m7, m4, q1032 pmulld m6, m4, [o(pd_1567)]{bcstd} pmulld m5, m7 paddd m6, m13 paddd m5, m6 psrad m5, 12 ; m5=t9a t14a ; t14a t9a -> t13/14 t9/10 [m5] & t8 15 -> t8/11a t12/15a [m4] shufps m6, m4, m5, q1032 ; t12 t13 shufps m8, m4, m5, q3210 ; t11a t10 pmulld m9, m6, m12 pmulld m7, m8, m12 paddd m9, m13 paddd m5, m9, m7 ; t12 t13a psubd m4, m9, m7 ; t11 t10a REPX {psrad x, 12 }, m5, m4 psubd m7, m3, m6 ; dct16 out15 out14 paddd m0, m3, m6 ; dct16 out0 out1 psubd m6, m3, m5 ; dct16 out12 out13 paddd m1, m3, m5 ; dct16 out3 out2 psubd m5, m3, m4 ; dct16 out11 out10 paddd m2, m3, m4 ; dct16 out4 out5 psubd m4, m3, m8 ; dct16 out8 out9 paddd m3, m8 ; dct16 out7 out6 REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 ; idct32_bottomhalf vbroadcasti32x4 m18, [o(pd_201_m601)] vbroadcasti32x4 m19, [o(pd_4091_4052)] pmulld m17, m16, m19 pmulld m16, m18 REPX {paddd x, m13}, m17, m16 REPX {psrad x, 12 }, m17, m16 ; m17: t31a t24a -> t30/31 t24/25, m16: t16a t23a -> t16/17 t22/23 [step2] vbroadcasti32x4 m10, [o(pd_799_m2276)] vbroadcasti32x4 m11, [o(pd_4017_3406)] pmulld m18, m17, m10 pmulld m19, m17, m11 pmulld m8, m16, m11 pmulld m9, m16, m10 REPX {paddd x, m13}, m18, m19 psubd m18, m8 paddd m19, m9 REPX {psrad x, 12 }, m18, m19 ; m17=t31 t24 -> t28/31a t24/27a, m16=t16 t23 -> t16/19a t20/23a ; m18=t17a t22a -> t17/18 t21/22, m19=t30a t25a -> t29/30 t25/26 punpckhqdq m23, m17, m19 ; t24a t25 [or t27a t26] punpcklqdq m20, m16, m18 ; t16a t17 [or t19a t18] punpckhqdq m22, m16, m18 ; t23a t22 [or t20a t21] punpcklqdq m16, m17, m19 ; t28a t29 [or t31a t30] mova m21, m23 mova m18, m20 mova m17, m22 mova m19, m16 jmp .main4 .main_fast2: ; bottom three-quarters are zero vbroadcasti32x4 m8, [o(pd_799_4017)] pmulld m8, m1 ; t4 t7 vpmulld m0, [o(pd_2896)] {1to16} ; t0 t1 REPX {paddd x, m13}, m8, m0 REPX {psrad x, 12 }, m8, m0 pmulld m3, m8, m12 mova m2, m0 ; t3 t2 call m(idct_8x8_internal_10bpc).main3 vbroadcasti32x4 m6, [o(pd_4076_3920)] vbroadcasti32x4 m3, [o(pd_401_m1189)] pmulld m6, m4 ; t15 t12 pmulld m4, m3 ; t9 t10 REPX {paddd x, m13}, m6, m4 REPX {psrad x, 12 }, m6, m4 mova m5, m6 ; t14 t13 mova m9, m4 ; t8 t11 call m(idct_16x8_internal_10bpc).main3 vbroadcasti32x4 m23, [o(pd_4091_3973)] vbroadcasti32x4 m7, [o(pd_201_995)] vbroadcasti32x4 m22, [o(pd_1380_601)] vbroadcasti32x4 m9, [o(pd_3857_4052)] pmulld m23, m16 ; t16 t20 pmulld m16, m7 ; t31 t27 pmulld m22, m17 ; -t19 -t25 pmulld m17, m9 ; t28 t24 REPX {paddd x, m13}, m23, m16, m17 psubd m22, m13, m22 REPX {psrad x, 12 }, m23, m16, m22, m17 mova m20, m23 ; t30 t26 mova m9, m16 ; t17 t21 mova m19, m22 ; t18 t22 mova m18, m17 ; t29 t25 jmp .main3 .main_fast: ; bottom half is zero vbroadcasti32x4 m23, [o(pd_4091_3973)] vbroadcasti32x4 m7, [o(pd_201_995)] vbroadcasti32x4 m20, [o(pd_2751_2106)] vbroadcasti32x4 m9, [o(pd_3035_3513)] vbroadcasti32x4 m21, [o(pd_3703_3290)] vbroadcasti32x4 m10, [o(pd_1751_2440)] vbroadcasti32x4 m22, [o(pd_1380_601)] vbroadcasti32x4 m11, [o(pd_3857_4052)] pmulld m23, m16 ; t16a t20a pmulld m16, m7 ; t31a t27a pmulld m20, m19 ; -t17a -t21a pmulld m19, m9 ; t30a t26a pmulld m21, m18 ; t18a t22a pmulld m18, m10 ; t29a t25a pmulld m22, m17 ; -t19a -t25a pmulld m17, m11 ; t28a t24a psubd m20, m13, m20 psubd m22, m13, m22 jmp .main2 .main: ITX_MULSUB_2D 16, 23, 7, 9, 10, _, 201_995, 4091_3973 ITX_MULSUB_2D 20, 19, 7, 9, 10, _, 3035_3513, 2751_2106 ITX_MULSUB_2D 18, 21, 7, 9, 10, _, 1751_2440, 3703_3290 ITX_MULSUB_2D 22, 17, 7, 9, 10, _, 3857_4052, 1380_601 paddd m20, m13 paddd m22, m13 .main2: REPX {paddd x, m13}, m16, m23, m19 REPX {psrad x, 12 }, m16, m20, m23, m19 psubd m9, m16, m20 ; t17 t21 paddd m16, m20 ; t16 t20 psubd m20, m23, m19 ; t30 t26 paddd m23, m19 ; t31 t27 REPX {pmaxsd x, m14}, m9, m16, m20, m23 REPX {paddd x, m13}, m21, m18, m17 REPX {psrad x, 12 }, m18, m22, m21, m17 psubd m19, m22, m18 ; t18 t22 paddd m22, m18 ; t19 t23 psubd m18, m17, m21 ; t29 t25 paddd m17, m21 ; t28 t24 REPX {pmaxsd x, m14}, m19, m22, m18, m17 REPX {pminsd x, m15}, m20, m9, m18, m19, m16, m23, m22, m17 .main3: vbroadcasti32x4 m11, [o(pd_4017_2276)] vbroadcasti32x4 m10, [o(pd_799_3406)] psubd m7, m0, m6 ; dct16 out15 out14 paddd m0, m6 ; dct16 out0 out1 psubd m6, m1, m5 ; dct16 out12 out13 paddd m1, m5 ; dct16 out3 out2 psubd m5, m2, m4 ; dct16 out11 out10 paddd m2, m4 ; dct16 out4 out5 psubd m4, m3, m8 ; dct16 out8 out9 paddd m3, m8 ; dct16 out7 out6 ITX_MULSUB_2D 20, 9, 8, 21, _, 13, 10, 11 ITX_MULSUB_2D 18, 19, 8, 21, _, 13, 10, 11, 2 REPX {pmaxsd x, m14}, m7, m0, m6, m1, m5, m2, m4, m3 punpckhqdq m21, m16, m20 ; t20 t21a punpcklqdq m16, m20 ; t16 t17a punpcklqdq m20, m22, m19 ; t19 t18a punpckhqdq m22, m19 ; t23 t22a REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 punpcklqdq m19, m23, m9 ; t31 t30a punpckhqdq m23, m9 ; t27 t26a punpckhqdq m9, m17, m18 ; t24 t25a punpcklqdq m17, m18 ; t28 t29a psubd m18, m16, m20 ; t19a t18 paddd m20, m16 ; t16a t17 psubd m16, m19, m17 ; t28a t29 paddd m19, m17 ; t31a t30 psubd m17, m22, m21 ; t20a t21 paddd m22, m21 ; t23a t22 psubd m21, m9, m23 ; t27a t26 paddd m23, m9 ; t24a t25 REPX {pmaxsd x, m14}, m18, m16, m17, m21 REPX {pminsd x, m15}, m16, m18, m21, m17 REPX {pmaxsd x, m14}, m20, m22, m19, m23 REPX {pminsd x, m15}, m20, m22, m19, m23 .main4: vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] ITX_MULSUB_2D 16, 18, 8, 9, _, 13, 10, 11 ITX_MULSUB_2D 21, 17, 8, 9, _, 13, 10, 11, 2 paddd m9, m20, m22 ; t16 t17a psubd m20, m22 ; t23 t22a paddd m22, m19, m23 ; t31 t30a psubd m19, m23 ; t24 t25a psubd m23, m16, m17 ; t20a t21 paddd m16, m17 ; t19a t18 psubd m17, m18, m21 ; t27a t26 paddd m21, m18 ; t28a t29 REPX {pmaxsd x, m14}, m20, m19, m23, m17 REPX {pminsd x, m15}, m19, m20, m17, m23 REPX {pmulld x, m12}, m19, m20, m17, m23 REPX {pmaxsd x, m14}, m22, m21, m16, m9 paddd m19, m13 paddd m17, m13 REPX {pminsd x, m15}, m22, m21, m16, m9 psubd m18, m19, m20 ; t23a t22 paddd m19, m20 ; t24a t25 paddd m20, m17, m23 ; t27 t26a psubd m17, m23 ; t20 t21a REPX {psrad x, 12 }, m20, m19, m18, m17 ret .transpose_8x32: mova m10, [o(idct32x8p)] psrlw m8, m10, 8 mova m9, m8 vpermi2w m8, m1, m5 vpermt2w m1, m10, m5 vprold m5, m9, 16 vpermi2w m9, m3, m7 vpermt2w m3, m10, m7 vprold m10, 16 mova m7, m5 vpermi2w m5, m0, m4 vpermt2w m0, m10, m4 vpermi2w m7, m2, m6 vpermt2w m2, m10, m6 punpckhdq m6, m5, m8 punpckldq m5, m8 punpckhdq m8, m7, m9 punpckldq m7, m9 punpckhdq m4, m2, m3 punpckldq m2, m3 punpckhdq m3, m0, m1 punpckldq m0, m1 ret cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 10, dst, stride, c, eob vpbroadcastd m5, [pw_4096] lea r4, [strideq*3] mova m6, [idtx32x8p] lea r5, [strideq*5] vpbroadcastd m9, [pixel_10bpc_max] lea r6, [strideq+r4*2] pxor m8, m8 sub eobd, 107 psrlw m7, m6, 8 .loop: mova m0, [cq+64*0] packssdw m0, [cq+64*1] ; 02 13 mova m1, [cq+64*2] packssdw m1, [cq+64*3] ; 46 57 mova m2, [cq+64*4] packssdw m2, [cq+64*5] ; 8a 9b mova m3, [cq+64*6] packssdw m3, [cq+64*7] ; ce df REPX {pmulhrsw x, m5}, m0, m1, m2, m3 REPX {mova [cq+64*x], m8}, 0, 1, 2, 3 mova m4, m6 vpermi2w m4, m1, m3 vpermt2w m1, m7, m3 REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 mova m3, m7 vpermi2w m3, m0, m2 vpermt2w m0, m6, m2 add cq, 64*8 punpcklqdq m2, m3, m1 ; 4 5 punpckhqdq m3, m1 ; 6 7 punpckhqdq m1, m0, m4 ; 2 3 punpcklqdq m0, m4 ; 0 1 mova ym4, [dstq+strideq*0] vinserti32x8 m4, [dstq+strideq*1], 1 paddw m0, m4 mova ym4, [dstq+strideq*2] vinserti32x8 m4, [dstq+r4 *1], 1 paddw m1, m4 mova ym4, [dstq+strideq*4] vinserti32x8 m4, [dstq+r5 *1], 1 paddw m2, m4 mova ym4, [dstq+r4 *2] vinserti32x8 m4, [dstq+r6 *1], 1 paddw m3, m4 REPX {pmaxsw x, m8}, m0, m1, m2, m3 REPX {pminsw x, m9}, m0, m1, m2, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+r4 *1], m1, 1 mova [dstq+strideq*4], ym2 vextracti32x8 [dstq+r5 *1], m2, 1 mova [dstq+r4 *2], ym3 vextracti32x8 [dstq+r6 *1], m3, 1 add dstq, 32 add eobd, 0x80000000 jnc .loop RET cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] %if WIN64 movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif cmp eobd, 36 jl .fast call .pass1 cmp eobd, 151 jge .full lea r5, [o_base_8bpc] pxor m9, m9 punpcklwd m8, m1, m1 ; 2 punpckhwd m14, m1, m1 ; 3 punpcklwd m1, m3, m3 ; 6 punpckhwd m15, m3, m3 ; 7 punpcklwd m3, m6, m6 ; 12 punpckhwd m19, m6, m6 ; 13 punpcklwd m6, m9, m4 ; __ 8 punpckhwd m20, m4, m4 ; 9 punpckhwd m16, m5, m5 ; 11 punpcklwd m5, m5 ; 10 punpcklwd m9, m0 ; __ 0 punpckhwd m21, m0, m0 ; 1 punpcklwd m0, m7, m7 ; 14 punpckhwd m17, m7, m7 ; 15 punpcklwd m7, m2, m2 ; 4 punpckhwd m18, m2, m2 ; 5 call m(idct_16x16_internal_8bpc).main_fast call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mov r6d, 64*3 pxor m8, m8 .zero_loop: REPX {mova [cq+r6*8+128*x], m8}, 3, 2, 1, 0 sub r6d, 64 jge .zero_loop jmp .pass2_end .full: mova [cq+128*0], m0 mova [cq+128*1], m1 mova [cq+128*2], m2 mova [cq+128*3], m3 mova [cq+128*4], m4 mova [cq+128*5], m5 mova [cq+128*6], m6 mova [cq+128*7], m7 add cq, 64 call .pass1 mova m9, [cq-64* 1] ; 0 1 mova m14, [cq+64* 1] ; 2 3 mova m18, [cq+64* 3] ; 4 5 mova m15, [cq+64* 5] ; 6 7 mova m20, [cq+64* 7] ; 8 9 mova m16, [cq+64* 9] ; 10 11 mova m22, [cq+64*11] ; 12 13 mova m19, [cq+64*13] ; 14 15 lea r5, [o_base_8bpc] punpcklwd m8, m7, m14 ; 30 2 punpckhwd m21, m7, m9 ; 31 1 punpcklwd m7, m6, m18 ; 28 4 punpckhwd m14, m6 ; 3 29 punpcklwd m9, m0, m9 ; 16 0 punpckhwd m17, m19, m0 ; 15 17 punpcklwd m0, m19, m1 ; 14 18 punpckhwd m19, m1, m22 ; 19 13 punpcklwd m1, m15, m5 ; 6 26 punpckhwd m18, m5, m18 ; 27 5 punpcklwd m6, m4, m20 ; 24 8 punpckhwd m15, m4 ; 7 25 punpcklwd m5, m3, m16 ; 22 10 punpckhwd m20, m3, m20 ; 23 9 punpcklwd m3, m22, m2 ; 12 20 punpckhwd m16, m2 ; 11 21 call m(idct_16x16_internal_8bpc).main2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf mov r6d, 32*7 pxor m8, m8 .full_zero_loop: REPX {mova [cq+r6*8+64*x], m8}, 2, 1, 0, -1 sub r6d, 32 jge .full_zero_loop jmp .pass2_end .fast: mova ym0, [cq+128*0] mova ym2, [cq+128*4] movshdup m8, [o(permB)] mova ym1, [cq+128*2] mova ym3, [cq+128*6] mova ym4, [cq+128*1] mova ym5, [cq+128*3] mova ym6, [cq+128*5] mova ym7, [cq+128*7] vpermt2q m0, m8, m2 ; 0 4 vpermt2q m1, m8, m3 ; 2 6 vpermt2q m4, m8, m5 ; 1 3 vpermt2q m7, m8, m6 ; 7 5 REPX {pmulld x, m12}, m0, m1, m4, m7 pxor ym16, ym16 mova [cq+128*0], ym16 REPX {vmovdqa32 [cq+128*x], ym16}, 1, 2, 3, 4, 5, 6, 7 REPX {paddd x, m13}, m0, m1, m4, m7 REPX {psrad x, 12 }, m0, m1, m4, m7 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast vpbroadcastd m11, [o(pd_1)] call m(idct_8x16_internal_10bpc).main_end2 mova m8, [o(idct8x32p)] packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 mova m6, [dup16_perm] vpermb m0, m8, m0 vpermb m2, m8, m2 vprold m8, 16 vpermb m1, m8, m1 vpermb m3, m8, m3 punpckldq m4, m0, m2 punpckhdq m0, m2 punpckldq m2, m1, m3 punpckhdq m1, m3 punpckldq m21, m4, m2 punpckhdq m14, m4, m2 punpckldq m18, m0, m1 punpckhdq m15, m0, m1 vpermb m8, m6, m14 ; 2 vpermb m1, m6, m15 ; 6 vpermb m7, m6, m18 ; 4 pmovzxwd m9, ym21 ; 0 vpord m6, [o(pb_32)] {1to16} lea r5, [o_base_8bpc] vpermb m21, m6, m21 ; 1 vpermb m15, m6, m15 ; 7 vpermb m18, m6, m18 ; 5 vpermb m14, m6, m14 ; 3 pslld m9, 16 call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 .pass2_end: movshdup m22, [permC] vpbroadcastd m11, [pw_2048] vpbroadcastd m13, [pixel_10bpc_max] lea r6, [strideq*3] pxor m12, m12 psrlq m23, m22, 8 vpermq m8, m22, m0 vpermq m9, m23, m1 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m2 vpermq m9, m23, m3 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m4 vpermq m9, m23, m5 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m6 vpermq m9, m23, m7 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m14 vpermq m9, m23, m15 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m16 vpermq m9, m23, m17 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m18 vpermq m9, m23, m19 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m22, m20 vpermq m9, m23, m21 %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif vzeroupper jmp m(idct_16x8_internal_10bpc).write_16x4 .pass1: pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 2] pmulld m2, m12, [cq+128* 4] pmulld m3, m12, [cq+128* 6] pmulld m4, m12, [cq+128* 8] pmulld m5, m12, [cq+128*10] pmulld m6, m12, [cq+128*12] pmulld m7, m12, [cq+128*14] call m(idct_8x16_internal_10bpc).main_rect2 pmulld m16, m12, [cq+128* 1] pmulld m17, m12, [cq+128* 3] pmulld m18, m12, [cq+128* 5] pmulld m19, m12, [cq+128* 7] pmulld m20, m12, [cq+128* 9] pmulld m21, m12, [cq+128*11] pmulld m22, m12, [cq+128*13] pmulld m23, m12, [cq+128*15] call m(idct_16x16_internal_10bpc).main_rect2 vpbroadcastd m11, [o(pd_1)] call m(idct_16x16_internal_10bpc).main_end2 jmp m(idct_16x16_internal_10bpc).main_end3 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 16, dst, stride, c, eob %undef cmp vpbroadcastd m10, [pw_2896x8] vpbroadcastd m11, [pw_1697x16] vpbroadcastd m13, [pw_8192] vpbroadcastd m15, [pixel_10bpc_max] lea r6, [strideq*9] pxor m14, m14 paddw m12, m13, m13 ; pw_16384 cmp eobd, 151 jl .main call .main add cq, 64-128*4 lea dstq, [dstq+strideq*8] .main: call .main_internal add cq, 128*4 pmulhrsw m1, m13, m2 pmulhrsw m3, m13, m4 pmulhrsw m5, m13, m6 pmulhrsw m7, m13, m8 call .main_internal .main2: pmulhrsw m2, m13 pmulhrsw m4, m13 pmulhrsw m6, m13 pmulhrsw m8, m13 punpcklqdq m0, m1, m2 ; 0 8 punpckhqdq m1, m2 ; 1 9 call .write_16x2x2 punpcklqdq m0, m3, m4 ; 2 10 punpckhqdq m1, m3, m4 ; 3 11 call .write_16x2x2 punpcklqdq m0, m5, m6 ; 4 12 punpckhqdq m1, m5, m6 ; 5 13 call .write_16x2x2 punpcklqdq m0, m7, m8 ; 6 14 punpckhqdq m1, m7, m8 ; 7 15 .write_16x2x2: mova ym2, [dstq+strideq*0] vinserti32x8 m2, [dstq+strideq*8], 1 mova ym9, [dstq+strideq*1] vinserti32x8 m9, [dstq+r6 ], 1 paddw m0, m2 paddw m1, m9 pmaxsw m0, m14 pmaxsw m1, m14 pminsw m0, m15 pminsw m1, m15 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*8], m0, 1 mova [dstq+strideq*1], ym1 vextracti32x8 [dstq+r6 ], m1, 1 lea dstq, [dstq+strideq*2] ret .main_internal: mova m8, [cq+128* 0] packssdw m8, [cq+128* 8] mova m6, [cq+128* 1] packssdw m6, [cq+128* 9] mova m0, [cq+128* 2] packssdw m0, [cq+128*10] mova m2, [cq+128* 3] packssdw m2, [cq+128*11] REPX {pmulhrsw x, m10}, m8, m6, m0, m2 REPX {vpermq x, x, q3120}, m8, m6, m0, m2 pmulhrsw m4, m11, m8 pmulhrsw m9, m11, m6 REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 pmulhrsw m4, m12 pmulhrsw m9, m12 paddsw m8, m4 paddsw m6, m9 pmulhrsw m4, m11, m0 pmulhrsw m9, m11, m2 REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 pmulhrsw m4, m12 pmulhrsw m9, m12 paddsw m0, m4 paddsw m2, m9 punpcklwd m4, m8, m6 punpckhwd m8, m6 punpcklwd m6, m0, m2 punpckhwd m0, m2 punpckldq m2, m4, m6 ; 0 1 punpckhdq m4, m6 ; 2 3 punpckldq m6, m8, m0 ; 4 5 punpckhdq m8, m0 ; 6 7 ret cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] %if WIN64 movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif mov r6d, 8*12 cmp eobd, 36 jl .fast pmulld m0, m12, [cq+64* 0] pmulld m1, m12, [cq+64* 4] pmulld m2, m12, [cq+64* 8] pmulld m3, m12, [cq+64*12] pmulld m16, m12, [cq+64* 2] pmulld m17, m12, [cq+64* 6] pmulld m18, m12, [cq+64*10] pmulld m19, m12, [cq+64*14] cmp eobd, 151 jge .full call m(idct_8x16_internal_10bpc).main_fast_rect2 call m(idct_16x16_internal_10bpc).main_fast_rect2 call .idct16_sumsub call .pass1_load_spill call .main_fast_rect2 jmp .pass1_end .full: pmulld m4, m12, [cq+64*16] pmulld m5, m12, [cq+64*20] pmulld m6, m12, [cq+64*24] pmulld m7, m12, [cq+64*28] pmulld m20, m12, [cq+64*18] pmulld m21, m12, [cq+64*22] pmulld m22, m12, [cq+64*26] pmulld m23, m12, [cq+64*30] add r6d, 8*16 call m(idct_8x16_internal_10bpc).main_rect2 call m(idct_16x16_internal_10bpc).main_rect2 call .idct16_sumsub call .pass1_load_spill pmulld m16, m12, [cq+64*17] pmulld m17, m12, [cq+64*19] pmulld m18, m12, [cq+64*21] pmulld m19, m12, [cq+64*23] pmulld m20, m12, [cq+64*25] pmulld m21, m12, [cq+64*27] pmulld m22, m12, [cq+64*29] pmulld m23, m12, [cq+64*31] call .main_rect2 .pass1_end: vpbroadcastd m11, [o(pd_1)] lea r4, [cq+64] call .idct32_pass1_end lea r5, [o_base_8bpc] punpckhqdq m19, m5, m16 ; 11 punpcklqdq m5, m16 ; 10 punpckhqdq m16, m2, m1 ; 5 punpcklqdq m2, m1 ; 4 punpcklqdq m1, m15, m4 ; 2 punpckhqdq m15, m4 ; 3 punpcklqdq m4, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpckhqdq m14, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m20, m6, m17 ; 13 punpcklqdq m6, m17 ; 12 punpckhqdq m17, m3, m21 ; 7 punpcklqdq m3, m21 ; 6 punpckhqdq m21, m7, m8 ; 15 punpcklqdq m7, m8 ; 14 call m(inv_txfm_add_dct_dct_32x8_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf jmp .end .fast: pmulld ym0, ym12, [cq+64*0] pmulld ym1, ym12, [cq+64*4] movshdup m7, [o(permB)] mova ym4, [cq+64*2] mova ym5, [cq+64*6] mova ym16, [cq+64*1] mova ym2, [cq+64*5] mova ym3, [cq+64*3] mova ym17, [cq+64*7] vpermt2q m4, m7, m5 ; 2 6 vpermt2q m16, m7, m2 ; 1 5 vpermt2q m17, m7, m3 ; 7 3 paddd ym0, ym13 paddd ym1, ym13 psrad ym0, 12 psrad ym1, 12 vpermq m0, m7, m0 ; 0 0 vpermq m1, m7, m1 ; 4 4 REPX {pmulld x, m12}, m4, m16, m17 REPX {paddd x, m13}, m4, m16, m17 REPX {psrad x, 12 }, m4, m16, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 vpbroadcastd m11, [o(pd_1)] call m(idct_16x16_internal_10bpc).main_end2 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 lea r5, [o_base_8bpc] punpckhqdq m14, m0, m2 ; 1 punpcklqdq m0, m2 ; 0 punpcklqdq m1, m3, m4 ; 2 punpckhqdq m15, m3, m4 ; 3 punpcklqdq m2, m5, m7 ; 4 punpckhqdq m16, m5, m7 ; 5 punpcklqdq m3, m6, m8 ; 6 punpckhqdq m17, m6, m8 ; 7 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast .end: %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif pxor m12, m12 .zero_loop: mova [cq+r6*8+64*3], m12 mova [cq+r6*8+64*2], m12 mova [cq+r6*8+64*1], m12 mova [cq+r6*8+64*0], m12 sub r6d, 8*4 jge .zero_loop call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start pmulhrsw m0, m11, m14 pmulhrsw m1, m11, m15 pmulhrsw m2, m11, m16 pmulhrsw m3, m11, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m18 pmulhrsw m1, m11, m19 pmulhrsw m2, m11, m20 pmulhrsw m3, m11, m21 vzeroupper jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 16 .dconly3: add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 .dconly2: vpbroadcastd m3, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw m2, r6d paddsw m2, m3 .dconly_loop: paddsw m0, m2, [dstq+strideq*0] paddsw m1, m2, [dstq+strideq*1] psubusw m0, m3 psubusw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET ALIGN function_align .idct16_sumsub: psubd m23, m0, m22 ; t15 paddd m0, m22 ; t0 psubd m22, m1, m21 ; t14 paddd m1, m21 ; t1 REPX {pmaxsd x, m14}, m23, m0, m22, m1 psubd m21, m2, m20 ; t13 paddd m2, m20 ; t2 REPX {pminsd x, m15}, m23, m0, m22, m1 psubd m20, m3, m19 ; t12 paddd m3, m19 ; t3 REPX {pmaxsd x, m14}, m21, m2, m20, m3 psubd m19, m4, m18 ; t11 paddd m4, m18 ; t4 REPX {pminsd x, m15}, m21, m2, m20, m3 psubd m18, m5, m17 ; t10 paddd m5, m17 ; t5 REPX {pmaxsd x, m14}, m19, m4, m18, m5 psubd m17, m6, m16 ; t9 paddd m6, m16 ; t6 REPX {pminsd x, m15}, m19, m4, m18, m5 psubd m16, m7, m9 ; t8 paddd m7, m9 ; t7 REPX {pmaxsd x, m14}, m17, m6, m16, m7 REPX {pminsd x, m15}, m17, m6, m16, m7 ret .idct32_pass1_end: psrlq m12, [o(permC)], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 %macro IDCT32_PASS1_END 2 ; low, high paddd m8, m11, [r4+128*%1] paddd m9, m11, [cq+128*%1] psubd m10, m8, m%1 ; out 16+n paddd m8, m%1 ; out 15-n paddd m%1, m9, m%2 ; out 0+n psubd m9, m%2 ; out 31-n REPX {vpsravd x, m11}, m10, m%1, m8, m9 packssdw m%1, m10 ; 0+n 16+n packssdw m%2, m8, m9 ; 15-n 31-n %endmacro IDCT32_PASS1_END 0, 23 ; 0 16, 15 31 IDCT32_PASS1_END 7, 16 ; 7 23, 8 24 IDCT32_PASS1_END 1, 22 ; 1 17, 14 30 IDCT32_PASS1_END 6, 17 ; 6 22, 9 25 IDCT32_PASS1_END 2, 21 ; 2 18, 13 29 IDCT32_PASS1_END 5, 18 ; 5 21, 10 26 IDCT32_PASS1_END 3, 20 ; 3 19, 12 28 IDCT32_PASS1_END 4, 19 ; 4 20, 11 27 .transpose_16x32: mova m14, m13 vpermi2q m14, m0, m16 vpermt2q m0, m12, m16 mova m15, m13 vpermi2q m15, m1, m17 vpermt2q m1, m12, m17 mova m16, m13 vpermi2q m16, m2, m18 vpermt2q m2, m12, m18 mova m17, m13 vpermi2q m17, m3, m19 vpermt2q m3, m12, m19 mova m18, m13 vpermi2q m18, m4, m20 vpermt2q m4, m12, m20 mova m19, m13 vpermi2q m19, m5, m21 vpermt2q m5, m12, m21 mova m20, m13 vpermi2q m20, m6, m22 vpermt2q m6, m12, m22 mova m21, m13 vpermi2q m21, m7, m23 vpermt2q m7, m12, m23 punpckhwd m8, m2, m3 ; c04 d04 c05 d05 c06 d06 c07 d07 punpcklwd m2, m3 ; c00 d00 c01 d01 c02 d02 c03 d03 punpckhwd m3, m0, m1 ; a04 b04 a05 b05 a06 b06 a07 b07 punpcklwd m0, m1 ; a00 b00 a01 b01 a02 b02 a03 b03 punpckhwd m1, m4, m5 ; e04 f04 e05 f05 e06 f06 e07 f07 punpcklwd m4, m5 ; e00 f00 e01 f01 e02 f02 e03 f03 punpckhwd m5, m6, m7 ; g04 h04 g05 h05 g06 h06 g07 h07 punpcklwd m6, m7 ; g00 h00 g01 h01 g02 h02 g03 h03 punpckhwd m7, m14, m15 ; a12 b12 a13 b13 a14 b14 a15 b15 punpcklwd m14, m15 ; a08 b08 a09 b09 a10 b10 a11 b11 punpckhwd m15, m16, m17 ; c12 d12 c13 d13 c14 d14 c15 d15 punpcklwd m16, m17 ; c08 d08 c09 d09 c10 d10 c11 d11 punpckhwd m17, m18, m19 ; e12 f12 e13 f13 e14 f14 e15 f15 punpcklwd m18, m19 ; e08 f08 e09 f09 e10 f10 e11 f11 punpckhwd m19, m20, m21 ; g12 h12 g13 h13 g14 h14 g15 h15 punpcklwd m20, m21 ; g08 h08 g09 h09 g10 h10 g11 h11 punpckhdq m21, m1, m5 ; e06 f06 g06 h06 e07 f07 g07 h07 punpckldq m1, m5 ; e04 f04 g04 h04 e05 f05 g05 h05 punpckhdq m5, m14, m16 ; a10 b10 c10 d10 a11 b11 c11 d11 punpckldq m14, m16 ; a08 b08 c08 d08 a09 b09 c09 d09 punpckhdq m16, m18, m20 ; e10 f10 g10 h10 e11 f11 g11 h11 punpckldq m18, m20 ; e08 f08 g08 h08 e09 f09 g09 h09 punpckldq m20, m4, m6 ; e00 f00 g00 h00 e01 f01 g01 h01 punpckhdq m4, m6 ; e02 f02 g02 h02 e03 f03 g03 h03 punpckldq m6, m7, m15 ; a12 b12 c12 d12 a13 b13 c13 d13 punpckhdq m7, m15 ; a14 b14 c14 d14 a15 b15 c15 d15 punpckhdq m15, m0, m2 ; a02 b02 c02 d02 a03 b03 c03 d03 punpckldq m0, m2 ; a00 b00 c00 d00 a01 b01 c01 d01 punpckldq m2, m3, m8 ; a04 b04 c04 d04 a05 b05 c05 d05 punpckhdq m3, m8 ; a06 b06 c06 d06 a07 b07 c07 d07 punpckhdq m8, m17, m19 ; e14 f14 g14 h14 e15 f15 g15 h15 punpckldq m17, m19 ; e12 f12 g12 h12 e13 f13 g13 h13 ret .pass1_load_spill: mova [cq+64* 0], m0 mova [cq+64* 2], m1 mova [cq+64* 4], m2 mova [cq+64* 6], m3 mova [cq+64* 8], m4 mova [cq+64*10], m5 mova [cq+64*12], m6 mova [cq+64*14], m7 pmulld m0, m12, [cq+64* 1] pmulld m1, m12, [cq+64* 3] pmulld m2, m12, [cq+64* 5] pmulld m3, m12, [cq+64* 7] pmulld m4, m12, [cq+64* 9] pmulld m5, m12, [cq+64*11] pmulld m6, m12, [cq+64*13] pmulld m7, m12, [cq+64*15] mova [cq+64* 1], m23 mova [cq+64* 3], m22 mova [cq+64* 5], m21 mova [cq+64* 7], m20 mova [cq+64* 9], m19 mova [cq+64*11], m18 mova [cq+64*13], m17 mova [cq+64*15], m16 ret .main_fast2_rect2: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_fast2: ; bottom 3/4 is zero pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a pmulld m0, [o(pd_201)] {1to16} ; t16a pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a pmulld m3, [o(pd_3857)] {1to16} ; t28a pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a pmulld m2, [o(pd_995)] {1to16} ; t20a pmulld m6, m1, [o(pd_601)] {1to16} ; t23a pmulld m17, m1, [o(pd_4052)] {1to16} ; t24a REPX {psubd x, m13, x}, m20, m6 REPX {paddd x, m13}, m23, m0, m3, m21, m2, m17 REPX {psrad x, 12 }, m20, m6, m23, m0, m3, m21, m2, m17 mova m8, m0 mova m16, m23 mova m7, m20 mova m4, m3 mova m19, m2 mova m18, m21 mova m5, m6 mova m22, m17 jmp .main3 .main_fast_rect2: call m(idct_8x16_internal_10bpc).round .main_fast: ; bottom half is zero pmulld m23, m0, [o(pd_4091)] {1to16} ; t31a pmulld m0, [o(pd_201)] {1to16} ; t16a pmulld m16, m7, [o(pd_2751)] {1to16} ; t17a pmulld m7, [o(pd_3035)] {1to16} ; t30a pmulld m19, m4, [o(pd_3703)] {1to16} ; t29a pmulld m4, [o(pd_1751)] {1to16} ; t18a pmulld m20, m3, [o(pd_1380)] {1to16} ; t19a pmulld m3, [o(pd_3857)] {1to16} ; t28a pmulld m21, m2, [o(pd_3973)] {1to16} ; t27a pmulld m2, [o(pd_995)] {1to16} ; t20a pmulld m18, m5, [o(pd_2106)] {1to16} ; t21a pmulld m5, [o(pd_3513)] {1to16} ; t26a pmulld m17, m6, [o(pd_3290)] {1to16} ; t25a pmulld m6, [o(pd_2440)] {1to16} ; t22a pmulld m22, m1, [o(pd_601)] {1to16} ; t23a pmulld m1, [o(pd_4052)] {1to16} ; t24a REPX {psubd x, m13, x}, m16, m20, m18, m22 call m(idct_16x16_internal_10bpc).round3 jmp .main2 .main_rect2: call m(idct_8x16_internal_10bpc).round call m(idct_16x16_internal_10bpc).round .main: ITX_MULSUB_2D 0, 23, 8, 9, 10, _, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 16, 7, 8, 9, 10, _, 3035, 2751 ; t17a, t30a ITX_MULSUB_2D 4, 19, 8, 9, 10, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 20, 3, 8, 9, 10, _, 3857, 1380 ; t19a, t28a ITX_MULSUB_2D 2, 21, 8, 9, 10, _, 995, 3973 ; t20a, t27a ITX_MULSUB_2D 18, 5, 8, 9, 10, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 6, 17, 8, 9, 10, _, 2440, 3290 ; t22a, t25a ITX_MULSUB_2D 22, 1, 8, 9, 10, _, 4052, 601 ; t23a, t24a call m(idct_16x16_internal_10bpc).round .main2: call m(idct_8x16_internal_10bpc).round psubd m8, m0, m16 ; t17 paddd m0, m16 ; t16 psubd m16, m23, m7 ; t30 paddd m23, m7 ; t31 REPX {pmaxsd x, m14}, m8, m0, m16, m23 paddd m7, m20, m4 ; t19 psubd m20, m4 ; t18 REPX {pminsd x, m15}, m8, m0, m16, m23 paddd m4, m3, m19 ; t28 psubd m3, m19 ; t29 REPX {pmaxsd x, m14}, m7, m20, m4, m3 psubd m19, m2, m18 ; t21 paddd m2, m18 ; t20 REPX {pminsd x, m15}, m7, m20, m4, m3 psubd m18, m21, m5 ; t26 paddd m21, m5 ; t27 REPX {pmaxsd x, m14}, m19, m2, m18, m21 psubd m5, m22, m6 ; t22 paddd m6, m22 ; t23 REPX {pminsd x, m15}, m19, m2, m18, m21 psubd m22, m1, m17 ; t25 paddd m17, m1 ; t24 REPX {pmaxsd x, m14}, m5, m6, m22, m17 REPX {pminsd x, m15}, m5, m6, m22, m17 .main3: vpbroadcastd m11, [o(pd_4017)] vpbroadcastd m10, [o(pd_799)] ITX_MULSUB_2D 16, 8, 9, 1, _, 13, 10, 11 ; t17a, t30a ITX_MULSUB_2D 3, 20, 9, 1, _, 13, 10, 11, 2 ; t29a, t18a vpbroadcastd m11, [o(pd_2276)] vpbroadcastd m10, [o(pd_3406)] ITX_MULSUB_2D 18, 19, 9, 1, _, 13, 10, 11 ; t21a, t26a ITX_MULSUB_2D 22, 5, 9, 1, _, 13, 10, 11, 2 ; t25a, t22a paddd m1, m6, m2 ; t23a psubd m6, m2 ; t20a psubd m2, m17, m21 ; t27a paddd m17, m21 ; t24a REPX {pmaxsd x, m14}, m1, m6, m2, m17 psubd m21, m23, m4 ; t28a paddd m23, m4 ; t31a REPX {pminsd x, m15}, m1, m6, m2, m17 psubd m4, m16, m20 ; t18 paddd m16, m20 ; t17 REPX {pmaxsd x, m14}, m21, m23, m4, m16 psubd m20, m0, m7 ; t19a paddd m0, m7 ; t16a REPX {pminsd x, m15}, m21, m23, m4, m16 psubd m7, m8, m3 ; t29 paddd m3, m8 ; t30 REPX {pmaxsd x, m14}, m20, m0, m7, m3 paddd m8, m5, m18 ; t22 psubd m5, m18 ; t21 REPX {pminsd x, m15}, m20, m0, m7, m3 psubd m18, m22, m19 ; t26 paddd m22, m19 ; t25 REPX {pmaxsd x, m14}, m8, m5, m18, m22 vpbroadcastd m11, [o(pd_3784)] vpbroadcastd m10, [o(pd_1567)] REPX {pminsd x, m15}, m8, m5, m18, m22 ITX_MULSUB_2D 21, 20, 9, 19, _, 13, 10, 11 ; t19, t28 ITX_MULSUB_2D 2, 6, 9, 19, _, 13, 10, 11, 2 ; t27, t20 ITX_MULSUB_2D 7, 4, 9, 19, _, 13, 10, 11 ; t18a, t29a ITX_MULSUB_2D 18, 5, 9, 19, _, 13, 10, 11, 2 ; t26a, t21a psubd m19, m0, m1 ; t23 paddd m0, m1 ; t16 paddd m1, m8, m16 ; t17a psubd m8, m16, m8 ; t22a REPX {pmaxsd x, m14}, m19, m0, m1, m8 psubd m16, m23, m17 ; t24 paddd m23, m17 ; t31 REPX {pminsd x, m15}, m19, m0, m1, m8 psubd m17, m3, m22 ; t25a paddd m22, m3 ; t30a REPX {pmaxsd x, m14}, m16, m23, m17, m22 paddd m3, m6, m21 ; t19a psubd m6, m21, m6 ; t20a REPX {pminsd x, m15}, m16, m23, m17, m22 paddd m21, m18, m4 ; t29 psubd m18, m4, m18 ; t26 REPX {pmaxsd x, m14}, m3, m6, m21, m18 psubd m4, m20, m2 ; t27a paddd m20, m2 ; t28a REPX {pminsd x, m15}, m3, m6, m21, m18 paddd m2, m7, m5 ; t18 psubd m7, m5 ; t21 REPX {pmaxsd x, m14}, m4, m20, m2, m7 REPX {pminsd x, m15}, m4, m20, m2, m7 REPX {pmulld x, m12}, m18, m16, m4, m17, m7, m19, m6, m8 REPX {paddd x, m13}, m18, m16, m4, m17 psubd m5, m18, m7 ; t21a paddd m18, m7 ; t26a psubd m7, m16, m19 ; t23a paddd m16, m19 ; t24a REPX {psrad x, 12 }, m5, m18, m7, m16 paddd m19, m4, m6 ; t27 psubd m4, m6 ; t20 psubd m6, m17, m8 ; t22 paddd m17, m8 ; t25 REPX {psrad x, 12 }, m19, m4, m6, m17 ret cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 16, dst, stride, c, eob %undef cmp vpbroadcastd m10, [pw_2896x8] vpbroadcastd m11, [pw_1697x16] vpbroadcastd m13, [pw_2048] vpbroadcastd m15, [pixel_10bpc_max] lea r6, [strideq*9] pxor m14, m14 cmp eobd, 151 jl .main mov r4, dstq call .main add cq, 64*12 lea dstq, [r4+32] .main: call .main_internal add cq, 64*4 pmulhrsw m1, m13, m2 pmulhrsw m3, m13, m4 pmulhrsw m5, m13, m6 pmulhrsw m7, m13, m8 call .main_internal jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 .main_internal: mova m8, [cq+64* 0] packssdw m8, [cq+64* 8] mova m6, [cq+64* 1] packssdw m6, [cq+64* 9] mova m0, [cq+64* 2] packssdw m0, [cq+64*10] mova m2, [cq+64* 3] packssdw m2, [cq+64*11] REPX {pmulhrsw x, m10}, m8, m6, m0, m2 REPX {paddsw x, x }, m8, m6, m0, m2 REPX {vpermq x, x, q3120}, m8, m6, m0, m2 pmulhrsw m4, m11, m8 pmulhrsw m9, m11, m6 paddsw m8, m8 paddsw m6, m6 REPX {mova [cq+64*x], m14}, 0, 1, 2, 3 paddsw m8, m4 paddsw m6, m9 pmulhrsw m4, m11, m0 pmulhrsw m9, m11, m2 paddsw m0, m0 paddsw m2, m2 REPX {mova [cq+64*x], m14}, 8, 9, 10, 11 paddsw m0, m4 paddsw m2, m9 punpcklwd m4, m8, m6 punpckhwd m8, m6 punpcklwd m6, m0, m2 punpckhwd m0, m2 punpckldq m2, m4, m6 ; 0 1 punpckhdq m4, m6 ; 2 3 punpckldq m6, m8, m0 ; 4 5 punpckhdq m8, m0 ; 6 7 ret cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] WIN64_SPILL_XMM 30 cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero mov r6d, 16*12 jmp .lefthalf .full: call .pass1 mov r6d, 16*28 .lefthalf: mova [cq+128* 0], m0 mova [cq+128* 1], m1 mova [cq+128* 2], m2 mova [cq+128* 3], m3 mova [cq+128* 4], m14 mova [cq+128* 5], m15 mova [cq+128* 6], m16 mova [cq+128* 7], m17 mova [cq+128* 8], m22 mova [cq+128* 9], m23 mova [cq+128*10], m24 mova [cq+128*11], m25 mova [cq+128*12], m26 mova [cq+128*13], m27 mova [cq+128*14], m28 mova [cq+128*15], m29 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] call .pass1 lea r5, [o_base_8bpc] call .pass2_start pxor m12, m12 .right_zero_loop: mova [cq+r6*8+64+128*3], m12 mova [cq+r6*8+64+128*2], m12 mova [cq+r6*8+64+128*1], m12 mova [cq+r6*8+64+128*0], m12 sub r6d, 16*4 jge .right_zero_loop mov r6d, 16*28 jmp .end2 .pass2_start: mova m4, [cq+64+128* 0] mova m5, [cq+64+128* 1] mova m6, [cq+64+128* 2] mova m7, [cq+64+128* 3] mova m18, [cq+64+128* 4] mova m19, [cq+64+128* 5] mova m20, [cq+64+128* 6] mova m21, [cq+64+128* 7] call m(inv_txfm_add_dct_dct_32x8_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+128*0], m14 mova [cq+128*1], m15 mova [cq+128*2], m16 mova [cq+128*3], m17 mova [cq+128*4], m18 mova [cq+128*5], m19 mova [cq+128*6], m20 mova [cq+128*7], m21 mova m14, [cq+64+128* 8] mova m15, [cq+64+128* 9] mova m16, [cq+64+128*10] mova m17, [cq+64+128*11] mova m18, [cq+64+128*12] mova m19, [cq+64+128*13] mova m20, [cq+64+128*14] mova m21, [cq+64+128*15] jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast lea r5, [o_base_8bpc] call .pass2_fast_start jmp .end .pass2_fast_start: call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova [cq+128*0], m14 mova [cq+128*1], m15 mova [cq+128*2], m16 mova [cq+128*3], m17 mova [cq+128*4], m18 mova [cq+128*5], m19 mova [cq+128*6], m20 mova [cq+128*7], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*0] mova ym1, [cq+128*4] mova ym4, [cq+128*2] mova ym5, [cq+128*6] mova ym16, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym17, [cq+128*7] mov r6d, 16*4 vpermq m0, m7, m0 ; 0 0 vpermq m1, m7, m1 ; 4 4 vpermt2q m4, m7, m5 ; 2 6 vpermt2q m16, m7, m2 ; 1 5 vpermt2q m17, m7, m3 ; 7 3 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 call m(idct_16x16_internal_10bpc).main_end call .pass2_fast2_start .end: pxor m12, m12 .end2: call .pass2_end .zero_loop: mova [cq+r6*8+128*3], m12 mova [cq+r6*8+128*2], m12 mova [cq+r6*8+128*1], m12 mova [cq+r6*8+128*0], m12 sub r6d, 16*4 jge .zero_loop WIN64_RESTORE_XMM vzeroupper ret .pass2_fast2_start: call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 lea r5, [o_base_8bpc] punpckhqdq m22, m0, m2 ; 1 punpcklqdq m0, m2 ; 0 punpcklqdq m1, m5, m7 ; 4 punpckhqdq m24, m5, m7 ; 5 punpcklqdq m14, m3, m4 ; 2 punpckhqdq m23, m3, m4 ; 3 punpcklqdq m15, m6, m8 ; 6 punpckhqdq m25, m6, m8 ; 7 mova m10, m13 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [cq+128*0], m14 mova [cq+128*1], m15 mova [cq+128*2], m16 mova [cq+128*3], m17 mova [cq+128*4], m18 mova [cq+128*5], m19 mova [cq+128*6], m20 mova [cq+128*7], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 .pass2_end: psubsw m9, m0, m29 ; out31 paddsw m0, m29 ; out0 psubsw m29, m1, m28 ; out30 paddsw m1, m28 ; out1 psubsw m28, m2, m27 ; out29 paddsw m2, m27 ; out2 psubsw m27, m3, m26 ; out28 paddsw m3, m26 ; out3 psubsw m26, m4, m25 ; out27 paddsw m4, m25 ; out4 psubsw m25, m5, m24 ; out26 paddsw m5, m24 ; out5 psubsw m24, m6, m23 ; out25 paddsw m6, m23 ; out6 psubsw m23, m7, m22 ; out24 paddsw m7, m22 ; out7 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8_start mova m0, [cq+128*0] mova m1, [cq+128*1] mova m2, [cq+128*2] mova m3, [cq+128*3] mova m4, [cq+128*4] mova m5, [cq+128*5] mova m6, [cq+128*6] mova m7, [cq+128*7] psubsw m22, m0, m21 ; out23 paddsw m0, m21 ; out8 psubsw m21, m1, m20 ; out22 paddsw m1, m20 ; out9 psubsw m20, m2, m19 ; out21 paddsw m2, m19 ; out10 psubsw m19, m3, m18 ; out20 paddsw m3, m18 ; out11 psubsw m18, m4, m17 ; out19 paddsw m4, m17 ; out12 psubsw m17, m5, m16 ; out18 paddsw m5, m16 ; out13 psubsw m16, m6, m15 ; out17 paddsw m6, m15 ; out14 psubsw m15, m7, m14 ; out16 paddsw m7, m14 ; out15 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 pmulhrsw m0, m11, m15 pmulhrsw m1, m11, m16 pmulhrsw m2, m11, m17 pmulhrsw m3, m11, m18 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m19 pmulhrsw m1, m11, m20 pmulhrsw m2, m11, m21 pmulhrsw m3, m11, m22 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m23 pmulhrsw m1, m11, m24 pmulhrsw m2, m11, m25 pmulhrsw m3, m11, m26 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m27 pmulhrsw m1, m11, m28 pmulhrsw m2, m11, m29 pmulhrsw m3, m11, m9 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly2 .pass1_fast: mova m0, [cq+128* 0] mova m1, [cq+128* 4] mova m2, [cq+128* 8] mova m3, [cq+128*12] mov r6d, 16*12 call m(idct_8x16_internal_10bpc).main_fast mova m16, [cq+128* 2] mova m17, [cq+128* 6] mova m18, [cq+128*10] mova m19, [cq+128*14] call m(idct_16x16_internal_10bpc).main_fast call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast jmp .pass1_end .pass1: mova m0, [cq+128* 0] mova m1, [cq+128* 4] mova m2, [cq+128* 8] mova m3, [cq+128*12] mova m4, [cq+128*16] mova m5, [cq+128*20] mova m6, [cq+128*24] mova m7, [cq+128*28] call m(idct_8x16_internal_10bpc).main mova m16, [cq+128* 2] mova m17, [cq+128* 6] mova m18, [cq+128*10] mova m19, [cq+128*14] mova m20, [cq+128*18] mova m21, [cq+128*22] mova m22, [cq+128*26] mova m23, [cq+128*30] call m(idct_16x16_internal_10bpc).main call .pass1_load_spill mova m16, [cq+128*17] mova m17, [cq+128*19] mova m18, [cq+128*21] mova m19, [cq+128*23] mova m20, [cq+128*25] mova m21, [cq+128*27] mova m22, [cq+128*29] mova m23, [cq+128*31] call m(inv_txfm_add_dct_dct_32x16_10bpc).main .pass1_end: vpbroadcastd m11, [o(pd_2)] lea r4, [cq+128*8] call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end punpckhqdq m22, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m24, m2, m1 ; 5 punpcklqdq m1, m2, m1 ; 4 punpcklqdq m2, m14, m18 ; 8 punpckhqdq m26, m14, m18 ; 9 punpcklqdq m14, m15, m4 ; 2 punpckhqdq m23, m15, m4 ; 3 punpckhqdq m25, m3, m21 ; 7 punpcklqdq m15, m3, m21 ; 6 punpckhqdq m28, m6, m17 ; 13 punpcklqdq m3, m6, m17 ; 12 punpckhqdq m27, m5, m16 ; 11 punpcklqdq m16, m5, m16 ; 10 punpckhqdq m29, m7, m8 ; 15 punpcklqdq m17, m7, m8 ; 14 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 mova m0, [cq+128* 1] mova [cq+128* 1], m1 mova [cq+128* 2], m2 mova m1, [cq+128* 3] mova m2, [cq+128* 5] mova [cq+128* 3], m3 mova [cq+128* 4], m4 mova m3, [cq+128* 7] mova m4, [cq+128* 9] mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 mova m5, [cq+128*11] mova m6, [cq+128*13] mova m7, [cq+128*15] mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eob %undef cmp vpbroadcastd m13, [pw_8192] vpbroadcastd m15, [pixel_10bpc_max] pxor m14, m14 lea r6, [strideq*9] cmp eobd, 136 jl .main mov r4, dstq call .main add cq, 64-128*4 lea dstq, [dstq+strideq*8] call .main add cq, 128*12-64 lea dstq, [r4+32] cmp eobd, 543 jl .main call .main add cq, 64-128*4 lea dstq, [dstq+strideq*8] .main: call .main_internal add cq, 128*4 pmulhrsw m1, m13, m2 pmulhrsw m3, m13, m4 pmulhrsw m5, m13, m6 pmulhrsw m7, m13, m8 call .main_internal jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 .main_internal: mova m8, [cq+128* 0] packssdw m8, [cq+128* 8] mova m6, [cq+128* 1] packssdw m6, [cq+128* 9] mova m0, [cq+128* 2] packssdw m0, [cq+128*10] mova m2, [cq+128* 3] packssdw m2, [cq+128*11] REPX {vpermq x, x, q3120}, m8, m6, m0, m2 REPX {mova [cq+128*x], m14}, 0, 1, 2, 3 punpcklwd m4, m8, m6 punpckhwd m8, m6 punpcklwd m6, m0, m2 punpckhwd m0, m2 REPX {mova [cq+128*x], m14}, 8, 9, 10, 11 punpckldq m2, m4, m6 ; 0 1 punpckhdq m4, m6 ; 2 3 punpckldq m6, m8, m0 ; 4 5 punpckhdq m8, m0 ; 6 7 ret cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 36 jl .fast call .pass1 cmp eobd, 151 jge .full lea r5, [o_base_8bpc] punpckhwd m22, m0, m0 punpckhwd m23, m1, m1 punpckhwd m24, m2, m2 punpckhwd m25, m3, m3 punpckhwd m26, m4, m4 punpckhwd m27, m5, m5 punpckhwd m28, m6, m6 punpckhwd m29, m7, m7 punpcklwd m21, m1, m1 punpcklwd m14, m3, m3 punpcklwd m18, m5, m5 punpcklwd m15, m7, m7 pxor m9, m9 punpcklwd m9, m9, m0 punpcklwd m8, m2, m2 punpcklwd m7, m4, m4 punpcklwd m1, m6, m6 call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 mova [rsp+mmsize*0], m14 mova [rsp+mmsize*1], m15 mova [rsp+mmsize*2], m16 mova [rsp+mmsize*3], m17 mova [rsp+mmsize*4], m18 mova [rsp+mmsize*5], m19 mova [rsp+mmsize*6], m20 mova [rsp+mmsize*7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast pxor m12, m12 mov r3d, 64*3 .zero_loop: REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3 sub r3d, 64 jge .zero_loop jmp .pass2_end .full: mova [cq+128*0], m0 mova [cq+128*1], m1 mova [cq+128*2], m2 mova [cq+128*3], m3 mova [cq+128*4], m4 mova [cq+128*5], m5 mova [cq+128*6], m6 mova [cq+128*7], m7 add cq, 64 call .pass1 sub cq, 64 mova m22, [cq+128*0] ; 0 1 mova m23, [cq+128*1] ; 2 3 mova m24, [cq+128*2] ; 4 5 mova m25, [cq+128*3] ; 6 7 mova m26, [cq+128*4] ; 8 9 mova m27, [cq+128*5] ; 10 11 mova m28, [cq+128*6] ; 12 13 mova m29, [cq+128*7] ; 14 15 mova [cq+64* 8], m0 mova [cq+64* 9], m1 mova [cq+64*10], m2 mova [cq+64*11], m3 mova [cq+64*12], m4 mova [cq+64*13], m5 mova [cq+64*14], m6 mova [cq+64*15], m7 lea r5, [o_base_8bpc] punpcklwd m20, m1, m1 punpcklwd m16, m3, m3 punpcklwd m19, m5, m5 punpcklwd m17, m7, m7 punpcklwd m8, m24, m24 ; 4 punpcklwd m5, m2, m2 ; 20 punpcklwd m1, m28, m28 ; 12 punpcklwd m7, m26, m26 ; 8 punpcklwd m3, m4, m4 ; 24 punpcklwd m4, m6, m6 ; 28 pxor m9, m9 punpcklwd m6, m9, m0 ; __ 16 mova m0, m4 punpcklwd m9, m9, m22 ; __ 0 call m(idct_16x16_internal_8bpc).main_fast punpcklwd m21, m23, m23 ; 2 punpcklwd m15, m29, m29 ; 14 punpcklwd m18, m27, m27 ; 10 punpcklwd m14, m25, m25 ; 6 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova [rsp+mmsize*0], m14 mova [rsp+mmsize*1], m15 mova [rsp+mmsize*2], m16 mova [rsp+mmsize*3], m17 mova [rsp+mmsize*4], m18 mova [rsp+mmsize*5], m19 mova [rsp+mmsize*6], m20 mova [rsp+mmsize*7], m21 mova m21, [cq+64*15] mova m14, [cq+64* 8] mova m17, [cq+64*11] mova m18, [cq+64*12] mova m19, [cq+64*13] mova m16, [cq+64*10] mova m15, [cq+64* 9] mova m20, [cq+64*14] REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ m24, m19, m16, m27, m28, m15, m20, m23 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf pxor m12, m12 mov r3d, 32*7 .full_zero_loop: REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3 sub r3d, 32 jge .full_zero_loop jmp .pass2_end .fast: mova ym0, [cq+128*0] mova ym2, [cq+128*4] movshdup m8, [o(permB)] mova ym1, [cq+128*2] mova ym3, [cq+128*6] mova ym4, [cq+128*1] mova ym5, [cq+128*3] mova ym6, [cq+128*5] mova ym7, [cq+128*7] vpermt2q m0, m8, m2 ; 0 4 vpermt2q m1, m8, m3 ; 2 6 vpermt2q m4, m8, m5 ; 1 3 vpermt2q m7, m8, m6 ; 7 5 call m(idct_8x8_internal_10bpc).main_fast call m(idct_16x8_internal_10bpc).main_fast vpbroadcastd m11, [o(pd_2)] call m(idct_8x16_internal_10bpc).main_end2 mova m8, [o(idct8x32p)] packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 packssdw m3, m7 mova m6, [dup16_perm] vpermb m0, m8, m0 vpermb m2, m8, m2 vprold m8, 16 vpermb m1, m8, m1 vpermb m3, m8, m3 punpckldq m4, m0, m2 punpckhdq m0, m2 punpckldq m2, m1, m3 punpckhdq m1, m3 punpckldq m21, m4, m2 punpckhdq m14, m4, m2 punpckldq m18, m0, m1 punpckhdq m15, m0, m1 vpord m7, m6, [o(pb_32)] {1to16} vpermb m22, m7, m21 ; 1 pmovzxwd m9, ym21 ; 0 vpermb m8, m6, m18 ; 4 vpermb m24, m7, m18 ; 5 vpermb m21, m6, m14 ; 2 vpermb m23, m7, m14 ; 3 vpermb m14, m6, m15 ; 6 vpermb m25, m7, m15 ; 7 lea r5, [o_base_8bpc] pslld m9, 16 pxor m7, m7 REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29 call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 mova [rsp+mmsize*0], m14 mova [rsp+mmsize*1], m15 mova [rsp+mmsize*2], m16 mova [rsp+mmsize*3], m17 mova [rsp+mmsize*4], m18 mova [rsp+mmsize*5], m19 mova [rsp+mmsize*6], m20 mova [rsp+mmsize*7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast pxor m12, m12 REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 .pass2_end: movshdup m30, [permC] vpbroadcastd m11, [pw_2048] vpbroadcastd m13, [pixel_10bpc_max] lea r6, [strideq*3] psrlq m31, m30, 8 vpermq m8, m30, m0 vpermq m9, m31, m1 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m2 vpermq m9, m31, m3 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m4 vpermq m9, m31, m5 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m6 vpermq m9, m31, m7 call m(idct_16x8_internal_10bpc).write_16x4 mova m1, [rsp+mmsize*0] mova m2, [rsp+mmsize*1] mova m3, [rsp+mmsize*2] mova m4, [rsp+mmsize*3] mova m5, [rsp+mmsize*4] mova m6, [rsp+mmsize*5] mova m7, [rsp+mmsize*6] mova m8, [rsp+mmsize*7] paddsw m0, m1, m21 psubsw m21, m1, m21 paddsw m1, m2, m20 psubsw m20, m2, m20 paddsw m2, m3, m19 psubsw m19, m3, m19 paddsw m3, m4, m18 psubsw m18, m4, m18 paddsw m4, m5, m17 psubsw m17, m5, m17 paddsw m5, m6, m16 psubsw m16, m6, m16 paddsw m6, m7, m15 psubsw m15, m7, m15 paddsw m7, m8, m14 psubsw m14, m8, m14 vpermq m8, m30, m0 vpermq m9, m31, m1 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m2 vpermq m9, m31, m3 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m4 vpermq m9, m31, m5 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m6 vpermq m9, m31, m7 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m14 vpermq m9, m31, m15 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m16 vpermq m9, m31, m17 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m18 vpermq m9, m31, m19 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m20 vpermq m9, m31, m21 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m22 vpermq m9, m31, m23 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m24 vpermq m9, m31, m25 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m26 vpermq m9, m31, m27 call m(idct_16x8_internal_10bpc).write_16x4 vpermq m8, m30, m28 vpermq m9, m31, m29 call m(idct_16x8_internal_10bpc).write_16x4 RET .pass1: mova m0, [cq+128* 0] mova m1, [cq+128* 2] mova m2, [cq+128* 4] mova m3, [cq+128* 6] mova m4, [cq+128* 8] mova m5, [cq+128*10] mova m6, [cq+128*12] mova m7, [cq+128*14] call m(idct_8x16_internal_10bpc).main mova m16, [cq+128* 1] mova m17, [cq+128* 3] mova m18, [cq+128* 5] mova m19, [cq+128* 7] mova m20, [cq+128* 9] mova m21, [cq+128*11] mova m22, [cq+128*13] mova m23, [cq+128*15] call m(idct_16x16_internal_10bpc).main call m(idct_16x16_internal_10bpc).main_end jmp m(idct_16x16_internal_10bpc).main_end3 .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 64 add r6d, 640 sar r6d, 10 jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 7, 32, -64*40, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero jmp .lefthalf .full: call .pass1 mov r3d, 16*28 .lefthalf: mova [cq+128* 0], m27 mova [cq+128* 1], m14 mova [cq+128* 2], m28 mova [cq+128* 3], m15 mova [cq+128* 4], m22 mova [cq+128* 5], m23 mova [cq+128* 6], m24 mova [cq+128* 7], m25 mova [cq+128* 8], m0 mova [cq+128* 9], m26 mova [cq+128*10], m20 mova [cq+128*11], m21 mova [cq+128*12], m18 mova [cq+128*13], m16 mova [cq+128*14], m17 mova [cq+128*15], m3 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] call .pass1 call .pass2_start pxor m31, m31 .right_zero_loop: REPX {mova [cq+r3*8+64+128*x], m31}, 0, 1, 2, 3 sub r3d, 16*4 jge .right_zero_loop mov r3d, 16*28 jmp .left_zero_loop .pass2_start: vpbroadcastd m10, [o(pd_2048)] lea r5, [o_base_8bpc] lea r4, [rsp+gprsize] mova m1, [cq+128*15+64] mova m2, [cq+128* 8+64] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, m21 mova m1, [cq+128*12+64] mova m2, [cq+128*11+64] mova m3, m18 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, m20 mova m1, [cq+128*13+64] mova m2, [cq+128*10+64] mova m3, m16 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, m26 mova m1, [cq+128*14+64] mova m2, [cq+128* 9+64] mova m3, m17 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m27 mova m1, m28 mova m2, [cq+128* 0+64] mova m3, [cq+128* 2+64] mova m16, [cq+128* 1+64] mova m17, [cq+128* 3+64] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova m26, [cq+128* 4+64] mova m27, [cq+128* 5+64] mova m28, [cq+128* 6+64] mova m29, [cq+128* 7+64] mova [rsp+64*32+gprsize], m14 mova [rsp+64*33+gprsize], m15 mova [rsp+64*34+gprsize], m16 mova [rsp+64*35+gprsize], m17 mova [rsp+64*36+gprsize], m18 mova [rsp+64*37+gprsize], m19 mova [rsp+64*38+gprsize], m20 mova [rsp+64*39+gprsize], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast vpbroadcastd m10, [o(pd_2048)] call .pass2_fast_start jmp .end .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*0] mova ym1, [cq+128*4] mova ym4, [cq+128*2] mova ym5, [cq+128*6] mova ym16, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym17, [cq+128*7] mov r3d, 16*4 vpermq m0, m7, m0 ; 0 0 vpermq m1, m7, m1 ; 4 4 vpermt2q m4, m7, m5 ; 2 6 vpermt2q m16, m7, m2 ; 1 5 vpermt2q m17, m7, m3 ; 7 3 REPX {pmulld x, m12}, m0, m1, m4, m16, m17 REPX {paddd x, m13}, m0, m1, m4, m16, m17 REPX {psrad x, 12 }, m0, m1, m4, m16, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast2 vpbroadcastd m11, [o(pd_1)] call m(idct_16x16_internal_10bpc).main_end2 call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 punpcklqdq m27, m0, m2 ; 0 punpckhqdq m0, m2 ; 1 punpcklqdq m22, m3, m4 ; 2 punpckhqdq m26, m3, m4 ; 3 punpcklqdq m14, m5, m7 ; 4 punpckhqdq m20, m5, m7 ; 5 punpcklqdq m23, m6, m8 ; 6 punpckhqdq m21, m6, m8 ; 7 mova m10, m13 call .pass2_fast2_start .end: pxor m31, m31 .left_zero_loop: REPX {mova [cq+r3*8+128*x], m31}, 0, 1, 2, 3 sub r3d, 16*4 jge .left_zero_loop call .pass2_end RET .pass2_end: DEFINE_ARGS dst, stride, _, dst2, stride32, stklo, stkhi vpbroadcastd m30, [pixel_10bpc_max] vpbroadcastd m13, [pw_2048] mov stride32q, strideq shl stride32q, 5 lea stkhiq, [rsp+31*mmsize+gprsize] lea dst2q, [dstq+stride32q] lea stkloq, [rsp+gprsize] sub dst2q, strideq ; dst31 paddsw m8, m0, m29 ; t0[idct32] psubsw m9, m0, m29 ; t31[idct32] call .end_sumsub_write paddsw m8, m1, m28 ; t1[idct32] psubsw m9, m1, m28 ; t30[idct32] call .end_sumsub_write paddsw m8, m2, m27 ; t2[idct32] psubsw m9, m2, m27 ; t29[idct32] call .end_sumsub_write paddsw m8, m3, m26 ; t3[idct32] psubsw m9, m3, m26 ; t28[idct32] call .end_sumsub_write paddsw m8, m4, m25 ; t4[idct32] psubsw m9, m4, m25 ; t27[idct32] call .end_sumsub_write paddsw m8, m5, m24 ; t5[idct32] psubsw m9, m5, m24 ; t26[idct32] call .end_sumsub_write paddsw m8, m6, m23 ; t6[idct32] psubsw m9, m6, m23 ; t25[idct32] call .end_sumsub_write paddsw m8, m7, m22 ; t7[idct32] psubsw m9, m7, m22 ; t24[idct32] call .end_sumsub_write mova m0, [rsp+64*32+gprsize] mova m1, [rsp+64*33+gprsize] mova m2, [rsp+64*34+gprsize] mova m3, [rsp+64*35+gprsize] mova m4, [rsp+64*36+gprsize] mova m5, [rsp+64*37+gprsize] mova m6, [rsp+64*38+gprsize] mova m7, [rsp+64*39+gprsize] paddsw m8, m0, m21 ; t8[idct32] psubsw m9, m0, m21 ; t23[idct32] call .end_sumsub_write paddsw m8, m1, m20 ; t9[idct32] psubsw m9, m1, m20 ; t22[idct32] call .end_sumsub_write paddsw m8, m2, m19 ; t10[idct32] psubsw m9, m2, m19 ; t21[idct32] call .end_sumsub_write paddsw m8, m3, m18 ; t11[idct32] psubsw m9, m3, m18 ; t20[idct32] call .end_sumsub_write paddsw m8, m4, m17 ; t12[idct32] psubsw m9, m4, m17 ; t19[idct32] call .end_sumsub_write paddsw m8, m5, m16 ; t13[idct32] psubsw m9, m5, m16 ; t18[idct32] call .end_sumsub_write paddsw m8, m6, m15 ; t14[idct32] psubsw m9, m6, m15 ; t17[idct32] call .end_sumsub_write paddsw m8, m7, m14 ; t15[idct32] psubsw m9, m7, m14 ; t16[idct32] ; fall-through .end_sumsub_write: mova m10, [stkhiq] ; t63-n mova m12, [stkloq] ; t32+n psubsw m11, m8, m10 ; out63-n paddsw m8, m10 ; out0 +n psubsw m10, m9, m12 ; out32+n paddsw m9, m12 ; out32-n REPX {pmulhrsw x, m13}, m11, m8, m10, m9 paddw m8, [dstq] paddw m9, [dst2q] paddw m10, [dstq+stride32q] paddw m11, [dst2q+stride32q] REPX {pminsw x, m30}, m11, m8, m10, m9 REPX {pmaxsw x, m31}, m11, m8, m10, m9 mova [dstq ], m8 mova [dst2q ], m9 mova [dstq +stride32q], m10 mova [dst2q+stride32q], m11 add stkloq, mmsize sub stkhiq, mmsize add dstq, strideq sub dst2q, strideq ret .pass2_fast_start: lea r5, [o_base_8bpc] lea r4, [rsp+gprsize] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m21 mova m3, m18 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m20 mova m3, m16 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m26 mova m3, m17 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m27 mova m1, m28 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [rsp+64*32+gprsize], m14 mova [rsp+64*33+gprsize], m15 mova [rsp+64*34+gprsize], m16 mova [rsp+64*35+gprsize], m17 mova [rsp+64*36+gprsize], m18 mova [rsp+64*37+gprsize], m19 mova [rsp+64*38+gprsize], m20 mova [rsp+64*39+gprsize], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 .pass2_fast2_start: lea r5, [o_base_8bpc] lea r4, [rsp+gprsize] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 mova m0, m21 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 mova m0, m20 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 mova m0, m26 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast2 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m27 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast3 mova [rsp+64*32+gprsize], m14 mova [rsp+64*33+gprsize], m15 mova [rsp+64*34+gprsize], m16 mova [rsp+64*35+gprsize], m17 mova [rsp+64*36+gprsize], m18 mova [rsp+64*37+gprsize], m19 mova [rsp+64*38+gprsize], m20 mova [rsp+64*39+gprsize], m21 jmp m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast3 .dconly: DEFINE_ARGS dst, stride, c, eob imul r6d, [cq], 181 mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_32x16_10bpc).dconly3 .pass1_fast: pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 4] pmulld m2, m12, [cq+128* 8] pmulld m3, m12, [cq+128*12] mov r3d, 16*12 call m(idct_8x16_internal_10bpc).main_fast_rect2 pmulld m16, m12, [cq+128* 2] pmulld m17, m12, [cq+128* 6] pmulld m18, m12, [cq+128*10] pmulld m19, m12, [cq+128*14] call m(idct_16x16_internal_10bpc).main_fast_rect2 call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 jmp .pass1_end .pass1: pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 4] pmulld m2, m12, [cq+128* 8] pmulld m3, m12, [cq+128*12] pmulld m4, m12, [cq+128*16] pmulld m5, m12, [cq+128*20] pmulld m6, m12, [cq+128*24] pmulld m7, m12, [cq+128*28] call m(idct_8x16_internal_10bpc).main_rect2 pmulld m16, m12, [cq+128* 2] pmulld m17, m12, [cq+128* 6] pmulld m18, m12, [cq+128*10] pmulld m19, m12, [cq+128*14] pmulld m20, m12, [cq+128*18] pmulld m21, m12, [cq+128*22] pmulld m22, m12, [cq+128*26] pmulld m23, m12, [cq+128*30] call m(idct_16x16_internal_10bpc).main_rect2 call .pass1_load_spill pmulld m16, m12, [cq+128*17] pmulld m17, m12, [cq+128*19] pmulld m18, m12, [cq+128*21] pmulld m19, m12, [cq+128*23] pmulld m20, m12, [cq+128*25] pmulld m21, m12, [cq+128*27] pmulld m22, m12, [cq+128*29] pmulld m23, m12, [cq+128*31] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_rect2 .pass1_end: vpbroadcastd m11, [o(pd_1)] lea r4, [cq+128*8] call m(inv_txfm_add_dct_dct_32x16_10bpc).idct32_pass1_end punpcklqdq m27, m0, m20 ; 0 punpckhqdq m0, m20 ; 1 punpcklqdq m24, m5, m16 ; 10 punpckhqdq m16, m5, m16 ; 11 punpcklqdq m23, m3, m21 ; 6 punpckhqdq m21, m3, m21 ; 7 punpcklqdq m25, m7, m8 ; 14 punpckhqdq m3, m7, m8 ; 15 punpcklqdq m22, m15, m4 ; 2 punpckhqdq m26, m15, m4 ; 3 punpcklqdq m15, m6, m17 ; 12 punpckhqdq m17, m6, m17 ; 13 punpcklqdq m28, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpcklqdq m14, m2, m1 ; 4 punpckhqdq m20, m2, m1 ; 5 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 pmulld m0, m12, [cq+128* 1] mova [cq+128* 1], m1 mova [cq+128* 2], m2 pmulld m1, m12, [cq+128* 3] pmulld m2, m12, [cq+128* 5] mova [cq+128* 3], m3 mova [cq+128* 4], m4 pmulld m3, m12, [cq+128* 7] pmulld m4, m12, [cq+128* 9] mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 pmulld m5, m12, [cq+128*11] pmulld m6, m12, [cq+128*13] pmulld m7, m12, [cq+128*15] mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 7, 32, -64*32, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 36 jl .fast ; 8x8 cmp eobd, 151 jge .full ; 16x16 lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64] mova m0, [cq+64* 1] mova m3, [cq+64*15] call .main_part1_fast mova m0, [cq+64* 7] mova m3, [cq+64* 9] call .main_part1_fast mova m0, [cq+64* 5] mova m3, [cq+64*11] call .main_part1_fast mova m0, [cq+64* 3] mova m3, [cq+64*13] call .main_part1_fast call .main_part2 mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m16, [cq+64* 4] mova m17, [cq+64*12] call m(idct_8x16_internal_10bpc).main_fast2 call m(idct_16x16_internal_10bpc).main_fast2 call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 mov r6d, 12*8 jmp .idct64_end .full: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64] mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] call .main_part1 mova m0, [cq+64* 7] mova m1, [cq+64*25] mova m2, [cq+64*23] mova m3, [cq+64* 9] call .main_part1 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] call .main_part1 mova m0, [cq+64* 3] mova m1, [cq+64*29] mova m2, [cq+64*19] mova m3, [cq+64*13] call .main_part1 call .main_part2 mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m2, [cq+64*16] mova m3, [cq+64*24] mova m16, [cq+64* 4] mova m17, [cq+64*12] mova m18, [cq+64*20] mova m19, [cq+64*28] call m(idct_8x16_internal_10bpc).main_fast call m(idct_16x16_internal_10bpc).main_fast call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub call .pass1_load_spill mova m4, [cq+64*18] mova m5, [cq+64*22] mova m6, [cq+64*26] mova m7, [cq+64*30] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast mov r6d, 28*8 jmp .idct64_end .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 16 .dconly1: add r6d, 640 sar r6d, 10 .dconly2: vpbroadcastd m3, [o(dconly_10bpc)] imul r6d, 181 add r6d, 2176 sar r6d, 12 vpbroadcastw m2, r6d paddsw m2, m3 .dconly_loop: paddsw m0, m2, [dstq+64*0] paddsw m1, m2, [dstq+64*1] psubusw m0, m3 psubusw m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq dec r3d jg .dconly_loop ret .pass1_load_spill: mova [cq+64* 0], m0 mova m0, [cq+64* 2] mova [cq+64* 2], m1 mova m1, [cq+64* 6] mova [cq+64* 4], m2 mova [cq+64* 6], m3 mova m2, [cq+64*10] mova m3, [cq+64*14] mova [cq+64* 8], m4 mova [cq+64*10], m5 mova [cq+64*12], m6 mova [cq+64*14], m7 mova [cq+64* 1], m23 mova [cq+64* 3], m22 mova [cq+64* 5], m21 mova [cq+64* 7], m20 mova [cq+64* 9], m19 mova [cq+64*11], m18 mova [cq+64*13], m17 mova [cq+64*15], m16 ret ALIGN function_align .main_part1_fast_rect2: REPX {paddd x, m13}, m0, m3 REPX {psrad x, 12 }, m0, m3 .main_part1_fast: pmulld m7, m0, [r4+4*0]{bcstd} ; t63a pmulld m0, [r4+4*1]{bcstd} ; t32a pmulld m4, m3, [r4+4*6]{bcstd} ; t60a pmulld m3, [r4+4*7]{bcstd} ; t35a vpbroadcastd m10, [r4+4*8] vpbroadcastd m11, [r4+4*9] REPX {paddd x, m13}, m7, m0, m4, m3 REPX {psrad x, 12 }, m7, m0, m4, m3 mova m8, m0 mova m1, m7 mova m6, m3 mova m2, m4 jmp .main_part1b .main_part1_rect2: REPX {paddd x, m13}, m0, m1, m2, m3 REPX {psrad x, 12 }, m0, m1, m2, m3 .main_part1: ; idct64 steps 1-5 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a pmulld m7, m0, [r4+4*0]{bcstd} ; t63a pmulld m0, [r4+4*1]{bcstd} ; t32a pmulld m6, m1, [r4+4*2]{bcstd} ; t62a pmulld m1, [r4+4*3]{bcstd} ; t33a pmulld m5, m2, [r4+4*4]{bcstd} ; t61a pmulld m2, [r4+4*5]{bcstd} ; t34a pmulld m4, m3, [r4+4*6]{bcstd} ; t60a pmulld m3, [r4+4*7]{bcstd} ; t35a vpbroadcastd m10, [r4+4*8] vpbroadcastd m11, [r4+4*9] REPX {paddd x, m13}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 psubd m8, m0, m1 ; t33 paddd m0, m1 ; t32 psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m5 ; t61 paddd m4, m5 ; t60 .main_part1b: REPX {pmaxsd x, m14}, m8, m1, m6, m2 REPX {pminsd x, m15}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 13, 10, 11 ; t33a, t62a ITX_MULSUB_2D 2, 6, 5, 9, _, 13, 10, 11, 2 ; t61a, t34a REPX {pmaxsd x, m14}, m0, m3, m7, m4 REPX {pminsd x, m15}, m0, m3, m7, m4 vpbroadcastd m10, [r4+4*10] vpbroadcastd m11, [r4+4*11] psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m8, m2 ; t61 paddd m8, m2 ; t62 REPX {pmaxsd x, m14}, m5, m3, m4, m6 REPX {pminsd x, m15}, m5, m3, m4, m6 ITX_MULSUB_2D 3, 5, 2, 9, _, 13, 10, 11 ; t35, t60 ITX_MULSUB_2D 6, 4, 2, 9, _, 13, 10, 11 ; t34a, t61a REPX {pmaxsd x, m14}, m0, m7, m1, m8 REPX {pminsd x, m15}, m0, m7, m1, m8 add r4, 4*12 mova [r6-64*4], m0 mova [r6+64*3], m7 mova [r6-64*3], m1 mova [r6+64*2], m8 mova [r6-64*2], m6 mova [r6+64*1], m4 mova [r6-64*1], m3 mova [r6+64*0], m5 add r6, 64*8 ret .main_part2: ; idct64 steps 6-9 lea r4, [r6+64*3] sub r6, 64*4 vpbroadcastd m10, [pd_1567] vpbroadcastd m11, [pd_3784] .main_part2_loop: mova m0, [r6-64*32] ; t32a mova m1, [r4-64*24] ; t39a mova m2, [r4-64*32] ; t63a mova m3, [r6-64*24] ; t56a mova m4, [r6-64*16] ; t40a mova m5, [r4-64* 8] ; t47a mova m6, [r4-64*16] ; t55a mova m7, [r6-64* 8] ; t48a psubd m8, m0, m1 ; t39 paddd m0, m1 ; t32 psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 REPX {pmaxsd x, m14}, m8, m1, m3, m4 REPX {pminsd x, m15}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 13, 10, 11 ; t39a, t56a ITX_MULSUB_2D 4, 3, 6, 9, _, 13, 10, 11, 2 ; t55a, t40a REPX {pmaxsd x, m14}, m0, m2, m5, m7 REPX {pminsd x, m15}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a paddd m2, m7 ; t63a psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m8, m4 ; t55 paddd m8, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 REPX {pmaxsd x, m14}, m6, m7, m5, m4 REPX {pminsd x, m15}, m6, m7, m5, m4 REPX {pmulld x, m12}, m6, m7, m5, m4 REPX {pmaxsd x, m14}, m2, m0, m8, m1 REPX {pminsd x, m15}, m2, m0, m8, m1 paddd m6, m13 paddd m5, m13 psubd m3, m6, m7 ; t47 paddd m6, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m3, m6, m7, m5 mova [r4-64* 8], m2 mova [r6-64*32], m0 mova [r6-64* 8], m8 mova [r4-64*32], m1 mova [r4-64*24], m3 mova [r6-64*16], m6 mova [r6-64*24], m7 mova [r4-64*16], m5 add r6, 64 sub r4, 64 cmp r6, r4 jl .main_part2_loop ret .idct64_main_end: %macro IDCT64_PASS1_END 9 mova m%5, [%9+%1*128] ; t0+n [idct32] + idct64 rounding psubd m%6, m%5, m%2 ; out31-n [idct32] = t31-n [idct64] paddd m%5, m%2 ; out0+n [idct32] = t0+n [idct64] REPX {pmaxsd x, m14}, m%6, m%5 REPX {pminsd x, m15}, m%6, m%5 REPX {paddd x, m11}, m%6, m%5 mova m%2, [r3+%3*64] ; t32+n [idct64] mova m%7, [r3+%4*64] ; t63-n [idct64] psubd m%8, m%5, m%7 ; out63-n paddd m%5, m%7 ; out0+n psubd m%7, m%6, m%2 ; out32+n paddd m%6, m%2 ; out31-n REPX {vpsravd x, m11}, m%8, m%5, m%7, m%6 %endmacro %macro IDCT64_PASS1_ENDx4 1 %assign %%m1 %1 ; t32+n %assign %%m2 (7-%1) ; t39-n %assign %%m3 (8+%1) ; t40+n %assign %%m4 (15-%1) ; t47-n %assign %%m5 (16+%1) ; t48+n %assign %%m6 (23-%1) ; t55-n %assign %%m7 (24+%1) ; t56+n %assign %%m8 (31-%1) ; t63-n %assign %%r1 %1 ; t16+n %assign %%r2 (7-%1) ; t23-n %assign %%r3 (16+%1) ; t24-n %assign %%r4 (23-%1) ; t31-n %assign %%c1 (%1) ; t0/8+n %assign %%c2 (7-%1) ; t7/15-n IDCT64_PASS1_END %%c1, %%r4, %%m1, %%m8, 24, 25, 26, 27, cq ; out0/31/32/63 IDCT64_PASS1_END %%c1, %%r1, %%m4, %%m5, 28, 29, 30, 31, r4 ; out15/16/47/48 packssdw m %+ %%r1, m24, m29 packssdw m %+ %%r4, m28, m25 packssdw m26, m31 packssdw m30, m27 mova [r3+%%m5*mmsize], m26 mova [r3+%%m8*mmsize], m30 IDCT64_PASS1_END %%c2, %%r3, %%m2, %%m7, 24, 25, 26, 27, cq ; out7/24/39/56 IDCT64_PASS1_END %%c2, %%r2, %%m3, %%m6, 28, 29, 30, 31, r4 ; out8/23/40/55 packssdw m %+ %%r2, m24, m29 packssdw m %+ %%r3, m28, m25 packssdw m26, m31 packssdw m30, m27 mova [r3+%%m6*mmsize], m26 mova [r3+%%m7*mmsize], m30 %endmacro IDCT64_PASS1_ENDx4 0 IDCT64_PASS1_ENDx4 1 IDCT64_PASS1_ENDx4 2 IDCT64_PASS1_ENDx4 3 ret .idct64_end: vpbroadcastd m11, [o(pd_2)] lea r4, [cq+64] mov r3, rsp lea r5, [o_base_8bpc] call .idct64_main_end pxor m12, m12 .zero_loop: REPX {mova [cq+r6*8+64*x], m12}, 0, 1, 2, 3 sub r6d, 8*4 jge .zero_loop lea r3, [strideq*3] mov r4, dstq call .pass2 mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] mova m16, [rsp+24*mmsize] mova m17, [rsp+25*mmsize] mova m18, [rsp+26*mmsize] mova m19, [rsp+27*mmsize] mova m20, [rsp+28*mmsize] mova m21, [rsp+29*mmsize] mova m22, [rsp+30*mmsize] mova m23, [rsp+31*mmsize] lea dstq, [r4+64] call .pass2 RET .pass2: psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 punpckhqdq m19, m5, m16 ; 11 punpcklqdq m5, m16 ; 10 punpckhqdq m16, m2, m1 ; 5 punpcklqdq m2, m1 ; 4 punpcklqdq m1, m15, m4 ; 2 punpckhqdq m15, m4 ; 3 punpcklqdq m4, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpckhqdq m14, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m20, m6, m17 ; 13 punpcklqdq m6, m17 ; 12 punpckhqdq m17, m3, m21 ; 7 punpcklqdq m3, m21 ; 6 punpckhqdq m21, m7, m8 ; 15 punpcklqdq m7, m8 ; 14 call m(inv_txfm_add_dct_dct_32x8_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf .write: vpbroadcastd m11, [pw_2048] pxor m12, m12 vpbroadcastd m13, [pixel_10bpc_max] call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x8 pmulhrsw m0, m11, m14 pmulhrsw m1, m11, m15 pmulhrsw m2, m11, m16 pmulhrsw m3, m11, m17 call m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 pmulhrsw m0, m11, m18 pmulhrsw m1, m11, m19 pmulhrsw m2, m11, m20 pmulhrsw m3, m11, m21 jmp m(inv_txfm_add_dct_dct_32x8_10bpc).write_32x4 .fast: ; 8x8 packed movshdup m7, [o(permB)] mova ym0, [cq+64*1] mova ym2, [cq+64*5] mova ym3, [cq+64*3] mova ym1, [cq+64*7] vpermt2q m0, m7, m2 ; 1 5 vpermt2q m1, m7, m3 ; 7 3 call .main_oddhalf_packed mova [rsp+ 0*mmsize], m0 mova [rsp+ 1*mmsize], m1 mova [rsp+ 2*mmsize], m2 mova [rsp+ 3*mmsize], m3 mova [rsp+ 4*mmsize], m4 mova [rsp+ 5*mmsize], m5 mova [rsp+ 6*mmsize], m6 mova [rsp+ 7*mmsize], m7 mova [rsp+ 8*mmsize], m16 mova [rsp+ 9*mmsize], m17 mova [rsp+10*mmsize], m18 mova [rsp+11*mmsize], m19 mova [rsp+12*mmsize], m20 mova [rsp+13*mmsize], m21 mova [rsp+14*mmsize], m22 mova [rsp+15*mmsize], m23 movshdup m7, [o(permB)] mova ym0, [cq+64*0] mova ym4, [cq+64*4] mova ym16, [cq+64*2] mova ym5, [cq+64*6] vpermt2q m16, m7, m5 ; 2 6 vpermq m0, m7, m0 ; 0 0 vpermq m4, m7, m4 ; 4 4 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 ; m0-7,9,16-22 contain un-sumsub'ed dct32 output data ; zero input coefs pxor m12, m12 REPX {mova [cq+x*64], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 vpbroadcastd m11, [o(pd_2)] call .main_end lea r3, [strideq*3] mov r4, dstq call .pass2_fast mova m0, m24 mova m1, m25 mova m2, m26 mova m3, m27 mova m4, m28 mova m5, m29 mova m6, m30 mova m7, m31 lea dstq, [r4+64] lea r5, [o_base] call .pass2_fast RET .pass2_fast: call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 lea r5, [o_base_8bpc] punpckhqdq m14, m0, m2 ; 1 punpcklqdq m0, m2 ; 0 punpcklqdq m1, m3, m4 ; 2 punpckhqdq m15, m3, m4 ; 3 punpcklqdq m2, m5, m7 ; 4 punpckhqdq m16, m5, m7 ; 5 punpcklqdq m3, m6, m8 ; 6 punpckhqdq m17, m6, m8 ; 7 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast jmp .write .main_end: %macro IDCT64_PASS1_PACKED_END 7 psubd m%5, m%1, m%2 ; out31-n [idct32] = t31-n [idct64] paddd m%1, m%2 ; out0+n [idct32] = t0+n [idct64] REPX {pmaxsd x, m14}, m%5, m%1 REPX {pminsd x, m15}, m%5, m%1 REPX {paddd x, m11}, m%5, m%1 mova m%2, [rsp+%6*64+gprsize] ; t32+n [idct64] mova m%3, [rsp+%7*64+gprsize] ; t63-n [idct64] psubd m%4, m%1, m%3 ; out63-n paddd m%1, m%3 ; out0+n psubd m%3, m%5, m%2 ; out32+n paddd m%2, m%5 ; out31-n REPX {vpsravd x, m11}, m%4, m%1, m%3, m%2 %endmacro IDCT64_PASS1_PACKED_END 0, 22, 24, 10, 12, 0, 15 ; out0/1,31/30,32/33,63/62 IDCT64_PASS1_PACKED_END 7, 9, 31, 13, 12, 7, 8 ; out15/14,16/17,47/46,48/49 packssdw m0, m9 packssdw m7, m22 packssdw m24, m13 packssdw m31, m10 IDCT64_PASS1_PACKED_END 1, 21, 25, 10, 12, 1, 14 ; out3/2,28/29,35/34,60/61 IDCT64_PASS1_PACKED_END 6, 16, 30, 13, 12, 6, 9 ; out12/13,19/18,44/45,51/50 packssdw m1, m16 packssdw m6, m21 packssdw m25, m13 packssdw m30, m10 IDCT64_PASS1_PACKED_END 2, 20, 26, 10, 12, 2, 13 ; out4/5,27/26,36/37,59/58 IDCT64_PASS1_PACKED_END 5, 17, 29, 13, 12, 5, 10 ; out11/10,20/21,43/42,52/53 packssdw m2, m17 packssdw m5, m20 packssdw m26, m13 packssdw m29, m10 IDCT64_PASS1_PACKED_END 3, 19, 27, 10, 12, 3, 12 ; out7/6,24/25,39/38,56/57 IDCT64_PASS1_PACKED_END 4, 18, 28, 13, 12, 4, 11 ; out8/9,23/22,40/41,55/54 packssdw m3, m18 packssdw m4, m19 packssdw m27, m13 packssdw m28, m10 ret .main_oddhalf_packed_rect2: REPX {paddd x, m13}, m0, m1 REPX {psrad x, 12 }, m0, m1 .main_oddhalf_packed: ; m0=in1 in5, m1=in7 in3 vbroadcasti32x4 m2, [o(pd_101_501)] vbroadcasti32x4 m3, [o(pd_m700_m301)] vbroadcasti32x4 m4, [o(pd_4095_4065)] vbroadcasti32x4 m5, [o(pd_4036_4085)] pmulld m2, m0 pmulld m3, m1 pmulld m0, m4 pmulld m1, m5 REPX {paddd x, m13}, m2, m3, m0, m1 REPX {psrad x, 12 }, m2, m3, m0, m1 ; m2=t32a t40a -> t32/33 t40/41, m3=t39a t47a -> t38/39 t46/47 ; m0=t63a t55a -> t62/63 t54/55, m1=t56a t48a -> t56/57 t48/49 ; end of step 1-2 vbroadcasti32x4 m10, [o(pd_401_1931)] vbroadcasti32x4 m11, [o(pd_4076_3612)] mova m4, m0 mova m5, m2 ITX_MULSUB_2D 4, 5, 8, 9, _, 13, 10, 11 vbroadcasti32x4 m10, [o(pd_3166_3920)] vbroadcasti32x4 m11, [o(pd_2598_1189)] mova m6, m3 mova m7, m1 ITX_MULSUB_2D 7, 6, 8, 9, _, 13, 10, 11, 2 ; m4=t33a t41a -> t41/42 t33/34, m5=t63a t54a -> t61/62 t53/54 ; m6=t38a t46a -> t37/38 t45/46, m7=t57a t49a -> t57/58 t49/50 ; and from earlier: ; m0=t63 t55 -> t60/63a t52/55a, m1=t56 t48 -> t56/59a t48/51a ; m2=t32 t40 -> t32/35a t40/43a, m3=t39 t47 -> t36/39a t44/47a ; end of step 3-4 punpcklqdq m22, m2, m4 ; t32a/33 or t35a/34 punpcklqdq m21, m3, m6 ; t36a/37 or t39a/38 punpckhqdq m18, m2, m4 ; t40a/41 or t43a/42 punpckhqdq m17, m3, m6 ; t44a/45 or t47a/46 punpckhqdq m6, m1, m7 ; t48a/49 or t51a/50 punpckhqdq m19, m0, m5 ; t52a/53 or t55a/54 punpcklqdq m8, m1, m7 ; t56a/57 or t59a/58 punpcklqdq m23, m0, m5 ; t60a/61 or t63a/62 mova m0, m22 mova m7, m21 mova m3, m18 mova m16, m17 mova m5, m6 mova m4, m19 mova m2, m8 mova m1, m23 ; m0/22/7/21,18/3/17/16,6/5/19/4,2/8/1/23: t32-63[a] ; step5 vpbroadcastd m10, [o(pd_799)] vpbroadcastd m11, [o(pd_4017)] ITX_MULSUB_2D 1, 22, 20, 9, _, 13, 10, 11 ; t35/34a, t60/61a ITX_MULSUB_2D 8, 7, 20, 9, _, 13, 10, 11, 2 ; t59/58a, t36/37a vpbroadcastd m10, [o(pd_3406)] vpbroadcastd m11, [o(pd_2276)] ITX_MULSUB_2D 19, 3, 20, 9, _, 13, 10, 11 ; t43/42a, t52/53a ITX_MULSUB_2D 5, 17, 20, 9, _, 13, 10, 11, 2 ; t51/50a, t44/45a ; m0-1/7/21: t32-39[a], m18-19/17-16: t40-47[a] ; m6-5/3-4: t48-55[a], m2/8/22-23: t56-63[a] ; step6 psubd m20, m0, m21 ; t39/38a paddd m0, m21 ; t32/33a psubd m21, m1, m7 ; t36a/37 paddd m1, m7 ; t35a/34 REPX {pmaxsd x, m14}, m20, m0, m21, m1 psubd m7, m16, m18 ; t40/41a paddd m16, m18 ; t47/46a REPX {pminsd x, m15}, m20, m0, m21, m1 psubd m18, m17, m19 ; t43a/42 paddd m17, m19 ; t44a/45 REPX {pmaxsd x, m14}, m7, m16, m18, m17 psubd m19, m6, m4 ; t55/54a paddd m6, m4 ; t48/49a REPX {pminsd x, m15}, m7, m16, m18, m17 psubd m4, m5, m3 ; t52a/53 paddd m5, m3 ; t51a/50 REPX {pmaxsd x, m14}, m19, m6, m4, m5 psubd m3, m23, m2 ; t56/57a paddd m23, m2 ; t63/62a REPX {pminsd x, m15}, m19, m6, m4, m5 psubd m2, m22, m8 ; t59a/58 paddd m22, m8 ; t60a/61 REPX {pmaxsd x, m14}, m3, m23, m2, m22 REPX {pminsd x, m15}, m3, m23, m2, m22 ; m0-1: t32-35[a], m17-16: t44-47[a], m6-5: t48-51[a], m22-23: t60-63[a] ; m21-20: t36-39[a], m7/18: t40-43[a], m4/19: t52-55[a], m3-2: t56-59[a] ; step7 vpbroadcastd m10, [o(pd_1567)] vpbroadcastd m11, [o(pd_3784)] ITX_MULSUB_2D 2, 21, 8, 9, _, 13, 10, 11 ; t36/37a, t59/58a ITX_MULSUB_2D 3, 20, 8, 9, _, 13, 10, 11 ; t39a/38, t56a/57 ITX_MULSUB_2D 19, 7, 8, 9, _, 13, 10, 11, 2 ; t55a/54, t40a/41 ITX_MULSUB_2D 4, 18, 8, 9, _, 13, 10, 11, 2 ; t52/53a, t43/42a ; m0-3: t32-39[a], m7,18-16: t40-47[a], m6-4,19: t48-55[a], m20-23: t56-63[a] ; step8 psubd m8, m0, m16 ; t47a/46 paddd m0, m16 ; t32a/33 psubd m16, m1, m17 ; t44/45a paddd m1, m17 ; t35/34a REPX {pmaxsd x, m14}, m8, m0, m16, m1 psubd m17, m2, m18 ; t43a/42 paddd m2, m18 ; t36a/37 REPX {pminsd x, m15}, m8, m0, m16, m1 psubd m18, m3, m7 ; t40/41a paddd m3, m7 ; t39/38a REPX {pmaxsd x, m14}, m17, m2, m18, m3 psubd m7, m23, m6 ; t48a/49 paddd m23, m6 ; t63a/62 REPX {pminsd x, m15}, m17, m2, m18, m3 psubd m6, m22, m5 ; t51/50a paddd m22, m5 ; t60/61a REPX {pmaxsd x, m14}, m7, m23, m6, m22 psubd m5, m21, m4 ; t52a/53 paddd m21, m4 ; t59a/58 REPX {pminsd x, m15}, m7, m23, m6, m22 psubd m4, m20, m19 ; t55/54a paddd m20, m19 ; t56/57a REPX {pmaxsd x, m14}, m5, m21, m4, m20 REPX {pminsd x, m15}, m5, m21, m4, m20 ; m0-3=t32-39[a], m18-16,8: t40-47[a], m7-4=t48-55[a], m20-23=t56-63[a] ; step9 REPX {pmulld x, m12}, m4, m18, m5, m17, m6, m16, m7, m8 REPX {paddd x, m13}, m4, m5, m6, m7 paddd m19, m4, m18 ; t55a/54 psubd m4, m18 ; t40a/41 paddd m18, m5, m17 ; t52/53a psubd m5, m17 ; t43/42a paddd m17, m6, m16 ; t51a/50 psubd m6, m16 ; t44a/45 paddd m16, m7, m8 ; t48/49a psubd m7, m8 ; t47/46a REPX {psrad x, 12 }, m19, m4, m18, m5, m17, m6, m16, m7 ; m4-7=t40-47[a], m16-19=t48-55[a] ret cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 8, 32, -64*32, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero mov r7d, 16*12 jmp .lefthalf .full: call .pass1 mov r7d, 16*28 .lefthalf: mova [cq+128* 0], m0 mova [cq+128* 1], m1 mova [cq+128* 2], m2 mova [cq+128* 3], m3 mova [cq+128* 4], m14 mova [cq+128* 5], m15 mova [cq+128* 6], m16 mova [cq+128* 7], m17 mova [cq+128* 8], m22 mova [cq+128* 9], m23 mova [cq+128*10], m24 mova [cq+128*11], m25 mova [cq+128*12], m26 mova [cq+128*13], m27 mova [cq+128*14], m28 mova [cq+128*15], m29 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] sub rsp, 16*64 call .pass1 add rsp, 16*64 lea r5, [o_base_8bpc] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start mov r4, dstq pxor m12, m12 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end lea dstq, [r4+64] mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] mova m16, [rsp+24*mmsize] mova m17, [rsp+25*mmsize] mova m18, [rsp+26*mmsize] mova m19, [rsp+27*mmsize] mova m20, [rsp+28*mmsize] mova m21, [rsp+29*mmsize] mova m22, [rsp+30*mmsize] mova m23, [rsp+31*mmsize] call .transpose mova [cq+128* 0+64], m0 mova [cq+128* 1+64], m1 mova [cq+128* 2+64], m2 mova [cq+128* 3+64], m3 mova [cq+128* 4+64], m14 mova [cq+128* 5+64], m15 mova [cq+128* 6+64], m16 mova [cq+128* 7+64], m17 mova [cq+128* 8+64], m22 mova [cq+128* 9+64], m23 mova [cq+128*10+64], m24 mova [cq+128*11+64], m25 mova [cq+128*12+64], m26 mova [cq+128*13+64], m27 mova [cq+128*14+64], m28 mova [cq+128*15+64], m29 mova m0, [rsp+ 0*mmsize] mova m1, [rsp+ 1*mmsize] mova m2, [rsp+ 2*mmsize] mova m3, [rsp+ 3*mmsize] mova m4, [rsp+ 4*mmsize] mova m5, [rsp+ 5*mmsize] mova m6, [rsp+ 6*mmsize] mova m7, [rsp+ 7*mmsize] mova m16, [rsp+ 8*mmsize] mova m17, [rsp+ 9*mmsize] mova m18, [rsp+10*mmsize] mova m19, [rsp+11*mmsize] mova m20, [rsp+12*mmsize] mova m21, [rsp+13*mmsize] mova m22, [rsp+14*mmsize] mova m23, [rsp+15*mmsize] call .transpose call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_start pxor m12, m12 .right_zero_loop: mova [cq+r7*8+64+128*3], m12 mova [cq+r7*8+64+128*2], m12 mova [cq+r7*8+64+128*1], m12 mova [cq+r7*8+64+128*0], m12 sub r7d, 16*4 jge .right_zero_loop mov r7d, 16*28 jmp .end .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast lea r5, [o_base_8bpc] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start mov r4, dstq pxor m12, m12 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end lea dstq, [r4+64] mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] mova m16, [rsp+24*mmsize] mova m17, [rsp+25*mmsize] mova m18, [rsp+26*mmsize] mova m19, [rsp+27*mmsize] mova m20, [rsp+28*mmsize] mova m21, [rsp+29*mmsize] mova m22, [rsp+30*mmsize] mova m23, [rsp+31*mmsize] call .transpose call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast_start mov r7d, 16*12 pxor m12, m12 jmp .end .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym1, [cq+128*7] vpermt2q m0, m7, m2 ; 1 5 vpermt2q m1, m7, m3 ; 7 3 REPX {pmulld x, m12}, m0, m1 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed_rect2 mova [rsp+ 0*mmsize], m0 mova [rsp+ 1*mmsize], m1 mova [rsp+ 2*mmsize], m2 mova [rsp+ 3*mmsize], m3 mova [rsp+ 4*mmsize], m4 mova [rsp+ 5*mmsize], m5 mova [rsp+ 6*mmsize], m6 mova [rsp+ 7*mmsize], m7 mova [rsp+ 8*mmsize], m16 mova [rsp+ 9*mmsize], m17 mova [rsp+10*mmsize], m18 mova [rsp+11*mmsize], m19 mova [rsp+12*mmsize], m20 mova [rsp+13*mmsize], m21 mova [rsp+14*mmsize], m22 mova [rsp+15*mmsize], m23 movshdup m7, [o(permB)] pmulld ym0, ym12, [cq+128*0] pmulld ym4, ym12, [cq+128*4] mova ym16, [cq+128*2] mova ym5, [cq+128*6] REPX {paddd x, ym13}, ym0, ym4 REPX {psrad x, 12 }, ym0, ym4 vpermt2q m16, m7, m5 ; 2 6 vpermq m0, m7, m0 ; 0 0 vpermq m4, m7, m4 ; 4 4 pmulld m16, m12 paddd m16, m13 psrad m16, 12 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 vpbroadcastd m11, [o(pd_1)] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end mova [rsp+16*mmsize], m24 mova [rsp+17*mmsize], m25 mova [rsp+18*mmsize], m26 mova [rsp+19*mmsize], m27 mova [rsp+20*mmsize], m28 mova [rsp+21*mmsize], m29 mova [rsp+22*mmsize], m30 mova [rsp+23*mmsize], m31 vpbroadcastd m13, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start mov r7d, 16*4 mov r4, dstq pxor m12, m12 call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end lea dstq, [r4+64] mova m0, [rsp+16*mmsize] mova m1, [rsp+17*mmsize] mova m2, [rsp+18*mmsize] mova m3, [rsp+19*mmsize] mova m4, [rsp+20*mmsize] mova m5, [rsp+21*mmsize] mova m6, [rsp+22*mmsize] mova m7, [rsp+23*mmsize] lea r5, [o_base] vpbroadcastd m13, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_fast2_start pxor m12, m12 .end: call m(inv_txfm_add_dct_dct_32x32_10bpc).pass2_end .zero_loop: mova [cq+r7*8+128*3], m12 mova [cq+r7*8+128*2], m12 mova [cq+r7*8+128*1], m12 mova [cq+r7*8+128*0], m12 sub r7d, 16*4 jge .zero_loop RET .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 32 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 384 sar r6d, 9 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 .pass1_fast: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] pmulld m0, m12, [cq+128* 1] pmulld m3, m12, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 pmulld m0, m12, [cq+128* 7] pmulld m3, m12, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 pmulld m0, m12, [cq+128* 5] pmulld m3, m12, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 pmulld m0, m12, [cq+128* 3] pmulld m3, m12, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast_rect2 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 8] pmulld m16, m12, [cq+128* 4] pmulld m17, m12, [cq+128*12] call m(idct_8x16_internal_10bpc).main_fast2_rect2 call m(idct_16x16_internal_10bpc).main_fast2_rect2 call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2_rect2 jmp .pass1_end .pass1: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] pmulld m0, m12, [cq+128* 1] pmulld m1, m12, [cq+128*31] pmulld m2, m12, [cq+128*17] pmulld m3, m12, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 pmulld m0, m12, [cq+128* 7] pmulld m1, m12, [cq+128*25] pmulld m2, m12, [cq+128*23] pmulld m3, m12, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 pmulld m0, m12, [cq+128* 5] pmulld m1, m12, [cq+128*27] pmulld m2, m12, [cq+128*21] pmulld m3, m12, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 pmulld m0, m12, [cq+128* 3] pmulld m1, m12, [cq+128*29] pmulld m2, m12, [cq+128*19] pmulld m3, m12, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_rect2 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 pmulld m0, m12, [cq+128* 0] pmulld m1, m12, [cq+128* 8] pmulld m2, m12, [cq+128*16] pmulld m3, m12, [cq+128*24] pmulld m16, m12, [cq+128* 4] pmulld m17, m12, [cq+128*12] pmulld m18, m12, [cq+128*20] pmulld m19, m12, [cq+128*28] call m(idct_8x16_internal_10bpc).main_fast_rect2 call m(idct_16x16_internal_10bpc).main_fast_rect2 call .pass1_load_spill pmulld m4, m12, [cq+128*18] pmulld m5, m12, [cq+128*22] pmulld m6, m12, [cq+128*26] pmulld m7, m12, [cq+128*30] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast_rect2 .pass1_end: vpbroadcastd m11, [o(pd_1)] lea r3, [rsp+gprsize] lea r4, [cq+8*128] call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end ; transpose one half immediately, we can transpose lower half later .transpose: ; transpose m0-7,16-23 psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 punpckhqdq m22, m0, m20 ; 1 punpcklqdq m0, m20 ; 0 punpckhqdq m24, m2, m1 ; 5 punpcklqdq m1, m2, m1 ; 4 punpcklqdq m2, m14, m18 ; 8 punpckhqdq m26, m14, m18 ; 9 punpcklqdq m14, m15, m4 ; 2 punpckhqdq m23, m15, m4 ; 3 punpckhqdq m25, m3, m21 ; 7 punpcklqdq m15, m3, m21 ; 6 punpckhqdq m28, m6, m17 ; 13 punpcklqdq m3, m6, m17 ; 12 punpckhqdq m27, m5, m16 ; 11 punpcklqdq m16, m5, m16 ; 10 punpckhqdq m29, m7, m8 ; 15 punpcklqdq m17, m7, m8 ; 14 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 mova [cq+128* 1], m1 pmulld m0, m12, [cq+128* 2] pmulld m1, m12, [cq+128* 6] mova [cq+128* 2], m2 mova [cq+128* 3], m3 pmulld m2, m12, [cq+128*10] pmulld m3, m12, [cq+128*14] mova [cq+128* 4], m4 mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 4, 9, 32, -64*32, dst, stride, c, eob %undef cmp vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] cmp eobd, 136 jl .fast add cq, 64 cmp eobd, 543 jge .full call .pass1_fast ; bottomright 16x16 zero mov r7d, 16*12 jmp .lefthalf .full: call .pass1 mov r7d, 16*28 .lefthalf: mova [cq+128* 0], m27 mova [cq+128* 1], m14 mova [cq+128* 2], m28 mova [cq+128* 3], m15 mova [cq+128* 4], m22 mova [cq+128* 5], m23 mova [cq+128* 6], m24 mova [cq+128* 7], m25 mova [cq+128* 8], m0 mova [cq+128* 9], m26 mova [cq+128*10], m20 mova [cq+128*11], m21 mova [cq+128*12], m18 mova [cq+128*13], m16 mova [cq+128*14], m17 mova [cq+128*15], m3 sub cq, 64 vpbroadcastd m12, [o(pd_2896)] vpbroadcastd m13, [o(pd_2048)] vpbroadcastd m14, [o(clip_18b_min)] vpbroadcastd m15, [o(clip_18b_max)] sub rsp, 16*64 call .pass1 sub rsp, 24*64 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start mov r8, dstq pxor m31, m31 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end lea dstq, [r8+64] mova m0, [rsp+56*mmsize] mova m1, [rsp+57*mmsize] mova m2, [rsp+58*mmsize] mova m3, [rsp+59*mmsize] mova m4, [rsp+60*mmsize] mova m5, [rsp+61*mmsize] mova m6, [rsp+62*mmsize] mova m7, [rsp+63*mmsize] mova m16, [rsp+64*mmsize] mova m17, [rsp+65*mmsize] mova m18, [rsp+66*mmsize] mova m19, [rsp+67*mmsize] mova m20, [rsp+68*mmsize] mova m21, [rsp+69*mmsize] mova m22, [rsp+70*mmsize] mova m23, [rsp+71*mmsize] call .transpose mova [cq+128* 0+64], m27 mova [cq+128* 1+64], m14 mova [cq+128* 2+64], m28 mova [cq+128* 3+64], m15 mova [cq+128* 4+64], m22 mova [cq+128* 5+64], m23 mova [cq+128* 6+64], m24 mova [cq+128* 7+64], m25 mova [cq+128* 8+64], m0 mova [cq+128* 9+64], m26 mova [cq+128*10+64], m20 mova [cq+128*11+64], m21 mova [cq+128*12+64], m18 mova [cq+128*13+64], m16 mova [cq+128*14+64], m17 mova [cq+128*15+64], m3 mova m0, [rsp+40*mmsize] mova m1, [rsp+41*mmsize] mova m2, [rsp+42*mmsize] mova m3, [rsp+43*mmsize] mova m4, [rsp+44*mmsize] mova m5, [rsp+45*mmsize] mova m6, [rsp+46*mmsize] mova m7, [rsp+47*mmsize] mova m16, [rsp+48*mmsize] mova m17, [rsp+49*mmsize] mova m18, [rsp+50*mmsize] mova m19, [rsp+51*mmsize] mova m20, [rsp+52*mmsize] mova m21, [rsp+53*mmsize] mova m22, [rsp+54*mmsize] mova m23, [rsp+55*mmsize] add rsp, 32*64 call .transpose lea r5, [o_base] call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_start .right_zero_loop: REPX {mova [cq+r7*8+64+128*x], m31}, 0, 1, 2, 3 sub r7d, 16*4 jge .right_zero_loop mov r7d, 16*28 jmp .end .fast: ; topleft 16x16 nonzero cmp eobd, 36 jl .fast2 call .pass1_fast sub rsp, 24*64 vpbroadcastd m10, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start mov r8, dstq pxor m31, m31 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end lea dstq, [r8+64] mova m0, [rsp+40*mmsize] mova m1, [rsp+41*mmsize] mova m2, [rsp+42*mmsize] mova m3, [rsp+43*mmsize] mova m4, [rsp+44*mmsize] mova m5, [rsp+45*mmsize] mova m6, [rsp+46*mmsize] mova m7, [rsp+47*mmsize] mova m16, [rsp+48*mmsize] mova m17, [rsp+49*mmsize] mova m18, [rsp+50*mmsize] mova m19, [rsp+51*mmsize] mova m20, [rsp+52*mmsize] mova m21, [rsp+53*mmsize] mova m22, [rsp+54*mmsize] mova m23, [rsp+55*mmsize] add rsp, 16*64 call .transpose lea r5, [o_base] vpbroadcastd m10, [o(pd_2048)] call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast_start mov r7d, 16*12 jmp .end .fast2: ; topleft 8x8 nonzero movshdup m7, [o(permB)] mova ym0, [cq+128*1] mova ym2, [cq+128*5] mova ym3, [cq+128*3] mova ym1, [cq+128*7] vpermt2q m0, m7, m2 ; 1 5 vpermt2q m1, m7, m3 ; 7 3 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_oddhalf_packed mova [rsp+ 0*mmsize], m0 mova [rsp+ 1*mmsize], m1 mova [rsp+ 2*mmsize], m2 mova [rsp+ 3*mmsize], m3 mova [rsp+ 4*mmsize], m4 mova [rsp+ 5*mmsize], m5 mova [rsp+ 6*mmsize], m6 mova [rsp+ 7*mmsize], m7 mova [rsp+ 8*mmsize], m16 mova [rsp+ 9*mmsize], m17 mova [rsp+10*mmsize], m18 mova [rsp+11*mmsize], m19 mova [rsp+12*mmsize], m20 mova [rsp+13*mmsize], m21 mova [rsp+14*mmsize], m22 mova [rsp+15*mmsize], m23 movshdup m7, [o(permB)] mova ym0, [cq+128*0] mova ym4, [cq+128*4] mova ym16, [cq+128*2] mova ym5, [cq+128*6] vpermt2q m16, m7, m5 ; 2 6 vpermq m0, m7, m0 ; 0 0 vpermq m4, m7, m4 ; 4 4 call m(inv_txfm_add_dct_dct_32x8_10bpc).main_fast3 vpbroadcastd m11, [o(pd_2)] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end sub rsp, 16*64 mova [rsp+40*mmsize], m24 mova [rsp+41*mmsize], m25 mova [rsp+42*mmsize], m26 mova [rsp+43*mmsize], m27 mova [rsp+44*mmsize], m28 mova [rsp+45*mmsize], m29 mova [rsp+46*mmsize], m30 mova [rsp+47*mmsize], m31 call .pass2_fast2_start mov r7d, 16*4 mov r8, dstq pxor m31, m31 call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end lea dstq, [r8+64] mova m0, [rsp+40*mmsize] mova m1, [rsp+41*mmsize] mova m2, [rsp+42*mmsize] mova m3, [rsp+43*mmsize] mova m4, [rsp+44*mmsize] mova m5, [rsp+45*mmsize] mova m6, [rsp+46*mmsize] mova m7, [rsp+47*mmsize] add rsp, 8*64 lea r5, [o_base] call .pass2_fast2_start .end: pxor m31, m31 .zero_loop: REPX {mova [cq+r7*8+128*x], m31}, 0, 1, 2, 3 sub r7d, 16*4 jge .zero_loop call m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_end add rsp, 8*64 ; FIXME adjust stack_size_padded instead? RET .pass2_fast2_start: call m(inv_txfm_add_dct_dct_32x8_10bpc).transpose_8x32 punpcklqdq m27, m0, m2 ; 0 punpckhqdq m0, m2 ; 1 punpcklqdq m22, m3, m4 ; 2 punpckhqdq m26, m3, m4 ; 3 punpcklqdq m14, m5, m7 ; 4 punpckhqdq m20, m5, m7 ; 5 punpcklqdq m23, m6, m8 ; 6 punpckhqdq m21, m6, m8 ; 7 vpbroadcastd m10, [o(pd_2048)] jmp m(inv_txfm_add_dct_dct_32x64_10bpc).pass2_fast2_start .dconly: imul r6d, [cq], 181 mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly1 .pass1_fast: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] mova m0, [cq+128* 1] mova m3, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast mova m0, [cq+128* 7] mova m3, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast mova m0, [cq+128* 5] mova m3, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast mova m0, [cq+128* 3] mova m3, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1_fast call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 mova m0, [cq+128* 0] mova m1, [cq+128* 8] mova m16, [cq+128* 4] mova m17, [cq+128*12] call m(idct_8x16_internal_10bpc).main_fast2 call m(idct_16x16_internal_10bpc).main_fast2 call .pass1_load_spill call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast2 jmp .pass1_end .pass1: lea r4, [idct64_mul_16bpc] lea r6, [rsp+4*64+gprsize] mova m0, [cq+128* 1] mova m1, [cq+128*31] mova m2, [cq+128*17] mova m3, [cq+128*15] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 mova m0, [cq+128* 7] mova m1, [cq+128*25] mova m2, [cq+128*23] mova m3, [cq+128* 9] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 mova m0, [cq+128* 5] mova m1, [cq+128*27] mova m2, [cq+128*21] mova m3, [cq+128*11] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 mova m0, [cq+128* 3] mova m1, [cq+128*29] mova m2, [cq+128*19] mova m3, [cq+128*13] call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part1 call m(inv_txfm_add_dct_dct_64x16_10bpc).main_part2 mova m0, [cq+128* 0] mova m1, [cq+128* 8] mova m2, [cq+128*16] mova m3, [cq+128*24] mova m16, [cq+128* 4] mova m17, [cq+128*12] mova m18, [cq+128*20] mova m19, [cq+128*28] call m(idct_8x16_internal_10bpc).main_fast call m(idct_16x16_internal_10bpc).main_fast call .pass1_load_spill mova m4, [cq+128*18] mova m5, [cq+128*22] mova m6, [cq+128*26] mova m7, [cq+128*30] call m(inv_txfm_add_dct_dct_32x16_10bpc).main_fast .pass1_end: vpbroadcastd m11, [o(pd_2)] lea r3, [rsp+gprsize] lea r4, [cq+8*128] call m(inv_txfm_add_dct_dct_64x16_10bpc).idct64_main_end ; transpose one half immediately, we can transpose lower half later .transpose: ; transpose m0-7,16-23 psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11 psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15 call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32 punpcklqdq m27, m0, m20 ; 0 punpckhqdq m0, m20 ; 1 punpcklqdq m24, m5, m16 ; 10 punpckhqdq m16, m5, m16 ; 11 punpcklqdq m23, m3, m21 ; 6 punpckhqdq m21, m3, m21 ; 7 punpcklqdq m25, m7, m8 ; 14 punpckhqdq m3, m7, m8 ; 15 punpcklqdq m22, m15, m4 ; 2 punpckhqdq m26, m15, m4 ; 3 punpcklqdq m15, m6, m17 ; 12 punpckhqdq m17, m6, m17 ; 13 punpcklqdq m28, m14, m18 ; 8 punpckhqdq m18, m14, m18 ; 9 punpcklqdq m14, m2, m1 ; 4 punpckhqdq m20, m2, m1 ; 5 ret .pass1_load_spill: call m(inv_txfm_add_dct_dct_32x16_10bpc).idct16_sumsub mova [cq+128* 0], m0 mova [cq+128* 1], m1 mova m0, [cq+128* 2] mova m1, [cq+128* 6] mova [cq+128* 2], m2 mova [cq+128* 3], m3 mova m2, [cq+128*10] mova m3, [cq+128*14] mova [cq+128* 4], m4 mova [cq+128* 5], m5 mova [cq+128* 6], m6 mova [cq+128* 7], m7 mova [cq+128* 8], m23 mova [cq+128* 9], m22 mova [cq+128*10], m21 mova [cq+128*11], m20 mova [cq+128*12], m19 mova [cq+128*13], m18 mova [cq+128*14], m17 mova [cq+128*15], m16 ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/itx16_sse.asm000064400000000000000000010562551046102023000144300ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; Copyright © 2017-2021, The rav1e contributors ; Copyright © 2020, Nathan Egge ; Copyright © 2021, Matthias Dressel ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA %macro COEF 1-2 pd_%1: times 4 dd %1 %if %0 == 2 pd_m%1: times 4 dd -%1 %endif %endmacro COEF 201 COEF 401 COEF 601, 1 COEF 799 COEF 995 COEF 1189, 1 COEF 1380, 1 COEF 1567 COEF 1751 COEF 1931 COEF 2106, 1 COEF 2276, 1 COEF 2440 COEF 2598, 1 COEF 2751, 1 COEF 2896 COEF 3035 COEF 3166 COEF 3290 COEF 3406 COEF 3513 COEF 3612 COEF 3703 COEF 3784 COEF 3857 COEF 3920 COEF 3973 COEF 4017 COEF 4052 COEF 4076 COEF 4091 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 %if ARCH_X86_32 pd_1: times 4 dd 1 %endif pd_2: times 4 dd 2 pw_5: times 8 dw 5 pd_1321: times 4 dd 1321 pd_2482: times 4 dd 2482 pd_m3344: times 4 dd -3344 pd_2048: times 4 dd 2048 pw_4x2048_4xm2048: times 4 dw 2048 times 4 dw -2048 pw_4xm2048_4x2048: times 4 dw -2048 times 4 dw 2048 pw_2048: times 8 dw 2048 pw_m2048: times 8 dw -2048 pd_3803: times 4 dd 3803 pw_4096: times 8 dw 4096 pd_5793: times 4 dd 5793 pd_6144: times 4 dd 6144 pw_8192: times 8 dw 8192 pd_10240: times 4 dd 10240 pd_11586: times 4 dd 11586 pw_1697x8: times 8 dw 1697*8 pw_2896x8: times 8 dw 2896*8 pw_1697x16: times 8 dw 1697*16 pw_16384: times 8 dw 16384 pixel_10bpc_max: times 8 dw 0x03ff pw_1567_3784: times 4 dw 1567, 3784 pw_m3784_1567: times 4 dw -3784, 1567 pw_2896_2896: times 4 dw 2896, 2896 pw_m2896_2896: times 4 dw -2896, 2896 clip_18b_min: times 4 dd -0x20000 clip_18b_max: times 4 dd 0x1ffff idct64_mul_16bpc: dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 cextern iadst_4x4_internal_8bpc_ssse3.main cextern idct_4x8_internal_8bpc_ssse3.main cextern iadst_4x8_internal_8bpc_ssse3.main cextern idct_16x4_internal_8bpc_ssse3.main cextern iadst_16x4_internal_8bpc_ssse3.main cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end cextern idct_8x4_internal_8bpc_ssse3.main cextern iadst_8x4_internal_8bpc_ssse3.main cextern idct_8x8_internal_8bpc_ssse3.main cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 cextern iadst_8x8_internal_8bpc_ssse3.main cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end cextern idct_16x8_internal_8bpc_ssse3.main cextern iadst_16x8_internal_8bpc_ssse3.main cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end cextern idct_8x32_internal_8bpc_ssse3.main cextern idct_8x32_internal_8bpc_ssse3.main_fast cextern idct_8x32_internal_8bpc_ssse3.main_veryfast cextern idct_16x64_internal_8bpc_ssse3.main cextern idct_16x64_internal_8bpc_ssse3.main_fast tbl_4x16_2d: db 0, 13, 29, 45 tbl_4x16_h: db 0, 16, 32, 48 tbl_4x16_v: db 0, 4, 8, 12 tbl_8x16_2d: db 0, 14, 30, 46 tbl_8x16_v: db 0, 4, 8, 12 tbl_8x16_h: db 0, 32, 64, 96 tbl_16x16_2d: db 0, 10, 36, 78 tbl_16x16_v: db 0, 4, 8, 12 tbl_16x16_h: db 0, 64, 128, 192 tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203 tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343 tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406 tbl_Nx32_odd_offset: db 2*16, 2*23 db 2*20, 2*19 db 2*18, 2*21 db 2*22, 2*17 db 2*30, 2*25 db 2*26, 2*29 db 2*28, 2*27 db 2*24, 2*31 tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46 db 2* 8, 2*40, 2*23, 2*38 db 2* 1, 2*36, 2*20, 2*42 db 2* 9, 2*44, 2*19, 2*34 db 2* 2, 2*60, 2*18, 2*50 db 2*10, 2*52, 2*21, 2*58 db 2* 3, 2*56, 2*22, 2*54 db 2*11, 2*48, 2*17, 2*62 SECTION .text %define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) %define m(x) m_suffix(x, SUFFIX) ; This refers to the first function in itx_sse i.e. the start of the text section ; which is needed as a base pointer for constants. %define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) %if ARCH_X86_64 %define o(x) x %else %define o(x) r6-$$+x ; PIC %endif %macro IWHT4_1D 0 ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 paddd m0, m1 ; in0 += in1 psubd m4, m2, m3 ; tmp0 = in2 - in3 psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 psrad m5, 1 psubd m2, m5, m1 ; in2 = tmp1 - in1 psubd m5, m3 ; in1 = tmp1 - in3 psubd m0, m5 ; in0 -= in1 paddd m4, m2 ; in3 = tmp0 + in2 ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 ; m4 = out3, m5 = out1 %endmacro INIT_XMM sse2 cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax mova m0, [cq+16*0] mova m1, [cq+16*1] mova m2, [cq+16*2] mova m3, [cq+16*3] REPX {psrad x, 2}, m0, m1, m2, m3 IWHT4_1D punpckldq m1, m0, m5 punpckhdq m3, m0, m5 punpckldq m5, m2, m4 punpckhdq m2, m4 punpcklqdq m0, m1, m5 punpckhqdq m1, m5 punpcklqdq m4, m3, m2 punpckhqdq m3, m2 mova m2, m4 IWHT4_1D packssdw m0, m4 ; low: out3, high: out0 packssdw m2, m5 ; low: out2, high: out1 pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 lea r2, [dstq+strideq*2] movq m1, [dstq+strideq*0] movhps m1, [r2 +strideq*1] movq m3, [r2 +strideq*0] movhps m3, [dstq+strideq*1] movd m5, bdmaxm pshuflw m5, m5, q0000 ; broadcast punpcklqdq m5, m5 ; broadcast paddsw m0, m1 paddsw m2, m3 pmaxsw m0, m4 pmaxsw m2, m4 pminsw m0, m5 pminsw m2, m5 movhps [r2 +strideq*1], m0 ; write out0 movhps [dstq+strideq*1], m2 ; write out1 movq [r2 +strideq*0], m2 ; write out2 movq [dstq+strideq*0], m0 ; write out3 RET ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 ; flags: 2 = inv_dst1, 4 = inv_dst2 ; skip round/shift if rnd is not a number %macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags ; %1 dst/src[1] ; %2 dst/src[2] ; %3 tmp[1] ; %4 tmp[2] ; %5 tmp[3] ; %6 rnd ; %7 coef[1] ; %8 coef[2] ; %9 flags %ifnidn %7,%8 ; optimize when coef1 == coef2 %if %8 < 32 pmulld m%4, m%1, m%8 pmulld m%3, m%2, m%8 %else mova m%3, [o(pd_%8)] pmulld m%4, m%1, m%3 pmulld m%3, m%2 %endif %endif %if %7 < 32 pmulld m%1, m%7 pmulld m%2, m%7 %else mova m%5, [o(pd_%7)] pmulld m%1, m%5 pmulld m%2, m%5 %endif %if %9 & 4 ; invert dst2 paddd m%4, m%2 psubd m%2, m%6, m%4 %else %ifnum %6 %ifnidn %7,%8 paddd m%4, m%6 %else paddd m%1, m%6 %endif %endif %ifnidn %7,%8 paddd m%2, m%4 %else mova m%3, m%2 paddd m%2, m%1 %endif %endif %if %9 & 2 ; invert dst1 psubd m%3, m%1 paddd m%1, m%3, m%6 %else %ifnum %6 %ifnidn %7,%8 paddd m%1, m%6 %endif %endif psubd m%1, m%3 %endif %ifnum %6 psrad m%2, 12 psrad m%1, 12 %endif %endmacro %macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%4_internal_16bpc) %if ARCH_X86_32 LEA r6, $$ %endif %if has_epilogue %ifidn %1_%2, dct_dct test eobd, eobd jz %%end %endif lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] %ifnum %3 %if %3 add eobd, %3 %endif %else lea r5, [o(%3)] %endif call %%p1 RET %%end: %else ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] %ifnum %3 %if %3 add eobd, %3 %endif %else lea r5, [o(%3)] %endif %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 0, 4x4 %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 4 .dconly: add r5d, 128 sar r5d, 8 .dconly2: imul r5d, 2896 mova m2, [o(pixel_10bpc_max)] add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 pxor m3, m3 punpcklqdq m0, m0 .dconly_loop: movq m1, [dstq+strideq*0] movhps m1, [dstq+strideq*1] paddw m1, m0 pminsw m1, m2 pmaxsw m1, m3 movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %endif %endmacro %macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd ; butterfly rotation ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 ; Hadamard rotation psubd m%5, m%1, m%2 paddd m%2, m%1 paddd m%1, m%3, m%4 psubd m%3, m%4 ; %1 (src1) = out0 ; %2 (src2) = out1 ; %3 (src3) = out3 ; $5 (tmp1) = out2 %endmacro INIT_XMM sse4 INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, identity INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] mova m2, [cq+16*2] mova m3, [cq+16*3] mova m5, [o(pd_2048)] call .pass1_main packssdw m0, m1 ; out0 out1 packssdw m4, m2 ; out2 out3 ; transpose punpckhwd m2, m0, m4 punpcklwd m0, m4 punpckhwd m1, m0, m2 punpcklwd m0, m2 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass1_main: IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 ret .pass2: ; m0 = in0 in1 ; m1 = in2 in3 ; m5 = pd_2048 punpckhwd m2, m1, m0 punpcklwd m1, m0 pmaddwd m4, m2, [o(pw_m3784_1567)] pmaddwd m2, [o(pw_1567_3784)] pmaddwd m0, m1, [o(pw_m2896_2896)] pmaddwd m1, [o(pw_2896_2896)] REPX {paddd x, m5}, m4, m2, m0, m1 packssdw m5, m5 ; pw_2048 REPX {psrad x, 12}, m4, m2, m0, m1 packssdw m2, m4 ; t3 t2 packssdw m1, m0 ; t0 t1 paddsw m0, m1, m2 ; out0 out1 psubsw m1, m2 ; out3 out2 pmulhrsw m0, m5 pmulhrsw m1, m5 movq m2, [dstq+strideq*0] movhps m2, [dstq+strideq*1] lea r5, [dstq+strideq*2] movq m3, [r5 +strideq*1] movhps m3, [r5 +strideq*0] mova m5, [o(pixel_10bpc_max)] pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movhps [r5 +strideq*0], m1 movq [r5 +strideq*1], m1 RET INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .main packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 ; transpose punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass2: ; m0 = in0 in1 ; m1 = in2 in3 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main .end: mova m4, [o(pw_2048)] movq m2, [dstq+strideq*0] movhps m2, [dstq+strideq*1] lea r5, [dstq+strideq*2] movq m3, [r5 +strideq*0] movhps m3, [r5 +strideq*1] mova m5, [o(pixel_10bpc_max)] pmulhrsw m0, m4 pmulhrsw m1, m4 pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [r5 +strideq*0], m1 movhps [r5 +strideq*1], m1 RET ALIGN function_align .main: mova m1, [cq+16*2] mova m3, [cq+16*3] mova m5, [cq+16*0] lea r3, [cq+16*1] .main2: mova m0, [o(pd_1321)] ; SINPI_1_9 mova m2, [o(pd_2482)] ; SINPI_2_9 mova m6, [o(pd_3803)] ; SINPI_4_9 pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] psubd m1, m3 ; T[2] - T[3] pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] paddd m0, m6 ; s[0] += s[3] paddd m0, m3 ; s[0] += s[5] mova m3, [o(pd_m3344)] ; -SINPI_3_9 psubd m2, m4 ; s[1] -= s[4] psubd m2, m7 ; s[1] -= s[6] psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 paddd m4, m0, m2 ; x[3] = s[0] + s[1] psubd m2, m3 ; x[1] = s[1] + s[3] psubd m0, m3 ; x[0] = s[0] + s[3] paddd m4, m3 ; x[3] -= s[3] paddd m2, m5 ; x[1] + 2048 REPX {psrad x, 12}, m0, m2, m1, m4 ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(iadst_4x4_internal_16bpc).main packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 ; transpose punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass2: ; m0 = in0 in1 ; m1 = in2 in3 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main mova m4, [o(pw_2048)] movq m3, [dstq+strideq*1] movhps m3, [dstq+strideq*0] lea r5, [dstq+strideq*2] movq m2, [r5 +strideq*1] movhps m2, [r5 +strideq*0] mova m5, [o(pixel_10bpc_max)] pmulhrsw m0, m4 pmulhrsw m1, m4 pxor m4, m4 mova [cq+16*0], m4 mova [cq+16*1], m4 mova [cq+16*2], m4 mova [cq+16*3], m4 paddw m0, m2 paddw m1, m3 pmaxsw m0, m4 pmaxsw m1, m4 pminsw m0, m5 pminsw m1, m5 movhps [dstq+strideq*0], m1 movq [dstq+strideq*1], m1 movhps [r5 +strideq*0], m0 movq [r5 +strideq*1], m0 RET INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m3, [o(pd_5793)] pmulld m0, m3, [cq+16*0] pmulld m1, m3, [cq+16*1] pmulld m2, m3, [cq+16*2] pmulld m3, [cq+16*3] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 ; transpose punpckhwd m3, m0, m2 punpcklwd m0, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 ; m0 = out0 out1 ; m1 = out2 out3 ; m5 = pd_2048 jmp tx2q .pass2: ; m0 = in0 in1 ; m1 = in2 in3 ; m5 = pd_2048 mova m4, [o(pw_1697x8)] movq m2, [dstq+strideq*0] movhps m2, [dstq+strideq*1] lea r5, [dstq+strideq*2] pmulhrsw m3, m4, m0 pmulhrsw m4, m1 paddsw m0, m3 paddsw m1, m4 movq m3, [r5 +strideq*0] movhps m3, [r5 +strideq*1] mova m4, [o(pixel_10bpc_max)] packssdw m5, m5 ; pw_2048 pmulhrsw m0, m5 pmulhrsw m1, m5 pxor m5, m5 mova [cq+16*0], m5 mova [cq+16*1], m5 mova [cq+16*2], m5 mova [cq+16*3], m5 paddw m0, m2 paddw m1, m3 pmaxsw m0, m5 pmaxsw m1, m5 pminsw m0, m4 pminsw m1, m4 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [r5 +strideq*0], m1 movhps [r5 +strideq*1], m1 RET %macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset INV_TXFM_FN %1, %2, %3, 4x8 %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 add r5d, 128 sar r5d, 8 imul r5d, 181 jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly %endif %endmacro INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, identity, 9 INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp mova m5, [o(pd_2048)] %if ARCH_X86_64 xor r5d, r5d cmp eobd, 13 setge r5b %else mov r5d, 1 cmp eobd, 13 sbb r5d, 0 %endif shl r5d, 4 .loop_pass1: mova m3, [o(pd_2896)] pmulld m0, m3, [cq+32*0+r5] pmulld m1, m3, [cq+32*1+r5] pmulld m2, m3, [cq+32*2+r5] pmulld m3, [cq+32*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 call m(idct_4x4_internal_16bpc).pass1_main packssdw m0, m1 ; out0 out1 packssdw m4, m2 ; out2 out3 test r5d, r5d jz .end_pass1 mova [cq+32*0+16], m0 mova [cq+32*1+16], m4 xor r5d, r5d jmp .loop_pass1 .end_pass1: punpckhwd m2, m0, m4 punpcklwd m0, m4 punpckhwd m1, m0, m2 punpcklwd m0, m2 mova m2, [cq+32*0+16] mova m6, [cq+32*1+16] punpckhwd m4, m2, m6 punpcklwd m2, m6 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_4x8_internal_8bpc, _ssse3).main ; m0-3 is now out0/1,3/2,4/5,7/6 mova m4, [o(pw_2048)] shufps m1, m1, q1032 shufps m3, m3, q1032 .end: REPX {pmulhrsw x, m4}, m0, m1, m2, m3 pxor m4, m4 REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 mova m7, [o(pixel_10bpc_max)] lea r2, [strideq*3] movq m5, [dstq+strideq*0] movq m6, [dstq+strideq*2] movhps m5, [dstq+strideq*1] movhps m6, [dstq+r2] lea r4, [dstq+strideq*4] paddw m0, m5 paddw m1, m6 movq m5, [r4+strideq*0] movq m6, [r4+strideq*2] movhps m5, [r4+strideq*1] movhps m6, [r4+r2] paddw m2, m5 paddw m3, m6 REPX {pminsw x, m7}, m0, m1, m2, m3 REPX {pmaxsw x, m4}, m0, m1, m2, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r2 ], m1 movq [r4 +strideq*0], m2 movhps [r4 +strideq*1], m2 movq [r4 +strideq*2], m3 movhps [r4 +r2 ], m3 RET INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity, 9 cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call .pass1_main punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 mova m2, [cq+32*2+16] mova m6, [cq+32*3+16] punpckhwd m4, m2, m6 punpcklwd m2, m6 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass1_main: %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 13 setge r5b %else mov r5d, 1 cmp eobd, 13 sbb r5d, 0 %endif shl r5d, 4 lea r3, [cq+32*1+16] .loop_pass1: mova m0, [o(pd_2048)] mova m3, [o(pd_2896)] pmulld m5, m3, [cq+32*0+r5] pmulld m2, m3, [cq+32*1+r5] pmulld m1, m3, [cq+32*2+r5] pmulld m3, [cq+32*3+r5] REPX {paddd x, m0}, m5, m2, m1, m3 REPX {psrad x, 12}, m5, m2, m1, m3 mova [r3], m2 call m(iadst_4x4_internal_16bpc).main2 packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 test r5d, r5d jz .end_pass1 mova [cq+32*2+16], m0 mova [cq+32*3+16], m1 xor r5d, r5d jmp .loop_pass1 .end_pass1: ret .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main mova m4, [o(pw_4x2048_4xm2048)] jmp m(idct_4x8_internal_16bpc).end INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity, 9 cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 call m(iadst_4x8_internal_16bpc).pass1_main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 mova m6, [cq+32*2+16] mova m2, [cq+32*3+16] punpcklwd m4, m2, m6 punpckhwd m2, m6 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main mova m4, m0 mova m5, m1 pshufd m0, m3, q1032 pshufd m1, m2, q1032 pshufd m2, m5, q1032 pshufd m3, m4, q1032 mova m4, [o(pw_4xm2048_4x2048)] jmp m(idct_4x8_internal_16bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity, 3 cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp mova m5, [o(pd_2048)] mova m4, [o(pd_2896)] mova m6, [o(pd_5793)] ; clear m7 in case we skip the bottom square pxor m7, m7 %if ARCH_X86_64 xor r5d, r5d cmp eobd, 16 setge r5b %else mov r5d, 1 cmp eobd, 16 sbb r5d, 0 %endif shl r5d, 4 .loop_pass1: pmulld m0, m4, [cq+32*0+r5] pmulld m1, m4, [cq+32*1+r5] pmulld m2, m4, [cq+32*2+r5] pmulld m3, m4, [cq+32*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 REPX {pmulld x, m6}, m0, m1, m2, m3 REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 12}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 test r5d, r5d jz .end_pass1 mova [cq+32*0+16], m0 mova m7, m2 xor r5d, r5d jmp .loop_pass1 .end_pass1: punpckhwd m4, m0, m2 punpcklwd m0, m2 punpckhwd m1, m0, m4 punpcklwd m0, m4 mova m2, [cq+32*0+16] punpckhwd m4, m2, m7 punpcklwd m2, m7 punpckhwd m3, m2, m4 punpcklwd m2, m4 ; m0-3 = packed & transposed output jmp tx2q .pass2: mova m4, [o(pw_4096)] jmp m(idct_4x8_internal_16bpc).end %macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 add r5d, 384 sar r5d, 9 jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 %endif %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, identity, v INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif mova m5, [o(pd_2048)] .loop_pass1: mova m0, [cq+64*0+r5] mova m1, [cq+64*1+r5] mova m2, [cq+64*2+r5] mova m3, [cq+64*3+r5] call m(idct_4x4_internal_16bpc).pass1_main pcmpeqd m3, m3 REPX {psubd x, m3}, m0, m1, m4, m2 REPX {psrad x, 1}, m0, m1, m4, m2 packssdw m0, m1 ; out0 out1 packssdw m4, m2 ; out2 out3 punpckhwd m2, m0, m4 punpcklwd m0, m4 punpckhwd m1, m0, m2 punpcklwd m0, m2 test r5d, r5d jz .end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .end_pass1: mova m2, [cq+64*0+16] mova m3, [cq+64*1+16] mova m4, [cq+64*0+32] mova m5, [cq+64*1+32] mova m6, [cq+64*0+48] mova m7, [cq+64*1+48] ; m0-7 = packed & transposed output jmp tx2q .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_16x4_internal_8bpc, _ssse3).main ; m0-6 is out0-13 [with odd registers having inversed output] ; [coeffq+16*7] has out15/14 mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [cq+16*7] REPX {shufps x, x, q1032}, m1, m3, m5, m7 mova [cq+16*0], m4 mova [cq+16*1], m5 mova [cq+16*2], m6 mova [cq+16*3], m7 .end: pxor m4, m4 REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 mova m7, [o(pixel_10bpc_max)] mov r5d, 2 lea r3, [strideq*3] .loop: movq m5, [dstq+strideq*0] movq m6, [dstq+strideq*2] movhps m5, [dstq+strideq*1] movhps m6, [dstq+r3] lea r4, [dstq+strideq*4] paddw m0, m5 paddw m1, m6 movq m5, [r4+strideq*0] movq m6, [r4+strideq*2] movhps m5, [r4+strideq*1] movhps m6, [r4+r3] paddw m2, m5 paddw m3, m6 REPX {pminsw x, m7}, m0, m1, m2, m3 REPX {pmaxsw x, m4}, m0, m1, m2, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+r3 ], m1 movq [r4 +strideq*0], m2 movhps [r4 +strideq*1], m2 movq [r4 +strideq*2], m3 movhps [r4 +r3 ], m3 dec r5d jz .end2 lea dstq, [dstq+strideq*8] mova m0, [cq+0*16] mova m1, [cq+1*16] mova m2, [cq+2*16] mova m3, [cq+3*16] REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 jmp .loop .end2: RET INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity, v cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r6+r5] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif .loop_pass1: mova m5, [cq+64*0+r5] lea r3, [cq+64*1+r5] mova m1, [cq+64*2+r5] mova m3, [cq+64*3+r5] call m(iadst_4x4_internal_16bpc).main2 pcmpeqd m3, m3 REPX {psubd x, m3}, m0, m2, m1, m4 REPX {psrad x, 1}, m0, m2, m1, m4 packssdw m0, m2 ; out0 out1 packssdw m1, m4 ; out2 out3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 test r5d, r5d jz m(idct_4x16_internal_16bpc).end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 mova m1, [o(pw_4x2048_4xm2048)] REPX {pmulhrsw x, m1}, m7, m2, m0 pshufd m6, m1, q1032 ; 4x-2048,4x2048 pmulhrsw m1, [cq+16*7] REPX {pmulhrsw x, m6}, m5, m4, m3 pmulhrsw m6, [cq+16*6] ; m7/5/2/4 = out4/11,5/10,6/9,7/8 ; m0/3/6/1 = out0/15,3/12,1/14,2/13 ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 movhps [cq+0*8], m4 movhps [cq+1*8], m2 movhps [cq+2*8], m5 movhps [cq+3*8], m7 movhps [cq+4*8], m3 movhps [cq+5*8], m1 movhps [cq+6*8], m6 movhps [cq+7*8], m0 punpcklqdq m0, m6 punpcklqdq m1, m3 punpcklqdq m3, m2, m4 punpcklqdq m2, m7, m5 jmp m(idct_4x16_internal_16bpc).end INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity, v cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif .loop_pass1: mova m5, [cq+64*0+r5] lea r3, [cq+64*1+r5] mova m1, [cq+64*2+r5] mova m3, [cq+64*3+r5] call m(iadst_4x4_internal_16bpc).main2 pcmpeqd m3, m3 REPX {psubd x, m3}, m0, m2, m1, m4 REPX {psrad x, 1}, m0, m2, m1, m4 packssdw m0, m2 ; out3 out2 packssdw m1, m4 ; out1 out0 punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 test r5d, r5d jz m(idct_4x16_internal_16bpc).end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 mova m1, [o(pw_4x2048_4xm2048)] REPX {pmulhrsw x, m1}, m7, m2, m0 pshufd m6, m1, q1032 ; 4x-2048,4x2048 pmulhrsw m1, [cq+16*7] REPX {pmulhrsw x, m6}, m5, m4, m3 pmulhrsw m6, [cq+16*6] ; m7/5/2/4 = out11/4,10/5,9/6,8/7 ; m0/3/6/1 = out15/0,12/3,14/1,13/2 ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 movq [cq+0*8], m4 movq [cq+1*8], m2 movq [cq+2*8], m5 movq [cq+3*8], m7 movq [cq+4*8], m3 movq [cq+5*8], m1 movq [cq+6*8], m6 movq [cq+7*8], m0 punpckhqdq m0, m6 punpckhqdq m1, m3 punpckhqdq m3, m2, m4 punpckhqdq m2, m7, m5 jmp m(idct_4x16_internal_16bpc).end INV_TXFM_4X16_FN identity, dct, h INV_TXFM_4X16_FN identity, adst, h INV_TXFM_4X16_FN identity, flipadst, h INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %undef cmp %if ARCH_X86_32 mov r5m, r6d %endif mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, r5m %endif mova m5, [o(pd_6144)] mova m4, [o(pd_5793)] .loop_pass1: pmulld m0, m4, [cq+64*0+r5] pmulld m1, m4, [cq+64*1+r5] pmulld m2, m4, [cq+64*2+r5] pmulld m3, m4, [cq+64*3+r5] REPX {paddd x, m5}, m0, m1, m2, m3 REPX {psrad x, 13}, m0, m1, m2, m3 packssdw m0, m1 packssdw m2, m3 punpckhwd m3, m0, m2 punpcklwd m0, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 test r5d, r5d jz m(idct_4x16_internal_16bpc).end_pass1 mova [cq+64*0+r5], m0 mova [cq+64*1+r5], m1 sub r5d, 16 jmp .loop_pass1 .pass2: mova [cq+16*4], m0 mova [cq+16*5], m1 mova [cq+16*6], m2 mova [cq+16*7], m7 mova m0, [o(pw_1697x16)] mova m7, [o(pw_2048)] pmulhrsw m1, m0, m4 pmulhrsw m2, m0, m5 REPX {paddsw x, x}, m4, m5 paddsw m4, m1 paddsw m5, m2 REPX {pmulhrsw x, m7}, m4, m5 mova [cq+16*0], m4 mova [cq+16*1], m5 mova m4, [cq+16*7] pmulhrsw m1, m0, m6 pmulhrsw m2, m0, m4 REPX {paddsw x, x}, m6, m4 paddsw m6, m1 paddsw m4, m2 REPX {pmulhrsw x, m7}, m6, m4 mova [cq+16*2], m6 mova [cq+16*3], m4 mova m4, [cq+16*4] mova m1, [cq+16*5] mova m2, [cq+16*6] pmulhrsw m5, m0, m2 pmulhrsw m6, m0, m3 REPX {paddsw x, x}, m2, m3 paddsw m2, m5 paddsw m3, m6 pmulhrsw m6, m0, m1 pmulhrsw m0, m4 REPX {paddsw x, x}, m1, m4 paddsw m1, m6 paddsw m0, m4 REPX {pmulhrsw x, m7}, m2, m3, m1, m0 jmp m(idct_4x16_internal_16bpc).end %macro INV_TXFM_8X4_FN 2 ; type1, type2 %if ARCH_X86_64 INV_TXFM_FN %1, %2, 0, 8x4, 15 %else INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 128 sar r5d, 8 imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 lea r2, [strideq*3] mova m1, [dstq+strideq*0] mova m2, [dstq+strideq*1] mova m3, [dstq+strideq*2] mova m4, [dstq+r2] REPX {paddw x, m0}, m1, m2, m3, m4 REPX {pmaxsw x, m5}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 mova [dstq+strideq*2], m3 mova [dstq+r2 ], m4 RET %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, identity INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] .pass1_entry: %if ARCH_X86_32 lea r3, [rsp+gprsize] %else mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+0*16] mova m1, [cq+1*16] mova m2, [cq+2*16] mova m3, [cq+3*16] mova m4, [cq+4*16] mova m5, [cq+5*16] mova m6, [cq+6*16] mova m7, [cq+7*16] call .rect2_mul call r5 call .transpose4x8packed ; m0-3 = packed & transposed output jmp tx2q .transpose4x8packed: ; transpose punpcklwd m1, m2, m6 punpckhwd m2, m6 punpckhwd m6, m0, m4 punpcklwd m0, m4 punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m4, m6, m2 punpcklwd m6, m2 punpcklwd m2, m3, m4 punpckhwd m3, m4 punpckhwd m1, m0, m6 punpcklwd m0, m6 ret .main: call .main_pass1 call .round packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 ret .rect2_mul: %if ARCH_X86_64 REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 %else mova [r3], m7 mova m7, [o(pd_2896)] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulld m7, [r3] mova [r3], m7 mova m7, [o(pd_2048)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] %endif REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 ret %if ARCH_X86_64 .main_pass1_fast: pmulld m5, m3, [o(pd_m2276)] pmulld m3, [o(pd_3406)] pmulld m7, m1, [o(pd_4017)] pmulld m1, [o(pd_799)] pmulld m6, m2, [o(pd_3784)] pmulld m2, [o(pd_1567)] pmulld m0, m14 pxor m4, m4 jmp .main_pass1_fast2 .main_pass1: ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 REPX {pmulld x, m14}, m0, m4 .main_pass1_fast2: REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7 REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7 paddd m8, m1, m5 ; t4 psubd m1, m5 ; t5a paddd m9, m7, m3 ; t7 psubd m7, m3 ; t6a REPX {pmaxsd x, m12}, m1, m8, m7, m9 REPX {pminsd x, m13}, m1, m8, m7, m9 REPX {pmulld x, m14}, m7, m1 paddd m0, m11 paddd m7, m11 psubd m5, m0, m4 paddd m0, m4 psubd m4, m7, m1 paddd m7, m1 REPX {psrad x, 12 }, m5, m0, m4, m7 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 paddd m6, m5, m2 ; dct4 out1 psubd m5, m2 ; dct4 out2 REPX {pmaxsd x, m12}, m0, m6, m5, m3 REPX {pminsd x, m13}, m0, m6, m5, m3 ret .round: paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 psubd m7, m0, m9 ; out7 paddd m0, m9 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 psubd m4, m3, m8 ; out4 paddd m3, m8 ; out3 %else .main_pass1_fast: pmulld m5, m3, [o(pd_m2276)] pmulld m3, [o(pd_3406)] pmulld m7, m1, [o(pd_4017)] pmulld m1, [o(pd_799)] pmulld m6, m2, [o(pd_3784)] pmulld m2, [o(pd_1567)] mova m4, [o(pd_2048)] mova [r3+0*16], m2 REPX {paddd x, m4}, m5, m3, m7, m1 REPX {psrad x, 12}, m5, m3, m7, m1 paddd m2, m1, m5 ; t4 psubd m1, m5 ; t5a pmulld m5, m0, [o(pd_2896)] mova m0, m4 paddd m4, m7, m3 ; t7 psubd m7, m3 ; t6a mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3 }, m1, m2, m7, m4 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3 }, m1, m2, m7, m4 mova [r3+3*16], m2 mova [r3+1*16], m4 pxor m4, m4 mova m2, [r3+0*16] mova m3, [o(pd_2896)] jmp .main_pass1_fast2 .main_pass1: mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m4 mova [r3+3*16], m6 mova m0, [o(pd_2048)] ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a paddd m2, m1, m5 ; t4 psubd m1, m5 ; t5a paddd m4, m7, m3 ; t7 psubd m7, m3 ; t6a mova m6, [o(clip_18b_min)] REPX {pmaxsd x, m6 }, m1, m2, m7, m4 mova m6, [o(clip_18b_max)] REPX {pminsd x, m6 }, m1, m2, m7, m4 mova m6, [r3+3*16] mova [r3+3*16], m2 mova m2, [r3+1*16] mova [r3+1*16], m4 ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3 mova m3, [o(pd_2896)] mova m5, [r3+0*16] mova m4, [r3+2*16] REPX {pmulld x, m3 }, m5, m4 .main_pass1_fast2: REPX {paddd x, m0 }, m2, m6 REPX {psrad x, 12 }, m2, m6 REPX {pmulld x, m3 }, m7, m1 paddd m7, m0 paddd m0, m5 psubd m5, m0, m4 paddd m0, m4 psubd m4, m7, m1 paddd m7, m1 REPX {psrad x, 12 }, m5, m0, m4, m7 psubd m3, m0, m6 ; dct4 out3 paddd m0, m6 ; dct4 out0 paddd m6, m5, m2 ; dct4 out1 psubd m5, m2 ; dct4 out2 mova m1, [o(clip_18b_min)] REPX {pmaxsd x, m1 }, m0, m6, m5, m3 mova m1, [o(clip_18b_max)] REPX {pminsd x, m1 }, m0, m6, m5, m3 ret .round: paddd m1, m6, m7 ; out1 psubd m6, m7 ; out6 mova [r3+0*16], m6 mova m6, [r3+1*16] psubd m7, m0, m6 ; out7 paddd m0, m6 ; out0 paddd m2, m5, m4 ; out2 psubd m5, m4 ; out5 mova m6, [r3+3*16] psubd m4, m3, m6 ; out4 paddd m3, m6 ; out3 mova m6, [r3+0*16] %endif ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_8x4_internal_8bpc, _ssse3).main .end: lea r3, [strideq*3] call .round2_and_write_8x4 REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 RET .round2_and_write_8x4: pxor m6, m6 mova m5, [o(pixel_10bpc_max)] mova m4, [o(pw_2048)] .round1_and_write_8x4: REPX {pmulhrsw x, m4}, m0, m1, m2, m3 .write_8x4: paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3] REPX {pminsw x, m5}, m0, m1, m2, m3 REPX {pmaxsw x, m6}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 ret INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] jmp m(idct_8x4_internal_16bpc).pass1_entry .main: call .main_pass1 call .round packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 ret .main_pass1: %if ARCH_X86_64 ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a psubd m8, m2, m6 ; t6 paddd m2, m6 ; t2 psubd m6, m0, m4 ; t4 paddd m0, m4 ; t0 psubd m4, m5, m1 ; t7 paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a psubd m9, m6, m8 ; t7 paddd m6, m8 ; out6 mova m8, [o(pd_2896)] psubd m3, m7, m5 ; t3 paddd m7, m5 ; -out7 psubd m5, m0, m2 ; t2 paddd m0, m2 ; out0 psubd m2, m1, m4 ; t6 paddd m1, m4 ; -out1 REPX {pmaxsd x, m12}, m5, m3, m2, m9 REPX {pminsd x, m13}, m5, m3, m2, m9 REPX {pmulld x, m14}, m5, m3, m2, m9 psubd m4, m5, m3 ; (t2 - t3) * 2896 paddd m3, m5 ; (t2 + t3) * 2896 psubd m5, m2, m9 ; (t6 - t7) * 2896 paddd m2, m9 ; (t6 + t7) * 2896 ret .round: ; m0=out0,m1=-out1,m6=out6,m7=-out7 pcmpeqd m8, m8 REPX {pxor x, m8 }, m1, m7, m3, m5 REPX {psubd x, m8 }, m1, m7 REPX {paddd x, m11}, m2, m3, m4, m5 REPX {psrad x, 12 }, m2, m3, m4, m5 %else mova [r3+0*16], m2 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m5 mova m5, [o(pd_2048)] ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a mova m2, [r3+0*16] mova m3, [r3+1*16] mova m4, [r3+2*16] mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m6 mova m1, [r3+3*16] mova [r3+3*16], m7 ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a mova m0, [r3+0*16] mova m6, [r3+2*16] psubd m7, m2, m6 ; t6 paddd m2, m6 ; t2 psubd m6, m0, m4 ; t4 paddd m0, m4 ; t0 mova [r3+0*16], m7 mova m5, [r3+1*16] mova m7, [r3+3*16] psubd m4, m1, m5 ; t7 paddd m5, m1 ; t3 psubd m1, m7, m3 ; t5 paddd m7, m3 ; t1 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 mova [r3+1*16], m7 mova m7, [o(clip_18b_max)] pmaxsd m3, [r3+0*16] REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 pminsd m7, [r3+1*16] mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m5 mova [r3+3*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a mova m5, [r3+2*16] mova m7, [r3+3*16] psubd m2, m6, m3 ; t7 paddd m6, m3 ; out6 mova [r3+3*16], m6 mova m0, [r3+0*16] mova m6, [r3+1*16] psubd m3, m7, m5 ; t3 paddd m7, m5 ; -out7 psubd m5, m0, m6 ; t2 paddd m0, m6 ; out0 psubd m6, m1, m4 ; t6 paddd m1, m4 ; -out1 mova m4, [o(clip_18b_min)] REPX {pmaxsd x, m4 }, m5, m3, m6, m2 mova m4, [o(clip_18b_max)] REPX {pminsd x, m4 }, m5, m3, m6, m2 mova m4, [o(pd_2896)] REPX {pmulld x, m4 }, m5, m3, m6, m2 psubd m4, m5, m3 ; (t2 - t3) * 2896 paddd m3, m5 ; (t2 + t3) * 2896 psubd m5, m6, m2 ; (t6 - t7) * 2896 paddd m2, m6 ; (t6 + t7) * 2896 ret .round: mova [r3+2*16], m0 pcmpeqd m0, m0 mova m6, [o(pd_2048)] REPX {pxor x, m0 }, m1, m7, m3, m5 REPX {psubd x, m0 }, m1, m7 REPX {paddd x, m6 }, m2, m3, m4, m5 REPX {psrad x, 12 }, m2, m3, m4, m5 mova m6, [r3+3*16] mova m0, [r3+2*16] %endif ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main jmp m(idct_8x4_internal_16bpc).end INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] jmp m(idct_8x4_internal_16bpc).pass1_entry .main: call m(iadst_8x4_internal_16bpc).main_pass1 call m(iadst_8x4_internal_16bpc).round packssdw m7, m6 packssdw m5, m4 packssdw m3, m2 packssdw m1, m0 mova m0, m7 mova m2, m5 mova m4, m3 mova m6, m1 ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main lea r3, [strideq*3] add dstq, r3 neg strideq jmp m(idct_8x4_internal_16bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r5, [o(.main)] jmp m(idct_8x4_internal_16bpc).pass1_entry .main: REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 ret .pass2: mova m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(idct_8x4_internal_16bpc).end %macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset %if ARCH_X86_64 INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16 %else INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 2 .end: add r5d, 384 sar r5d, 9 .end2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 lea r2, [strideq*3] .loop: mova m1, [dstq+strideq*0] mova m2, [dstq+strideq*1] mova m3, [dstq+strideq*2] mova m4, [dstq+r2] REPX {paddw x, m0}, m1, m2, m3, m4 REPX {pmaxsw x, m5}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 mova [dstq+strideq*0], m1 mova [dstq+strideq*1], m2 mova [dstq+strideq*2], m3 mova [dstq+r2 ], m4 lea dstq, [dstq+strideq*4] dec r3d jg .loop RET %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, identity, 6 INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 DECLARE_REG_TMP 1 mov [rsp+4*16+1*gprsize], r1 %else DECLARE_REG_TMP 6 %endif lea t0, [o(.pass1_main)] .pass1_full: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 10 setge r5b %else mov r5d, 1 cmp eobd, 10 sbb r5d, 0 %endif shl r5d, 4 %if ARCH_X86_32 lea r3, [rsp+gprsize] %endif .loop_pass1: mova m0, [cq+0*32+r5] mova m1, [cq+1*32+r5] mova m2, [cq+2*32+r5] mova m3, [cq+3*32+r5] mova m4, [cq+4*32+r5] mova m5, [cq+5*32+r5] mova m6, [cq+6*32+r5] mova m7, [cq+7*32+r5] call t0 test r5d, r5d jz .end_pass1 mova [cq+0*32+16], m0 mova [cq+1*32+16], m1 mova [cq+2*32+16], m2 mova [cq+3*32+16], m3 sub r5d, 16 jmp .loop_pass1 .end_pass1: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_32 mov r1, [rsp+4*16+1*gprsize] %endif jmp tx2q .pass1_main: call m(idct_8x4_internal_16bpc).main_pass1 pcmpeqd m1, m1 REPX {psubd x, m1}, m0, m6, m5, m3 call m(idct_8x4_internal_16bpc).round REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 .pack_and_transpose: packssdw m2, m3 packssdw m6, m7 packssdw m0, m1 packssdw m4, m5 jmp m(idct_8x4_internal_16bpc).transpose4x8packed .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_8x8_internal_8bpc, _ssse3).main lea r3, [strideq*3] %if ARCH_X86_64 mova m10, [o(pixel_10bpc_max)] pxor m9, m9 %endif call .round3_and_write_8x8 .zero: %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 %undef mzero RET ; round (rounded right-shift by 5) before writing ; data in m0-7 ; on x86-64, pw_2048 is in m8 ; .round1 is for m0-7 ; .round2 is for m0-6 & [rsp+gprsize*2] ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7 %if ARCH_X86_32 .round1_and_write_8x8: mova [rsp+gprsize*2], m7 .round2_and_write_8x8: %endif .round3_and_write_8x8: mova m7, [o(pw_2048)] %if ARCH_X86_32 .round4_and_write_8x8: %endif REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [rsp+gprsize*2] %if ARCH_X86_64 jmp .write_8x8 .round2_and_write_8x8: mova m7, [rsp+gprsize*2] .round1_and_write_8x8: REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 %endif ; m0-7 have to-be-written data [pre-rounded] ; on x86-64, m9-10 contain a zero/pixel_max ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch ; r0,1,3 contain dstq/strideq/stride3q ; r5 is a scratch register .write_8x8: lea r5, [dstq+strideq*4] paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r3] paddw m4, [r5 +strideq*0] paddw m5, [r5 +strideq*1] paddw m6, [r5 +strideq*2] paddw m7, [r5 +r3] %if ARCH_X86_64 REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 %else mova [rsp+gprsize*2], m7 pxor m7, m7 REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmaxsw m7, [rsp+gprsize*2] mova [rsp+gprsize*2], m7 mova m7, [o(pixel_10bpc_max)] REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pminsw m7, [rsp+gprsize*2] %endif mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r3 ], m3 mova [r5 +strideq*0], m4 mova [r5 +strideq*1], m5 mova [r5 +strideq*2], m6 mova [r5 +r3 ], m7 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity, 6 cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+4*16+1*gprsize], r1 %endif lea t0, [o(.pass1_main)] jmp m(idct_8x8_internal_16bpc).pass1_full .pass1_main: call m(iadst_8x4_internal_16bpc).main_pass1 call .round jmp m(idct_8x8_internal_16bpc).pack_and_transpose .round: %if ARCH_X86_64 pcmpeqd m8, m8 ; -1 REPX {psubd x, m8 }, m0, m6 REPX {pxor x, m8 }, m1, m7, m3, m5 REPX {psrad x, 1 }, m0, m1, m6, m7 REPX {psubd x, m8 }, m1, m7 mova m8, [o(pd_6144)] REPX {paddd x, m8 }, m2, m3, m4, m5 REPX {psrad x, 13 }, m2, m3, m4, m5 %else mova [r3+2*16], m0 pcmpeqd m0, m0 ; -1 mova m6, [o(pd_6144)] REPX {pxor x, m0 }, m1, m7, m3, m5 REPX {psrad x, 1 }, m1, m7 REPX {psubd x, m0 }, m1, m7 REPX {paddd x, m6 }, m2, m3, m4, m5 REPX {psrad x, 13 }, m2, m3, m4, m5 mova m0, [r3+2*16] psrld m6, 12 ; +1 paddd m0, m6 paddd m6, [r3+3*16] REPX {psrad x, 1 }, m0, m6 %endif ret .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end lea r3, [strideq*3] %if ARCH_X86_64 mova m10, [o(pixel_10bpc_max)] pxor m9, m9 %endif call .round3_and_write_8x8 jmp m(idct_8x8_internal_16bpc).zero ; round (rounded right-shift by 5) before writing; odd registers are negated ; data in m0-7 ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11 ; .round1 is for m0-7 ; .round2 is for m0-6 & [rsp+gprsize*2] ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) %if ARCH_X86_64 .round2_and_write_8x8: mova m7, [rsp+gprsize*2] .round1_and_write_8x8: REPX {pmulhrsw x, m8 }, m0, m2, m4, m6 REPX {pmulhrsw x, m11}, m1, m3, m5, m7 jmp m(idct_8x8_internal_16bpc).write_8x8 %else .round1_and_write_8x8: mova [rsp+gprsize*2], m7 .round2_and_write_8x8: %endif .round3_and_write_8x8: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova m7, [o(pw_m2048)] REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize*2] jmp m(idct_8x8_internal_16bpc).write_8x8 INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity, 6 cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+4*16+1*gprsize], r1 %endif lea t0, [o(.pass1_main)] jmp m(idct_8x8_internal_16bpc).pass1_full .pass1_main: call m(iadst_8x4_internal_16bpc).main_pass1 call m(iadst_8x8_internal_16bpc).round ; invert registers packssdw m7, m6 packssdw m5, m4 packssdw m3, m2 packssdw m1, m0 mova m0, m7 mova m2, m5 mova m4, m3 mova m6, m1 jmp m(idct_8x4_internal_16bpc).transpose4x8packed .pass2: lea dstq, [dstq+strideq*8] sub dstq, strideq neg strideq jmp m(iadst_8x8_internal_16bpc).pass2 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 mova m0, [cq+0*32] mova m1, [cq+1*32] mova m2, [cq+2*32] mova m3, [cq+3*32] mova m4, [cq+4*32] mova m5, [cq+5*32] mova m6, [cq+6*32] mova m7, [cq+7*32] packssdw m0, [cq+0*32+16] packssdw m1, [cq+1*32+16] packssdw m2, [cq+2*32+16] packssdw m3, [cq+3*32+16] packssdw m4, [cq+4*32+16] packssdw m5, [cq+5*32+16] packssdw m6, [cq+6*32+16] packssdw m7, [cq+7*32+16] mova [rsp+gprsize+16*1], m6 jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif lea r3, [strideq*3] %if ARCH_X86_64 mova m10, [o(pixel_10bpc_max)] pxor m9, m9 mova m8, [o(pw_4096)] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 %else mova [rsp+gprsize], m7 mova m7, [o(pw_4096)] call m(idct_8x8_internal_16bpc).round4_and_write_8x8 %endif jmp m(idct_8x8_internal_16bpc).zero %macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix %if ARCH_X86_64 INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16 %else INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 add r5d, 128 sar r5d, 8 imul r5d, 181 mov r3d, 4 %if stack_size_padded > 0 ; adjust to caller's stack allocation add rsp, (12+ARCH_X86_64)*16 %endif jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity, v INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst %if ARCH_X86_64 DECLARE_REG_TMP 7 %endif cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] .pass1_full: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif %undef cmp mov r6d, 4 .zero_loop: dec r6d cmp eobb, byte [r5+r6] jl .zero_loop mov r5d, r6d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, [rsp+16*16+2*gprsize] ; setup stack pointer lea r3, [rsp+gprsize] %endif .loop_pass1: mova m0, [cq+0*64+r5] mova m1, [cq+1*64+r5] mova m2, [cq+2*64+r5] mova m3, [cq+3*64+r5] mova m4, [cq+4*64+r5] mova m5, [cq+5*64+r5] mova m6, [cq+6*64+r5] mova m7, [cq+7*64+r5] call m(idct_8x4_internal_16bpc).rect2_mul call t0 mova [cq+0*64+r5], m0 mova [cq+1*64+r5], m1 mova [cq+2*64+r5], m2 mova [cq+3*64+r5], m3 sub r5d, 16 jge .loop_pass1 %if WIN64 POP r7 %elif ARCH_X86_32 mov r1, [rsp+16*16+1*gprsize] %endif jmp tx2q .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 ; some are still pre-loaded from the final loop iteration in pass=1 mova m1, m2 mova m2, [cq+ 1*16] mova m3, [cq+ 9*16] mova m4, [cq+ 2*16] mova m5, [cq+10*16] mova m6, [cq+ 3*16] mova m7, [cq+11*16] call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+3*16], m0 mova [rsp+gprsize+4*16], m1 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m3 mova [rsp+gprsize+7*16], m4 mova [rsp+gprsize+8*16], m5 mova [rsp+gprsize+9*16], m6 ; m7 is already stored in [rsp+gprsize+0*16] mova m0, [cq+ 4*16] mova m1, [cq+12*16] mova m2, [cq+ 5*16] mova m3, [cq+13*16] mova m4, [cq+ 6*16] mova m5, [cq+14*16] mova m6, [cq+ 7*16] mova m7, [cq+15*16] call m_suffix(idct_16x8_internal_8bpc, _ssse3).main ; out0-7 is in rsp+gprsize+3-10*mmsize ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize %if ARCH_X86_64 mova m8, [o(pw_2048)] mova m10, [o(pixel_10bpc_max)] pxor m9, m9 mov r6, dstq %else mov [rsp+16*16+gprsize*1], dstq %endif lea r3, [strideq*3] lea dstq, [dstq+strideq*8] call m(idct_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 %undef mzero mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] %if ARCH_X86_64 mov dstq, r6 %else mov dstq, [rsp+16*16+gprsize*1] %endif call m(idct_8x8_internal_16bpc).round1_and_write_8x8 RET INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity, v cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] jmp m(idct_8x16_internal_16bpc).pass1_full .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m4, [cq+ 9*16] mova m5, [cq+13*16] mova [rsp+gprsize+7*16], m0 mova [rsp+gprsize+8*16], m1 mova [rsp+gprsize+5*16], m4 mova [rsp+gprsize+6*16], m5 mova m0, m2 mova m1, m3 mova m2, [cq+ 1*16] mova m3, [cq+ 5*16] mova m4, [cq+ 2*16] mova m5, [cq+ 6*16] mova m6, [cq+11*16] mova m7, [cq+15*16] mova [rsp+gprsize+ 3*16], m4 mova [rsp+gprsize+ 4*16], m5 mova [rsp+gprsize+ 9*16], m6 mova [rsp+gprsize+10*16], m7 mova m4, [cq+10*16] mova m5, [cq+14*16] mova m6, [cq+ 3*16] mova m7, [cq+ 7*16] call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end %if ARCH_X86_64 mova m11, [o(pw_m2048)] mova m8, [o(pw_2048)] mova m10, [o(pixel_10bpc_max)] pxor m9, m9 mov r6, dstq %else mov [rsp+16*16+gprsize*1], dstq %endif lea r3, [strideq*3] lea dstq, [dstq+strideq*8] call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 %undef mzero mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] %if ARCH_X86_64 mov dstq, r6 %else mov dstq, [rsp+16*16+gprsize*1] %endif call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 RET INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity, v cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] jmp m(idct_8x16_internal_16bpc).pass1_full .pass2: lea r3, [strideq*3] lea r3, [r3*5] add dstq, r3 neg strideq jmp m(iadst_8x16_internal_16bpc).pass2 INV_TXFM_8X16_FN identity, dct, h INV_TXFM_8X16_FN identity, adst, h INV_TXFM_8X16_FN identity, flipadst, h INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 PUSH r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] jmp m(idct_8x16_internal_16bpc).pass1_full .pass2: %if ARCH_X86_64 mova m4, [o(pw_2048)] mova m5, [o(pixel_10bpc_max)] pxor m6, m6 mova m7, [o(pw_1697x16)] %endif mov r5d, 4 lea r3, [strideq*3] .pass2_loop: call .main %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).round1_and_write_8x4 %else call m(idct_8x4_internal_16bpc).round2_and_write_8x4 %endif REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28 dec r5d jle .end add cq, 16 lea dstq, [dstq+strideq*4] mova m0, [cq+ 0*16] mova m1, [cq+ 4*16] mova m2, [cq+ 8*16] mova m3, [cq+12*16] jmp .pass2_loop .end: RET .main: ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) %if ARCH_X86_32 mova m7, [o(pw_1697x16)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 %else pmulhrsw m8, m7, m0 pmulhrsw m9, m7, m1 pmulhrsw m10, m7, m2 pmulhrsw m11, m7, m3 %endif REPX {paddsw x, x}, m0, m1, m2, m3 %if ARCH_X86_64 paddsw m0, m8 paddsw m1, m9 paddsw m2, m10 paddsw m3, m11 %else paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 %endif ret %macro INV_TXFM_16X4_FN 2 ; type1, type2 %if ARCH_X86_64 INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16 %else INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 4 .dconly: add r5d, 384 sar r5d, 9 .dconly2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m3, [o(pixel_10bpc_max)] pxor m4, m4 .loop: mova m1, [dstq+ 0] mova m2, [dstq+16] REPX {paddw x, m0}, m1, m2 REPX {pminsw x, m3}, m1, m2 REPX {pmaxsw x, m4}, m1, m2 mova [dstq+ 0], m1 mova [dstq+16], m2 add dstq, strideq dec r3d jg .loop RET %endif %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, identity INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif ; setup stack pointer lea r3, [rsp+gprsize] mova m0, [cq+ 1*16] mova m1, [cq+ 3*16] mova m2, [cq+ 5*16] mova m3, [cq+ 7*16] mova m4, [cq+ 9*16] mova m5, [cq+11*16] mova m6, [cq+13*16] mova m7, [cq+15*16] call .main_oddhalf mova m0, [cq+ 0*16] mova m1, [cq+ 2*16] mova m2, [cq+ 4*16] mova m3, [cq+ 6*16] mova m4, [cq+ 8*16] mova m5, [cq+10*16] mova m6, [cq+12*16] mova m7, [cq+14*16] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round ; t0-7 is in m0-7 call .round %if ARCH_X86_64 .pack_transpose: ; transpose in two parts packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 .transpose: call m(idct_8x4_internal_16bpc).transpose4x8packed call .transpose4x8packed_hi %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m2 mova [r3+3*16], m3 mova m0, [r3+ 8*16] mova m2, [r3+ 9*16] mova m4, [r3+10*16] mova m6, [r3+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif jmp tx2q %if ARCH_X86_64 .transpose4x8packed_hi: punpcklwd m9, m10, m14 punpckhwd m10, m14 punpckhwd m14, m8, m12 punpcklwd m8, m12 punpckhwd m11, m8, m9 punpcklwd m8, m9 punpckhwd m12, m14, m10 punpcklwd m14, m10 punpcklwd m10, m11, m12 punpckhwd m11, m12 punpckhwd m9, m8, m14 punpcklwd m8, m14 ret %endif .main_oddhalf_fast: ; lower half zero pmulld m7, m0, [o(pd_4076)] pmulld m0, [o(pd_401)] pmulld m6, m1, [o(pd_m1189)] pmulld m1, [o(pd_3920)] %if ARCH_X86_32 mova m4, [o(pd_2048)] REPX {paddd x, m4}, m1, m6 REPX {psrad x, 12}, m1, m6 mova [r3+1*16], m1 %endif pmulld m5, m2, [o(pd_3612)] pmulld m2, [o(pd_1931)] %if ARCH_X86_32 pmulld m1, m3, [o(pd_m2598)] %else pmulld m4, m3, [o(pd_m2598)] %endif pmulld m3, [o(pd_3166)] jmp .main_oddhalf_fast2 .main_oddhalf: %if ARCH_X86_64 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a .main_oddhalf_fast2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 psubd m8, m0, m4 ; t9 paddd m0, m4 ; t8 psubd m4, m6, m2 ; t10 paddd m2, m6 ; t11 psubd m6, m1, m5 ; t13 paddd m5, m1 ; t12 psubd m1, m7, m3 ; t14 paddd m7, m3 ; t15 REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 psubd m3, m1, m4 ; t10 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a paddd m0, m2 ; t8a psubd m2, m8, m6 ; t13 paddd m6, m8 ; t14 psubd m8, m7, m5 ; t12a paddd m7, m5 ; t15a REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 REPX {pmulld x, m14}, m2, m8, m3, m4 paddd m2, m11 paddd m8, m11 paddd m5, m2, m3 ; t13a psubd m2, m3 ; t10a psubd m3, m8, m4 ; t11 paddd m4, m8 ; t12 REPX {psrad x, 12}, m5, m2, m3, m4 mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m2 mova [r3+3*16], m3 mova [r3+4*16], m4 mova [r3+5*16], m5 mova [r3+6*16], m6 mova [r3+7*16], m7 %else mova [r3+0*16], m2 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m5 mova m4, [o(pd_2048)] ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a mova m2, [r3+0*16] mova m3, [r3+1*16] mova [r3+0*16], m0 mova [r3+1*16], m1 mova m1, [r3+2*16] mova m5, [r3+3*16] mova [r3+2*16], m6 mova [r3+3*16], m7 ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a mova m0, [r3+0*16] mova m6, [r3+2*16] mova m7, [r3+3*16] .main_oddhalf_fast2: REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3 REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3 psubd m4, m0, m1 ; t9 paddd m0, m1 ; t8 mova m1, [r3+1*16] mova [r3+0*16], m4 psubd m4, m6, m2 ; t10 paddd m2, m6 ; t11 psubd m6, m1, m5 ; t13 paddd m5, m1 ; t12 psubd m1, m7, m3 ; t14 paddd m7, m3 ; t15 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7 pmaxsd m3, [r3+0*16] mova [r3+0*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7 pminsd m3, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m5 mova [r3+3*16], m7 mova m7, [o(pd_2048)] ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784 ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4 mova m0, [r3+0*16] mova m2, [r3+1*16] psubd m5, m1, m4 ; t10 mova [r3+1*16], m5 paddd m1, m4 ; t9 psubd m4, m0, m2 ; t11a paddd m0, m2 ; t8a mova m5, [r3+2*16] mova m7, [r3+3*16] psubd m2, m3, m6 ; t13 paddd m6, m3 ; t14 paddd m3, m7, m5 ; t15a psubd m7, m5 ; t12a mova [r3+0*16], m3 mova m3, [r3+1*16] mova m5, [o(clip_18b_min)] REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6 pmaxsd m5, [r3+0*16] mova [r3+0*16], m5 mova m5, [o(clip_18b_max)] REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6 pminsd m5, [r3+0*16] mova [r3+0*16], m5 mova m5, [o(pd_2896)] REPX {pmulld x, m5}, m2, m7, m3, m4 mova m5, [o(pd_2048)] REPX {paddd x, m5}, m2, m7 paddd m5, m2, m3 ; t13a psubd m2, m3 ; t10a psubd m3, m7, m4 ; t11 paddd m4, m7 ; t12 REPX {psrad x, 12}, m5, m2, m3, m4 mova m7, [r3+0*16] mova [r3+11*16], m0 mova [r3+10*16], m1 mova [r3+9*16], m2 mova [r3+8*16], m3 mova [r3+7*16], m4 mova [r3+6*16], m5 mova [r3+5*16], m6 mova [r3+4*16], m7 %endif ret .round: %if ARCH_X86_64 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 pcmpeqd m8, m8 REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] mova m9, [r3+2*16] mova m10, [r3+3*16] mova m11, [r3+4*16] mova m12, [r3+5*16] mova m13, [r3+6*16] mova m14, [r3+7*16] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r3+0*16] ; out8 paddd m7, [r3+0*16] ; out7 REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 pmaxsd m0, [r3+ 0*16] mova [r3+ 0*16], m7 mova m7, [o(clip_18b_max)] REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 pminsd m7, [r3+ 0*16] mova [r3+ 0*16], m0 pcmpeqd m0, m0 REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 mova [r3+ 1*16], m1 mova [r3+ 2*16], m2 mova m1, [r3+ 0*16] psubd m1, m0 mova [r3+ 0*16], m1 mova m1, [r3+11*16] mova m2, [r3+10*16] psubd m0, m7, m1 paddd m7, m1 psubd m1, m6, m2 paddd m6, m2 REPX {psrad x, 1}, m0, m1, m6, m7 packssdw m0, m1 ; out8-9 packssdw m6, m7 ; out6-7 mova [r3+11*16], m6 mova m1, [r3+9*16] mova m7, [r3+8*16] psubd m2, m5, m1 paddd m5, m1 psubd m1, m4, m7 paddd m4, m7 REPX {psrad x, 1}, m2, m1, m4, m5 packssdw m2, m1 ; out10-11 packssdw m4, m5 ; out4-5 mova m1, [r3+2*16] mova [r3+10*16], m4 mova m6, [r3+7*16] mova m7, [r3+6*16] psubd m4, m3, m6 paddd m3, m6 psubd m6, m1, m7 paddd m1, m7 REPX {psrad x, 1}, m4, m6, m1, m3 packssdw m4, m6 ; out12-13 packssdw m1, m3 ; out2-3 mova m3, [r3+1*16] mova [r3+9*16], m1 mova m1, [r3+0*16] mova m5, [r3+5*16] mova m7, [r3+4*16] psubd m6, m3, m5 paddd m3, m5 psubd m5, m1, m7 paddd m1, m7 REPX {psrad x, 1}, m6, m5, m1, m3 packssdw m6, m5 ; out14-15 packssdw m1, m3 ; out0-1 mova [r3+8*16], m1 %endif ret .pass2: lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)] .pass2_loop: lea r3, [strideq*3] %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call r4 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 %if ARCH_X86_64 mova m0, m8 mova m1, m9 mova m2, m10 mova m3, m11 %else mova m0, [rsp+gprsize+0*16] mova m1, [rsp+gprsize+1*16] mova m2, [rsp+gprsize+2*16] mova m3, [rsp+gprsize+3*16] %endif add dstq, 16 %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call r4 call m(idct_8x4_internal_16bpc).round2_and_write_8x4 RET INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 ; setup stack pointer lea r3, [rsp+gprsize] call .main %if ARCH_X86_64 jmp m(idct_16x4_internal_16bpc).pack_transpose %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+gprsize+0*16], m0 mova [rsp+gprsize+1*16], m1 mova [rsp+gprsize+2*16], m2 mova [rsp+gprsize+3*16], m3 mova m0, [rsp+gprsize+ 8*16] mova m2, [rsp+gprsize+ 9*16] mova m4, [rsp+gprsize+10*16] mova m6, [rsp+gprsize+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed jmp tx2q %endif .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*16] mova m1, [cq+13*16] mova m2, [cq+ 6*16] mova m3, [cq+ 9*16] mova m4, [cq+10*16] mova m5, [cq+ 5*16] mova m6, [cq+14*16] mova m7, [cq+ 1*16] call .main_part1 mova m0, [cq+ 0*16] mova m1, [cq+15*16] mova m2, [cq+ 4*16] mova m3, [cq+11*16] mova m4, [cq+ 8*16] mova m5, [cq+ 7*16] mova m6, [cq+12*16] mova m7, [cq+ 3*16] call .main_part2 .round: %if ARCH_X86_64 mova m15, [o(pd_6144)] psrld m14, 11 ; pd_1 pcmpeqd m8, m8 ; -1 psubd m13, m15, m14 ; pd_6143 REPX {paddd x, m14}, m0, m2 REPX {paddd x, m15}, m4, m6 REPX {pxor x, m8 }, m1, m3, m5, m7 REPX {psrad x, 1 }, m1, m3 REPX {paddd x, m15}, m5, m7 REPX {psubd x, m8 }, m1, m3 paddd m8, m15, m9 psubd m9, m13, m10 paddd m10, m15, m11 psubd m11, m13, m12 paddd m12, m14, [r3+3*16] psubd m13, m14, [r3+2*16] psubd m15, m14, [r3+0*16] paddd m14, [r3+1*16] REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15 REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 %else mova [r3+8*16], m1 mova [r3+9*16], m3 mova m3, [o(pd_6144)] pcmpeqd m1, m1 REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m5, m6, m7 REPX {psrad x, 13}, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {pxor x, m1}, m5, m7 REPX {psubd x, m1}, m4, m6 REPX {psrad x, 1 }, m4, m5, m6, m7 REPX {psubd x, m1}, m5, m7 packssdw m4, m5 packssdw m6, m7 mova m5, [r3+8*16] mova m7, [r3+9*16] mova [r3+8*16], m4 mova [r3+9*16], m6 REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m0, m5, m2, m7 REPX {psrad x, 13}, m0, m5, m2, m7 packssdw m0, m5 packssdw m2, m7 mova m4, [r3+0*16] mova m5, [r3+1*16] mova m6, [r3+2*16] mova m7, [r3+3*16] REPX {psubd x, m1}, m4, m6 REPX {pxor x, m1}, m5, m7 REPX {psrad x, 1 }, m4, m5, m6, m7 REPX {psubd x, m1}, m5, m7 packssdw m4, m5 packssdw m6, m7 %endif ret .main_part2: %if ARCH_X86_64 ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380 psubd m8, m0, m4 ; t8a paddd m0, m4 ; t0a psubd m4, m1, m5 ; t9a paddd m1, m5 ; t1a psubd m5, m2, m6 ; t12a paddd m2, m6 ; t4a psubd m6, m3, m7 ; t13a paddd m7, m3 ; t5a REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 mova m15, [o(pd_4017)] mova m10, [o(pd_799)] ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 psubd m3, m0, m2 ; t4 paddd m0, m2 ; t0 psubd m2, m1, m7 ; t5 paddd m1, m7 ; t1 psubd m7, m4, m6 ; t12a paddd m4, m6 ; t8a psubd m6, m8, m5 ; t13a paddd m5, m8 ; t9a REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15 ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15 mova m10, [r3+0*16] ; t2 mova m8, [r3+1*16] ; t3 psubd m9, m0, m10 ; t2a paddd m0, m10 ; out0 psubd m10, m1, m8 ; t3a paddd m1, m8 ; -out15 mova [r3+0*16], m1 mova m15, [r3+3*16] ; t7a mova m1, [r3+2*16] ; t6a psubd m8, m3, m15 ; t7 paddd m15, m3 ; out12 paddd m3, m2, m1 ; -out3 psubd m2, m1 ; t6 mova [r3+3*16], m15 mova [r3+1*16], m2 mova m1, [r3+7*16] ; t15 mova m2, [r3+6*16] ; t14 paddd m15, m7, m1 ; -out13 psubd m7, m1 ; t15a psubd m11, m6, m2 ; t14a paddd m2, m6 ; out2 mova [r3+2*16], m15 mova m1, [r3+4*16] ; t10a mova m15, [r3+5*16] ; t11a psubd m6, m4, m1 ; t10 paddd m1, m4 ; -out1 psubd m4, m5, m15 ; t11 paddd m5, m15 ; out14 REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8 pmaxsd m12, [r3+1*16] ; t6 mova [r3+1*16], m5 REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8 REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8 paddd m5, m11, m7 ; -out5 (unshifted) psubd m11, m7 ; out10 (unshifted) paddd m7, m9, m10 ; -out7 (unshifted) psubd m9, m10 ; out8 (unshifted) psubd m10, m6, m4 ; -out9 (unshifted) paddd m6, m4 ; out6 (unshifted) paddd m4, m12, m8 ; out4 (unshifted) psubd m12, m8 ; -out11 (unshifted) %else mova [r3+8*16], m0 mova [r3+9*16], m1 mova [r3+10*16], m2 mova [r3+11*16], m3 mova m3, [o(pd_2048)] ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751 ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380 mova m0, [r3+8*16] mova m1, [r3+9*16] mova [r3+8*16], m4 mova m4, [r3+10*16] mova [r3+9*16], m5 mova [r3+10*16], m6 mova m5, [r3+11*16] mova [r3+11*16], m7 ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091 ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703 mova m2, [r3+8*16] mova m6, [r3+9*16] psubd m3, m0, m2 ; t8a paddd m0, m2 ; t0a mova [r3+8*16], m3 psubd m2, m1, m6 ; t9a paddd m1, m6 ; t1a mova m3, [r3+10*16] psubd m6, m4, m3 ; t12a paddd m4, m3 ; t4a mova m3, [r3+11*16] psubd m7, m5, m3 ; t13a paddd m5, m3 ; t5a mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5 pmaxsd m3, [r3+8*16] mova [r3+8*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5 pminsd m3, [r3+8*16] mova [r3+8*16], m3 psubd m3, m0, m4 ; t4 paddd m0, m4 ; t0 psubd m4, m1, m5 ; t5 paddd m1, m5 ; t1 mova m5, [o(pd_2048)] mova [r3+9*16], m1 mova [r3+10*16], m4 mova [r3+11*16], m3 mova m3, [r3+8*16] mova [r3+8*16], m0 ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017 ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4 psubd m5, m2, m7 ; t12a paddd m2, m7 ; t8a psubd m7, m3, m6 ; t13a paddd m6, m3 ; t9a mova m0, [r3+8*16] mova m1, [r3+9*16] mova m4, [r3+10*16] mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6 pmaxsd m3, [r3+11*16] mova [r3+8*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6 pminsd m3, [r3+8*16] mova [r3+8*16], m0 mova [r3+9*16], m1 mova [r3+10*16], m2 mova [r3+11*16], m6 mova m0, [o(pd_2048)] ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784 ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784 mova m0, [r3+7*16] ; t7a mova m2, [r3+6*16] ; t6a psubd m1, m3, m0 ; t7 paddd m0, m3 ; out12 paddd m3, m4, m2 ; -out3 psubd m4, m2 ; t6 mova [r3+7*16], m3 mova m3, [r3+3*16] ; t15 mova m2, [r3+2*16] ; t14 paddd m6, m5, m3 ; -out13 psubd m5, m3 ; t15a psubd m3, m7, m2 ; t14a paddd m2, m7 ; out2 mova [r3+6*16], m2 mova m7, [r3+0*16] ; t10a mova m2, [r3+1*16] ; t11a mova [r3+0*16], m0 mova [r3+1*16], m6 mova m6, [r3+11*16] psubd m0, m6, m2 ; t11 paddd m6, m2 ; out14 mova [r3+2*16], m6 mova m2, [r3+10*16] psubd m6, m2, m7 ; t10 paddd m2, m7 ; -out1 mova m7, [r3+5*16] ; t3 mova [r3+5*16], m2 mova [r3+10*16], m1 mova m1, [r3+9*16] psubd m2, m1, m7 ; t3a paddd m1, m7 ; -out15 mova [r3+3*16], m1 mova m1, [r3+4*16] ; t2 mova m7, [r3+8*16] psubd m7, m1 ; t2a paddd m1, [r3+8*16] ; out0 mova [r3+4*16], m1 mova m1, [o(clip_18b_min)] REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7 pmaxsd m1, [r3+10*16] mova [r3+10*16], m1 mova m1, [o(clip_18b_max)] REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7 pminsd m1, [r3+10*16] mova [r3+10*16], m1 mova m1, [o(pd_2896)] REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7 pmulld m1, [r3+10*16] mova [r3+11*16], m3 psubd m3, m4, m1 ; -out11 (unshifted) paddd m4, m1 ; out4 (unshifted) psubd m1, m6, m0 ; -out9 (unshifted) paddd m6, m0 ; out6 (unshifted) psubd m0, m7, m2 ; out8 (unshifted) paddd m7, m2 ; -out7 (unshifted) mova m2, [r3+11*16] mova [r3+11*16], m5 paddd m5, m2 ; -out5 (unshifted) psubd m2, [r3+11*16] ; out10 (unshifted) ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted) ; r[-4,3] contain out0-3 and out12-15 %endif ret .main_part1: %if ARCH_X86_64 ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973 ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290 ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106 ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601 psubd m8, m0, m4 ; t10a paddd m0, m4 ; t2a psubd m4, m1, m5 ; t11a paddd m1, m5 ; t3a psubd m5, m2, m6 ; t14a paddd m2, m6 ; t6a psubd m6, m3, m7 ; t15a paddd m7, m3 ; t7a REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 mova m15, [o(pd_2276)] mova m10, [o(pd_3406)] ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 psubd m3, m0, m2 ; t6 paddd m0, m2 ; t2 psubd m2, m1, m7 ; t7 paddd m1, m7 ; t3 psubd m7, m4, m6 ; t14a paddd m4, m6 ; t10a psubd m6, m8, m5 ; t15a paddd m5, m8 ; t11a REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 mova m15, [o(pd_1567)] mova m10, [o(pd_3784)] ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15 ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15 mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+4*16], m4 mova [r3+5*16], m5 mova [r3+2*16], m2 mova [r3+3*16], m3 mova [r3+6*16], m6 mova [r3+7*16], m7 %else mova [r3+4*16], m0 mova [r3+5*16], m1 mova [r3+6*16], m2 mova [r3+7*16], m3 mova m3, [o(pd_2048)] ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106 ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601 mova [r3+0*16], m4 mova [r3+1*16], m5 mova [r3+2*16], m6 mova [r3+3*16], m7 mova m0, [r3+4*16] mova m1, [r3+5*16] mova m2, [r3+6*16] mova m7, [r3+7*16] ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973 ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290 mova m4, [r3+0*16] mova m5, [r3+1*16] psubd m6, m0, m4 ; t10a paddd m0, m4 ; t2a mova [r3+4*16], m6 mova m6, [r3+2*16] mova m3, [r3+3*16] psubd m4, m1, m5 ; t11a paddd m1, m5 ; t3a psubd m5, m2, m6 ; t14a paddd m2, m6 ; t6a psubd m6, m7, m3 ; t15a paddd m7, m3 ; t7a mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7 pmaxsd m3, [r3+4*16] mova [r3+4*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7 pminsd m3, [r3+4*16] mova [r3+4*16], m3 psubd m3, m0, m2 ; t6 paddd m0, m2 ; t2 psubd m2, m1, m7 ; t7 paddd m1, m7 ; t3 mova [r3+5*16], m1 mova [r3+6*16], m3 mova [r3+7*16], m2 mova m1, [r3+4*16] mova [r3+4*16], m0 mova m3, [o(pd_2048)] ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276 ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2 psubd m7, m4, m6 ; t14a paddd m4, m6 ; t10a psubd m6, m1, m5 ; t15a paddd m5, m1 ; t11a mova m1, [r3+5*16] mova m3, [r3+6*16] mova m2, [r3+7*16] mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5 pmaxsd m0, [r3+4*16] mova [r3+4*16], m0 mova m0, [o(clip_18b_max)] REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5 pminsd m0, [r3+4*16] mova [r3+4*16], m0 mova [r3+5*16], m1 mova [r3+0*16], m4 mova [r3+1*16], m5 mova m0, [o(pd_2048)] ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567 ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567 mova [r3+6*16], m2 mova [r3+7*16], m3 mova [r3+2*16], m6 mova [r3+3*16], m7 %endif ret .pass2: lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] jmp m(idct_16x4_internal_16bpc).pass2_loop INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 lea r3, [rsp+gprsize] call m(iadst_16x4_internal_16bpc).main %if ARCH_X86_64 packssdw m1, m0 packssdw m3, m2 packssdw m5, m4 packssdw m7, m6 packssdw m9, m8 packssdw m11, m10 packssdw m13, m12 packssdw m15, m14 mova m0, m15 mova m2, m13 mova m4, m11 mova m6, m9 mova m8, m7 mova m10, m5 mova m12, m3 mova m14, m1 jmp m(idct_16x4_internal_16bpc).transpose %else mova [rsp+gprsize+4*16], m0 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m4 mova [rsp+gprsize+7*16], m6 pshufd m6, [rsp+gprsize+ 8*16], q1032 pshufd m4, [rsp+gprsize+ 9*16], q1032 pshufd m2, [rsp+gprsize+10*16], q1032 pshufd m0, [rsp+gprsize+11*16], q1032 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+gprsize+0*16], m0 mova [rsp+gprsize+1*16], m1 mova [rsp+gprsize+2*16], m2 mova [rsp+gprsize+3*16], m3 pshufd m6, [rsp+gprsize+ 4*16], q1032 pshufd m4, [rsp+gprsize+ 5*16], q1032 pshufd m2, [rsp+gprsize+ 6*16], q1032 pshufd m0, [rsp+gprsize+ 7*16], q1032 call m(idct_8x4_internal_16bpc).transpose4x8packed jmp tx2q %endif .pass2: lea r3, [strideq*3] lea dstq, [dstq+r3] neg strideq lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] jmp m(idct_16x4_internal_16bpc).pass2_loop INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 mova m15, [o(pd_11586)] pmulld m0, m15, [cq+ 0*16] pmulld m1, m15, [cq+ 1*16] pmulld m2, m15, [cq+ 2*16] pmulld m3, m15, [cq+ 3*16] pmulld m4, m15, [cq+ 4*16] pmulld m5, m15, [cq+ 5*16] pmulld m6, m15, [cq+ 6*16] pmulld m7, m15, [cq+ 7*16] pmulld m8, m15, [cq+ 8*16] pmulld m9, m15, [cq+ 9*16] pmulld m10, m15, [cq+10*16] pmulld m11, m15, [cq+11*16] pmulld m12, m15, [cq+12*16] pmulld m13, m15, [cq+13*16] pmulld m14, m15, [cq+14*16] pmulld m15, [cq+15*16] mova [cq+ 0*16], m15 mova m15, [o(pd_6144)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [cq+ 0*16] REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 jmp m(idct_16x4_internal_16bpc).pack_transpose %else add cq, 8*16 mov r5d, 2 .loop_pass1: mova m7, [o(pd_11586)] pmulld m0, m7, [cq+0*16] pmulld m1, m7, [cq+1*16] pmulld m2, m7, [cq+2*16] pmulld m3, m7, [cq+3*16] pmulld m4, m7, [cq+4*16] pmulld m5, m7, [cq+5*16] pmulld m6, m7, [cq+6*16] pmulld m7, [cq+7*16] mova [cq+7*16], m7 mova m7, [o(pd_6144)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [cq+7*16] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed dec r5d jz .end_pass1 mova [rsp+gprsize+0*16], m0 mova [rsp+gprsize+1*16], m1 mova [rsp+gprsize+2*16], m2 mova [rsp+gprsize+3*16], m3 sub cq, 8*16 jmp .loop_pass1 .end_pass1: jmp tx2q %endif .pass2: %if ARCH_X86_64 mova m12, [o(pw_1697x8)] %endif lea r4, [o(.main)] jmp m(idct_16x4_internal_16bpc).pass2_loop .main: %if ARCH_X86_64 pmulhrsw m4, m0, m12 pmulhrsw m5, m1, m12 pmulhrsw m6, m2, m12 pmulhrsw m7, m3, m12 %else mova m7, [o(pw_1697x8)] pmulhrsw m4, m0, m7 pmulhrsw m5, m1, m7 pmulhrsw m6, m2, m7 pmulhrsw m7, m3 %endif paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 ret %macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset %if ARCH_X86_64 INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16 %else INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 add r5d, 128 sar r5d, 8 imul r5d, 181 %if ARCH_X86_32 add rsp, 1*16 %endif jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity, 6 INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 DECLARE_REG_TMP 6, 4, 6 %else mov [rsp+gprsize+12*16], r1 DECLARE_REG_TMP 1, 4, 3 %endif lea t0, [o(.main)] .loop_main: %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 10 setge r5b %else mov r5d, 1 cmp eobd, 10 sbb r5d, 0 %endif shl r5d, 4 lea r3, [rsp+gprsize] .loop_pass1: call t0 %if ARCH_X86_64 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+4*32+r5], m8 mova [cq+5*32+r5], m9 mova [cq+6*32+r5], m10 mova [cq+7*32+r5], m11 %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+4*32+r5], m0 mova [cq+5*32+r5], m1 mova [cq+6*32+r5], m2 mova [cq+7*32+r5], m3 mova m0, [rsp+gprsize+ 8*16] mova m2, [rsp+gprsize+ 9*16] mova m4, [rsp+gprsize+10*16] mova m6, [rsp+gprsize+11*16] %endif call m(idct_8x4_internal_16bpc).transpose4x8packed pxor m7, m7 REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15 test r5d, r5d jz .end mova [cq+0*32+r5], m0 mova [cq+1*32+r5], m1 mova [cq+2*32+r5], m2 mova [cq+3*32+r5], m3 xor r5d, r5d jmp .loop_pass1 .end: jmp tx2q .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*32+r5] mova m1, [cq+ 3*32+r5] mova m2, [cq+ 5*32+r5] mova m3, [cq+ 7*32+r5] mova m4, [cq+ 9*32+r5] mova m5, [cq+11*32+r5] mova m6, [cq+13*32+r5] mova m7, [cq+15*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*32+r5] mova m1, [cq+ 2*32+r5] mova m2, [cq+ 4*32+r5] mova m3, [cq+ 6*32+r5] mova m4, [cq+ 8*32+r5] mova m5, [cq+10*32+r5] mova m6, [cq+12*32+r5] mova m7, [cq+14*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call m(idct_16x4_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif mov r4d, 2 .pass2_main: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %endif lea r3, [strideq*3] jmp .loop_pass2_entry .loop_pass2: mova m0, [cq+0*32+ 0] mova m1, [cq+1*32+ 0] mova m2, [cq+2*32+ 0] mova m3, [cq+3*32+ 0] .loop_pass2_entry: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(idct_8x8_internal_8bpc, _ssse3).main call m(idct_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, 16 add cq, 4*32 dec r4d jg .loop_pass2 RET INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity, 6 cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+gprsize+12*16], r1 %endif lea t0, [o(.main)] jmp m(idct_16x8_internal_16bpc).loop_main .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*32+r5] mova m1, [cq+13*32+r5] mova m2, [cq+ 6*32+r5] mova m3, [cq+ 9*32+r5] mova m4, [cq+10*32+r5] mova m5, [cq+ 5*32+r5] mova m6, [cq+14*32+r5] mova m7, [cq+ 1*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul call m(iadst_16x4_internal_16bpc).main_part1 mova m0, [cq+ 0*32+r5] mova m1, [cq+15*32+r5] mova m2, [cq+ 4*32+r5] mova m3, [cq+11*32+r5] mova m4, [cq+ 8*32+r5] mova m5, [cq+ 7*32+r5] mova m6, [cq+12*32+r5] mova m7, [cq+ 3*32+r5] %if ARCH_X86_32 add r3, 8*16 %endif call m(idct_8x4_internal_16bpc).rect2_mul %if ARCH_X86_32 sub r3, 8*16 %endif call m(iadst_16x4_internal_16bpc).main_part2 call m(iadst_16x4_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif mov r4d, 2 %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] mova m11, [o(pw_m2048)] %endif lea r3, [strideq*3] jmp .loop_pass2_entry .loop_pass2: mova m0, [cq+0*32+ 0] mova m1, [cq+1*32+ 0] mova m2, [cq+2*32+ 0] mova m3, [cq+3*32+ 0] .loop_pass2_entry: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, 16 add cq, 4*32 dec r4d jg .loop_pass2 RET INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity, 6 cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+gprsize+12*16], r1 %endif lea t0, [o(.main)] jmp m(idct_16x8_internal_16bpc).loop_main .main: call m(iadst_16x8_internal_16bpc).main %if ARCH_X86_64 pshufd m1, m0, q1032 pshufd m3, m2, q1032 pshufd m5, m4, q1032 pshufd m7, m6, q1032 pshufd m0, m14, q1032 pshufd m2, m12, q1032 pshufd m4, m10, q1032 pshufd m6, m8, q1032 mova m14, m1 mova m12, m3 mova m10, m5 mova m8, m7 %else pshufd m1, m0, q1032 pshufd m3, m2, q1032 pshufd m5, m4, q1032 pshufd m7, m6, q1032 pshufd m0, [r3+11*16], q1032 pshufd m2, [r3+10*16], q1032 pshufd m4, [r3+9*16], q1032 pshufd m6, [r3+8*16], q1032 mova [r3+8*16], m7 mova [r3+9*16], m5 mova [r3+10*16], m3 mova [r3+11*16], m1 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif lea dstq, [dstq+strideq*8] neg strideq add dstq, strideq %if ARCH_X86_32 mov [rsp+gprsize+12*16], strideq %endif jmp m(iadst_16x8_internal_16bpc).pass2 INV_TXFM_16X8_FN identity, dct, -54 INV_TXFM_16X8_FN identity, adst, -54 INV_TXFM_16X8_FN identity, flipadst, -54 INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_32 mov [rsp+gprsize+12*16], r1 %endif lea t0, [o(.main)] jmp m(idct_16x8_internal_16bpc).loop_main .main: %if ARCH_X86_64 mova m15, [o(pd_2896)] pmulld m0, m15, [cq+ 0*32+r5] pmulld m1, m15, [cq+ 1*32+r5] pmulld m2, m15, [cq+ 2*32+r5] pmulld m3, m15, [cq+ 3*32+r5] pmulld m4, m15, [cq+ 4*32+r5] pmulld m5, m15, [cq+ 5*32+r5] pmulld m6, m15, [cq+ 6*32+r5] pmulld m7, m15, [cq+ 7*32+r5] pmulld m8, m15, [cq+ 8*32+r5] pmulld m9, m15, [cq+ 9*32+r5] pmulld m10, m15, [cq+10*32+r5] pmulld m11, m15, [cq+11*32+r5] pmulld m12, m15, [cq+12*32+r5] pmulld m13, m15, [cq+13*32+r5] pmulld m14, m15, [cq+14*32+r5] pmulld m15, [cq+15*32+r5] mova [r3], m15 mova m15, [o(pd_2048)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [r3] REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 mova [r3], m15 mova m15, [o(pd_11586)] REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 pmulld m15, [r3] mova [r3], m15 mova m15, [o(pd_6144)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [r3] REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %else mova m0, [cq+ 0*32+r5] mova m1, [cq+ 1*32+r5] mova m2, [cq+ 2*32+r5] mova m3, [cq+ 3*32+r5] mova m4, [cq+ 4*32+r5] mova m5, [cq+ 5*32+r5] mova m6, [cq+ 6*32+r5] mova m7, [cq+ 7*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul mova [r3], m7 mova m7, [o(pd_11586)] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulld m7, [r3] mova [r3], m7 mova m7, [o(pd_6144)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 mova [r3+ 8*16], m0 mova [r3+ 9*16], m2 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m0, [cq+ 8*32+r5] mova m1, [cq+ 9*32+r5] mova m2, [cq+10*32+r5] mova m3, [cq+11*32+r5] mova m4, [cq+12*32+r5] mova m5, [cq+13*32+r5] mova m6, [cq+14*32+r5] mova m7, [cq+15*32+r5] call m(idct_8x4_internal_16bpc).rect2_mul mova [r3], m7 mova m7, [o(pd_11586)] REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulld m7, [r3] mova [r3], m7 mova m7, [o(pd_6144)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 %endif ret .pass2: %if ARCH_X86_32 mov strideq, [rsp+gprsize+12*16] %endif mov r4d, 2 %if ARCH_X86_64 mova m8, [o(pw_4096)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %endif lea r3, [strideq*3] jmp .loop_pass2_entry .loop_pass2: mova m0, [cq+0*32+ 0] mova m1, [cq+1*32+ 0] mova m2, [cq+2*32+ 0] mova m3, [cq+3*32+ 0] .loop_pass2_entry: mova m4, [cq+0*32+16] mova m5, [cq+1*32+16] mova m6, [cq+2*32+16] mova m7, [cq+3*32+16] %if ARCH_X86_64 call m(idct_8x8_internal_16bpc).round1_and_write_8x8 %else mova [rsp+gprsize], m7 mova m7, [o(pw_4096)] call m(idct_8x8_internal_16bpc).round4_and_write_8x8 %endif %if ARCH_X86_64 %define mzero m9 %else %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add dstq, 16 add cq, 4*32 dec r4d jg .loop_pass2 RET %macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix %if ARCH_X86_64 INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16 %else INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 %endif %ifidn %1_%2, dct_dct imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 add r5d, 640 sar r5d, 10 add rsp, (5+ARCH_X86_64*3+WIN64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity, v INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if ARCH_X86_64 DECLARE_REG_TMP 6, 7 %if WIN64 mov [rsp+16*16+gprsize], r7 %endif %elif ARCH_X86_32 DECLARE_REG_TMP 1, 6 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] .pass1_full: %undef cmp mov t1d, 4 .zero_loop: dec t1d cmp eobb, byte [r5+t1] jb .zero_loop mov r5d, t1d shl r5d, 4 %if ARCH_X86_32 ; restore pic-ptr mov r6, [rsp+16*16+2*gprsize] %endif ; setup stack pointer lea r3, [rsp+gprsize] .loop_pass1: call t0 %if ARCH_X86_64 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+4*64+r5], m8 mova [cq+5*64+r5], m9 mova [cq+6*64+r5], m10 mova [cq+7*64+r5], m11 %else call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+4*64+r5], m0 mova [cq+5*64+r5], m1 mova [cq+6*64+r5], m2 mova [cq+7*64+r5], m3 mova m0, [rsp+gprsize+ 8*16] mova m2, [rsp+gprsize+ 9*16] mova m4, [rsp+gprsize+10*16] mova m6, [rsp+gprsize+11*16] %endif call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+0*64+r5], m0 mova [cq+1*64+r5], m1 mova [cq+2*64+r5], m2 mova [cq+3*64+r5], m3 pxor m0, m0 REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15 sub r5d, 16 jge .loop_pass1 %if ARCH_X86_32 ; restore pic-ptr mov r1, [rsp+16*16+1*gprsize] %endif jmp tx2q .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*64+r5] mova m1, [cq+ 3*64+r5] mova m2, [cq+ 5*64+r5] mova m3, [cq+ 7*64+r5] mova m4, [cq+ 9*64+r5] mova m5, [cq+11*64+r5] mova m6, [cq+13*64+r5] mova m7, [cq+15*64+r5] call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*64+r5] mova m1, [cq+ 2*64+r5] mova m2, [cq+ 4*64+r5] mova m3, [cq+ 6*64+r5] mova m4, [cq+ 8*64+r5] mova m5, [cq+10*64+r5] mova m6, [cq+12*64+r5] mova m7, [cq+14*64+r5] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call .round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .round: %if ARCH_X86_64 REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 psrld m8, m11, 10 ; 2 REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 mova m8, [r3+1*16] mova m9, [r3+2*16] mova m10, [r3+3*16] mova m11, [r3+4*16] mova m12, [r3+5*16] mova m13, [r3+6*16] mova m14, [r3+7*16] psubd m15, m0, m14 ; out15 paddd m0, m14 ; out0 psubd m14, m1, m13 ; out14 paddd m1, m13 ; out1 psubd m13, m2, m12 ; out13 paddd m2, m12 ; out2 psubd m12, m3, m11 ; out12 paddd m3, m11 ; out3 psubd m11, m4, m10 ; out11 paddd m4, m10 ; out4 psubd m10, m5, m9 ; out10 paddd m5, m9 ; out5 psubd m9, m6, m8 ; out9 paddd m6, m8 ; out6 psubd m8, m7, [r3+0*16] ; out8 paddd m7, [r3+0*16] ; out7 REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 ; and out0-15 is now in m0-15 %else mova [r3+ 0*16], m0 mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 pmaxsd m0, [r3+ 0*16] mova [r3+ 0*16], m7 mova m7, [o(clip_18b_max)] REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 pminsd m7, [r3+ 0*16] mova [r3+ 0*16], m0 mova m0, [o(pd_2)] REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 paddd m0, [r3+ 0*16] mova [r3+ 0*16], m0 mova [r3+ 1*16], m1 mova [r3+ 2*16], m2 mova m1, [r3+11*16] mova m2, [r3+10*16] psubd m0, m7, m1 paddd m7, m1 psubd m1, m6, m2 paddd m6, m2 REPX {psrad x, 2}, m0, m1, m6, m7 packssdw m0, m1 ; out8-9 packssdw m6, m7 ; out6-7 mova [r3+11*16], m6 mova m1, [r3+9*16] mova m7, [r3+8*16] psubd m2, m5, m1 paddd m5, m1 psubd m1, m4, m7 paddd m4, m7 REPX {psrad x, 2}, m2, m1, m4, m5 packssdw m2, m1 ; out10-11 packssdw m4, m5 ; out4-5 mova m1, [r3+2*16] mova [r3+10*16], m4 mova m6, [r3+7*16] mova m7, [r3+6*16] psubd m4, m3, m6 paddd m3, m6 psubd m6, m1, m7 paddd m1, m7 REPX {psrad x, 2}, m4, m6, m1, m3 packssdw m4, m6 ; out12-13 packssdw m1, m3 ; out2-3 mova m3, [r3+1*16] mova [r3+9*16], m1 mova m1, [r3+0*16] mova m5, [r3+5*16] mova m7, [r3+4*16] psubd m6, m3, m5 paddd m3, m5 psubd m5, m1, m7 paddd m1, m7 REPX {psrad x, 2}, m6, m5, m1, m3 packssdw m6, m5 ; out14-15 packssdw m1, m3 ; out0-1 mova [r3+8*16], m1 %endif ret .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 2 .loop_pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m0, [cq+0*64+ 0] mova m1, [cq+2*64+ 0] mova m2, [cq+0*64+16] mova m3, [cq+2*64+16] mova m4, [cq+0*64+32] mova m5, [cq+2*64+32] mova m6, [cq+0*64+48] mova m7, [cq+2*64+48] call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+3*16], m0 mova [rsp+gprsize+4*16], m1 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m3 mova [rsp+gprsize+7*16], m4 mova [rsp+gprsize+8*16], m5 mova [rsp+gprsize+9*16], m6 ; m7 is already stored in [rsp+gprsize+0*16] mova m0, [cq+1*64+ 0] mova m1, [cq+3*64+ 0] mova m2, [cq+1*64+16] mova m3, [cq+3*64+16] mova m4, [cq+1*64+32] mova m5, [cq+3*64+32] mova m6, [cq+1*64+48] mova m7, [cq+3*64+48] call m_suffix(idct_16x8_internal_8bpc, _ssse3).main ; out0-7 is in rsp+gprsize+3-10*mmsize ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize %if ARCH_X86_64 lea dstq, [r7+strideq*8] %else mov dstq, [rsp+2*gprsize+16*16] lea dstq, [dstq+strideq*8] %endif call m(idct_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 mov dstq, r7 %else mov dstq, [rsp+2*gprsize+16*16] %endif mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 %if ARCH_X86_64 add r7, 16 %define mzero m9 %else add dword [rsp+2*gprsize+16*16], 16 %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add cq, 64*4 REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 %undef mzero dec r4d jg .loop_pass2 %if WIN64 mov r7, [rsp+16*16+gprsize] %endif RET INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 mov [rsp+16*16+gprsize], r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] jmp m(idct_16x16_internal_16bpc).pass1_full .main: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 2*64+r5] mova m1, [cq+13*64+r5] mova m2, [cq+ 6*64+r5] mova m3, [cq+ 9*64+r5] mova m4, [cq+10*64+r5] mova m5, [cq+ 5*64+r5] mova m6, [cq+14*64+r5] mova m7, [cq+ 1*64+r5] call m(iadst_16x4_internal_16bpc).main_part1 mova m0, [cq+ 0*64+r5] mova m1, [cq+15*64+r5] mova m2, [cq+ 4*64+r5] mova m3, [cq+11*64+r5] mova m4, [cq+ 8*64+r5] mova m5, [cq+ 7*64+r5] mova m6, [cq+12*64+r5] mova m7, [cq+ 3*64+r5] call m(iadst_16x4_internal_16bpc).main_part2 call .round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif ret .round: %if ARCH_X86_64 pcmpeqd m8, m8 ; -1 mova m15, [o(pd_10240)] psrld m14, 10 ; +2 psubd m13, m14, m8 ; +3 REPX {pxor x, m8 }, m1, m3, m5, m7 REPX {paddd x, m14}, m0, m2 REPX {paddd x, m13}, m1, m3 REPX {paddd x, m15}, m4, m5, m6, m7 paddd m13, m15, m8 ; +10239 paddd m8, m15, m9 psubd m9, m13, m10 paddd m10, m15, m11 psubd m11, m13, m12 paddd m12, m14, [r3+3*16] psubd m13, m14, [r3+2*16] psubd m15, m14, [r3+0*16] paddd m14, [r3+1*16] REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 %else mova [r3+8*16], m1 mova [r3+9*16], m3 mova m3, [o(pd_10240)] pcmpeqd m1, m1 REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m5, m6, m7 REPX {psrad x, 14}, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] mova m3, [o(pd_2)] REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m6 psubd m3, m1 REPX {paddd x, m3}, m5, m7 REPX {psrad x, 2 }, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 mova m5, [r3+8*16] mova m7, [r3+9*16] mova [r3+8*16], m4 mova [r3+9*16], m6 mova m3, [o(pd_10240)] REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m0, m5, m2, m7 REPX {psrad x, 14}, m0, m5, m2, m7 packssdw m0, m5 packssdw m2, m7 mova m4, [r3+0*16] mova m5, [r3+1*16] mova m6, [r3+2*16] mova m7, [r3+3*16] mova m3, [o(pd_2)] REPX {pxor x, m1}, m5, m7 REPX {paddd x, m3}, m4, m6 psubd m3, m1 REPX {paddd x, m3}, m5, m7 REPX {psrad x, 2 }, m4, m5, m6, m7 packssdw m4, m5 packssdw m6, m7 %endif ret .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] mova m11, [o(pw_m2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 2 .loop_pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m0, [cq+0*64+32] mova m1, [cq+1*64+32] mova m2, [cq+2*64+16] mova m3, [cq+3*64+16] mova m4, [cq+0*64+ 0] mova m5, [cq+1*64+ 0] mova m6, [cq+2*64+48] mova m7, [cq+3*64+48] mova [rsp+gprsize+3*16], m0 mova [rsp+gprsize+4*16], m1 mova [rsp+gprsize+5*16], m2 mova [rsp+gprsize+6*16], m3 mova [rsp+gprsize+7*16], m4 mova [rsp+gprsize+8*16], m5 mova [rsp+gprsize+9*16], m6 mova [rsp+gprsize+10*16], m7 mova m0, [cq+2*64+ 0] mova m1, [cq+3*64+ 0] mova m2, [cq+0*64+16] mova m3, [cq+1*64+16] mova m4, [cq+2*64+32] mova m5, [cq+3*64+32] mova m6, [cq+0*64+48] mova m7, [cq+1*64+48] call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end ; out0-7 is in rsp+gprsize+3-10*mmsize ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize %if ARCH_X86_64 lea dstq, [r7+strideq*8] %else mov dstq, [rsp+2*gprsize+16*16] lea dstq, [dstq+strideq*8] %endif call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 %if ARCH_X86_64 mov dstq, r7 %else mov dstq, [rsp+2*gprsize+16*16] %endif mova m0, [rsp+gprsize+ 3*16] mova m1, [rsp+gprsize+ 4*16] mova m2, [rsp+gprsize+ 5*16] mova m3, [rsp+gprsize+ 6*16] mova m4, [rsp+gprsize+ 7*16] mova m5, [rsp+gprsize+ 8*16] mova m6, [rsp+gprsize+ 9*16] mova m7, [rsp+gprsize+10*16] call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 %if ARCH_X86_64 add r7, 16 %define mzero m9 %else add dword [rsp+2*gprsize+16*16], 16 %define mzero m7 pxor m7, m7 %endif REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 add cq, 64*4 REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 %undef mzero dec r4d jg .loop_pass2 %if WIN64 mov r7, [rsp+16*16+gprsize] %endif RET INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 mov [rsp+16*16+gprsize], r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] jmp m(idct_16x16_internal_16bpc).pass1_full .main: call m(iadst_16x16_internal_16bpc).main %if ARCH_X86_64 mova m1, m0 mova m3, m2 mova m5, m4 mova m7, m6 pshufd m0, m14, q1032 pshufd m2, m12, q1032 pshufd m4, m10, q1032 pshufd m6, m8, q1032 pshufd m8, m7, q1032 pshufd m10, m5, q1032 pshufd m12, m3, q1032 pshufd m14, m1, q1032 %else pshufd m1, m0, q1032 pshufd m3, m2, q1032 pshufd m5, m4, q1032 pshufd m7, m6, q1032 pshufd m0, [r3+11*16], q1032 pshufd m2, [r3+10*16], q1032 pshufd m4, [r3+9*16], q1032 pshufd m6, [r3+8*16], q1032 mova [r3+11*16], m1 mova [r3+10*16], m3 mova [r3+ 9*16], m5 mova [r3+ 8*16], m7 %endif ret .pass2: lea r3, [strideq*3] lea r3, [r3*5] add dstq, r3 neg strideq jmp m(iadst_16x16_internal_16bpc).pass2 INV_TXFM_16X16_FN identity, dct, h INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 %if WIN64 mov [rsp+16*16+gprsize], r7 %elif ARCH_X86_32 mov [rsp+16*16+gprsize*1], r1 mov [rsp+16*16+gprsize*2], r6 %endif lea t0, [o(.main)] jmp m(idct_16x16_internal_16bpc).pass1_full .main: %if ARCH_X86_64 mova m15, [o(pd_11586)] pmulld m0, m15, [cq+ 0*64+r5] pmulld m1, m15, [cq+ 1*64+r5] pmulld m2, m15, [cq+ 2*64+r5] pmulld m3, m15, [cq+ 3*64+r5] pmulld m4, m15, [cq+ 4*64+r5] pmulld m5, m15, [cq+ 5*64+r5] pmulld m6, m15, [cq+ 6*64+r5] pmulld m7, m15, [cq+ 7*64+r5] pmulld m8, m15, [cq+ 8*64+r5] pmulld m9, m15, [cq+ 9*64+r5] pmulld m10, m15, [cq+10*64+r5] pmulld m11, m15, [cq+11*64+r5] pmulld m12, m15, [cq+12*64+r5] pmulld m13, m15, [cq+13*64+r5] pmulld m14, m15, [cq+14*64+r5] pmulld m15, [cq+15*64+r5] mova [r3], m15 mova m15, [o(pd_10240)] REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14 paddd m15, [r3] REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m12, m13, m14, m15 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %else mova m7, [o(pd_11586)] pmulld m0, m7, [cq+ 0*64+r5] pmulld m1, m7, [cq+ 1*64+r5] pmulld m2, m7, [cq+ 2*64+r5] pmulld m3, m7, [cq+ 3*64+r5] pmulld m4, m7, [cq+ 4*64+r5] pmulld m5, m7, [cq+ 5*64+r5] pmulld m6, m7, [cq+ 6*64+r5] pmulld m7, [cq+ 7*64+r5] mova [r3], m7 mova m7, [o(pd_10240)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 mova [r3+8*16], m0 mova [r3+9*16], m2 mova [r3+10*16], m4 mova [r3+11*16], m6 mova m7, [o(pd_11586)] pmulld m0, m7, [cq+ 8*64+r5] pmulld m1, m7, [cq+ 9*64+r5] pmulld m2, m7, [cq+10*64+r5] pmulld m3, m7, [cq+11*64+r5] pmulld m4, m7, [cq+12*64+r5] pmulld m5, m7, [cq+13*64+r5] pmulld m6, m7, [cq+14*64+r5] pmulld m7, [cq+15*64+r5] mova [r3], m7 mova m7, [o(pd_10240)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3] REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 %endif ret .pass2: %if ARCH_X86_64 mova m4, [o(pw_2048)] mova m5, [o(pixel_10bpc_max)] pxor m6, m6 mova m7, [o(pw_1697x16)] mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif mov r5d, 4 lea r3, [strideq*3] .pass2_loop: mova m0, [cq+0*64+0] mova m1, [cq+1*64+0] mova m2, [cq+2*64+0] mova m3, [cq+3*64+0] call m(iidentity_8x16_internal_16bpc).main %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).round1_and_write_8x4 %else call m(idct_8x4_internal_16bpc).round2_and_write_8x4 %endif REPX {mova [cq+x*16], m6}, 0, 4, 8, 12 add cq, 16 lea dstq, [dstq+strideq*4] dec r5w jg .pass2_loop add cq, 64*3 btc r5d, 16 jc .end %if ARCH_X86_64 lea dstq, [r7+16] %else mov dstq, [rsp+2*gprsize+16*16] add dstq, 16 %endif add r5d, 4 jmp .pass2_loop .end: %if WIN64 mov r7, [rsp+16*16+gprsize] %endif RET cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %endif mova m5, [o(pw_5)] mova m7, [o(pixel_10bpc_max)] pxor m6, m6 mov r5d, eobd add eobb, 21 cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192 lea r4, [strideq*3] .loop: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {paddsw x, m5}, m0, m1, m2, m3 REPX {psraw x, 3 }, m0, m1, m2, m3 call .main_zero add cq, 16 lea dstq, [dstq+strideq*4] btc eobd, 16 jnc .loop sub eobd, 64 jge .loop RET ALIGN function_align .main_zero: REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 .main: punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m4 punpcklwd m0, m4 punpckhwd m4, m2, m1 punpcklwd m2, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 paddw m0, [dstq+strideq*0] paddw m1, [dstq+strideq*1] paddw m2, [dstq+strideq*2] paddw m3, [dstq+r4 ] REPX {pmaxsw x, m6}, m0, m1, m2, m3 REPX {pminsw x, m7}, m0, m1, m2, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+r4 ], m3 ret cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %endif mova m5, [o(pw_4096)] mova m7, [o(pixel_10bpc_max)] pxor m6, m6 mov r4d, eobd add eobb, 21 cmovc eobd, r4d lea r4, [strideq*3] mov r5, dstq .loop: mova m0, [cq+32*0] packssdw m0, [cq+32*1] mova m1, [cq+32*2] packssdw m1, [cq+32*3] mova m2, [cq+32*4] packssdw m2, [cq+32*5] mova m3, [cq+32*6] packssdw m3, [cq+32*7] REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmulhrsw x, m5}, m0, m1, m2, m3 call m(inv_txfm_add_identity_identity_8x32_16bpc).main lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .loop add cq, 32*8-32 add r5, 16 mov dstq, r5 sub eobd, 64 jge .loop RET cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %else mova m8, [o(pw_2896x8)] mova m9, [o(pw_1697x16)] mova m11, [o(pw_8192)] %endif mova m7, [o(pixel_10bpc_max)] lea r4, [strideq*3] pxor m6, m6 %if ARCH_X86_64 paddw m10, m11, m11 ; pw_16384 %endif mov r5, dstq call .main sub eobd, 36 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main sub cq, 128*8 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 107 ; eob < 143 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main sub cq, 128*8 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 271 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main sub cq, 128*8 lea dstq, [r5+strideq*8] mov r5, dstq call .main sub eobd, 128 ; eob < 399 jl .ret add cq, 128*8-32 lea dstq, [r5+16] call .main .ret: RET ALIGN function_align .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] %if ARCH_X86_64 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 pmulhrsw m4, m9, m0 pmulhrsw m5, m9, m1 REPX {pmulhrsw x, m10}, m4, m5 %else mova m6, [o(pw_2896x8)] REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 mova m5, [o(pw_1697x16)] pmulhrsw m4, m5, m0 pmulhrsw m5, m1 mova m6, [o(pw_16384)] REPX {pmulhrsw x, m6 }, m4, m5 %endif paddsw m0, m4 paddsw m1, m5 %if ARCH_X86_64 pmulhrsw m4, m9, m2 pmulhrsw m5, m9, m3 REPX {pmulhrsw x, m10}, m4, m5 %else mova m5, [o(pw_1697x16)] pmulhrsw m4, m5, m2 pmulhrsw m5, m3 REPX {pmulhrsw x, m6 }, m4, m5 %endif paddsw m2, m4 paddsw m3, m5 %if ARCH_X86_64 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 %else psrlw m6, 1 ; pw_8192 REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 pxor m6, m6 %endif call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .main ret cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %else mova m8, [o(pw_2896x8)] mova m9, [o(pw_1697x16)] mova m10, [o(pw_2048)] %endif mova m7, [o(pixel_10bpc_max)] lea r4, [strideq*3] pxor m6, m6 mov r5, dstq call .main sub eobd, 36 jl .ret call .main add cq, 64*8-64 lea dstq, [r5+16*1] call .main sub eobd, 107 ; eob < 143 jl .ret call .main add cq, 64*8-64 lea dstq, [r5+16*2] call .main sub eobd, 128 ; eob < 271 jl .ret call .main add cq, 64*8-64 lea dstq, [r5+16*3] call .main sub eobd, 128 ; eob < 399 jl .ret call .main .ret: RET ALIGN function_align .main: mova m0, [cq+64*0] packssdw m0, [cq+64*1] mova m1, [cq+64*2] packssdw m1, [cq+64*3] mova m2, [cq+64*4] packssdw m2, [cq+64*5] mova m3, [cq+64*6] packssdw m3, [cq+64*7] %if ARCH_X86_64 REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 %else mova m6, [o(pw_2896x8)] REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 %endif REPX {paddsw x, x }, m0, m1, m2, m3 %if ARCH_X86_64 pmulhrsw m4, m9, m0 pmulhrsw m5, m9, m1 %else mova m6, [o(pw_1697x16)] pmulhrsw m4, m6, m0 pmulhrsw m5, m6, m1 %endif REPX {paddsw x, x }, m0, m1 paddsw m0, m4 paddsw m1, m5 %if ARCH_X86_64 pmulhrsw m4, m9, m2 pmulhrsw m5, m9, m3 %else pmulhrsw m4, m6, m2 pmulhrsw m6, m3 %endif REPX {paddsw x, x }, m2, m3 paddsw m2, m4 %if ARCH_X86_64 paddsw m3, m5 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 %else paddsw m3, m6 mova m6, [o(pw_2048)] REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 pxor m6, m6 %endif REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 call m(inv_txfm_add_identity_identity_8x32_16bpc).main lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .main ret cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob %undef cmp %if ARCH_X86_32 LEA r6, $$ %endif mova m5, [o(pw_8192)] mova m7, [o(pixel_10bpc_max)] pxor m6, m6 lea r4, [strideq*3] mov r5, dstq call .main ; 0 cmp eobd, 36 jl .ret add cq, 128*8-32 ; 0 1 lea dstq, [r5+16] ; 1 call .main call .main2 cmp eobd, 136 jl .ret add cq, 128*16-64 ; 0 1 2 lea dstq, [r5+16*2] ; 1 2 call .main ; 2 call .main2 call .main2 cmp eobd, 300 jl .ret add cq, 128*24-96 ; 0 1 2 3 add r5, 16*3 ; 1 2 3 mov dstq, r5 ; 2 3 call .main ; 3 call .main2 call .main2 call .main2 cmp eobd, 535 jl .ret add cq, 128*24-96 ; 0 1 2 3 lea dstq, [r5+strideq*8] ; 1 2 3 4 mov r5, dstq ; 2 3 4 call .main ; 3 4 call .main2 call .main2 cmp eobd, 755 jl .ret add cq, 128*16-64 ; 0 1 2 3 lea dstq, [r5+strideq*8] ; 1 2 3 4 mov r5, dstq ; 2 3 4 5 call .main ; 3 4 5 call .main2 cmp eobd, 911 jl .ret add cq, 128*8-32 ; 0 1 2 3 lea dstq, [r5+strideq*8] ; 1 2 3 4 call .main ; 2 3 4 5 .ret: ; 3 4 5 6 RET ALIGN function_align .main2: sub cq, 128*8 sub dstq, 16 .main: mova m0, [cq+128*0] packssdw m0, [cq+128*1] mova m1, [cq+128*2] packssdw m1, [cq+128*3] mova m2, [cq+128*4] packssdw m2, [cq+128*5] mova m3, [cq+128*6] packssdw m3, [cq+128*7] REPX {pmulhrsw x, m5}, m0, m1, m2, m3 call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero lea dstq, [dstq+strideq*4] add cq, 16 btc eobd, 16 jnc .main ret cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %define base $$ DECLARE_REG_TMP 0, 4 %else lea r6, [tbl_Nx32_odd_offset] %define base tbl_Nx32_odd_offset DECLARE_REG_TMP 4, 7 %if WIN64 mov [rsp+gprsize*1+35*16], r7 %endif %endif %define o2(x) r6-base+x test eobd, eobd jz .dconly %if ARCH_X86_32 mov [rsp+gprsize*1+35*16], r0 %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_8x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [rsp+ 3*16+r5*8], m0 mova [rsp+11*16+r5*8], m0 mova [rsp+ 3*16+t0*8], m0 mova [rsp+ 3*16+t1*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_8x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+35*16], eobd mov r3, rsp .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+0*128+r5*8] mova m1, [cq+1*128+r5*8] mova m2, [cq+2*128+r5*8] mova m3, [cq+3*128+r5*8] mova m4, [cq+4*128+r5*8] mova m5, [cq+5*128+r5*8] mova m6, [cq+6*128+r5*8] mova m7, [cq+7*128+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 mova m1, [o(pd_2)] REPX {paddd x, m1}, m0, m6, m5, m3 call m(idct_8x4_internal_16bpc).round REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [r3+ 3*16+r5*8], m0 mova [r3+11*16+r5*8], m2 mova [r3+ 3*16+t1*8], m1 mova [r3+ 3*16+t0*8], m3 pxor m7, m7 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7 sub r5d, 2 jge .loop_pass1 ; pass 2 code starts here ; m0 is already loaded from last iteration of first pass %if ARCH_X86_32 mov r0, [rsp+gprsize*1+35*16] %endif mov eobd, [rsp+gprsize*0+35*16] cmp eobd, 43 jl .load_veryfast cmp eobd, 107 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: call .pass2 %if WIN64 mov r7, [rsp+gprsize*1+35*16] %endif RET .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m1, [rsp+gprsize+16* 4] mova m2, [rsp+gprsize+16* 5] mova m3, [rsp+gprsize+16* 6] mova m4, [rsp+gprsize+16* 7] mova m5, [rsp+gprsize+16* 8] mova m6, [rsp+gprsize+16* 9] mova m7, [rsp+gprsize+16*10] call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+ 3*16], m0 mova [rsp+gprsize+ 4*16], m1 mova [rsp+gprsize+ 5*16], m2 mova [rsp+gprsize+ 6*16], m3 mova [rsp+gprsize+ 7*16], m4 mova [rsp+gprsize+ 8*16], m5 mova [rsp+gprsize+ 9*16], m6 mova m0, [rsp+gprsize+11*16] mova m1, [rsp+gprsize+12*16] mova m2, [rsp+gprsize+13*16] mova m3, [rsp+gprsize+14*16] mova m4, [rsp+gprsize+15*16] mova m5, [rsp+gprsize+16*16] mova m6, [rsp+gprsize+17*16] mova m7, [rsp+gprsize+18*16] call m_suffix(idct_16x8_internal_8bpc, _ssse3).main mova m7, [rsp+gprsize+ 0*16] mova [rsp+gprsize+11*16], m0 mova [rsp+gprsize+12*16], m1 mova [rsp+gprsize+13*16], m2 mova [rsp+gprsize+14*16], m3 mova [rsp+gprsize+15*16], m4 mova [rsp+gprsize+16*16], m5 mova [rsp+gprsize+17*16], m6 mova [rsp+gprsize+18*16], m7 call r4 %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %endif lea r3, [strideq*3] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] mova m0, [rsp+gprsize+11*16] mova m1, [rsp+gprsize+12*16] mova m2, [rsp+gprsize+13*16] mova m3, [rsp+gprsize+14*16] mova m4, [rsp+gprsize+15*16] mova m5, [rsp+gprsize+16*16] mova m6, [rsp+gprsize+17*16] mova m7, [rsp+gprsize+18*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] mova m0, [rsp+gprsize+19*16] mova m1, [rsp+gprsize+20*16] mova m2, [rsp+gprsize+21*16] mova m3, [rsp+gprsize+22*16] mova m4, [rsp+gprsize+23*16] mova m5, [rsp+gprsize+24*16] mova m6, [rsp+gprsize+25*16] mova m7, [rsp+gprsize+26*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] mova m0, [rsp+gprsize+27*16] mova m1, [rsp+gprsize+28*16] mova m2, [rsp+gprsize+29*16] mova m3, [rsp+gprsize+30*16] mova m4, [rsp+gprsize+31*16] mova m5, [rsp+gprsize+32*16] mova m6, [rsp+gprsize+33*16] mova m7, [rsp+gprsize+34*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 add r5d, 640 sar r5d, 10 add rsp, (31+2*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 mov [rsp+gprsize*1+76*16], r0 %elif WIN64 mov [rsp+gprsize*1+76*16], r7 %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [rsp+12*16+r5*8], m0 mova [rsp+20*16+r5*8], m0 mova [rsp+12*16+t0*8], m0 mova [rsp+12*16+t1*8], m0 mova [rsp+44*16+r5*8], m0 mova [rsp+52*16+r5*8], m0 mova [rsp+44*16+t0*8], m0 mova [rsp+44*16+t1*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+76*16], eobd mov r3, rsp .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*128+r5*8] mova m1, [cq+ 3*128+r5*8] mova m2, [cq+ 5*128+r5*8] mova m3, [cq+ 7*128+r5*8] mova m4, [cq+ 9*128+r5*8] mova m5, [cq+11*128+r5*8] mova m6, [cq+13*128+r5*8] mova m7, [cq+15*128+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*128+r5*8] mova m1, [cq+ 2*128+r5*8] mova m2, [cq+ 4*128+r5*8] mova m3, [cq+ 6*128+r5*8] mova m4, [cq+ 8*128+r5*8] mova m5, [cq+10*128+r5*8] mova m6, [cq+12*128+r5*8] mova m7, [cq+14*128+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call m(idct_16x4_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif call m(idct_8x4_internal_16bpc).transpose4x8packed movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 %if ARCH_X86_64 mova [rsp+12*16+r5*8], m0 mova [rsp+20*16+r5*8], m2 mova [rsp+12*16+t1*8], m1 mova [rsp+12*16+t0*8], m3 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+44*16+r5*8], m8 mova [rsp+52*16+r5*8], m10 mova [rsp+44*16+t1*8], m9 mova [rsp+44*16+t0*8], m11 %else mova [rsp+44*16+r5*8], m0 mova [rsp+52*16+r5*8], m2 mova [rsp+44*16+t1*8], m1 mova [rsp+44*16+t0*8], m3 mova m0, [r3+ 8*16] mova m2, [r3+ 9*16] mova m4, [r3+10*16] mova m6, [r3+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+12*16+r5*8], m0 mova [rsp+20*16+r5*8], m2 mova [rsp+12*16+t1*8], m1 mova [rsp+12*16+t0*8], m3 %endif pxor m7, m7 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 sub r5d, 2 jge .loop_pass1 ; pass=2 add rsp, 9*16 %if ARCH_X86_64 mov r6, dstq %else mov dstq, [rsp+gprsize*1+67*16] %endif mov eobd, [rsp+gprsize*0+67*16] cmp eobd, 44 jl .load_veryfast cmp eobd, 151 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: %if ARCH_X86_64 lea r2, [dstq+32] mov r7, -4 %else lea r2, [rsp+67*16] mov dword [r2+0*gprsize], 2 %endif jmp .loop_pass2_entry .loop_pass2: mova m0, [rsp+16* 3] .loop_pass2_entry: %if ARCH_X86_32 mov dstq, [r2+1*gprsize] %endif call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2 add rsp, 32*16 %if ARCH_X86_64 add r7, 2 lea dstq, [r2+r7*8] jl .loop_pass2 %if WIN64 mov r7, [rsp+gprsize*1+3*16] %endif %else add dword [r2+1*gprsize], 16 dec dword [r2+0*gprsize] jg .loop_pass2 %endif %assign stack_size (stack_size-73*16) %if STACK_ALIGNMENT >= 16 %assign stack_size_padded (stack_size_padded-73*16) %assign stack_offset (stack_offset-73*16) %else %xdefine rstkm [rsp + stack_size] %endif RET .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add r5d, 128 sar r5d, 8 imul r5d, 181 add rsp, (65+4*ARCH_X86_64)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ dst, stride, c, eob %if ARCH_X86_32 LEA r6, $$ %endif test eobd, eobd jz .dconly ; remove entirely-zero iterations %undef cmp %if ARCH_X86_64 xor r5d, r5d cmp eobd, 10 setge r5b %else mov r5d, 1 cmp eobd, 10 sbb r5d, 0 %endif add r5d, r5d ; actual first pass after skipping all-zero data .loop_pass1: mova m0, [cq+32* 1+r5*8] mova m1, [cq+32* 7+r5*8] mova m2, [cq+32* 9+r5*8] mova m3, [cq+32*15+r5*8] mova m4, [cq+32*17+r5*8] mova m5, [cq+32*23+r5*8] mova m6, [cq+32*25+r5*8] mova m7, [cq+32*31+r5*8] %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp call .main_oddhalf_part1 mova m0, [cq+32* 3+r5*8] mova m1, [cq+32* 5+r5*8] mova m2, [cq+32*11+r5*8] mova m3, [cq+32*13+r5*8] mova m4, [cq+32*19+r5*8] mova m5, [cq+32*21+r5*8] mova m6, [cq+32*27+r5*8] mova m7, [cq+32*29+r5*8] call .main_oddhalf_part2 mova m0, [cq+32* 2+r5*8] mova m1, [cq+32* 6+r5*8] mova m2, [cq+32*10+r5*8] mova m3, [cq+32*14+r5*8] mova m4, [cq+32*18+r5*8] mova m5, [cq+32*22+r5*8] mova m6, [cq+32*26+r5*8] mova m7, [cq+32*30+r5*8] add r3, 16*(16+4*ARCH_X86_32) call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+32* 0+r5*8] mova m1, [cq+32* 4+r5*8] mova m2, [cq+32* 8+r5*8] mova m3, [cq+32*12+r5*8] mova m4, [cq+32*16+r5*8] mova m5, [cq+32*20+r5*8] mova m6, [cq+32*24+r5*8] mova m7, [cq+32*28+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call .round_dct32 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+32* 8+r5*8], m8 mova [cq+32* 9+r5*8], m9 mova [cq+32*10+r5*8], m10 mova [cq+32*11+r5*8], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+32* 4+r5*8], m8 mova [cq+32* 5+r5*8], m9 mova [cq+32* 6+r5*8], m10 mova [cq+32* 7+r5*8], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+32*12+r5*8], m8 mova [cq+32*13+r5*8], m9 mova [cq+32*14+r5*8], m10 mova [cq+32*15+r5*8], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+32* 4+r5*8], m0 mova [cq+32* 5+r5*8], m1 mova [cq+32* 6+r5*8], m2 mova [cq+32* 7+r5*8], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+32* 8+r5*8], m0 mova [cq+32* 9+r5*8], m1 mova [cq+32*10+r5*8], m2 mova [cq+32*11+r5*8], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+32*12+r5*8], m0 mova [cq+32*13+r5*8], m1 mova [cq+32*14+r5*8], m2 mova [cq+32*15+r5*8], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif pxor m7, m7 ; clear lower half of [cq] REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 test r5d, r5d jz .end_pass1 mova [cq+32* 0+r5*8], m0 mova [cq+32* 1+r5*8], m1 mova [cq+32* 2+r5*8], m2 mova [cq+32* 3+r5*8], m3 sub r5d, 2 jmp .loop_pass1 .end_pass1: ; pass=2, we need to call this otherwise the stack pointer has ; the wrong offset in the 8-bit code mov r4d, 4 call m(idct_16x8_internal_16bpc).pass2_main RET .main_oddhalf_part1_fast: ; lower half zero pmulld m7, m0, [o(pd_4091)] pmulld m0, [o(pd_201)] pmulld m4, m3, [o(pd_m2751)] %if ARCH_X86_32 pmulld m3, [o(pd_3035)] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m7 REPX {psrad x, 12}, m0, m7 mova [r3+3*16], m7 mova m7, m3 mova m3, m5 %else pmulld m3, [o(pd_3035)] %endif pmulld m6, m1, [o(pd_m1380)] pmulld m1, [o(pd_3857)] pmulld m5, m2, [o(pd_3703)] pmulld m2, [o(pd_1751)] jmp .main_oddhalf_part1_fast2 .main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 %if ARCH_X86_64 ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a .main_oddhalf_part1_fast2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 psubd m8, m0, m4 ; t17 paddd m0, m4 ; t16 psubd m4, m6, m2 ; t18 paddd m6, m2 ; t19 psubd m2, m1, m5 ; t29 paddd m1, m5 ; t28 psubd m5, m7, m3 ; t30 paddd m7, m3 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 mova m15, [o(pd_4017)] mova m10, [o(pd_799)] ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m1 ; t28a paddd m7, m1 ; t31a psubd m1, m5, m4 ; t18 paddd m5, m4 ; t17 psubd m4, m8, m2 ; t29 paddd m8, m2 ; t30 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 mova [r3+16*0], m0 mova [r3+16*1], m5 mova [r3+16*2], m4 mova [r3+16*3], m6 mova [r3+16*4], m3 mova [r3+16*5], m1 mova [r3+16*6], m8 mova [r3+16*7], m7 %else mova [r3+0*16], m2 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m5 mova m3, [o(pd_2048)] ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a mova m4, [r3+2*16] mova m5, [r3+3*16] mova [r3+2*16], m6 mova [r3+3*16], m7 mova m2, [r3+0*16] mova m7, [r3+1*16] mova [r3+0*16], m0 mova [r3+1*16], m1 ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a mova m0, [r3+0*16] mova m1, [r3+1*16] mova m6, [r3+2*16] .main_oddhalf_part1_fast2: REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7 REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7 psubd m3, m0, m4 ; t17 mova [r3+0*16], m3 mova m3, [r3+3*16] paddd m0, m4 ; t16 psubd m4, m6, m2 ; t18 paddd m6, m2 ; t19 psubd m2, m1, m5 ; t29 paddd m1, m5 ; t28 psubd m5, m3, m7 ; t30 paddd m7, m3 ; t31 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pmaxsd m3, [r3+0*16] mova [r3+0*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pminsd m3, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m1 mova [r3+2*16], m6 mova [r3+3*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a psubd m1, m5, m4 ; t18 paddd m5, m4 ; t17 psubd m4, m3, m2 ; t29 paddd m3, m2 ; t30 mova m0, [r3+0*16] mova m2, [r3+1*16] mova m6, [r3+2*16] mova m7, [r3+3*16] mova [r3+0*16], m3 psubd m3, m0, m6 ; t19a paddd m0, m6 ; t16a psubd m6, m7, m2 ; t28a paddd m7, m2 ; t31a mova m2, [o(clip_18b_min)] REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pmaxsd m2, [r3+0*16] mova [r3+0*16], m2 mova m2, [o(clip_18b_max)] REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pminsd m2, [r3+0*16] mova [r3+16*0], m0 mova [r3+16*1], m5 mova [r3+16*6], m2 mova [r3+16*7], m7 mova m7, [o(pd_2048)] ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28 mova [r3+16*2], m4 mova [r3+16*3], m6 mova [r3+16*4], m3 mova [r3+16*5], m1 %endif ret .main_oddhalf_part2_fast: ; lower half zero pmulld m7, m0, [o(pd_m601)] pmulld m0, [o(pd_4052)] pmulld m4, m3, [o(pd_3290)] %if ARCH_X86_32 pmulld m3, [o(pd_2440)] mova m5, [o(pd_2048)] REPX {paddd x, m5}, m0, m7 REPX {psrad x, 12}, m0, m7 mova [r3+11*16], m7 mova m7, m3 mova m3, m5 %else pmulld m3, [o(pd_2440)] %endif pmulld m6, m1, [o(pd_3973)] pmulld m1, [o(pd_995)] pmulld m5, m2, [o(pd_m2106)] pmulld m2, [o(pd_3513)] jmp .main_oddhalf_part2_fast2 .main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 %if ARCH_X86_64 ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a .main_oddhalf_part2_fast2: REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 psubd m8, m0, m4 ; t25 paddd m0, m4 ; t24 psubd m4, m6, m2 ; t26 paddd m6, m2 ; t27 psubd m2, m1, m5 ; t21 paddd m1, m5 ; t20 psubd m5, m7, m3 ; t22 paddd m7, m3 ; t23 REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 mova m15, [o(pd_2276)] mova m10, [o(pd_3406)] ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m1 ; t20a paddd m7, m1 ; t23a psubd m1, m5, m4 ; t21 paddd m5, m4 ; t22 psubd m4, m8, m2 ; t26 paddd m8, m2 ; t25 REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 mova m15, [o(pd_3784)] mova m10, [o(pd_1567)] ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 mova m9, [r3+16*0] ; t16a mova m10, [r3+16*1] ; t17 psubd m2, m9, m7 ; t23 paddd m9, m7 ; t16 psubd m7, m10, m5 ; t22a paddd m10, m5 ; t17a REPX {pmaxsd x, m12}, m9, m10, m2, m7 REPX {pminsd x, m13}, m9, m10, m2, m7 mova [r3+16*0], m9 mova [r3+16*1], m10 mova m9, [r3+16*2] ; t18a mova m10, [r3+16*3] ; t19 psubd m5, m9, m1 ; t21 paddd m9, m1 ; t18 psubd m1, m10, m6 ; t20a paddd m10, m6 ; t19a REPX {pmaxsd x, m12}, m9, m10, m5, m1 REPX {pminsd x, m13}, m9, m10, m5, m1 mova [r3+16*2], m9 mova [r3+16*3], m10 mova m9, [r3+16*4] ; t28 mova m10, [r3+16*5] ; t29a psubd m6, m9, m3 ; t27a paddd m9, m3 ; t28a psubd m3, m10, m4 ; t26 paddd m10, m4 ; t29 REPX {pmaxsd x, m12}, m9, m10, m6, m3 REPX {pminsd x, m13}, m9, m10, m6, m3 REPX {pmulld x, m14}, m6, m3, m1, m5 paddd m6, m11 paddd m3, m11 psubd m4, m6, m1 ; t20 paddd m6, m1 ; t27 psubd m1, m3, m5 ; t21a paddd m3, m5 ; t26a REPX {psrad x, 12 }, m4, m1, m3, m6 mova [r3+16*4], m4 mova [r3+16*5], m1 mova m4, [r3+16*6] ; t30 mova m1, [r3+16*7] ; t31a psubd m5, m4, m8 ; t25a paddd m4, m8 ; t30a psubd m8, m1, m0 ; t24 paddd m1, m0 ; t31 REPX {pmaxsd x, m12}, m8, m5, m4, m1 REPX {pminsd x, m13}, m8, m5, m4, m1 REPX {pmulld x, m14}, m5, m8, m7, m2 paddd m5, m11 paddd m8, m11 psubd m0, m5, m7 ; t22 paddd m5, m7 ; t25 psubd m7, m8, m2 ; t23a paddd m2, m8 ; t24a REPX {psrad x, 12 }, m0, m7, m2, m5 mova [r3+16*6], m0 mova [r3+16*7], m7 mova [r3+16*8], m2 mova [r3+16*9], m5 mova [r3+16*10], m3 mova [r3+16*11], m6 mova [r3+16*12], m9 mova [r3+16*13], m10 mova [r3+16*14], m4 mova [r3+16*15], m1 %else mova [r3+ 8*16], m2 mova [r3+ 9*16], m3 mova [r3+10*16], m4 mova [r3+11*16], m5 mova m3, [o(pd_2048)] ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a mova m2, [r3+ 8*16] mova m4, [r3+10*16] mova m5, [r3+11*16] mova [r3+ 8*16], m0 mova [r3+10*16], m6 mova [r3+11*16], m7 mova m7, [r3+ 9*16] mova [r3+ 9*16], m1 ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a mova m0, [r3+ 8*16] mova m1, [r3+ 9*16] mova m6, [r3+10*16] .main_oddhalf_part2_fast2: REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6 REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6 psubd m3, m0, m4 ; t25 mova [r3+ 8*16], m3 mova m3, [r3+11*16] paddd m0, m4 ; t24 psubd m4, m6, m2 ; t26 paddd m6, m2 ; t27 psubd m2, m1, m5 ; t21 paddd m1, m5 ; t20 psubd m5, m3, m7 ; t22 paddd m7, m3 ; t23 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pmaxsd m3, [r3+ 8*16] mova [r3+ 8*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 pminsd m3, [r3+ 8*16] mova [r3+ 8*16], m0 mova [r3+ 9*16], m1 mova [r3+10*16], m6 mova [r3+11*16], m7 mova m7, [o(pd_2048)] ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a psubd m1, m5, m4 ; t21 paddd m5, m4 ; t22 psubd m4, m3, m2 ; t26 paddd m3, m2 ; t25 mova m0, [r3+ 8*16] mova m2, [r3+ 9*16] mova m6, [r3+10*16] mova m7, [r3+11*16] mova [r3+ 8*16], m3 psubd m3, m0, m6 ; t27a paddd m0, m6 ; t24a psubd m6, m7, m2 ; t20a paddd m7, m2 ; t23a mova m2, [o(clip_18b_min)] REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pmaxsd m2, [r3+ 8*16] mova [r3+ 8*16], m2 mova m2, [o(clip_18b_max)] REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 pminsd m2, [r3+ 8*16] mova [r3+ 8*16], m0 mova [r3+ 9*16], m2 mova [r3+14*16], m5 mova [r3+15*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20 mova [r3+10*16], m3 mova m0, [o(clip_18b_min)] mova m2, [o(clip_18b_max)] mova m5, [r3+16*2] ; t18a mova m7, [r3+16*3] ; t19 psubd m3, m5, m1 ; t21 paddd m5, m1 ; t18 psubd m1, m7, m6 ; t20a paddd m7, m6 ; t19a REPX {pmaxsd x, m0}, m5, m7, m3, m1 REPX {pminsd x, m2}, m5, m7, m3, m1 mova [r3+16*2], m5 mova [r3+16*3], m7 mova [r3+11*16], m3 mova m3, [r3+10*16] mova m5, [r3+16*4] ; t28 mova m7, [r3+16*5] ; t29a psubd m6, m5, m3 ; t27a paddd m5, m3 ; t28a psubd m3, m7, m4 ; t26 paddd m7, m4 ; t29 REPX {pmaxsd x, m0}, m5, m7, m6, m3 REPX {pminsd x, m2}, m5, m7, m6, m3 mova [r3+16*12], m5 mova [r3+16*13], m7 mova m5, [o(pd_2048)] mova m7, [o(pd_2896)] mova m4, [r3+11*16] REPX {pmulld x, m7}, m6, m3, m1, m4 paddd m6, m5 paddd m3, m5 psubd m5, m6, m1 ; t20 paddd m6, m1 ; t27 psubd m1, m3, m4 ; t21a paddd m3, m4 ; t26a REPX {psrad x, 12}, m5, m1, m3, m6 mova [r3+16*4], m5 mova [r3+16*5], m1 mova [r3+16*10], m3 mova [r3+16*11], m6 mova m5, [r3+14*16] mova m6, [r3+15*16] mova m3, [r3+16*0] ; t16a mova m4, [r3+16*1] ; t17 psubd m1, m3, m6 ; t23 paddd m3, m6 ; t16 psubd m6, m4, m5 ; t22a paddd m4, m5 ; t17a REPX {pmaxsd x, m0}, m3, m4, m1, m6 REPX {pminsd x, m2}, m3, m4, m1, m6 mova [r3+16*0], m3 mova [r3+16*1], m4 mova m5, [r3+ 8*16] mova m3, [r3+ 9*16] mova [r3+ 8*16], m1 mova [r3+ 9*16], m6 mova m4, [r3+16*6] ; t30 mova m1, [r3+16*7] ; t31a psubd m6, m1, m5 ; t24 paddd m1, m5 ; t31 psubd m5, m4, m3 ; t25a paddd m4, m3 ; t30a REPX {pmaxsd x, m0}, m6, m5, m4, m1 REPX {pminsd x, m2}, m6, m5, m4, m1 mova [r3+16*14], m4 mova [r3+16*15], m1 mova m4, [o(pd_2048)] mova m1, [r3+ 9*16] mova m2, [r3+ 8*16] REPX {pmulld x, m7}, m5, m6, m1, m2 paddd m5, m4 paddd m6, m4 psubd m0, m5, m1 ; t22 paddd m5, m1 ; t25 psubd m1, m6, m2 ; t23a paddd m2, m6 ; t24a REPX {psrad x, 12}, m0, m1, m2, m5 mova [r3+16*6], m0 mova [r3+16*7], m1 mova [r3+16*8], m2 mova [r3+16*9], m5 %endif ret ; final sumsub for idct16 as well as idct32, plus final downshift %macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx mova m%4, [r3+16*(23-%1)] pmaxsd m%1, m12 pminsd m%1, m13 psubd m%3, m%1, m%4 ; idct16 out15 - n paddd m%1, m%4 ; idct16 out0 + n pmaxsd m%1, m12 pmaxsd m%3, m12 pminsd m%1, m13 pminsd m%3, m13 paddd m%1, m11 paddd m%3, m11 mova m%5, [r3+16*( 0+%1)] mova m%2, [r3+16*(15-%1)] psubd m%4, m%1, m%2 ; out31 - n paddd m%1, m%2 ; out0 + n paddd m%2, m%3, m%5 ; out15 - n psubd m%3, m%5 ; out16 + n REPX {psrad x, %6}, m%1, m%3, m%2, m%4 %endmacro .round_dct32: %if ARCH_X86_64 psrld m11, 10 ; pd_2 IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31 mova [r3+ 0*16], m6 mova [r3+23*16], m7 IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30 packssdw m0, m1 ; 0 1 packssdw m14, m15 ; 14 15 packssdw m8, m6 ; 16 17 packssdw m7, m9 ; 30 31 mova [r3+16*15], m14 mova [r3+16*14], m7 IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29 IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28 packssdw m2, m3 ; 2 3 packssdw m14, m15 ; 12 13 packssdw m10, m1 ; 18 19 packssdw m9, m7 ; 28 29 mova [r3+16*13], m14 mova [r3+16*12], m9 IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27 IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26 packssdw m4, m5 ; 4 5 packssdw m14, m15 ; 10 11 packssdw m1, m3 ; 20 21 packssdw m9, m7 ; 26 27 mova [r3+16*11], m14 mova [r3+16*10], m9 mova m6, [r3+ 0*16] mova m7, [r3+23*16] IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25 IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24 packssdw m6, m7 ; 6 7 packssdw m11, m15 ; 8 9 packssdw m14, m3 ; 22 23 packssdw m9, m5 ; 24 25 mova [r3+16*9], m11 mova [r3+16*8], m9 mova m12, m1 ret %else mova [r3+16*16], m0 mova [r3+17*16], m1 mova [r3+18*16], m2 mova [r3+19*16], m3 mova [r3+20*16], m4 mova [r3+21*16], m5 mova [r3+22*16], m6 mova [r3+23*16], m7 mova m1, [o(pd_2)] mova m2, [o(clip_18b_min)] mova m3, [o(clip_18b_max)] mov r4, 15*16 .loop_dct32_end: mova m0, [r3+16*16] mova m6, [r3+16*24] pmaxsd m0, m2 pminsd m0, m3 psubd m5, m0, m6 ; idct16 out15 - n paddd m0, m6 ; idct16 out0 + n pmaxsd m0, m2 pmaxsd m5, m2 pminsd m0, m3 pminsd m5, m3 paddd m0, m1 paddd m5, m1 mova m7, [r3] mova m4, [r3+r4] psubd m6, m0, m4 ; out31 - n paddd m0, m4 ; out0 + n paddd m4, m5, m7 ; out15 - n psubd m5, m7 ; out16 + n REPX {psrad x, 2}, m0, m5, m4, m6 mova [r3], m0 mova [r3+r4], m4 mova [r3+16*16], m5 mova [r3+24*16], m6 add r3, 16 sub r4, 32 jg .loop_dct32_end ret %endif .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 8 .dconly1: add r5d, 640 sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 .dconly_loop: mova m1, [dstq+16*0] mova m2, [dstq+16*1] mova m3, [dstq+16*2] mova m4, [dstq+16*3] REPX {paddw x, m0}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 REPX {pmaxsw x, m5}, m1, m2, m3, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m2 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, strideq dec r3d jg .dconly_loop RET cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly ; remove entirely-zero iterations %undef cmp mov r5d, 8 .zero_loop: sub r5d, 2 cmp eobw, word [o2(tbl_32x16_2d)+r5] jl .zero_loop ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+64* 1+r5*8] mova m1, [cq+64* 7+r5*8] mova m2, [cq+64* 9+r5*8] mova m3, [cq+64*15+r5*8] mova m4, [cq+64*17+r5*8] mova m5, [cq+64*23+r5*8] mova m6, [cq+64*25+r5*8] mova m7, [cq+64*31+r5*8] mov r3, rsp call m(idct_8x4_internal_16bpc).rect2_mul call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 mova m0, [cq+64* 3+r5*8] mova m1, [cq+64* 5+r5*8] mova m2, [cq+64*11+r5*8] mova m3, [cq+64*13+r5*8] mova m4, [cq+64*19+r5*8] mova m5, [cq+64*21+r5*8] mova m6, [cq+64*27+r5*8] mova m7, [cq+64*29+r5*8] %if ARCH_X86_32 add r3, 16*8 %endif call m(idct_8x4_internal_16bpc).rect2_mul %if ARCH_X86_32 sub r3, 16*8 %endif call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 add r3, 16*(16+4*ARCH_X86_32) mova m0, [cq+64* 2+r5*8] mova m1, [cq+64* 6+r5*8] mova m2, [cq+64*10+r5*8] mova m3, [cq+64*14+r5*8] mova m4, [cq+64*18+r5*8] mova m5, [cq+64*22+r5*8] mova m6, [cq+64*26+r5*8] mova m7, [cq+64*30+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+64* 0+r5*8] mova m1, [cq+64* 4+r5*8] mova m2, [cq+64* 8+r5*8] mova m3, [cq+64*12+r5*8] mova m4, [cq+64*16+r5*8] mova m5, [cq+64*20+r5*8] mova m6, [cq+64*24+r5*8] mova m7, [cq+64*28+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call .round_dct32 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+64* 8+r5*8], m8 mova [cq+64* 9+r5*8], m9 mova [cq+64*10+r5*8], m10 mova [cq+64*11+r5*8], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+64* 4+r5*8], m8 mova [cq+64* 5+r5*8], m9 mova [cq+64* 6+r5*8], m10 mova [cq+64* 7+r5*8], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [cq+64*12+r5*8], m8 mova [cq+64*13+r5*8], m9 mova [cq+64*14+r5*8], m10 mova [cq+64*15+r5*8], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+64* 4+r5*8], m0 mova [cq+64* 5+r5*8], m1 mova [cq+64* 6+r5*8], m2 mova [cq+64* 7+r5*8], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+64* 8+r5*8], m0 mova [cq+64* 9+r5*8], m1 mova [cq+64*10+r5*8], m2 mova [cq+64*11+r5*8], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [cq+64*12+r5*8], m0 mova [cq+64*13+r5*8], m1 mova [cq+64*14+r5*8], m2 mova [cq+64*15+r5*8], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif mova [cq+64* 0+r5*8], m0 mova [cq+64* 1+r5*8], m1 mova [cq+64* 2+r5*8], m2 mova [cq+64* 3+r5*8], m3 pxor m0, m0 REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 sub r5d, 2 jge .loop_pass1 ; pass=2, we need to call this otherwise the stack pointer has ; the wrong offset in the 8-bit code call .pass2 RET .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %if WIN64 mov [rsp+16*16+gprsize], r7 %endif mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 4 jmp m(idct_16x16_internal_16bpc).loop_pass2 .round_dct32: %if ARCH_X86_64 psrld m11, 11 ; pd_1 IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31 mova [r3+ 0*16], m6 mova [r3+23*16], m7 IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30 packssdw m0, m1 ; 0 1 packssdw m14, m15 ; 14 15 packssdw m8, m6 ; 16 17 packssdw m7, m9 ; 30 31 mova [r3+16*15], m14 mova [r3+16*14], m7 IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29 IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28 packssdw m2, m3 ; 2 3 packssdw m14, m15 ; 12 13 packssdw m10, m1 ; 18 19 packssdw m9, m7 ; 28 29 mova [r3+16*13], m14 mova [r3+16*12], m9 IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27 IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26 packssdw m4, m5 ; 4 5 packssdw m14, m15 ; 10 11 packssdw m1, m3 ; 20 21 packssdw m9, m7 ; 26 27 mova [r3+16*11], m14 mova [r3+16*10], m9 mova m6, [r3+ 0*16] mova m7, [r3+23*16] IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25 IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24 packssdw m6, m7 ; 6 7 packssdw m11, m15 ; 8 9 packssdw m14, m3 ; 22 23 packssdw m9, m5 ; 24 25 mova [r3+16*9], m11 mova [r3+16*8], m9 mova m12, m1 ret %else mova [r3+16*16], m0 mova [r3+17*16], m1 mova [r3+18*16], m2 mova [r3+19*16], m3 mova [r3+20*16], m4 mova [r3+21*16], m5 mova [r3+22*16], m6 mova [r3+23*16], m7 pcmpeqd m1, m1 ; -1 mova m2, [o(clip_18b_min)] mova m3, [o(clip_18b_max)] mov r4, 15*16 .loop_dct32_end: mova m0, [r3+16*16] mova m6, [r3+16*24] psubd m5, m0, m6 ; idct16 out15 - n paddd m0, m6 ; idct16 out0 + n pmaxsd m0, m2 pmaxsd m5, m2 pminsd m0, m3 pminsd m5, m3 psubd m0, m1 psubd m5, m1 mova m7, [r3] mova m4, [r3+r4] psubd m6, m0, m4 ; out31 - n paddd m0, m4 ; out0 + n paddd m4, m5, m7 ; out15 - n psubd m5, m7 ; out16 + n REPX {psrad x, 1}, m0, m5, m4, m6 mova [r3], m0 mova [r3+r4], m4 mova [r3+16*16], m5 mova [r3+24*16], m6 add r3, 16 sub r4, 32 jg .loop_dct32_end ret %endif .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 384 sar r5d, 9 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly ; remove entirely-zero iterations %if ARCH_X86_32 mov [rsp+5*32*16+1*gprsize], dstq %elif WIN64 mov [rsp+5*32*16+1*gprsize], r7 %endif %undef cmp mov r5d, 14 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 mova [rsp+32*16+r5*8+0*32*16], m0 mova [rsp+40*16+r5*8+0*32*16], m0 mova [rsp+32*16+t0*8+0*32*16], m0 mova [rsp+32*16+t1*8+0*32*16], m0 mova [rsp+32*16+r5*8+1*32*16], m0 mova [rsp+40*16+r5*8+1*32*16], m0 mova [rsp+32*16+t0*8+1*32*16], m0 mova [rsp+32*16+t1*8+1*32*16], m0 mova [rsp+32*16+r5*8+2*32*16], m0 mova [rsp+40*16+r5*8+2*32*16], m0 mova [rsp+32*16+t0*8+2*32*16], m0 mova [rsp+32*16+t1*8+2*32*16], m0 mova [rsp+32*16+r5*8+3*32*16], m0 mova [rsp+40*16+r5*8+3*32*16], m0 mova [rsp+32*16+t0*8+3*32*16], m0 mova [rsp+32*16+t1*8+3*32*16], m0 sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+5*32*16], eobd .loop_pass1: mova m0, [cq+128* 1+r5*8] mova m1, [cq+128* 7+r5*8] mova m2, [cq+128* 9+r5*8] mova m3, [cq+128*15+r5*8] mova m4, [cq+128*17+r5*8] mova m5, [cq+128*23+r5*8] mova m6, [cq+128*25+r5*8] mova m7, [cq+128*31+r5*8] %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128* 5+r5*8] mova m2, [cq+128*11+r5*8] mova m3, [cq+128*13+r5*8] mova m4, [cq+128*19+r5*8] mova m5, [cq+128*21+r5*8] mova m6, [cq+128*27+r5*8] mova m7, [cq+128*29+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 mova m0, [cq+128* 2+r5*8] mova m1, [cq+128* 6+r5*8] mova m2, [cq+128*10+r5*8] mova m3, [cq+128*14+r5*8] mova m4, [cq+128*18+r5*8] mova m5, [cq+128*22+r5*8] mova m6, [cq+128*26+r5*8] mova m7, [cq+128*30+r5*8] add r3, 16*(16+4*ARCH_X86_32) call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 4+r5*8] mova m2, [cq+128* 8+r5*8] mova m3, [cq+128*12+r5*8] mova m4, [cq+128*16+r5*8] mova m5, [cq+128*20+r5*8] mova m6, [cq+128*24+r5*8] mova m7, [cq+128*28+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32 movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+32*16+r5*8+2*32*16], m8 mova [rsp+40*16+r5*8+2*32*16], m10 mova [rsp+32*16+t1*8+2*32*16], m9 mova [rsp+32*16+t0*8+2*32*16], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+32*16+r5*8+1*32*16], m8 mova [rsp+40*16+r5*8+1*32*16], m10 mova [rsp+32*16+t1*8+1*32*16], m9 mova [rsp+32*16+t0*8+1*32*16], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+32*16+r5*8+3*32*16], m8 mova [rsp+40*16+r5*8+3*32*16], m10 mova [rsp+32*16+t1*8+3*32*16], m9 mova [rsp+32*16+t0*8+3*32*16], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+32*16+r5*8+1*32*16], m0 mova [rsp+40*16+r5*8+1*32*16], m2 mova [rsp+32*16+t1*8+1*32*16], m1 mova [rsp+32*16+t0*8+1*32*16], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+32*16+r5*8+2*32*16], m0 mova [rsp+40*16+r5*8+2*32*16], m2 mova [rsp+32*16+t1*8+2*32*16], m1 mova [rsp+32*16+t0*8+2*32*16], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+32*16+r5*8+3*32*16], m0 mova [rsp+40*16+r5*8+3*32*16], m2 mova [rsp+32*16+t1*8+3*32*16], m1 mova [rsp+32*16+t0*8+3*32*16], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif pxor m7, m7 ; clear lower half of [cq] REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 mova [rsp+32*16+r5*8+0*32*16], m0 mova [rsp+40*16+r5*8+0*32*16], m2 mova [rsp+32*16+t1*8+0*32*16], m1 mova [rsp+32*16+t0*8+0*32*16], m3 sub r5d, 2 jge .loop_pass1 ; pass=2 code starts here mov eobd, [rsp+gprsize*0+5*32*16] add rsp, 29*16 cmp eobd, 36 jl .load_veryfast cmp eobd, 136 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: %if ARCH_X86_64 lea r2, [dstq+64] mov r7, -8 %else lea r2, [rsp+(4*32+3)*16] mov dword [r2+0*gprsize], 4 %endif jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1 cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0 mov [rsp+gprsize*1+(64*2+12)*16], r0 mov [rsp+gprsize*2+(64*2+12)*16], r1 mov [rsp+gprsize*3+(64*2+12)*16], r2 %else DECLARE_REG_TMP 8, 9, 4, 7 mov [rsp+gprsize*1+(64*2+12)*16], r9 %if WIN64 mov [rsp+gprsize*2+(64*2+12)*16], r7 mov [rsp+gprsize*3+(64*2+12)*16], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 mova [rsp+12*16+t0*8], m0 mova [rsp+12*16+t1*8], m0 mova [rsp+12*16+t2*8], m0 mova [rsp+12*16+t3*8], m0 mova [rsp+76*16+t0*8], m0 mova [rsp+76*16+t1*8], m0 mova [rsp+76*16+t2*8], m0 mova [rsp+76*16+t3*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_16x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+(64*2+12)*16], eobd mov r3, rsp %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 6, 0 mov r2, [rsp+gprsize*3+(64*2+12)*16] mov [rsp+gprsize*3+(64*2+12)*16], r6 %endif .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+ 1*128+r5*8] mova m1, [cq+ 3*128+r5*8] mova m2, [cq+ 5*128+r5*8] mova m3, [cq+ 7*128+r5*8] mova m4, [cq+ 9*128+r5*8] mova m5, [cq+11*128+r5*8] mova m6, [cq+13*128+r5*8] mova m7, [cq+15*128+r5*8] call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+ 0*128+r5*8] mova m1, [cq+ 2*128+r5*8] mova m2, [cq+ 4*128+r5*8] mova m3, [cq+ 6*128+r5*8] mova m4, [cq+ 8*128+r5*8] mova m5, [cq+10*128+r5*8] mova m6, [cq+12*128+r5*8] mova m7, [cq+14*128+r5*8] call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round call m(idct_16x16_internal_16bpc).round %if ARCH_X86_64 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 packssdw m8, m9 packssdw m10, m11 packssdw m12, m13 packssdw m14, m15 %endif call m(idct_8x4_internal_16bpc).transpose4x8packed movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 %if ARCH_X86_64 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+76*16+t0*8], m8 mova [rsp+76*16+t1*8], m9 mova [rsp+76*16+t2*8], m10 mova [rsp+76*16+t3*8], m11 %else mova [rsp+76*16+t0*8], m0 mova [rsp+76*16+t1*8], m1 mova [rsp+76*16+t2*8], m2 mova [rsp+76*16+t3*8], m3 mova m0, [rsp+ 8*16] mova m2, [rsp+ 9*16] mova m4, [rsp+10*16] mova m6, [rsp+11*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif mova [rsp+12*16+t0*8], m0 mova [rsp+12*16+t1*8], m1 mova [rsp+12*16+t2*8], m2 mova [rsp+12*16+t3*8], m3 %if ARCH_X86_32 mov r6, [rsp+gprsize*3+(64*2+12)*16] %endif pxor m7, m7 REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 sub r5d, 2 jge .loop_pass1 ; pass=2 mov eobd, [rsp+gprsize*0+(64*2+12)*16] cmp eobd, 151 jl .fast ; fall-through %if ARCH_X86_64 DECLARE_REG_TMP 8, 9 %else DECLARE_REG_TMP 1, 5 %endif lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] jmp .run .fast: lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] .run: add rsp, 9*16 %if ARCH_X86_64 lea r2, [dstq+32] mov r7, -4 %else lea r2, [rsp+(64*2+3)*16] mov [r2+4*gprsize], t0 mov [r2+5*gprsize], t1 mov r1, [r2+2*gprsize] mov dword [r2+0*gprsize], 2 %endif .loop_pass2: %if ARCH_X86_32 mov dstq, [r2+1*gprsize] %endif call .pass2 add rsp, 64*16 %if ARCH_X86_64 add r7, 2 lea dstq, [r2+r7*8] jl .loop_pass2 %else add dword [r2+1*gprsize], 16 dec dword [r2+0*gprsize] jg .loop_pass2 %endif %assign stack_size (stack_size-(64*2+9)*16) %if STACK_ALIGNMENT >= 16 %assign stack_size_padded (stack_size_padded-(64*2+9)*16) %assign stack_offset (stack_offset-(64*2+9)*16) %else %xdefine rstkm [rsp + stack_size] %endif %if ARCH_X86_64 mov r9, [rsp+gprsize*1+3*16] %if WIN64 mov r7, [rsp+gprsize*2+3*16] mov r8, [rsp+gprsize*3+3*16] %endif %endif RET .pass2: %if ARCH_X86_32 lea r5, [o(itx8_start)] %endif mova m0, [rsp+gprsize+16* 3] mova m1, [rsp+gprsize+16* 4] mova m2, [rsp+gprsize+16* 5] mova m3, [rsp+gprsize+16* 6] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m_suffix(idct_8x8_internal_8bpc, _ssse3).main mova [rsp+gprsize+ 3*16], m0 mova [rsp+gprsize+ 4*16], m1 mova [rsp+gprsize+ 5*16], m2 mova [rsp+gprsize+ 6*16], m3 mova [rsp+gprsize+ 7*16], m4 mova [rsp+gprsize+ 8*16], m5 mova [rsp+gprsize+ 9*16], m6 mova [rsp+gprsize+10*16], m7 mova m0, [rsp+gprsize+16*11] mova m1, [rsp+gprsize+16*12] mova m2, [rsp+gprsize+16*13] mova m3, [rsp+gprsize+16*14] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m_suffix(idct_16x8_internal_8bpc, _ssse3).main mova m7, [rsp+gprsize+ 0*16] mova [rsp+gprsize+11*16], m0 mova [rsp+gprsize+12*16], m1 mova [rsp+gprsize+13*16], m2 mova [rsp+gprsize+14*16], m3 mova [rsp+gprsize+15*16], m4 mova [rsp+gprsize+16*16], m5 mova [rsp+gprsize+17*16], m6 mova [rsp+gprsize+18*16], m7 %if ARCH_X86_64 call r8 %else call [r2+4*gprsize] %endif mova [rsp+gprsize+ 3*16], m0 mova [rsp+gprsize+ 5*16], m2 mova [rsp+gprsize+ 8*16], m5 mova [rsp+gprsize+10*16], m7 %if ARCH_X86_64 call r9 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %else call [r2+5*gprsize] %endif lea r3, [strideq*3] lea r4, [rsp+gprsize+ 3*16] %if ARCH_X86_64 mov r6d, 8 %else mov dword [r2+2*gprsize], 8 %endif .loop_write: mova m0, [r4+0*16] mova m1, [r4+1*16] mova m2, [r4+2*16] mova m3, [r4+3*16] mova m4, [r4+4*16] mova m5, [r4+5*16] mova m6, [r4+6*16] mova m7, [r4+7*16] call m(idct_8x8_internal_16bpc).round1_and_write_8x8 lea dstq, [dstq+strideq*8] add r4, 8*16 %if ARCH_X86_64 dec r6d %else dec dword [r2+2*gprsize] %endif jg .loop_write ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add r5d, 640 sar r5d, 10 add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0 mov [rsp+gprsize*1+(64*4+32)*16], r0 mov [rsp+gprsize*2+(64*4+32)*16], r1 mov [rsp+gprsize*3+(64*4+32)*16], r2 %else DECLARE_REG_TMP 8, 9, 4, 7 mov [rsp+gprsize*1+(64*4+32)*16], r9 %if WIN64 mov [rsp+gprsize*2+(64*4+32)*16], r7 mov [rsp+gprsize*3+(64*4+32)*16], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 7*2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 mova [rsp+ 32*16+t0*8], m0 mova [rsp+ 32*16+t1*8], m0 mova [rsp+ 32*16+t2*8], m0 mova [rsp+ 32*16+t3*8], m0 mova [rsp+ 96*16+t0*8], m0 mova [rsp+ 96*16+t1*8], m0 mova [rsp+ 96*16+t2*8], m0 mova [rsp+ 96*16+t3*8], m0 mova [rsp+160*16+t0*8], m0 mova [rsp+160*16+t1*8], m0 mova [rsp+160*16+t2*8], m0 mova [rsp+160*16+t3*8], m0 mova [rsp+224*16+t0*8], m0 mova [rsp+224*16+t1*8], m0 mova [rsp+224*16+t2*8], m0 mova [rsp+224*16+t3*8], m0 sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: ; actual first pass after skipping all-zero data mov [rsp+gprsize*0+(64*4+32)*16], eobd mov r3, rsp %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 6, 0 mov r2, [rsp+gprsize*3+(64*4+32)*16] mov [rsp+gprsize*3+(64*4+32)*16], r6 %endif .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mova m0, [cq+128* 1+r5*8] mova m1, [cq+128* 7+r5*8] mova m2, [cq+128* 9+r5*8] mova m3, [cq+128*15+r5*8] mova m4, [cq+128*17+r5*8] mova m5, [cq+128*23+r5*8] mova m6, [cq+128*25+r5*8] mova m7, [cq+128*31+r5*8] mov r3, rsp call m(idct_8x4_internal_16bpc).rect2_mul call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128* 5+r5*8] mova m2, [cq+128*11+r5*8] mova m3, [cq+128*13+r5*8] mova m4, [cq+128*19+r5*8] mova m5, [cq+128*21+r5*8] mova m6, [cq+128*27+r5*8] mova m7, [cq+128*29+r5*8] %if ARCH_X86_32 add r3, 16*8 %endif call m(idct_8x4_internal_16bpc).rect2_mul %if ARCH_X86_32 sub r3, 16*8 %endif call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 add r3, 16*(16+4*ARCH_X86_32) mova m0, [cq+128* 2+r5*8] mova m1, [cq+128* 6+r5*8] mova m2, [cq+128*10+r5*8] mova m3, [cq+128*14+r5*8] mova m4, [cq+128*18+r5*8] mova m5, [cq+128*22+r5*8] mova m6, [cq+128*26+r5*8] mova m7, [cq+128*30+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_16x4_internal_16bpc).main_oddhalf mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 4+r5*8] mova m2, [cq+128* 8+r5*8] mova m3, [cq+128*12+r5*8] mova m4, [cq+128*16+r5*8] mova m5, [cq+128*20+r5*8] mova m6, [cq+128*24+r5*8] mova m7, [cq+128*28+r5*8] call m(idct_8x4_internal_16bpc).rect2_mul call m(idct_8x4_internal_16bpc).main_pass1 call m(idct_8x4_internal_16bpc).round sub r3, 16*(16+4*ARCH_X86_32) call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32 movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 %if ARCH_X86_64 call m(idct_8x4_internal_16bpc).transpose4x8packed call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+160*16+t0*8], m8 mova [rsp+160*16+t1*8], m9 mova [rsp+160*16+t2*8], m10 mova [rsp+160*16+t3*8], m11 mova m8, [r3+16* 9] ; 8 9 mova m10, [r3+16*11] ; 10 11 mova m12, [r3+16*13] ; 12 13 mova m14, [r3+16*15] ; 14 15 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+ 96*16+t0*8], m8 mova [rsp+ 96*16+t1*8], m9 mova [rsp+ 96*16+t2*8], m10 mova [rsp+ 96*16+t3*8], m11 mova m8, [r3+16* 8] ; 24 25 mova m10, [r3+16*10] ; 26 27 mova m12, [r3+16*12] ; 28 29 mova m14, [r3+16*14] ; 30 31 call m(idct_16x4_internal_16bpc).transpose4x8packed_hi mova [rsp+224*16+t0*8], m8 mova [rsp+224*16+t1*8], m9 mova [rsp+224*16+t2*8], m10 mova [rsp+224*16+t3*8], m11 %else sub r3, 8*16 mova m0, [r3+ 8*16] mova m2, [r3+10*16] mova m4, [r3+12*16] mova m6, [r3+14*16] packssdw m0, [r3+ 9*16] packssdw m2, [r3+11*16] packssdw m4, [r3+13*16] packssdw m6, [r3+15*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+ 96*16+t0*8], m0 mova [rsp+ 96*16+t1*8], m1 mova [rsp+ 96*16+t2*8], m2 mova [rsp+ 96*16+t3*8], m3 mova m0, [r3+16*16] mova m2, [r3+18*16] mova m4, [r3+20*16] mova m6, [r3+22*16] packssdw m0, [r3+17*16] packssdw m2, [r3+19*16] packssdw m4, [r3+21*16] packssdw m6, [r3+23*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+160*16+t0*8], m0 mova [rsp+160*16+t1*8], m1 mova [rsp+160*16+t2*8], m2 mova [rsp+160*16+t3*8], m3 mova m0, [r3+31*16] mova m2, [r3+29*16] mova m4, [r3+27*16] mova m6, [r3+25*16] packssdw m0, [r3+30*16] packssdw m2, [r3+28*16] packssdw m4, [r3+26*16] packssdw m6, [r3+24*16] call m(idct_8x4_internal_16bpc).transpose4x8packed mova [rsp+224*16+t0*8], m0 mova [rsp+224*16+t1*8], m1 mova [rsp+224*16+t2*8], m2 mova [rsp+224*16+t3*8], m3 mova m0, [r3+ 0*16] mova m2, [r3+ 2*16] mova m4, [r3+ 4*16] mova m6, [r3+ 6*16] packssdw m0, [r3+ 1*16] packssdw m2, [r3+ 3*16] packssdw m4, [r3+ 5*16] packssdw m6, [r3+ 7*16] call m(idct_8x4_internal_16bpc).transpose4x8packed %endif mova [rsp+ 32*16+t0*8], m0 mova [rsp+ 32*16+t1*8], m1 mova [rsp+ 32*16+t2*8], m2 mova [rsp+ 32*16+t3*8], m3 pxor m0, m0 REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 %if ARCH_X86_32 mov r6, [rsp+gprsize*3+(64*4+32)*16] %endif sub r5d, 2 jge .loop_pass1 ; pass=2 mov eobd, [rsp+gprsize*0+(64*4+32)*16] cmp eobd, 136 jl .fast ; fall-through %if ARCH_X86_64 DECLARE_REG_TMP 8, 9 %else DECLARE_REG_TMP 1, 5 %endif lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] jmp .run .fast: lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] .run: add rsp, 29*16 %if ARCH_X86_64 lea r2, [dstq+64] mov r7, -8 %else lea r2, [rsp+(64*4+3)*16] mov [r2+4*gprsize], t0 mov [r2+5*gprsize], t1 mov r1, [r2+2*gprsize] mov dword [r2+0*gprsize], 4 %endif jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 384 sar r5d, 9 add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly ; remove entirely-zero iterations %undef cmp mov r5d, 8 .zero_loop: sub r5d, 2 cmp eobw, word [o2(tbl_32x16_2d)+r5] jl .zero_loop ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp lea r4, [o(idct64_mul_16bpc)] mova m0, [cq+64* 1+r5*8] mova m1, [cq+64*31+r5*8] mova m2, [cq+64*17+r5*8] mova m3, [cq+64*15+r5*8] call .main_part1 mova m0, [cq+64* 7+r5*8] mova m1, [cq+64*25+r5*8] mova m2, [cq+64*23+r5*8] mova m3, [cq+64* 9+r5*8] call .main_part1 mova m0, [cq+64* 5+r5*8] mova m1, [cq+64*27+r5*8] mova m2, [cq+64*21+r5*8] mova m3, [cq+64*11+r5*8] call .main_part1 mova m0, [cq+64* 3+r5*8] mova m1, [cq+64*29+r5*8] mova m2, [cq+64*19+r5*8] mova m3, [cq+64*13+r5*8] call .main_part1 call .main_part2 mova m0, [cq+64* 2+r5*8] mova m1, [cq+64*14+r5*8] mova m2, [cq+64*18+r5*8] mova m3, [cq+64*30+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast mova m0, [cq+64* 6+r5*8] mova m1, [cq+64*10+r5*8] mova m2, [cq+64*22+r5*8] mova m3, [cq+64*26+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast add r3, 16*(24+4*ARCH_X86_32) mova m0, [cq+64* 4+r5*8] mova m1, [cq+64*12+r5*8] mova m2, [cq+64*20+r5*8] mova m3, [cq+64*28+r5*8] call m(idct_16x4_internal_16bpc).main_oddhalf_fast mova m0, [cq+64* 0+r5*8] mova m1, [cq+64* 8+r5*8] mova m2, [cq+64*16+r5*8] mova m3, [cq+64*24+r5*8] call m(idct_8x4_internal_16bpc).main_pass1_fast call m(idct_8x4_internal_16bpc).round mova [r3-(7+4*ARCH_X86_32)*16], m1 mova [r3-(6+4*ARCH_X86_32)*16], m2 mova [r3-(5+4*ARCH_X86_32)*16], m3 mova [r3-(4+4*ARCH_X86_32)*16], m4 mova [r3-(3+4*ARCH_X86_32)*16], m5 mova [r3-(2+4*ARCH_X86_32)*16], m6 mova [r3-(1+4*ARCH_X86_32)*16], m7 sub r3, 16*(40+4*ARCH_X86_32-4) %if ARCH_X86_64 psrld m15, m11, 10 ; pd_2 %else mova m7, [o(pd_2)] %endif call .main_end_loop_start lea r3, [rsp+56*16] lea r4, [cq+r5*8+64*28] call .shift_transpose sub r5d, 2 jge .loop_pass1 ; pass=2, we need to call this otherwise the stack pointer has ; the wrong offset in the 8-bit code call .pass2 RET .pass2: %if ARCH_X86_64 mova m8, [o(pw_2048)] pxor m9, m9 mova m10, [o(pixel_10bpc_max)] %if WIN64 mov [rsp+16*16+gprsize], r7 %endif mov r7, dstq %else mov [rsp+2*gprsize+16*16], dstq %endif lea r3, [strideq*3] mov r4d, 8 jmp m(idct_16x16_internal_16bpc).loop_pass2 .main_part1: ; idct64 steps 1-5 ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a %if ARCH_X86_64 movd m7, [r4+4*0] movd m8, [r4+4*1] movd m6, [r4+4*2] movd m9, [r4+4*3] movd m5, [r4+4*4] movd m10, [r4+4*5] movd m4, [r4+4*6] movd m15, [r4+4*7] REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15 pmulld m7, m0 ; t63a pmulld m0, m8 ; t32a pmulld m6, m1 ; t62a pmulld m1, m9 ; t33a pmulld m5, m2 ; t61a pmulld m2, m10 ; t34a pmulld m4, m3 ; t60a pmulld m3, m15 ; t35a movd m10, [r4+4*8] movd m15, [r4+4*9] REPX {pshufd x, x, q0000}, m10, m15 REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 psubd m8, m0, m1 ; t33 paddd m0, m1 ; t32 psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m5 ; t61 paddd m4, m5 ; t60 REPX {pmaxsd x, m12}, m8, m1, m6, m2 REPX {pminsd x, m13}, m8, m1, m6, m2 ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a REPX {pmaxsd x, m12}, m0, m3, m7, m4 REPX {pminsd x, m13}, m0, m3, m7, m4 movd m10, [r4+4*10] movd m15, [r4+4*11] REPX {pshufd x, x, q0000}, m10, m15 psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m8, m2 ; t61 paddd m8, m2 ; t62 REPX {pmaxsd x, m12}, m5, m3, m4, m6 REPX {pminsd x, m13}, m5, m3, m4, m6 ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a REPX {pmaxsd x, m12}, m0, m7, m1, m8 REPX {pminsd x, m13}, m0, m7, m1, m8 add r4, 4*12 mova [r3+16*0], m0 mova [r3+16*7], m7 mova [r3+16*1], m1 mova [r3+16*6], m8 mova [r3+16*2], m6 mova [r3+16*5], m4 mova [r3+16*3], m3 mova [r3+16*4], m5 %else movd m7, [r4+4*0] movd m6, [r4+4*2] movd m5, [r4+4*4] movd m4, [r4+4*6] REPX {pshufd x, x, q0000}, m7, m6, m5, m4 pmulld m7, m0 ; t63a pmulld m6, m1 ; t62a pmulld m5, m2 ; t61a pmulld m4, m3 ; t60a mova [r3+0*16], m6 mova [r3+1*16], m7 movd m6, [r4+4*1] movd m7, [r4+4*3] REPX {pshufd x, x, q0000}, m7, m6 pmulld m0, m6 ; t32a pmulld m1, m7 ; t33a movd m6, [r4+4*5] movd m7, [r4+4*7] REPX {pshufd x, x, q0000}, m7, m6 pmulld m2, m6 ; t34a pmulld m3, m7 ; t35a mova m6, [r3+0*16] mova m7, [o(pd_2048)] REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 paddd m7, [r3+1*16] REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4 mova [r3+0*16], m5 psubd m5, m0, m1 ; t33 paddd m0, m1 ; t32 mova [r3+1*16], m0 mova m0, [r3+0*16] psubd m1, m7, m6 ; t62 paddd m7, m6 ; t63 psubd m6, m3, m2 ; t34 paddd m3, m2 ; t35 psubd m2, m4, m0 ; t61 paddd m4, m0 ; t60 mova m0, [o(clip_18b_min)] REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4 pmaxsd m0, [r3+1*16] mova [r3+0*16], m0 mova m0, [o(clip_18b_max)] REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4 pminsd m0, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m3 mova [r3+2*16], m4 mova [r3+3*16], m7 mova m0, [o(pd_2048)] movd m3, [r4+4*8] movd m4, [r4+4*9] REPX {pshufd x, x, q0000}, m3, m4 mova [r3+4*16], m2 ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a mova m2, [r3+4*16] mova [r3+4*16], m5 ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a mova m0, [r3+0*16] mova m3, [r3+1*16] mova m4, [r3+2*16] mova m7, [r3+3*16] psubd m5, m0, m3 ; t35a paddd m0, m3 ; t32a mova [r3+0*16], m5 mova m5, [r3+4*16] psubd m3, m7, m4 ; t60a paddd m7, m4 ; t63a psubd m4, m1, m6 ; t34 paddd m1, m6 ; t33 psubd m6, m5, m2 ; t61 paddd m2, m5 ; t62 mova m5, [o(clip_18b_min)] REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2 pmaxsd m5, [r3+0*16] mova [r3+0*16], m5 mova m5, [o(clip_18b_max)] REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2 pminsd m5, [r3+0*16] mova [r3+16*0], m0 mova [r3+16*7], m7 mova [r3+16*1], m1 mova [r3+16*6], m2 mova [r3+16*2], m4 mova m7, [o(pd_2048)] movd m0, [r4+4*10] movd m1, [r4+4*11] REPX {pshufd x, x, q0000}, m0, m1 ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60 mova [r3+16*3], m3 mova [r3+16*4], m5 mova m4, [r3+2*16] ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a add r4, 4*12 mova [r3+16*2], m6 mova [r3+16*5], m4 %endif add r3, 16*8 ret .main_part2: ; idct64 steps 6-9 lea r4, [r3+16*7] %if ARCH_X86_64 mova m10, [o(pd_1567)] mova m15, [o(pd_3784)] .main_part2_loop: mova m0, [r3-16*32] ; t32a mova m1, [r4-16*24] ; t39a mova m2, [r4-16*32] ; t63a mova m3, [r3-16*24] ; t56a mova m4, [r3-16*16] ; t40a mova m5, [r4-16* 8] ; t47a mova m6, [r4-16*16] ; t55a mova m7, [r3-16* 8] ; t48a psubd m8, m0, m1 ; t39 paddd m0, m1 ; t32 psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 REPX {pmaxsd x, m12}, m8, m1, m3, m4 REPX {pminsd x, m13}, m8, m1, m3, m4 ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a REPX {pmaxsd x, m12}, m0, m2, m5, m7 REPX {pminsd x, m13}, m0, m5, m2, m7 psubd m6, m2, m7 ; t48a paddd m2, m7 ; t63a psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m8, m4 ; t55 paddd m8, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 REPX {pmaxsd x, m12}, m6, m7, m5, m4 REPX {pminsd x, m13}, m6, m7, m5, m4 REPX {pmulld x, m14}, m6, m7, m5, m4 REPX {pmaxsd x, m12}, m2, m0, m8, m1 REPX {pminsd x, m13}, m2, m0, m8, m1 paddd m6, m11 paddd m5, m11 psubd m3, m6, m7 ; t47 paddd m6, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m3, m6, m7, m5 mova [r4-16* 8], m2 mova [r3-16*32], m0 mova [r3-16* 8], m8 mova [r4-16*32], m1 mova [r4-16*24], m3 mova [r3-16*16], m6 mova [r3-16*24], m7 mova [r4-16*16], m5 %else .main_part2_loop: mova m0, [r3-16*32] ; t32a mova m1, [r4-16*24] ; t39a mova m2, [r4-16*32] ; t63a mova m3, [r3-16*24] ; t56a mova m4, [r3-16*16] ; t40a mova m5, [r4-16* 8] ; t47a mova m6, [r4-16*16] ; t55a psubd m7, m0, m1 ; t39 paddd m0, m1 ; t32 mova [r3+0*16], m7 mova m7, [r3-16* 8] ; t48a psubd m1, m2, m3 ; t56 paddd m2, m3 ; t63 psubd m3, m5, m4 ; t40 paddd m5, m4 ; t47 psubd m4, m7, m6 ; t55 paddd m7, m6 ; t48 mova m6, [o(clip_18b_min)] REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7 pmaxsd m6, [r3+0*16] mova [r3+0*16], m6 mova m6, [o(clip_18b_max)] REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7 pminsd m6, [r3+0*16] mova [r3+0*16], m0 mova [r3+1*16], m2 mova [r3+2*16], m5 mova [r3+3*16], m7 mova m0, [o(pd_2048)] ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a mova m2, [r3+1*16] mova m7, [r3+3*16] psubd m5, m2, m7 ; t48a paddd m2, m7 ; t63a mova [r3+1*16], m5 mova m0, [r3+0*16] mova m5, [r3+2*16] psubd m7, m0, m5 ; t47a paddd m0, m5 ; t32a psubd m5, m6, m4 ; t55 paddd m6, m4 ; t56 psubd m4, m1, m3 ; t40 paddd m1, m3 ; t39 mova m3, [o(clip_18b_min)] REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1 pmaxsd m3, [r3+1*16] mova [r3+0*16], m3 mova m3, [o(clip_18b_max)] REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1 pminsd m3, [r3+0*16] mova [r4-16* 8], m2 mova [r3-16*32], m0 mova [r3-16* 8], m6 mova [r4-16*32], m1 mova m0, [o(pd_2896)] mova m1, [o(pd_2048)] REPX {pmulld x, m0}, m3, m7, m5, m4 REPX {paddd x, m1}, m3, m5 psubd m6, m3, m7 ; t47 paddd m3, m7 ; t48 psubd m7, m5, m4 ; t40a paddd m5, m4 ; t55a REPX {psrad x, 12}, m6, m3, m7, m5 mova [r4-16*24], m6 mova [r3-16*16], m3 mova [r3-16*24], m7 mova [r4-16*16], m5 %endif add r3, 16 sub r4, 16 cmp r3, r4 jl .main_part2_loop sub r3, 4*16 ret .main_end_loop: mova m0, [r3+16*28] ; idct8 0 + n .main_end_loop_start: mova m2, [r3+16*12] ; idct32 16 + n mova m3, [r4+16*12] ; idct32 31 - n %if ARCH_X86_64 mova m1, [r4+16*28] ; idct16 15 - n mova m4, [r4-16* 4] ; idct64 63 - n mova m5, [r3-16* 4] ; idct64 48 + n mova m6, [r4-16*20] ; idct64 47 - n mova m7, [r3-16*20] ; idct64 32 + n pmaxsd m0, m12 pminsd m0, m13 paddd m8, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m12}, m8, m0 REPX {pminsd x, m13}, m8, m0 paddd m1, m8, m3 ; idct32 out0 + n psubd m8, m3 ; idct32 out31 - n paddd m3, m0, m2 ; idct32 out15 - n psubd m0, m2 ; idct32 out16 + n REPX {pmaxsd x, m12}, m1, m8, m3, m0 REPX {pminsd x, m13}, m1, m3, m8, m0 REPX {paddd x, m15}, m1, m3, m0, m8 paddd m2, m1, m4 ; idct64 out0 + n (unshifted) psubd m1, m4 ; idct64 out63 - n (unshifted) paddd m4, m3, m5 ; idct64 out15 - n (unshifted) psubd m3, m5 ; idct64 out48 + n (unshifted) paddd m5, m0, m6 ; idct64 out16 + n (unshifted) psubd m0, m6 ; idct64 out47 - n (unshifted) paddd m6, m8, m7 ; idct64 out31 - n (unshifted) psubd m8, m7 ; idct64 out32 + n (unshifted) mova [r3-16*20], m2 mova [r4+16*28], m1 mova [r4-16*20], m4 mova [r3+16*28], m3 mova [r3-16* 4], m5 mova [r4+16*12], m0 mova [r4-16* 4], m6 mova [r3+16*12], m8 %else mova m5, [o(clip_18b_min)] mova m6, [o(clip_18b_max)] mova m1, [r3+16*44] ; idct16 15 - n pmaxsd m0, m5 pminsd m0, m6 paddd m4, m0, m1 ; idct16 out0 + n psubd m0, m1 ; idct16 out15 - n REPX {pmaxsd x, m5}, m4, m0 REPX {pminsd x, m6}, m4, m0 paddd m1, m4, m3 ; idct32 out0 + n psubd m4, m3 ; idct32 out31 - n paddd m3, m0, m2 ; idct32 out15 - n psubd m0, m2 ; idct32 out16 + n REPX {pmaxsd x, m5}, m1, m4, m3, m0 REPX {pminsd x, m6}, m1, m3, m4, m0 REPX {paddd x, m7}, m1, m3, m0, m4 mova m5, [r4-16* 4] ; idct64 63 - n mova m6, [r3-16* 4] ; idct64 48 + n paddd m2, m1, m5 ; idct64 out0 + n (unshifted) psubd m1, m5 ; idct64 out63 - n (unshifted) paddd m5, m3, m6 ; idct64 out15 - n (unshifted) psubd m3, m6 ; idct64 out48 + n (unshifted) mova [r4+16*28], m1 mova [r3+16*28], m3 mova m6, [r4-16*20] ; idct64 47 - n mova m1, [r3-16*20] ; idct64 32 + n mova [r3-16*20], m2 mova [r4-16*20], m5 paddd m5, m0, m6 ; idct64 out16 + n (unshifted) psubd m0, m6 ; idct64 out47 - n (unshifted) paddd m6, m4, m1 ; idct64 out31 - n (unshifted) psubd m4, m1 ; idct64 out32 + n (unshifted) mova [r3-16* 4], m5 mova [r4+16*12], m0 mova [r4-16* 4], m6 mova [r3+16*12], m4 %endif sub r4, 16 add r3, 16 cmp r3, r4 jl .main_end_loop ret .shift_transpose: mova m0, [r3+0*16] mova m1, [r3+1*16] mova m2, [r3+2*16] mova m3, [r3+3*16] mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [r4+0*64], m0 mova [r4+1*64], m1 mova [r4+2*64], m2 mova [r4+3*64], m3 sub r4, 4*64 sub r3, 8*16 cmp r3, rsp jg .shift_transpose ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 16 .dconly1: add r5d, 640 sar r5d, 10 .dconly2: imul r5d, 2896 add r5d, 34816 movd m0, r5d pshuflw m0, m0, q1111 punpcklqdq m0, m0 mova m6, [o(pixel_10bpc_max)] pxor m5, m5 .dconly_loop: paddw m1, m0, [dstq+16*0] paddw m2, m0, [dstq+16*1] paddw m3, m0, [dstq+16*2] paddw m4, m0, [dstq+16*3] REPX {pmaxsw x, m5}, m1, m2, m3, m4 REPX {pminsw x, m6}, m1, m2, m3, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m2 mova [dstq+16*2], m3 mova [dstq+16*3], m4 add dstq, 64 btc r3d, 16 jnc .dconly_loop lea dstq, [dstq+strideq-128] dec r3d jg .dconly_loop RET cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 0, 4, 1 mov [rsp+(8*32+64+8)*16+1*gprsize], dstq mov [rsp+(8*32+64+8)*16+2*gprsize], strideq %else DECLARE_REG_TMP 4, 7, 8 %if WIN64 mov [rsp+(8*32+64+1)*16+1*gprsize], r7 mov [rsp+64*16+0*gprsize], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 14 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 lea t2, [rsp+7*32*16] .zero_loop_inner: mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0 mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0 sub t2, 32*16 cmp t2, rsp jge .zero_loop_inner sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp lea r4, [o(idct64_mul_16bpc)] mova m0, [cq+128* 1+r5*8] mova m1, [cq+128*31+r5*8] mova m2, [cq+128*17+r5*8] mova m3, [cq+128*15+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 7+r5*8] mova m1, [cq+128*25+r5*8] mova m2, [cq+128*23+r5*8] mova m3, [cq+128* 9+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 5+r5*8] mova m1, [cq+128*27+r5*8] mova m2, [cq+128*21+r5*8] mova m3, [cq+128*11+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128*29+r5*8] mova m2, [cq+128*19+r5*8] mova m3, [cq+128*13+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 mova m0, [cq+128* 2+r5*8] mova m1, [cq+128*14+r5*8] mova m2, [cq+128*18+r5*8] mova m3, [cq+128*30+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast mova m0, [cq+128* 6+r5*8] mova m1, [cq+128*10+r5*8] mova m2, [cq+128*22+r5*8] mova m3, [cq+128*26+r5*8] call .rect2_mul_fast call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast add r3, 16*(24+4*ARCH_X86_32) mova m0, [cq+128* 4+r5*8] mova m1, [cq+128*12+r5*8] mova m2, [cq+128*20+r5*8] mova m3, [cq+128*28+r5*8] call .rect2_mul_fast call m(idct_16x4_internal_16bpc).main_oddhalf_fast mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 8+r5*8] mova m2, [cq+128*16+r5*8] mova m3, [cq+128*24+r5*8] call .rect2_mul_fast call m(idct_8x4_internal_16bpc).main_pass1_fast call m(idct_8x4_internal_16bpc).round mova [r3-(7+4*ARCH_X86_32)*16], m1 mova [r3-(6+4*ARCH_X86_32)*16], m2 mova [r3-(5+4*ARCH_X86_32)*16], m3 mova [r3-(4+4*ARCH_X86_32)*16], m4 mova [r3-(3+4*ARCH_X86_32)*16], m5 mova [r3-(2+4*ARCH_X86_32)*16], m6 mova [r3-(1+4*ARCH_X86_32)*16], m7 sub r3, 16*(40+4*ARCH_X86_32-4) %if ARCH_X86_64 psrld m15, m11, 11 ; pd_1 %else mova m7, [o(pd_1)] %endif call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start lea r3, [rsp+56*16] lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16] movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] movzx t1d, t0b shr t0d, 8 call .shift_transpose ; zero cq pxor m7, m7 lea r4, [cq+30*128+r5*8] .zero_cq_loop: REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 sub r4, 4*128 cmp r4, cq jg .zero_cq_loop sub r5d, 2 jge .loop_pass1 ; pass=2 code starts here mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16] %if ARCH_X86_32 mov strideq, [rsp+gprsize*2+(8*32+64+8)*16] %elif WIN64 mov r8, [rsp+gprsize*0+64*16] %endif add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16 cmp eobd, 36 jl .load_veryfast cmp eobd, 136 jl .load_fast ; load normal lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] jmp .run .load_fast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] jmp .run .load_veryfast: lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] ; fall-through .run: %if ARCH_X86_64 lea r2, [dstq+128] mov r7, -16 %else lea r2, [rsp+(8*32+3)*16] mov dword [r2+0*gprsize], 8 %endif jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry .rect2_mul_fast: %if ARCH_X86_64 REPX {pmulld x, m14}, m0, m1, m2, m3 REPX {paddd x, m11}, m0, m1, m2, m3 %else mova m4, [o(pd_2896)] mova m5, [o(pd_2048)] REPX {pmulld x, m4 }, m0, m1, m2, m3 REPX {paddd x, m5 }, m0, m1, m2, m3 %endif REPX {psrad x, 12 }, m0, m1, m2, m3 ret .shift_transpose: mova m0, [r3+0*16] mova m1, [r3+1*16] mova m2, [r3+2*16] mova m3, [r3+3*16] mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [t2+0*16+r5*8], m0 mova [t2+8*16+r5*8], m2 mova [t2+0*16+t0*8], m3 mova [t2+0*16+t1*8], m1 sub t2, 16*32 sub r3, 8*16 cmp r3, rsp jg .shift_transpose ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 32 add r5d, 128 sar r5d, 8 imul r5d, 181 add r5d, 384 sar r5d, 9 add rsp, (1+8*32+1*WIN64)*16 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \ dst, stride, c, eob LEA r6, base test eobd, eobd jz .dconly %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0, 6 mov [rsp+gprsize*1+(64*9+8)*16], r0 mov [rsp+gprsize*2+(64*9+8)*16], r1 mov [rsp+gprsize*3+(64*9+8)*16], r2 mov [rsp+gprsize*4+(64*9+8)*16], r6 %else DECLARE_REG_TMP 8, 9, 4, 7, 0 mov [rsp+gprsize*1+(64*9+1)*16], r9 mov [rsp+gprsize*0+64*16], r0 %if WIN64 mov [rsp+gprsize*2+(64*9+1)*16], r7 mov [rsp+gprsize*3+(64*9+1)*16], r8 %endif %endif %undef cmp ; remove entirely-zero iterations mov r5d, 14 cmp eobw, word [o2(tbl_32x32_2d)+r5] jge .end_zero_loop pxor m0, m0 .zero_loop: movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 lea t4, [rsp+7*64*16] .zero_loop_inner: mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0 mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0 sub t4, 64*16 cmp t4, rsp jge .zero_loop_inner %if ARCH_X86_32 mov r6, [rsp+gprsize*4+(64*9+8)*16] %endif sub r5d, 2 cmp eobw, word [o2(tbl_32x32_2d)+r5] jl .zero_loop .end_zero_loop: mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd %if ARCH_X86_32 mov cq, [rsp+gprsize*3+(64*9+8)*16] %endif ; actual first pass after skipping all-zero data .loop_pass1: %if ARCH_X86_64 mova m11, [o(pd_2048)] mova m12, [o(clip_18b_min)] mova m13, [o(clip_18b_max)] mova m14, [o(pd_2896)] %endif mov r3, rsp lea r4, [o(idct64_mul_16bpc)] mova m0, [cq+128* 1+r5*8] mova m1, [cq+128*31+r5*8] mova m2, [cq+128*17+r5*8] mova m3, [cq+128*15+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 7+r5*8] mova m1, [cq+128*25+r5*8] mova m2, [cq+128*23+r5*8] mova m3, [cq+128* 9+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 5+r5*8] mova m1, [cq+128*27+r5*8] mova m2, [cq+128*21+r5*8] mova m3, [cq+128*11+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 mova m0, [cq+128* 3+r5*8] mova m1, [cq+128*29+r5*8] mova m2, [cq+128*19+r5*8] mova m3, [cq+128*13+r5*8] call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 mova m0, [cq+128* 2+r5*8] mova m1, [cq+128*14+r5*8] mova m2, [cq+128*18+r5*8] mova m3, [cq+128*30+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast mova m0, [cq+128* 6+r5*8] mova m1, [cq+128*10+r5*8] mova m2, [cq+128*22+r5*8] mova m3, [cq+128*26+r5*8] call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast add r3, 16*(24+4*ARCH_X86_32) mova m0, [cq+128* 4+r5*8] mova m1, [cq+128*12+r5*8] mova m2, [cq+128*20+r5*8] mova m3, [cq+128*28+r5*8] call m(idct_16x4_internal_16bpc).main_oddhalf_fast mova m0, [cq+128* 0+r5*8] mova m1, [cq+128* 8+r5*8] mova m2, [cq+128*16+r5*8] mova m3, [cq+128*24+r5*8] call m(idct_8x4_internal_16bpc).main_pass1_fast call m(idct_8x4_internal_16bpc).round mova [r3-(7+4*ARCH_X86_32)*16], m1 mova [r3-(6+4*ARCH_X86_32)*16], m2 mova [r3-(5+4*ARCH_X86_32)*16], m3 mova [r3-(4+4*ARCH_X86_32)*16], m4 mova [r3-(3+4*ARCH_X86_32)*16], m5 mova [r3-(2+4*ARCH_X86_32)*16], m6 mova [r3-(1+4*ARCH_X86_32)*16], m7 sub r3, 16*(40+4*ARCH_X86_32-4) %if ARCH_X86_64 psrld m15, m11, 10 ; pd_2 %else mova m7, [o(pd_2)] %endif call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start lea r3, [rsp+56*16] movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] movzx t0d, t1b movzx t2d, t3b shr t1d, 8 shr t3d, 8 lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16] call .shift_transpose ; zero cq pxor m7, m7 %if ARCH_X86_32 mov cq, [rsp+gprsize*3+(64*9+8)*16] %endif lea r4, [cq+30*128+r5*8] .zero_cq_loop: REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 sub r4, 4*128 cmp r4, cq jg .zero_cq_loop %if ARCH_X86_32 mov r6, [rsp+gprsize*4+(64*9+8)*16] %endif sub r5d, 2 jge .loop_pass1 ; pass=2 code starts here mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16] %if ARCH_X86_32 mov strideq, [rsp+gprsize*2+(9*64+8)*16] %else mov r0, [rsp+gprsize*0+64*16] %endif add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16 cmp eobd, 151 jl .fast ; fall-through %if ARCH_X86_64 DECLARE_REG_TMP 8, 9 %else DECLARE_REG_TMP 1, 5 %endif lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] jmp .run .fast: lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] .run: %if ARCH_X86_64 lea r2, [dstq+128] mov r7, -16 %else lea r2, [rsp+(64*8+3)*16] mov [r2+4*gprsize], t0 mov [r2+5*gprsize], t1 mov r1, [r2+2*gprsize] mov dword [r2+0*gprsize], 8 %endif jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 ; copy of pass=1 tmp-regs %if ARCH_X86_32 DECLARE_REG_TMP 4, 1, 2, 0, 6 %else DECLARE_REG_TMP 8, 9, 4, 7, 0 %endif .shift_transpose: mova m0, [r3+0*16] mova m1, [r3+1*16] mova m2, [r3+2*16] mova m3, [r3+3*16] mova m4, [r3+4*16] mova m5, [r3+5*16] mova m6, [r3+6*16] mova m7, [r3+7*16] REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 packssdw m0, m1 packssdw m2, m3 packssdw m4, m5 packssdw m6, m7 call m(idct_8x4_internal_16bpc).transpose4x8packed mova [t4+t0*8], m0 mova [t4+t1*8], m1 mova [t4+t2*8], m2 mova [t4+t3*8], m3 sub t4, 16*64 sub r3, 8*16 cmp r3, rsp jg .shift_transpose ret .dconly: imul r5d, [cq], 181 mov [cq], eobd ; 0 mov r3d, 64 add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16 jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1 rav1e-0.7.1/src/x86/itx_avx2.asm000064400000000000000000006252471046102023000143510ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 16 ; Note: The order of (at least some of) those constants matter! const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 %macro COEF_PAIR 2 pw_%1_%2: dw %1, %2 pw_m%2_%1: dw -%2, %1 %endmacro ; ADST-only pw_3803_1321: dw 3803, 1321 pw_m1321_2482: dw -1321, 2482 pw_2482_3344: dw 2482, 3344 pw_m3344_3344: dw -3344, 3344 pw_m3803_3344: dw -3803, 3344 pw_m3803_m6688: dw -3803, -6688 pw_2896_m2896: dw 2896, -2896 const pw_5, times 2 dw 5 const pw_2048, times 2 dw 2048 const pw_4096, times 2 dw 4096 const pw_8192, times 2 dw 8192 const pw_16384, times 2 dw 16384 const pw_1697x16, times 2 dw 1697*16 const pw_1697x8, times 2 dw 1697*8 const pw_2896x8, times 2 dw 2896*8 const pd_2048, dd 2048 const pw_2896_2896, dw 2896, 2896 const pw_m2896_2896, dw -2896, 2896 const pw_1567_3784, dw 1567, 3784 const pw_m3784_1567, dw -3784, 1567 COEF_PAIR 3784, 1567 COEF_PAIR 201, 4091 COEF_PAIR 995, 3973 COEF_PAIR 1751, 3703 COEF_PAIR 2440, 3290 COEF_PAIR 3035, 2751 COEF_PAIR 3513, 2106 COEF_PAIR 3857, 1380 COEF_PAIR 4052, 601 COEF_PAIR 401, 4076 COEF_PAIR 1931, 3612 COEF_PAIR 3166, 2598 COEF_PAIR 3920, 1189 COEF_PAIR 799, 4017 COEF_PAIR 3406, 2276 pw_m799_m4017: dw -799, -4017 const pw_m1567_m3784, dw -1567, -3784 pw_m3406_m2276: dw -3406, -2276 pw_m401_m4076: dw -401, -4076 pw_m3166_m2598: dw -3166, -2598 pw_m1931_m3612: dw -1931, -3612 pw_m3920_m1189: dw -3920, -1189 COEF_PAIR 2276, 3406 COEF_PAIR 4017, 799 %macro COEF_X8 1-* %rep %0 dw %1*8, %1*8 %rotate 1 %endrep %endmacro pw_3703x8: COEF_X8 3703 pw_1751x8: COEF_X8 1751 pw_m1380x8: COEF_X8 -1380 pw_3857x8: COEF_X8 3857 pw_3973x8: COEF_X8 3973 pw_995x8: COEF_X8 995 pw_m2106x8: COEF_X8 -2106 pw_3513x8: COEF_X8 3513 pw_3290x8: COEF_X8 3290 pw_2440x8: COEF_X8 2440 pw_m601x8: COEF_X8 -601 pw_4052x8: COEF_X8 4052 const idct64_mul COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 pw_201_4091x8: dw 201*8, 4091*8 pw_m601_4052x8: dw -601*8, 4052*8 pw_995_3973x8: dw 995*8, 3973*8 pw_m1380_3857x8: dw -1380*8, 3857*8 pw_1751_3703x8: dw 1751*8, 3703*8 pw_m2106_3513x8: dw -2106*8, 3513*8 pw_2440_3290x8: dw 2440*8, 3290*8 pw_m2751_3035x8: dw -2751*8, 3035*8 %define o_idct64_offset idct64_mul - (o_base) - 8 SECTION .text ; Code size reduction trickery: Instead of using rip-relative loads with ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a ; single rip-relative lea and then address things relative from that with ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. %define o_base deint_shuf + 128 %define o(x) (r6 - (o_base) + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; flags: 1 = swap, 2 = interleave, 4: coef_regs %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags %if %7 & 4 pmaddwd m%2, m%5, m%1 pmaddwd m%1, m%6 %else %if %7 & 1 vpbroadcastd m%2, [o(pw_%5_%6)] vpbroadcastd m%3, [o(pw_m%6_%5)] %else vpbroadcastd m%2, [o(pw_m%6_%5)] vpbroadcastd m%3, [o(pw_%5_%6)] %endif pmaddwd m%2, m%1 pmaddwd m%1, m%3 %endif paddd m%2, m%4 paddd m%1, m%4 %if %7 & 2 pslld m%2, 4 psrld m%1, 12 pblendw m%1, m%2, 0xaa %else psrad m%2, 12 psrad m%1, 12 packssdw m%1, m%2 %endif %endmacro ; flags: 1 = swap, 2 = interleave, 4 = coef_regs %macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags %if %10 & 1 vpbroadcastd m%3, [o(pw_%8_%9)] vpbroadcastd m%4, [o(pw_m%9_%8)] vpbroadcastd xm%2, [o(pw_%6_%7)] vpblendd m%2, m%3, 0xf0 vpbroadcastd xm%3, [o(pw_m%7_%6)] %else vpbroadcastd m%3, [o(pw_m%9_%8)] vpbroadcastd m%4, [o(pw_%8_%9)] vpbroadcastd xm%2, [o(pw_m%7_%6)] vpblendd m%2, m%3, 0xf0 vpbroadcastd xm%3, [o(pw_%6_%7)] %endif vpblendd m%3, m%4, 0xf0 ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) %endmacro ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 punpckhwd m%3, m%2, m%1 punpcklwd m%2, m%1 %if %7 < 32 pmaddwd m%1, m%7, m%2 pmaddwd m%4, m%7, m%3 %else vpbroadcastd m%1, [o(pw_m%7_%6)] pmaddwd m%4, m%3, m%1 pmaddwd m%1, m%2 %endif paddd m%4, m%5 paddd m%1, m%5 psrad m%4, 12 psrad m%1, 12 packssdw m%1, m%4 %if %7 < 32 pmaddwd m%3, m%6 pmaddwd m%2, m%6 %else vpbroadcastd m%4, [o(pw_%6_%7)] pmaddwd m%3, m%4 pmaddwd m%2, m%4 %endif paddd m%3, m%5 paddd m%2, m%5 psrad m%3, 12 psrad m%2, 12 %if %0 == 8 packssdw m%8, m%2, m%3 %else packssdw m%2, m%3 %endif %endmacro %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 psubsw m%3, m%1, m%2 paddsw m%2, m%1 paddsw m%1, m%4, m%5 psubsw m%4, m%5 %endmacro %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 paddsw m%9, m%2, m%6 ; t4 psubsw m%2, m%6 ; t5a paddsw m%10, m%8, m%4 ; t7 psubsw m%8, m%4 ; t6a ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 psubsw m%6, m%1, m%3 ; dct4 out2 paddsw m%3, m%1 ; dct4 out1 paddsw m%1, m%5, m%7 ; dct4 out0 psubsw m%5, m%7 ; dct4 out3 psubsw m%7, m%3, m%2 ; out6 paddsw m%2, m%3 ; out1 paddsw m%3, m%6, m%8 ; out2 psubsw m%6, m%8 ; out5 psubsw m%8, m%1, m%10 ; out7 paddsw m%1, m%10 ; out0 paddsw m%4, m%5, m%9 ; out3 psubsw m%5, m%9 ; out4 %endmacro ; in1 = %1, in3 = %2, in5 = %3, in7 = %4 ; in9 = %5, in11 = %6, in13 = %7, in15 = %8 %macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a psubsw m%9, m%2, m%6 ; t13 paddsw m%6, m%2 ; t12 psubsw m%2, m%8, m%4 ; t14 paddsw m%8, m%4 ; t15 psubsw m%4, m%7, m%3 ; t10 paddsw m%3, m%7 ; t11 psubsw m%7, m%1, m%5 ; t9 paddsw m%1, m%5 ; t8 ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a psubsw m%5, m%1, m%3 ; t11a paddsw m%1, m%3 ; t8a psubsw m%3, m%7, m%4 ; t13 paddsw m%7, m%4 ; t14 psubsw m%4, m%8, m%6 ; t12a paddsw m%8, m%6 ; t15a psubsw m%6, m%2, m%9 ; t10 paddsw m%2, m%9 ; t9 ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 %endmacro %macro WRAP_XMM 1+ INIT_XMM cpuname %1 INIT_YMM cpuname %endmacro %macro ITX4_END 4-5 2048 ; row[1-4], rnd %if %5 vpbroadcastd m2, [o(pw_%5)] pmulhrsw m0, m2 pmulhrsw m1, m2 %endif lea r2, [dstq+strideq*2] %assign %%i 1 %rep 4 %if %1 & 2 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) %else CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) %endif %assign %%i %%i + 1 %rotate 1 %endrep movd m2, [%%row_adr1] pinsrd m2, [%%row_adr2], 1 movd m3, [%%row_adr3] pinsrd m3, [%%row_adr4], 1 pmovzxbw m2, m2 pmovzxbw m3, m3 paddw m0, m2 paddw m1, m3 packuswb m0, m1 movd [%%row_adr1], m0 pextrd [%%row_adr2], m0, 1 pextrd [%%row_adr3], m0, 2 pextrd [%%row_adr4], m0, 3 ret %endmacro %macro IWHT4_1D_PACKED 0 punpckhqdq m3, m0, m1 ; in1 in3 punpcklqdq m0, m1 ; in0 in2 psubw m2, m0, m3 paddw m0, m3 punpckhqdq m2, m2 ; t2 t2 punpcklqdq m0, m0 ; t0 t0 psubw m1, m0, m2 psraw m1, 1 psubw m1, m3 ; t1 t3 psubw m0, m1 ; ____ out0 paddw m2, m1 ; out3 ____ %endmacro INIT_XMM avx2 cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c mova m0, [cq+16*0] mova m1, [cq+16*1] pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 psraw m0, 2 psraw m1, 2 IWHT4_1D_PACKED punpckhwd m0, m1 punpcklwd m3, m1, m2 punpckhdq m1, m0, m3 punpckldq m0, m3 IWHT4_1D_PACKED vpblendd m0, m2, 0x03 ITX4_END 3, 0, 2, 1, 0 %macro INV_TXFM_FN 3 ; type1, type2, size cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 %define %%p1 m(i%1_%3_internal_8bpc) lea r6, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%3_internal_8bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x4 %ifidn %1_%2, dct_dct vpbroadcastw m0, [cq] vpbroadcastd m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [cq], eobd ; 0 pmulhrsw m0, m1 mova m1, m0 jmp m(iadst_4x4_internal_8bpc).end2 %endif %endmacro %macro IDCT4_1D_PACKED 0 vpbroadcastd m4, [o(pd_2048)] punpckhwd m2, m1, m0 punpcklwd m1, m0 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 paddsw m0, m1, m2 ; out0 out1 psubsw m1, m2 ; out3 out2 %endmacro %macro IADST4_1D_PACKED 0 punpcklwd m2, m1, m0 punpckhwd m3, m1, m0 vpbroadcastd m5, [o(pw_m3344_3344)] vpbroadcastd m0, [o(pw_3803_1321)] vpbroadcastd m4, [o(pw_m1321_2482)] pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 psrld m5, 16 pmaddwd m0, m2 pmaddwd m2, m4 pmaddwd m5, m3 ; 3344*in0 paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 vpbroadcastd m4, [o(pw_2482_3344)] vpbroadcastd m5, [o(pw_m3803_3344)] pmaddwd m4, m3 pmaddwd m5, m3 paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 vpbroadcastd m0, [o(pw_m3803_m6688)] pmaddwd m3, m0 vpbroadcastd m0, [o(pd_2048)] paddd m2, m0 paddd m1, m0 paddd m0, m4 paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 paddd m2, m4 paddd m2, m3 REPX {psrad x, 12}, m1, m2, m0, m5 packssdw m0, m5 ; out0 out1 packssdw m1, m2 ; out2 out3 %endmacro INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] IDCT4_1D_PACKED mova m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: IDCT4_1D_PACKED pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 ITX4_END 0, 1, 3, 2 INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call .main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 .end2: ITX4_END 0, 1, 2, 3 ALIGN function_align cglobal_label .main IADST4_1D_PACKED ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: call m(iadst_4x4_internal_8bpc).main .end: pxor m2, m2 mova [cq+16*0], m2 mova [cq+16*1], m2 .end2: ITX4_END 3, 2, 1, 0 INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x4_internal_8bpc).end %macro WRITE_4X8 2 ; coefs[1-2] movd xm4, [dstq+strideq*0] pinsrd xm4, [dstq+strideq*1], 1 movd xm5, [dstq+strideq*2] pinsrd xm5, [dstq+r3 ], 1 pinsrd xm4, [r2 +strideq*0], 2 pinsrd xm4, [r2 +strideq*1], 3 pinsrd xm5, [r2 +strideq*2], 2 pinsrd xm5, [r2 +r3 ], 3 pmovzxbw m4, xm4 pmovzxbw m5, xm5 paddw m4, m%1 paddw m5, m%2 packuswb m4, m5 vextracti128 xm5, m4, 1 movd [dstq+strideq*0], xm4 pextrd [dstq+strideq*1], xm4, 1 pextrd [dstq+strideq*2], xm4, 2 pextrd [dstq+r3 ], xm4, 3 movd [r2 +strideq*0], xm5 pextrd [r2 +strideq*1], xm5, 1 pextrd [r2 +strideq*2], xm5, 2 pextrd [r2 +r3 ], xm5, 3 %endmacro %macro INV_TXFM_4X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x8 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_2048)] mov [cq], eobd pmulhrsw xm0, xm1 pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 jmp m(iadst_4x8_internal_8bpc).end3 %endif %endmacro %macro IDCT8_1D_PACKED 0 vpbroadcastd m6, [o(pd_2048)] punpckhwd m5, m3, m0 ; in7 in1 punpckhwd m4, m1, m2 ; in3 in5 punpcklwd m3, m1 ; in6 in2 punpcklwd m2, m0 ; in4 in0 ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 psubsw m0, m5, m4 ; t5a t6a (interleaved) paddsw m4, m5 ; t4 t7 (interleaved) ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 vpbroadcastd m1, [o(pw_m2896_2896)] ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 %if mmsize > 16 vbroadcasti128 m1, [o(deint_shuf)] pshufb m4, m1 %else pshufb m4, [o(deint_shuf)] %endif psubsw m1, m2, m3 ; tmp3 tmp2 paddsw m3, m2 ; tmp0 tmp1 shufps m2, m4, m0, q1032 ; t7 t6 vpblendd m4, m0, 0xcc ; t4 t5 paddsw m0, m3, m2 ; out0 out1 psubsw m3, m2 ; out7 out6 psubsw m2, m1, m4 ; out4 out5 paddsw m1, m4 ; out3 out2 %endmacro %macro IADST8_1D_PACKED 1 ; pass vpbroadcastd m6, [o(pd_2048)] punpckhwd m0, m4, m3 ; 0 7 punpckhwd m1, m5, m2 ; 2 5 punpcklwd m2, m5 ; 4 3 punpcklwd m3, m4 ; 6 1 %if %1 == 1 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a psubsw m4, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a %if mmsize > 16 vbroadcasti128 m2, [o(deint_shuf)] %else mova m2, [o(deint_shuf)] %endif pshuflw m1, m1, q2301 pshufhw m1, m1, q2301 psubsw m3, m0, m1 ; t3 t2 paddsw m0, m1 ; -out7 out0 psubsw m1, m4, m5 ; t7 t6 paddsw m4, m5 ; out6 -out1 pshufb m0, m2 pshufb m4, m2 vpbroadcastd m5, [o(pw_m2896_2896)] pmaddwd m2, m5, m3 pmaddwd m5, m1 paddd m2, m6 paddd m5, m6 psrad m2, 12 psrad m5, 12 packssdw m2, m5 ; out4 -out5 vpbroadcastd m5, [o(pw_2896_2896)] pmaddwd m3, m5 pmaddwd m1, m5 paddd m3, m6 paddd m1, m6 psrad m3, 12 psrad m1, 12 packssdw m1, m3 ; out2 -out3 punpcklqdq m3, m4, m0 ; out6 -out7 punpckhqdq m0, m4 ; out0 -out1 %else ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a psubsw m4, m0, m2 ; t4 t5 paddsw m0, m2 ; t0 t1 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 shufps m2, m5, m4, q1032 punpckhwd m4, m2 punpcklwd m5, m2 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a psubsw m2, m0, m1 ; t2 t3 paddsw m0, m1 ; out0 -out7 psubsw m1, m4, m5 ; t7 t6 paddsw m4, m5 ; out6 -out1 vpbroadcastd m5, [o(pw_2896x8)] vpblendd m3, m0, m4, 0x33 ; out6 -out7 vpblendd m0, m4, 0xcc ; out0 -out1 shufps m4, m2, m1, q1032 ; t3 t7 vpblendd m1, m2, 0x33 ; t2 t6 psubsw m2, m1, m4 ; t2-t3 t6-t7 paddsw m1, m4 ; t2+t3 t6+t7 pmulhrsw m2, m5 ; out4 -out5 pshufd m1, m1, q1032 pmulhrsw m1, m5 ; out2 -out3 %endif %endmacro INIT_YMM avx2 INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 IDCT4_1D_PACKED vbroadcasti128 m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 call .main vpbroadcastd m4, [o(pw_2048)] vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 pshufd m1, m1, q1032 jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main WRAP_XMM IDCT8_1D_PACKED ret INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal_8bpc).main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call .main_pass2 vpbroadcastd m4, [o(pw_2048)] vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 pxor m5, m5 psubw m5, m4 .end: vpblendd m4, m5, 0xcc .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 WIN64_RESTORE_XMM pxor m2, m2 mova [cq+32*0], m2 mova [cq+32*1], m2 .end3: lea r2, [dstq+strideq*4] lea r3, [strideq*3] WRITE_4X8 0, 1 RET ALIGN function_align .main_pass1: WRAP_XMM IADST8_1D_PACKED 1 ret ALIGN function_align cglobal_label .main_pass2 WRAP_XMM IADST8_1D_PACKED 2 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal_8bpc).main punpcklwd m3, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m3 punpckhwd m1, m3 jmp tx2q .pass2: vextracti128 xm2, m0, 1 vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call m(iadst_4x8_internal_8bpc).main_pass2 vpbroadcastd m5, [o(pw_2048)] vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 pxor m4, m4 psubw m4, m5 pshufd m0, m3, q1032 pshufd m1, m2, q1032 jmp m(iadst_4x8_internal_8bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m2, [cq+32*0], q3120 vpermq m0, [cq+32*1], q3120 vpbroadcastd m3, [o(pw_2896x8)] vpbroadcastd m4, [o(pw_1697x8)] punpcklwd m1, m2, m0 punpckhwd m2, m0 pmulhrsw m1, m3 pmulhrsw m2, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 pmulhrsw m2, m4, m0 pmulhrsw m4, m1 paddsw m0, m2 paddsw m1, m4 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_4x8_internal_8bpc).end2 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] movd xm3, [o(pw_2048)] mov [cq], eobd pmulhrsw xm0, xm2 pmulhrsw xm0, xm1 pmulhrsw xm0, xm3 vpbroadcastw m0, xm0 mova m1, m0 mova m2, m0 mova m3, m0 jmp m(iadst_4x16_internal_8bpc).end3 %endif %endmacro %macro IDCT16_1D_PACKED 0 vpbroadcastd m10, [o(pd_2048)] .main2: punpckhwd m8, m7, m0 ; dct16 in15 in1 punpcklwd m9, m4, m0 ; dct4 in2 in0 punpckhwd m0, m3, m4 ; dct16 in7 in9 punpcklwd m7, m1 ; dct8 in7 in1 punpckhwd m1, m6 ; dct16 in3 in13 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 psubsw m2, m8, m0 ; t9 t14 paddsw m8, m0 ; t8 t15 psubsw m0, m1, m5 ; t10 t13 paddsw m1, m5 ; t11 t12 vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a psubsw m4, m8, m1 ; t11a t12a paddsw m8, m1 ; t8a t15a psubsw m1, m7, m3 ; t5a t6a paddsw m7, m3 ; t4 t7 paddsw m3, m2, m0 ; t9 t14 psubsw m2, m0 ; t10 t13 %if mmsize > 16 vbroadcasti128 m0, [o(deint_shuf)] %else mova m0, [o(deint_shuf)] %endif pshufb m8, m0 pshufb m7, m0 pshufb m3, m0 ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 vpbroadcastd m0, [o(pw_m2896_2896)] ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 vpbroadcastd m5, [o(pw_2896_2896)] ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 vpbroadcastd m0, [o(pw_m2896_2896)] ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a punpckhqdq m0, m8, m3 ; t15a t14 punpcklqdq m8, m3 ; t8a t9 shufps m5, m4, m2, q1032 ; t12 t13a vpblendd m4, m2, 0xcc ; t11 t10a shufps m2, m7, m1, q1032 ; t7 t6 vpblendd m7, m1, 0xcc ; t4 t5 psubsw m1, m9, m6 ; dct4 out3 out2 paddsw m9, m6 ; dct4 out0 out1 psubsw m3, m9, m2 ; dct8 out7 out6 paddsw m9, m2 ; dct8 out0 out1 psubsw m2, m1, m7 ; dct8 out4 out5 paddsw m1, m7 ; dct8 out3 out2 psubsw m7, m9, m0 ; out15 out14 paddsw m0, m9 ; out0 out1 psubsw m6, m1, m5 ; out12 out13 paddsw m1, m5 ; out3 out2 psubsw m5, m2, m4 ; out11 out10 paddsw m2, m4 ; out4 out5 psubsw m4, m3, m8 ; out8 out9 paddsw m3, m8 ; out7 out6 %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] call m(idct_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 REPX {pmulhrsw x, m5}, m0, m4, m2, m3 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vextracti128 xm4, m0, 1 vextracti128 xm5, m1, 1 vextracti128 xm6, m2, 1 vextracti128 xm7, m3, 1 call .main vinserti128 m0, xm4, 1 vinserti128 m1, xm5, 1 vpbroadcastd m5, [o(pw_2048)] vinserti128 m2, xm6, 1 vinserti128 m3, xm7, 1 pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp m(iadst_4x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main WRAP_XMM IDCT16_1D_PACKED ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 REPX {pmulhrsw x, m5}, m4, m2, m3, m0 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: call .main vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 vpbroadcastd m5, [o(pw_2048)] pshufd m1, m1, q1032 vpblendd m4, m1, m0, 0x33 vpblendd m0, m2, 0x33 vpblendd m2, m3, 0x33 vpblendd m3, m1, 0x33 vpermq m0, m0, q2031 vpermq m1, m2, q1302 vpermq m2, m3, q3120 vpermq m3, m4, q0213 psubw m6, m7, m5 .end: vpblendd m5, m6, 0xcc .end2: REPX {pmulhrsw x, m5}, m0, m1, m2, m3 WIN64_RESTORE_XMM pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 .end3: lea r2, [dstq+strideq*8] lea r3, [strideq*3] WRITE_4X8 0, 1 lea dstq, [dstq+strideq*4] lea r2, [r2 +strideq*4] WRITE_4X8 2, 3 RET ALIGN function_align .main: vpblendd m4, m1, m0, 0xcc vpblendd m1, m0, 0x33 vpblendd m5, m2, m3, 0xcc vpblendd m2, m3, 0x33 vperm2i128 m3, m5, m2, 0x31 vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 vperm2i128 m4, m1, m4, 0x31 vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 pshufd m3, m3, q1032 ; in15 in12 in13 in14 pshufd m2, m4, q1032 ; in11 in8 in9 in10 cglobal_label .main2 vpbroadcastd m8, [o(pd_2048)] pxor m7, m7 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 punpcklwd m0, m3 ; in0 in15 in2 in13 punpckhwd m3, m2, m1 ; in8 in7 in10 in5 punpcklwd m1, m2 ; in4 in11 in6 in9 ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 psubsw m2, m0, m3 ; t9a t8a t11a t10a paddsw m0, m3 ; t1a t0a t3a t2a psubsw m3, m1, m4 ; t13a t12a t15a t14a paddsw m1, m4 ; t5a t4a t7a t6a ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 psubw m6, m7, m5 ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 vpbroadcastd m6, [o(pw_m3784_1567)] vpbroadcastd m5, [o(pw_1567_3784)] psubsw m4, m0, m1 ; t5 t4 t7 t6 paddsw m0, m1 ; t1 t0 t3 t2 psubsw m1, m2, m3 ; t13a t12a t15a t14a paddsw m2, m3 ; t9a t8a t11a t10a psubw m3, m7, m6 ; pw_3784_m1567 vpblendd m6, m3, 0xf0 ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 vbroadcasti128 m5, [o(deint_shuf)] pshufb m0, m5 pshufb m2, m5 vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 pshufd m2, m2, q1032 ; t6a t7a t14 t15 psubsw m1, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 paddsw m3, m4, m2 ; -out3 out12 out2 -out13 psubsw m4, m2 ; t6 t7 t14a t15a shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a ret ALIGN function_align .main_pass1_end: vpbroadcastd m5, [o(pw_m2896_2896)] vpbroadcastd m6, [o(pw_2896_2896)] punpcklwd m1, m4, m2 punpckhwd m4, m2 pmaddwd m2, m5, m4 pmaddwd m4, m6 pmaddwd m5, m1 pmaddwd m1, m6 REPX {paddd x, m8}, m5, m1, m2, m4 REPX {psrad x, 12}, m5, m2, m1, m4 packssdw m2, m5 ; -out11 out8 out10 -out9 packssdw m1, m4 ; -out7 out4 out6 -out5 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpcklwd m4, m1, m0 punpckhwd m1, m0 punpcklwd m0, m3, m2 punpckhwd m3, m2 REPX {pmulhrsw x, m5}, m4, m1, m0, m3 punpckldq m2, m3, m1 punpckhdq m3, m1 punpckhdq m1, m0, m4 punpckldq m0, m4 jmp tx2q .pass2: call m(iadst_4x16_internal_8bpc).main vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m2, m5 ; out8 -out11 -out9 out10 vpbroadcastd m6, [o(pw_2048)] pshufd m1, m1, q1032 vpblendd m4, m0, m2, 0x33 vpblendd m0, m1, 0xcc vpblendd m1, m3, 0xcc vpblendd m2, m3, 0x33 vpermq m0, m0, q3120 vpermq m1, m1, q0213 vpermq m2, m2, q2031 vpermq m3, m4, q1302 psubw m5, m7, m6 jmp m(iadst_4x16_internal_8bpc).end INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova m3, [cq+32*0] mova m2, [cq+32*1] mova m4, [cq+32*2] mova m5, [cq+32*3] vpbroadcastd m8, [o(pw_1697x8)] pcmpeqw m0, m0 ; -1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m5 punpckhwd m4, m5 pmulhrsw m5, m8, m1 pmulhrsw m6, m8, m2 pmulhrsw m7, m8, m3 pmulhrsw m8, m4 pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is pxor m1, m9 ; unsigned. as long as both signs are equal pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the pxor m2, m9 ; pmulhrsw result will become 0 which causes pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless pxor m3, m9 ; we explicitly deal with that case here. pcmpeqw m0, m4 pxor m4, m0 pavgw m1, m5 pavgw m2, m6 pavgw m3, m7 pavgw m4, m8 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vpbroadcastd m8, [o(pw_1697x16)] vpbroadcastd m5, [o(pw_2048)] pmulhrsw m4, m8, m0 pmulhrsw m6, m8, m1 pmulhrsw m7, m8, m2 pmulhrsw m8, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m6 paddsw m2, m7 paddsw m3, m8 jmp m(iadst_4x16_internal_8bpc).end2 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] movhps xm%3, [dstq+%5] movq xm%4, [dstq+%6] movhps xm%4, [dstq+%7] pmovzxbw m%3, xm%3 pmovzxbw m%4, xm%4 %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vextracti128 xm%4, m%3, 1 movq [dstq ], xm%3 movhps [dstq+%6], xm%3 movq [dstq+%5], xm%4 movhps [dstq+%7], xm%4 %endmacro %macro INV_TXFM_8X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x4 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] mov [cq], eobd pmulhrsw xm0, xm1 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm3, [o(pw_2896x8)] pmulhrsw xm0, xm3, [cq+16*0] pmulhrsw xm1, xm3, [cq+16*1] pmulhrsw xm2, xm3, [cq+16*2] pmulhrsw xm3, [cq+16*3] call m(idct_4x8_internal_8bpc).main vbroadcasti128 m4, [o(deint_shuf)] vinserti128 m3, m1, xm3, 1 vinserti128 m1, m0, xm2, 1 shufps m0, m1, m3, q0220 shufps m1, m3, q1331 pshufb m0, m4 pshufb m1, m4 jmp tx2q .pass2: IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal_8bpc).main_pass1 vinserti128 m0, xm2, 1 vinserti128 m1, xm3, 1 punpckhwd m2, m0, m1 punpcklwd m0, m1 pxor m3, m3 psubsw m3, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: vpermq m0, m0, q3120 vpermq m1, m1, q3120 .end2: vpbroadcastd m2, [o(pw_2048)] pmulhrsw m0, m2 pmulhrsw m1, m2 WIN64_RESTORE_XMM .end3: pxor m2, m2 mova [cq+32*0], m2 mova [cq+32*1], m2 lea r3, [strideq*3] WRITE_8X4 0, 1, 4, 5 RET ALIGN function_align cglobal_label .main IADST4_1D_PACKED ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal_8bpc).main_pass1 vinserti128 m3, xm1, 1 vinserti128 m2, xm0, 1 punpckhwd m1, m3, m2 punpcklwd m3, m2 pxor m0, m0 psubsw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call m(iadst_8x4_internal_8bpc).main mova m2, m1 vpermq m1, m0, q2031 vpermq m0, m2, q2031 jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm0, [cq+16*1] vinserti128 m2, [cq+16*2], 1 vinserti128 m0, [cq+16*3], 1 vpbroadcastd m3, [o(pw_2896x8)] punpcklwd m1, m2, m0 punpckhwd m2, m0 pmulhrsw m1, m3 pmulhrsw m2, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 paddsw m0, m0 paddsw m1, m1 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd or r3d, 8 .dconly: pmulhrsw xm0, xm2 .dconly2: movd xm2, [pw_2048] pmulhrsw xm0, xm1 lea r2, [strideq*3] pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 .dconly_loop: WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 lea dstq, [dstq+strideq*4] sub r3d, 4 jg .dconly_loop RET %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m1, [cq+32*1], q3120 ; 2 3 call .main shufps m4, m0, m1, q0220 shufps m5, m0, m1, q1331 shufps m1, m2, m3, q0220 shufps m3, m2, m3, q1331 vbroadcasti128 m0, [o(deint_shuf)] vpbroadcastd m2, [o(pw_16384)] REPX {pshufb x, m0}, m4, m5, m1, m3 REPX {pmulhrsw x, m2}, m4, m5, m1, m3 vinserti128 m0, m4, xm1, 1 vperm2i128 m2, m4, m1, 0x31 vinserti128 m1, m5, xm3, 1 vperm2i128 m3, m5, m3, 0x31 jmp tx2q .pass2: call .main vpbroadcastd m4, [o(pw_2048)] vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main IDCT8_1D_PACKED ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call .main_pass1 vpbroadcastd m5, [o(pw_16384)] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 pxor m3, m3 psubw m3, m5 ; negate odd elements during rounding pmulhrsw m4, m5 pmulhrsw m0, m3 pmulhrsw m1, m5 pmulhrsw m2, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 vperm2i128 m2, m3, m0, 0x31 vinserti128 m0, m3, xm0, 1 vperm2i128 m3, m4, m1, 0x31 vinserti128 m1, m4, xm1, 1 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call .main_pass2 vpbroadcastd m5, [o(pw_2048)] vpbroadcastd xm4, [o(pw_4096)] psubw m4, m5 ; lower half = 2048, upper half = -2048 .end: REPX {vpermq x, x, q3120}, m0, m1, m2, m3 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 .end3: pmulhrsw m2, m4 pmulhrsw m3, m4 WIN64_RESTORE_XMM .end4: pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 lea r3, [strideq*3] WRITE_8X4 0, 1, 4, 5 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 4, 5 RET ALIGN function_align .main_pass1: IADST8_1D_PACKED 1 ret ALIGN function_align cglobal_label .main_pass2 IADST8_1D_PACKED 2 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call m(iadst_8x8_internal_8bpc).main_pass1 vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m3, m2 punpcklwd m3, m2 punpckhwd m2, m1, m0 punpcklwd m1, m0 pxor m0, m0 psubw m0, m5 pmulhrsw m4, m0 pmulhrsw m3, m5 pmulhrsw m2, m0 pmulhrsw m1, m5 punpckhwd m0, m4, m3 punpcklwd m4, m3 punpckhwd m3, m2, m1 punpcklwd m2, m1 vinserti128 m1, m0, xm3, 1 vperm2i128 m3, m0, m3, 0x31 vinserti128 m0, m4, xm2, 1 vperm2i128 m2, m4, m2, 0x31 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m4, [o(pw_2048)] vpbroadcastd xm5, [o(pw_4096)] psubw m4, m5 ; lower half = -2048, upper half = 2048 vpermq m5, m3, q2031 vpermq m3, m0, q2031 vpermq m0, m2, q2031 vpermq m2, m1, q2031 pmulhrsw m1, m0, m4 pmulhrsw m0, m5, m4 jmp m(iadst_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*1] vinserti128 m3, [cq+16*4], 1 vinserti128 m2, [cq+16*5], 1 mova xm4, [cq+16*2] mova xm0, [cq+16*3] vinserti128 m4, [cq+16*6], 1 vinserti128 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_8x8_internal_8bpc).end %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 or r3d, 16 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly %endif %endmacro %macro ITX_8X16_LOAD_COEFS 0 vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m0, m4, [cq+32*0] add cq, 32*4 pmulhrsw m7, m4, [cq+32*3] pmulhrsw m1, m4, [cq-32*3] pmulhrsw m6, m4, [cq+32*2] pmulhrsw m2, m4, [cq-32*2] pmulhrsw m5, m4, [cq+32*1] pmulhrsw m3, m4, [cq-32*1] pmulhrsw m4, [cq+32*0] %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [o(pw_16384)] .pass1_end: vperm2i128 m9, m3, m7, 0x31 vinserti128 m3, xm7, 1 vperm2i128 m8, m2, m6, 0x31 vinserti128 m2, xm6, 1 vperm2i128 m6, m1, m5, 0x31 vinserti128 m1, xm5, 1 vperm2i128 m5, m0, m4, 0x31 vinserti128 m0, xm4, 1 punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 .pass1_end2: punpckhwd m7, m5, m6 punpcklwd m5, m6 punpcklwd m6, m8, m9 punpckhwd m8, m9 REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 punpckldq m4, m5, m6 punpckhdq m5, m6 punpckldq m6, m7, m8 punpckhdq m7, m8 jmp tx2q .pass2: call .main REPX {vpermq x, x, q3120}, m0, m2, m4, m6 REPX {vpermq x, x, q2031}, m1, m3, m5, m7 .end: vpbroadcastd m8, [o(pw_2048)] .end2: REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 .end3: pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 lea r3, [strideq*3] WRITE_8X4 0, 1, 8, 9 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 0, 1 lea dstq, [dstq+strideq*4] WRITE_8X4 4, 5, 0, 1 lea dstq, [dstq+strideq*4] WRITE_8X4 6, 7, 0, 1 RET ALIGN function_align cglobal_label .main IDCT16_1D_PACKED ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end vpbroadcastd m10, [o(pw_16384)] pslld m9, m10, 17 psubw m10, m9 ; 16384, -16384 jmp m(idct_8x16_internal_8bpc).pass1_end ALIGN function_align .pass2: call .main call .main_pass2_end vpbroadcastd m9, [o(pw_2048)] vpbroadcastd xm8, [o(pw_4096)] psubw m8, m9 REPX {vpermq x, x, q2031}, m0, m1, m2, m3 REPX {vpermq x, x, q3120}, m4, m5, m6, m7 jmp m(idct_8x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main REPX {pshufd x, x, q1032}, m7, m1, m5, m3 .main2: vpbroadcastd m10, [o(pd_2048)] punpckhwd m8, m7, m0 ; in14 in1 punpcklwd m0, m7 ; in0 in15 punpcklwd m7, m6, m1 ; in12 in3 punpckhwd m1, m6 ; in2 in13 punpckhwd m6, m5, m2 ; in10 in5 punpcklwd m2, m5 ; in4 in11 punpcklwd m5, m4, m3 ; in8 in7 punpckhwd m3, m4 ; in6 in9 ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 psubsw m4, m0, m5 ; t9a t8a paddsw m0, m5 ; t1a t0a psubsw m5, m1, m6 ; t11a t10a paddsw m1, m6 ; t3a t2a psubsw m6, m2, m7 ; t13a t12a paddsw m2, m7 ; t5a t4a psubsw m7, m3, m8 ; t15a t14a paddsw m3, m8 ; t7a t6a vpbroadcastd m11, [o(pw_m4017_799)] vpbroadcastd m12, [o(pw_799_4017)] pxor m9, m9 ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 psubw m8, m9, m11 ; pw_4017_m799 ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 vpbroadcastd m11, [o(pw_m2276_3406)] vpbroadcastd m12, [o(pw_3406_2276)] ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 psubw m8, m9, m11 ; pw_2276_m3406 ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 psubsw m8, m1, m3 ; t7 t6 paddsw m1, m3 ; t3 t2 psubsw m3, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m2, m5, m7 ; t14a t15a paddsw m7, m5 ; t10a t11a psubsw m5, m4, m6 ; t12a t13a paddsw m4, m6 ; t8a t9a vpbroadcastd m11, [o(pw_m3784_1567)] vpbroadcastd m12, [o(pw_1567_3784)] ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a psubw m6, m9, m11 ; pw_3784_m1567 ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a vpbroadcastd m11, [o(pw_m1567_3784)] vpbroadcastd m12, [o(pw_3784_1567)] ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 psubw m6, m9, m11 ; pw_1567_m3784 ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 vbroadcasti128 m12, [o(deint_shuf)] paddsw m6, m4, m7 ; -out1 out14 psubsw m4, m7 ; t10 t11 psubsw m11, m3, m8 ; t7 t6 paddsw m8, m3 ; out12 -out3 psubsw m3, m0, m1 ; t3a t2a paddsw m0, m1 ; -out15 out0 paddsw m1, m2, m5 ; -out13 out2 psubsw m5, m2 ; t15a t14a pshufb m0, m12 pshufb m6, m12 pshufb m8, m12 pshufb m1, m12 shufps m7, m6, m0, q1032 ; out14 -out15 vpblendd m0, m6, 0x33 ; -out1 out0 punpcklqdq m6, m8, m1 ; out12 -out13 punpckhqdq m1, m8, m1 ; -out3 out2 ret ALIGN function_align .main_pass1_end: vpbroadcastd m8, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] pmaddwd m9, m8, m11 ; -out11 pmaddwd m2, m12, m5 ; -out5 pmaddwd m5, m8 ; out10 pmaddwd m11, m12 ; out4 REPX {paddd x, m10}, m9, m5, m2, m11 REPX {psrad x, 12 }, m9, m5, m2, m11 packssdw m5, m9 ; out10 -out11 packssdw m2, m11 ; -out5 out4 pmaddwd m11, m8, m3 ; out8 vpbroadcastd m8, [o(pw_2896_m2896)] pmaddwd m3, m12 ; -out7 pmaddwd m8, m4 ; -out9 pmaddwd m4, m12 ; out6 REPX {paddd x, m10}, m11, m3, m8, m4 REPX {psrad x, 12 }, m11, m3, m8, m4 packssdw m3, m4 ; -out7 out6 packssdw m4, m11, m8 ; out8 -out9 vpbroadcastd m10, [o(pw_16384)] pxor m9, m9 ret ALIGN function_align cglobal_label .main_pass2_end vpbroadcastd m8, [o(pw_2896x8)] pshufb m2, m11, m12 pshufb m5, m12 pshufb m3, m12 pshufb m4, m12 punpcklqdq m11, m5, m2 ; t15a t7 punpckhqdq m5, m2 ; t14a t6 shufps m2, m3, m4, q1032 ; t2a t10 vpblendd m3, m4, 0xcc ; t3a t11 psubsw m4, m2, m3 ; out8 -out9 paddsw m3, m2 ; -out7 out6 paddsw m2, m5, m11 ; -out5 out4 psubsw m5, m11 ; out10 -out11 REPX {pmulhrsw x, m8}, m2, m3, m4, m5 ret INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end vpbroadcastd m9, [o(pw_16384)] pslld m10, m9, 17 psubw m10, m9 ; -16384, 16384 vperm2i128 m9, m4, m0, 0x31 vinserti128 m0, m4, xm0, 1 vperm2i128 m8, m5, m1, 0x31 vinserti128 m4, m5, xm1, 1 vperm2i128 m5, m7, m3, 0x31 vinserti128 m3, m7, xm3, 1 vinserti128 m1, m6, xm2, 1 vperm2i128 m6, m6, m2, 0x31 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpcklwd m0, m3, m1 punpckhwd m3, m1 jmp m(idct_8x16_internal_8bpc).pass1_end2 .pass2: call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end vpbroadcastd m8, [o(pw_2048)] vpbroadcastd xm9, [o(pw_4096)] psubw m8, m9 vpermq m9, m0, q3120 vpermq m0, m7, q2031 vpermq m7, m1, q3120 vpermq m1, m6, q2031 vpermq m6, m2, q3120 vpermq m2, m5, q2031 vpermq m5, m3, q3120 vpermq m3, m4, q2031 pmulhrsw m0, m8 pmulhrsw m1, m8 pmulhrsw m2, m8 pmulhrsw m3, m8 pmulhrsw m4, m5, m8 pmulhrsw m5, m6, m8 pmulhrsw m6, m7, m8 pmulhrsw m7, m9, m8 jmp m(idct_8x16_internal_8bpc).end3 INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 pmulhrsw m%2, m%4 %else paddsw m%1, m%1 %endif paddsw m%1, m%2 %endmacro cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*2] add cq, 16*8 vinserti128 m3, [cq+16*0], 1 vinserti128 m2, [cq+16*2], 1 vpbroadcastd m9, [o(pw_2896x8)] mova xm4, [cq-16*4] mova xm5, [cq-16*2] vinserti128 m4, [cq+16*4], 1 vinserti128 m5, [cq+16*6], 1 mova xm7, [cq-16*7] mova xm6, [cq-16*5] vinserti128 m7, [cq+16*1], 1 vinserti128 m6, [cq+16*3], 1 mova xm8, [cq-16*3] mova xm0, [cq-16*1] vinserti128 m8, [cq+16*5], 1 vinserti128 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m5 punpckhwd m4, m5 punpcklwd m5, m7, m6 punpckhwd m7, m6 punpcklwd m6, m8, m0 punpckhwd m8, m0 REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 punpckldq m4, m5, m6 punpckhdq m5, m6 punpckldq m6, m7, m8 punpckhdq m7, m8 jmp tx2q .pass2: vpbroadcastd m8, [o(pw_1697x16)] REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 jmp m(idct_8x16_internal_8bpc).end %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] pmovzxbw m%3, [dstq+%5] %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif pmovzxbw m%4, [dstq+%6] %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vpermq m%3, m%3, q3120 mova [dstq+%5], xm%3 vextracti128 [dstq+%6], m%3, 1 %endmacro %macro INV_TXFM_16X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x4 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd or r3d, 4 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 pxor m3, m3 .dconly_loop: mova xm1, [dstq+strideq*0] vinserti128 m1, [dstq+strideq*1], 1 punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 paddw m1, m0 packuswb m1, m2 mova [dstq+strideq*0], xm1 vextracti128 [dstq+strideq*1], m1, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET %endif %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] mova xm3, [cq+16*3] mova xm4, [cq+16*4] mova xm5, [cq+16*5] mova xm6, [cq+16*6] mova xm7, [cq+16*7] call m(idct_4x16_internal_8bpc).main vinserti128 m6, m2, xm6, 1 vinserti128 m2, m0, xm4, 1 vinserti128 m0, m1, xm5, 1 vinserti128 m1, m3, xm7, 1 punpcklwd m3, m2, m6 punpckhwd m2, m6 vpbroadcastd m6, [o(pw_16384)] punpckhwd m4, m0, m1 punpcklwd m0, m1 mova m1, m6 jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: call .main jmp m(iadst_16x4_internal_8bpc).end ALIGN function_align cglobal_label .main vpbroadcastd m6, [o(pd_2048)] IDCT4_1D 0, 1, 2, 3, 4, 5, 6 ret INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 call m(iadst_4x16_internal_8bpc).main2 call m(iadst_4x16_internal_8bpc).main_pass1_end punpcklwd m4, m3, m1 punpcklwd m5, m2, m0 punpckhwd m0, m1 punpckhwd m2, m3 vpbroadcastd m1, [o(pw_16384)] vinserti128 m3, m0, xm2, 1 vperm2i128 m2, m0, m2, 0x31 vinserti128 m0, m4, xm5, 1 vperm2i128 m4, m4, m5, 0x31 psubw m6, m7, m1 .pass1_end: pmulhrsw m3, m1 pmulhrsw m2, m6 pmulhrsw m4, m1 pmulhrsw m0, m6 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: call .main .end: vpbroadcastd m4, [o(pw_2048)] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 WIN64_RESTORE_XMM .end2: pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 .end3: WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 lea dstq, [dstq+strideq*2] WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 RET ALIGN function_align cglobal_label .main vpbroadcastd m6, [o(pw_m3344_3344)] vpbroadcastd m7, [o(pw_3803_1321)] vpbroadcastd m8, [o(pw_m1321_2482)] vpbroadcastd m9, [o(pw_2482_3344)] punpcklwd m4, m2, m0 ; in2 in0 l punpckhwd m2, m0 ; in2 in0 h psrld m5, m6, 16 pmaddwd m10, m6, m4 ; t2:02 l pmaddwd m6, m2 ; t2:02 h pmaddwd m0, m7, m4 ; t0:02 l pmaddwd m7, m2 ; t0:02 h pmaddwd m4, m8 ; t1:02 l pmaddwd m8, m2 ; t1:02 h punpckhwd m2, m3, m1 ; in3 in1 h punpcklwd m3, m1 ; in3 in1 l pmaddwd m1, m5, m2 ; t2:3 h pmaddwd m5, m3 ; t2:3 l paddd m6, m1 vpbroadcastd m1, [o(pd_2048)] paddd m10, m5 pmaddwd m5, m9, m3 pmaddwd m9, m2 paddd m0, m1 paddd m7, m1 paddd m0, m5 ; t0 + t3 + 2048 l paddd m7, m9 ; t0 + t3 + 2048 h vpbroadcastd m9, [o(pw_m3803_3344)] pmaddwd m5, m9, m2 pmaddwd m9, m3 paddd m10, m1 ; t2 + 2048 l paddd m6, m1 ; t2 + 2048 h paddd m5, m1 ; t1:13 + 2048 h paddd m1, m9 ; t1:13 + 2048 l vpbroadcastd m9, [o(pw_m3803_m6688)] pmaddwd m2, m9 pmaddwd m3, m9 paddd m5, m8 ; t1 + t3 + 2048 h paddd m1, m4 ; t1 + t3 + 2048 l paddd m8, m7 paddd m4, m0 paddd m2, m8 ; t0 + t1 - t3 + 2048 h paddd m3, m4 ; t0 + t1 - t3 + 2048 l REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 packssdw m0, m7 packssdw m1, m5 packssdw m3, m2 packssdw m2, m10, m6 ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q1230 vpermq m3, [cq+32*3], q2103 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 call m(iadst_4x16_internal_8bpc).main2 call m(iadst_4x16_internal_8bpc).main_pass1_end punpckhwd m4, m3, m2 punpckhwd m5, m1, m0 punpcklwd m0, m2 punpcklwd m1, m3 vpbroadcastd m6, [o(pw_16384)] vinserti128 m3, m0, xm1, 1 vperm2i128 m2, m0, m1, 0x31 vinserti128 m0, m4, xm5, 1 vperm2i128 m4, m4, m5, 0x31 psubw m1, m7, m6 jmp m(iadst_16x4_internal_8bpc).pass1_end ALIGN function_align .pass2: call m(iadst_16x4_internal_8bpc).main vpbroadcastd m4, [o(pw_2048)] REPX {pmulhrsw x, m4}, m3, m2, m1, m0 pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 lea dstq, [dstq+strideq*2] WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 RET INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm4, [cq+16*1] vinserti128 m2, [cq+16*4], 1 vinserti128 m4, [cq+16*5], 1 mova xm0, [cq+16*2] mova xm1, [cq+16*3] vinserti128 m0, [cq+16*6], 1 vinserti128 m1, [cq+16*7], 1 vpbroadcastd m7, [o(pw_1697x16)] vpbroadcastd m8, [o(pw_16384)] punpcklwd m3, m2, m4 punpckhwd m2, m4 punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 pmulhrsw m0, m7, m1 pmulhrsw m5, m7, m2 pmulhrsw m6, m7, m3 pmulhrsw m7, m4 REPX {pmulhrsw x, m8}, m0, m5, m6, m7 paddsw m1, m0 paddsw m2, m5 paddsw m3, m6 paddsw m4, m7 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 jmp tx2q .pass2: vpbroadcastd m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(iadst_16x4_internal_8bpc).end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 or r3d, 8 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd vpbroadcastd m8, [o(pw_2896x8)] vpermq m0, [cq+32*0], q3120 add cq, 32*4 vpermq m7, [cq+32*3], q%1 vpermq m1, [cq-32*3], q%1 vpermq m6, [cq+32*2], q3120 vpermq m2, [cq-32*2], q3120 vpermq m5, [cq+32*1], q%1 vpermq m3, [cq-32*1], q%1 vpermq m4, [cq+32*0], q3120 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 3120 call m(idct_8x16_internal_8bpc).main vpbroadcastd m10, [o(pw_16384)] punpckhwd m8, m0, m2 punpcklwd m0, m2 punpckhwd m2, m1, m3 punpcklwd m1, m3 punpcklwd m9, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 REPX {pmulhrsw x, m10}, m8, m1, m4, m6 .pass1_end: REPX {pmulhrsw x, m10}, m0, m2, m9, m5 punpckhwd m3, m0, m8 punpcklwd m0, m8 punpckhwd m8, m2, m1 punpcklwd m2, m1 punpcklwd m7, m9, m4 punpckhwd m9, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m8 punpckhdq m3, m8 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m8, m9, m5 punpckhdq m9, m5 vperm2i128 m4, m0, m6, 0x31 vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m8, 0x31 vinserti128 m2, xm8, 1 vperm2i128 m7, m3, m9, 0x31 vinserti128 m3, xm9, 1 jmp tx2q .pass2: call .main vpbroadcastd m8, [o(pw_2048)] .end: REPX {pmulhrsw x, m8}, m0, m2, m4, m6 .end2: REPX {pmulhrsw x, m8}, m1, m3, m5, m7 lea r3, [strideq*3] WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 .end3: pxor m0, m0 REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 .end4: lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 RET ALIGN function_align cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 call m(iadst_8x16_internal_8bpc).main2 call m(iadst_8x16_internal_8bpc).main_pass1_end psubw m11, m9, m10 punpcklwd m8, m0, m2 punpckhwd m0, m2 punpckhwd m2, m1, m3 punpcklwd m1, m3 punpcklwd m9, m4, m6 punpckhwd m4, m6 punpckhwd m6, m5, m7 punpcklwd m5, m7 REPX {pmulhrsw x, m11}, m8, m1, m4, m6 jmp m(idct_16x8_internal_8bpc).pass1_end ALIGN function_align .pass2: call .main call .main_pass2_end pxor m8, m8 psubw m8, m9 REPX {pmulhrsw x, m9}, m0, m2, m4, m6 jmp m(idct_16x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main vpbroadcastd m10, [o(pd_2048)] ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a psubsw m8, m2, m6 ; t6 paddsw m2, m6 ; t2 psubsw m6, m0, m4 ; t4 paddsw m0, m4 ; t0 psubsw m4, m5, m1 ; t7 paddsw m5, m1 ; t3 psubsw m1, m7, m3 ; t5 paddsw m7, m3 ; t1 ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a psubsw m9, m6, m8 ; t7 paddsw m6, m8 ; out6 psubsw m3, m7, m5 ; t3 paddsw m7, m5 ; -out7 psubsw m5, m0, m2 ; t2 paddsw m0, m2 ; out0 psubsw m2, m1, m4 ; t6 paddsw m1, m4 ; -out1 ret ALIGN function_align .main_pass1_end: vpbroadcastd m11, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] punpckhwd m4, m3, m5 punpcklwd m3, m5 pmaddwd m5, m11, m4 pmaddwd m4, m12 pmaddwd m8, m11, m3 pmaddwd m3, m12 REPX {paddd x, m10}, m5, m4, m8, m3 REPX {psrad x, 12 }, m5, m8, m4, m3 packssdw m3, m4 ; -out3 packssdw m4, m8, m5 ; out4 punpcklwd m5, m9, m2 punpckhwd m9, m2 pmaddwd m2, m12, m5 pmaddwd m5, m11 pmaddwd m12, m9 pmaddwd m11, m9 REPX {paddd x, m10}, m2, m5, m12, m11 REPX {psrad x, 12 }, m2, m12, m5, m11 packssdw m2, m12 ; out2 packssdw m5, m11 ; -out5 ret ALIGN function_align cglobal_label .main_pass2_end vpbroadcastd m8, [o(pw_2896x8)] psubsw m4, m5, m3 paddsw m3, m5 psubsw m5, m2, m9 paddsw m2, m9 pmulhrsw m2, m8 ; out2 pmulhrsw m3, m8 ; -out3 pmulhrsw m4, m8 ; out4 pmulhrsw m5, m8 ; -out5 vpbroadcastd m9, [o(pw_2048)] ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 call m(iadst_8x16_internal_8bpc).main2 call m(iadst_8x16_internal_8bpc).main_pass1_end psubw m9, m10 punpcklwd m8, m6, m4 punpckhwd m6, m4 punpcklwd m4, m7, m5 punpckhwd m7, m5 punpckhwd m5, m3, m1 punpcklwd m3, m1 punpckhwd m1, m2, m0 punpcklwd m2, m0 REPX {pmulhrsw x, m10}, m8, m4, m5, m1 REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 punpcklwd m0, m7, m4 punpckhwd m7, m4 punpckhwd m4, m6, m8 punpcklwd m6, m8 punpckhwd m8, m3, m5 punpcklwd m3, m5 punpcklwd m5, m2, m1 punpckhwd m2, m1 punpckhdq m1, m0, m6 punpckldq m0, m6 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckhdq m4, m3, m5 punpckldq m3, m5 punpckldq m5, m8, m2 punpckhdq m8, m2 vinserti128 m2, m6, xm5, 1 vperm2i128 m6, m5, 0x31 vperm2i128 m5, m1, m4, 0x31 vinserti128 m1, xm4, 1 vperm2i128 m4, m0, m3, 0x31 vinserti128 m0, xm3, 1 vinserti128 m3, m7, xm8, 1 vperm2i128 m7, m8, 0x31 jmp tx2q .pass2: call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end pxor m8, m8 psubw m8, m9 pmulhrsw m10, m7, m8 pmulhrsw m7, m0, m9 pmulhrsw m0, m6, m9 pmulhrsw m6, m1, m8 pmulhrsw m1, m5, m8 pmulhrsw m5, m2, m9 pmulhrsw m2, m4, m9 pmulhrsw m4, m3, m8 lea r3, [strideq*3] WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 WRITE_16X2 1, 2, 0, 1, strideq*2, r3 jmp m(idct_16x8_internal_8bpc).end3 INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 mova xm7, [cq+16*0] mova xm2, [cq+16*1] add cq, 16*8 vpbroadcastd m3, [o(pw_2896x8)] vinserti128 m7, [cq+16*0], 1 vinserti128 m2, [cq+16*1], 1 mova xm6, [cq-16*6] mova xm4, [cq-16*5] vinserti128 m6, [cq+16*2], 1 vinserti128 m4, [cq+16*3], 1 mova xm8, [cq-16*4] mova xm5, [cq-16*3] vinserti128 m8, [cq+16*4], 1 vinserti128 m5, [cq+16*5], 1 mova xm0, [cq-16*2] mova xm1, [cq-16*1] vinserti128 m0, [cq+16*6], 1 vinserti128 m1, [cq+16*7], 1 vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m11, [o(pw_16384)] REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 punpcklwd m3, m7, m2 punpckhwd m7, m2 punpcklwd m2, m6, m4 punpckhwd m6, m4 punpcklwd m4, m8, m5 punpckhwd m8, m5 punpcklwd m5, m0, m1 punpckhwd m0, m1 punpckldq m1, m3, m2 punpckhdq m3, m2 punpckldq m2, m4, m5 punpckhdq m4, m5 punpckldq m5, m7, m6 punpckhdq m7, m6 punpckldq m6, m8, m0 punpckhdq m8, m0 REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 punpcklqdq m0, m1, m2 punpckhqdq m1, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m6 punpckhqdq m5, m6 punpcklqdq m6, m7, m8 punpckhqdq m7, m8 jmp tx2q .pass2: vpbroadcastd m8, [o(pw_4096)] jmp m(idct_16x8_internal_8bpc).end %define o_base pw_5 + 128 %macro INV_TXFM_16X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd or r3d, 16 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly %endif %endmacro %macro ITX_16X16_LOAD_COEFS 0 mova m0, [cq+32*0] mova m1, [cq+32*1] mova m2, [cq+32*2] mova m3, [cq+32*3] add cq, 32*8 mova m4, [cq-32*4] mova m5, [cq-32*3] mova m6, [cq-32*2] mova m7, [cq-32*1] mova m8, [cq+32*0] mova m9, [cq+32*1] mova m10, [cq+32*2] mova m11, [cq+32*3] mova m12, [cq+32*4] mova m13, [cq+32*5] mova m14, [cq+32*6] mova m15, [cq+32*7] mova [rsp], m15 %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main .pass1_end: vpbroadcastd m1, [o(pw_8192)] REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 vextracti128 [rsp+16*5], m8, 1 mova [rsp+16*1], xm8 .pass1_end2: vextracti128 [rsp+16*4], m0, 1 mova [rsp+16*0], xm0 REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 pmulhrsw m1, [rsp+32*1] vperm2i128 m8, m1, m9, 0x31 vinserti128 m1, xm9, 1 vperm2i128 m9, m2, m10, 0x31 vinserti128 m2, xm10, 1 vperm2i128 m10, m3, m11, 0x31 vinserti128 m3, xm11, 1 vperm2i128 m11, m4, m12, 0x31 vinserti128 m4, xm12, 1 vperm2i128 m12, m5, m13, 0x31 vinserti128 m5, xm13, 1 vperm2i128 m13, m6, m14, 0x31 vinserti128 m6, xm14, 1 vperm2i128 m14, m7, m15, 0x31 vinserti128 m7, xm15, 1 mova m15, [rsp+32*2] .pass1_end3: punpcklwd m0, m9, m10 punpckhwd m9, m10 punpcklwd m10, m15, m8 punpckhwd m15, m8 punpckhwd m8, m11, m12 punpcklwd m11, m12 punpckhwd m12, m13, m14 punpcklwd m13, m14 punpckhdq m14, m11, m13 punpckldq m11, m13 punpckldq m13, m15, m9 punpckhdq m15, m9 punpckldq m9, m10, m0 punpckhdq m10, m0 punpckhdq m0, m8, m12 punpckldq m8, m12 punpcklqdq m12, m13, m8 punpckhqdq m13, m8 punpcklqdq m8, m9, m11 punpckhqdq m9, m11 punpckhqdq m11, m10, m14 punpcklqdq m10, m14 punpcklqdq m14, m15, m0 punpckhqdq m15, m0 mova m0, [rsp] mova [rsp], m15 punpckhwd m15, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m15, m1 punpckhdq m15, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m15 punpcklqdq m6, m15 jmp tx2q .pass2: call .main .end: vpbroadcastd m1, [o(pw_2048)] REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 mova [rsp], m6 .end2: REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 pmulhrsw m1, [rsp+32*1] lea r3, [strideq*3] WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 .end3: pxor m2, m2 REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 lea dstq, [dstq+strideq*4] WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 WRITE_16X2 10, 11, 0, 1, strideq*2, r3 REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 lea dstq, [dstq+strideq*4] WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 WRITE_16X2 14, 15, 0, 1, strideq*2, r3 RET ALIGN function_align cglobal_label .main vpbroadcastd m15, [o(pd_2048)] mova [rsp+gprsize+32*1], m1 mova [rsp+gprsize+32*2], m9 IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 mova m1, [rsp+gprsize+32*2] ; in9 mova [rsp+gprsize+32*2], m14 ; tmp7 mova m9, [rsp+gprsize+32*1] ; in1 mova [rsp+gprsize+32*1], m10 ; tmp5 mova m14, [rsp+gprsize+32*0] ; in15 mova [rsp+gprsize+32*0], m6 ; tmp3 IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 mova m6, [rsp+gprsize+32*1] ; tmp5 psubsw m15, m0, m14 ; out15 paddsw m0, m14 ; out0 psubsw m14, m2, m13 ; out14 paddsw m2, m13 ; out1 mova [rsp+gprsize+32*1], m2 psubsw m13, m4, m11 ; out13 paddsw m2, m4, m11 ; out2 psubsw m11, m8, m7 ; out11 paddsw m4, m8, m7 ; out4 mova m7, [rsp+gprsize+32*2] ; tmp7 psubsw m10, m6, m5 ; out10 paddsw m5, m6 ; out5 psubsw m8, m7, m9 ; out8 paddsw m7, m9 ; out7 psubsw m9, m12, m3 ; out9 paddsw m6, m12, m3 ; out6 mova m3, [rsp+gprsize+32*0] ; tmp3 psubsw m12, m3, m1 ; out12 paddsw m3, m1 ; out3 ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main call .main_pass1_end pmulhrsw m0, m1, [cq+32*0] pmulhrsw m2, m1, [cq+32*1] REPX {pmulhrsw x, m1}, m4, m6, m8, m10 pmulhrsw m12, m1, [cq+32*2] pmulhrsw m14, m1, [cq+32*3] vextracti128 [rsp+16*5], m8, 1 mova [rsp+16*1], xm8 pxor m8, m8 psubw m1, m8, m1 jmp m(idct_16x16_internal_8bpc).pass1_end2 ALIGN function_align .pass2: call .main call .main_pass2_end REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 mova [rsp+32*0], m6 pxor m6, m6 psubw m1, m6, m1 jmp m(idct_16x16_internal_8bpc).end2 ALIGN function_align cglobal_label .main vpbroadcastd m15, [o(pd_2048)] mova [rsp+gprsize+32*1], m0 mova [rsp+gprsize+32*2], m4 ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 psubsw m0, m2, m10 ; t10a paddsw m2, m10 ; t2a psubsw m10, m13, m5 ; t11a paddsw m13, m5 ; t3a psubsw m5, m6, m14 ; t14a paddsw m6, m14 ; t6a psubsw m14, m9, m1 ; t15a paddsw m9, m1 ; t7a ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 psubsw m1, m10, m14 ; t14a paddsw m10, m14 ; t10a psubsw m14, m0, m5 ; t15a paddsw m0, m5 ; t11a psubsw m5, m2, m6 ; t6 paddsw m2, m6 ; t2 psubsw m6, m13, m9 ; t7 paddsw m13, m9 ; t3 ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 mova m9, [rsp+gprsize+32*0] ; in15 mova [rsp+gprsize+32*0], m10 ; t10a mova m4, [rsp+gprsize+32*1] ; in0 mova [rsp+gprsize+32*1], m6 ; t6a mova m6, [rsp+gprsize+32*2] ; in4 mova [rsp+gprsize+32*2], m2 ; t2 ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 psubsw m10, m4, m8 ; t8a paddsw m8, m4 ; t0a psubsw m4, m9, m7 ; t9a paddsw m9, m7 ; t1a psubsw m7, m6, m12 ; t12a paddsw m6, m12 ; t4a psubsw m12, m11, m3 ; t13a paddsw m11, m3 ; t5a ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 psubsw m3, m9, m11 ; t5 paddsw m9, m11 ; t1 psubsw m11, m4, m12 ; t12a paddsw m4, m12 ; t8a paddsw m12, m8, m6 ; t0 psubsw m8, m6 ; t4 paddsw m6, m10, m7 ; t9a psubsw m10, m7 ; t13a ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 mova m7, [rsp+gprsize+32*0] ; t10a mova m2, [rsp+gprsize+32*1] ; t6a paddsw m15, m9, m13 ; -out15 psubsw m9, m13 ; t3a paddsw m13, m11, m1 ; -out13 psubsw m11, m1 ; t15a psubsw m1, m4, m7 ; t10 paddsw m7, m4 ; -out1 psubsw m4, m3, m2 ; t6 paddsw m3, m2 ; -out3 paddsw m2, m10, m14 ; out2 psubsw m10, m14 ; t14a paddsw m14, m6, m0 ; out14 psubsw m6, m0 ; t11 mova m0, [rsp+gprsize+32*2] ; t2 mova [rsp+gprsize+32*1], m7 psubsw m7, m12, m0 ; t2a paddsw m0, m12 ; out0 paddsw m12, m8, m5 ; out12 psubsw m8, m5 ; t7 ret ALIGN function_align .main_pass1_end: mova [cq+32*0], m0 mova [cq+32*1], m2 mova [cq+32*2], m12 mova [cq+32*3], m14 vpbroadcastd m14, [pw_m2896_2896] vpbroadcastd m12, [pw_2896_2896] vpbroadcastd m2, [pd_2048] punpcklwd m5, m11, m10 punpckhwd m11, m10 pmaddwd m10, m14, m5 pmaddwd m0, m14, m11 pmaddwd m5, m12 pmaddwd m11, m12 REPX {paddd x, m2}, m10, m0, m5, m11 REPX {psrad x, 12}, m10, m0, m5, m11 packssdw m10, m0 ; out10 packssdw m5, m11 ; -out5 punpcklwd m11, m8, m4 punpckhwd m8, m4 pmaddwd m4, m12, m11 pmaddwd m0, m12, m8 pmaddwd m11, m14 pmaddwd m8, m14 REPX {paddd x, m2}, m4, m0, m11, m8 REPX {psrad x, 12}, m4, m0, m11, m8 packssdw m4, m0 ; out4 packssdw m11, m8 ; -out11 punpcklwd m8, m9, m7 punpckhwd m9, m7 pmaddwd m7, m12, m8 pmaddwd m0, m12, m9 pmaddwd m8, m14 pmaddwd m9, m14 REPX {paddd x, m2}, m7, m0, m8, m9 REPX {psrad x, 12}, m7, m0, m8, m9 packssdw m7, m0 ; -out7 packssdw m8, m9 ; out8 punpckhwd m0, m6, m1 punpcklwd m6, m1 pmaddwd m1, m14, m0 pmaddwd m9, m14, m6 pmaddwd m0, m12 pmaddwd m6, m12 REPX {paddd x, m2}, m1, m9, m0, m6 REPX {psrad x, 12}, m1, m9, m0, m6 packssdw m9, m1 ; -out7 packssdw m6, m0 ; out8 vpbroadcastd m1, [o(pw_8192)] ret ALIGN function_align cglobal_label .main_pass2_end ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to ; 16-bit here will produce the same result as using 32-bit intermediates. paddsw m5, m10, m11 ; -out5 psubsw m10, m11 ; out10 psubsw m11, m4, m8 ; -out11 paddsw m4, m8 ; out4 psubsw m8, m7, m9 ; out8 paddsw m7, m9 ; -out7 psubsw m9, m1, m6 ; -out9 paddsw m6, m1 ; out6 vpbroadcastd m1, [o(pw_2896x8)] REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 vpbroadcastd m1, [o(pw_2048)] ret INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass1_end pmulhrsw m6, m1 pmulhrsw m2, m1, m8 mova [rsp+32*2], m6 pmulhrsw m6, m1, m4 pmulhrsw m4, m1, m10 pmulhrsw m8, m1, [cq+32*3] pmulhrsw m10, m1, [cq+32*2] pmulhrsw m12, m1, [cq+32*1] pmulhrsw m14, m1, [cq+32*0] pxor m0, m0 psubw m0, m1 REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 pmulhrsw m1, m0, m9 pmulhrsw m9, m0, m13 pmulhrsw m0, [rsp+32*1] mova [rsp+16*0], xm15 mova [rsp+16*1], xm7 vperm2i128 m15, m15, m7, 0x31 vinserti128 m7, m2, xm14, 1 vperm2i128 m14, m2, m14, 0x31 vinserti128 m2, m9, xm5, 1 vperm2i128 m9, m9, m5, 0x31 vinserti128 m5, m4, xm12, 1 vperm2i128 m12, m4, m12, 0x31 vinserti128 m4, m11, xm3, 1 vperm2i128 m11, m11, m3, 0x31 vinserti128 m3, m10, xm6, 1 vperm2i128 m10, m10, m6, 0x31 vinserti128 m6, m1, xm0, 1 vperm2i128 m13, m1, m0, 0x31 vinserti128 m1, m8, [rsp+32*2], 1 vperm2i128 m8, m8, [rsp+32*2], 0x31 jmp m(idct_16x16_internal_8bpc).pass1_end3 .pass2: call m(iadst_16x16_internal_8bpc).main call m(iadst_16x16_internal_8bpc).main_pass2_end pmulhrsw m0, m1 pmulhrsw m8, m1 mova [rsp+32*0], m0 mova [rsp+32*2], m8 pxor m0, m0 psubw m0, m1 pmulhrsw m8, m0, m7 pmulhrsw m7, m0, m9 pmulhrsw m9, m1, m6 pmulhrsw m6, m1, m10 pmulhrsw m10, m0, m5 pmulhrsw m5, m0, m11 pmulhrsw m11, m1, m4 pmulhrsw m4, m1, m12 pmulhrsw m12, m0, m3 pmulhrsw m3, m0, m13 pmulhrsw m13, m1, m2 pmulhrsw m1, m14 pmulhrsw m14, m0, [rsp+32*1] pmulhrsw m0, m15 lea r3, [strideq*3] WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 mova m15, [rsp+32*0] WRITE_16X2 3, 4, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 jmp m(idct_16x16_internal_8bpc).end3 %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 pmulhrsw m%2, m%3, m%1 psraw m%2, 1 pavgw m%1, m%2 ; signs are guaranteed to be equal %endmacro INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 vpbroadcastd m7, [o(pw_1697x16)] mova xm0, [cq+16* 0] vinserti128 m0, [cq+16*16], 1 mova xm15, [cq+16* 1] vinserti128 m15, [cq+16*17], 1 mova xm1, [cq+16* 2] vinserti128 m1, [cq+16*18], 1 mova xm8, [cq+16* 3] vinserti128 m8, [cq+16*19], 1 mova xm2, [cq+16* 4] vinserti128 m2, [cq+16*20], 1 mova xm9, [cq+16* 5] vinserti128 m9, [cq+16*21], 1 mova xm3, [cq+16* 6] vinserti128 m3, [cq+16*22], 1 mova xm10, [cq+16* 7] add cq, 16*16 vinserti128 m10, [cq+16* 7], 1 mova xm4, [cq-16* 8] vinserti128 m4, [cq+16* 8], 1 mova xm11, [cq-16* 7] vinserti128 m11, [cq+16* 9], 1 mova xm5, [cq-16* 6] vinserti128 m5, [cq+16*10], 1 mova xm12, [cq-16* 5] vinserti128 m12, [cq+16*11], 1 mova xm13, [cq-16* 3] vinserti128 m13, [cq+16*13], 1 mova xm14, [cq-16* 1] vinserti128 m14, [cq+16*15], 1 REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ 10, 4, 11, 5, 12, 13, 14 mova xm6, [cq-16* 4] vinserti128 m6, [cq+16*12], 1 mova [rsp], m0 IDTX16B 6, 0, 7 mova xm0, [cq-16* 2] vinserti128 m0, [cq+16*14], 1 pmulhrsw m7, m0 psraw m7, 1 pavgw m7, m0 jmp m(idct_16x16_internal_8bpc).pass1_end3 ALIGN function_align .pass2: vpbroadcastd m15, [o(pw_1697x16)] mova [rsp+32*1], m0 REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14 mova m0, [rsp+32*1] mova [rsp+32*1], m1 IDTX16 0, 1, 15 mova m1, [rsp+32*0] pmulhrsw m15, m1 paddsw m1, m1 paddsw m15, m1 jmp m(idct_16x16_internal_8bpc).end %define o_base deint_shuf + 128 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 %if %3 vpbroadcastd m15, [o(pw_2896x8)] pmulhrsw m0, m15, [%1+%2*0] pmulhrsw m1, m15, [%1+%2*1] pmulhrsw m2, m15, [%1+%2*2] pmulhrsw m3, m15, [%1+%2*3] pmulhrsw m4, m15, [%1+%2*4] pmulhrsw m5, m15, [%1+%2*5] pmulhrsw m6, m15, [%1+%2*6] pmulhrsw m7, m15, [%1+%2*7] %else mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] mova m4, [%1+%2*4] mova m5, [%1+%2*5] mova m6, [%1+%2*6] mova m7, [%1+%2*7] %endif %endmacro %macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 %if %3 %if %3 == 1 vpbroadcastd m15, [o(pw_2896x8)] %endif pmulhrsw m8, m15, [%1+%2*0] pmulhrsw m9, m15, [%1+%2*1] pmulhrsw m10, m15, [%1+%2*2] pmulhrsw m11, m15, [%1+%2*3] pmulhrsw m12, m15, [%1+%2*4] pmulhrsw m13, m15, [%1+%2*5] pmulhrsw m14, m15, [%1+%2*6] pmulhrsw m15, [%1+%2*7] %else mova m8, [%1+%2*0] mova m9, [%1+%2*1] mova m10, [%1+%2*2] mova m11, [%1+%2*3] mova m12, [%1+%2*4] mova m13, [%1+%2*5] mova m14, [%1+%2*6] mova m15, [%1+%2*7] %endif %endmacro %macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] punpcklwd m%1, m%2, m%2 pmulhrsw m%1, m%3 vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] punpckhwd m%2, m%2 pmulhrsw m%2, m%3 %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob %undef cmp cmp eobd, 106 jle .fast LOAD_8ROWS cq+32*1, 32*2 call m(idct_16x8_internal_8bpc).main vperm2i128 m11, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 vinserti128 m1, xm5, 1 vperm2i128 m5, m2, m6, 0x31 vinserti128 m2, xm6, 1 vperm2i128 m6, m3, m7, 0x31 vinserti128 m3, xm7, 1 pxor m7, m7 REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpcklwd m3, m11, m4 punpckhwd m11, m4 punpckhwd m4, m5, m6 punpcklwd m5, m6 punpckhdq m6, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m5 punpckhdq m3, m5 punpckhdq m5, m11, m4 punpckldq m11, m4 punpckldq m4, m7, m1 punpckhdq m7, m1 punpckhqdq m12, m6, m0 punpcklqdq m0, m6 ; out4 punpckhqdq m13, m7, m4 punpcklqdq m4, m7 ; out5 punpckhqdq m14, m3, m2 punpcklqdq m2, m3 ; out6 punpckhqdq m15, m5, m11 punpcklqdq m11, m5 ; out7 mova [rsp+32*0], m0 mova [rsp+32*1], m4 mova [rsp+32*2], m2 .fast: LOAD_8ROWS cq+32*0, 32*2 call m(idct_16x8_internal_8bpc).main vperm2i128 m8, m0, m4, 0x31 vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 vinserti128 m1, xm5, 1 vperm2i128 m5, m2, m6, 0x31 vinserti128 m2, xm6, 1 vperm2i128 m6, m3, m7, 0x31 vinserti128 m3, xm7, 1 vpbroadcastd m9, [o(pw_8192)] pxor m7, m7 REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 punpckhwd m7, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhwd m3, m8, m4 punpcklwd m8, m4 punpckhwd m4, m5, m6 punpcklwd m5, m6 punpckhdq m6, m0, m2 punpckldq m0, m2 punpckldq m2, m8, m5 punpckhdq m8, m5 punpckhdq m5, m3, m4 punpckldq m3, m4 punpckhdq m4, m7, m1 punpckldq m7, m1 punpcklqdq m1, m7, m4 punpckhqdq m7, m4 ; out9 punpckhqdq m4, m2, m8 ; out10 punpcklqdq m2, m8 punpckhqdq m8, m3, m5 punpcklqdq m3, m5 punpckhqdq m5, m0, m6 ; out8 punpcklqdq m0, m6 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 cmp eobd, 106 jg .full mova [rsp+32*0], m5 mova [rsp+32*1], m7 mova [rsp+32*2], m4 pmulhrsw m11, m9, m8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call .main_fast jmp .pass2 .dconly: movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly .full: REPX {pmulhrsw x, m9}, m12, m13, m14, m15 pmulhrsw m6, m9, [rsp+32*2] mova [rsp+32*2], m4 pmulhrsw m4, m9, [rsp+32*0] mova [rsp+32*0], m5 pmulhrsw m5, m9, [rsp+32*1] mova [rsp+32*1], m7 pmulhrsw m7, m9, m11 pmulhrsw m11, m9, m8 call .main .pass2: vpbroadcastd m12, [o(pw_2048)] REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ m8, m9, m10, m11, m13, m14, m15 pmulhrsw m12, [rsp] REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 mova [rsp+32*0], m4 mova [rsp+32*1], m6 lea r3, [strideq*3] WRITE_8X4 0, 1, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 [rsp+32*0], 5, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 [rsp+32*1], 7, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 8, 9, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 10, 11, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 12, 13, 4, 6 lea dstq, [dstq+strideq*4] WRITE_8X4 14, 15, 4, 6 RET ALIGN function_align cglobal_label .main_fast ; bottom half is zero call m(idct_8x16_internal_8bpc).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] mova [rsp+gprsize+1*32], m1 mova m0, [rsp+gprsize+2*32] mova [rsp+gprsize+2*32], m6 lea r5, [r6-(o_base)+pw_201_4091x8] ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main2 ALIGN function_align cglobal_label .main call m(idct_8x16_internal_8bpc).main mova m8, [rsp+gprsize+0*32] mova [rsp+gprsize+0*32], m0 mova m9, [rsp+gprsize+1*32] mova [rsp+gprsize+1*32], m1 mova m0, [rsp+gprsize+2*32] mova [rsp+gprsize+2*32], m6 punpcklwd m1, m15, m8 ; in31 in1 punpckhwd m8, m15 ; in3 in29 punpcklwd m15, m14, m9 ; in27 in5 punpckhwd m9, m14 ; in7 in25 punpcklwd m14, m13, m0 ; in23 in9 punpckhwd m0, m13 ; in11 in21 punpcklwd m13, m12, m11 ; in19 in13 punpckhwd m11, m12 ; in15 in17 ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a .main2: psubsw m6, m1, m11 ; t17 t30 paddsw m1, m11 ; t16 t31 psubsw m11, m9, m14 ; t18 t29 paddsw m9, m14 ; t19 t28 psubsw m14, m15, m0 ; t21 t26 paddsw m15, m0 ; t20 t27 psubsw m0, m8, m13 ; t22 t25 paddsw m8, m13 ; t23 t24 ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a psubsw m13, m1, m9 ; t19a t28a paddsw m1, m9 ; t16a t31a psubsw m9, m8, m15 ; t20a t27a paddsw m8, m15 ; t23a t24a psubsw m15, m6, m11 ; t18 t29 paddsw m6, m11 ; t17 t30 psubsw m11, m0, m14 ; t21 t26 paddsw m0, m14 ; t22 t25 ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a vbroadcasti128 m12, [o(deint_shuf)] psubsw m14, m1, m8 ; t23 t24 paddsw m1, m8 ; t16 t31 psubsw m8, m6, m0 ; t22a t25a paddsw m6, m0 ; t17a t30a psubsw m0, m15, m11 ; t21 t26 paddsw m15, m11 ; t18 t29 psubsw m11, m13, m9 ; t20a t27a paddsw m13, m9 ; t19a t28a REPX {pshufb x, m12}, m1, m6, m15, m13 ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a vpbroadcastd m9, [o(pw_m2896_2896)] ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 vpbroadcastd m12, [o(pw_2896_2896)] ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a vpbroadcastd m12, [o(pw_2896_2896)] ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 shufps m9, m14, m8, q1032 ; t23a t22 vpblendd m14, m8, 0xcc ; t24a t25 shufps m8, m11, m0, q1032 ; t20 t21a vpblendd m11, m0, 0xcc ; t27 t26a punpcklqdq m0, m1, m6 ; t16 t17a punpckhqdq m1, m6 ; t31 t30a psubsw m10, m5, m8 ; out20 out21 paddsw m5, m8 ; out11 out10 psubsw m6, m3, m14 ; out24 out25 paddsw m3, m14 ; out7 out6 psubsw m8, m7, m0 ; out16 out17 paddsw m7, m0 ; out15 out14 mova m0, [rsp+gprsize+0*32] punpcklqdq m12, m13, m15 ; t19a t18 punpckhqdq m13, m15 ; t28a t29 psubsw m15, m0, m1 ; out31 out30 paddsw m0, m1 ; out0 out1 mova m1, [rsp+gprsize+1*32] mova [rsp+gprsize+0*32], m6 mova m6, [rsp+gprsize+2*32] psubsw m14, m1, m13 ; out28 out29 paddsw m1, m13 ; out3 out2 psubsw m13, m2, m11 ; out27 out26 paddsw m2, m11 ; out4 out5 psubsw m11, m4, m9 ; out23 out22 paddsw m4, m9 ; out8 out9 psubsw m9, m6, m12 ; out19 out18 paddsw m6, m12 ; out12 out13 ret %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] vbroadcasti128 m%1, [cq+16*%3] vbroadcasti128 m%2, [cq+16*%4] shufpd m%1, m%2, 0x0c %endmacro cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd or r3d, 8 .dconly: pmulhrsw xm0, xm2 movd xm2, [pw_2048] ; intentionally rip-relative pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 pxor m3, m3 .dconly_loop: mova m1, [dstq] punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 paddw m1, m0 packuswb m1, m2 mova [dstq], m1 add dstq, strideq dec r3d jg .dconly_loop RET .normal: PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob %undef cmp LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 add cq, 16*16 LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 mova [rsp+32*0], m4 mova [rsp+32*1], m5 mova [rsp+32*2], m6 cmp eobd, 106 jg .full pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast jmp .pass2 .full: LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 add cq, 16*8 LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 call m(inv_txfm_add_dct_dct_8x32_8bpc).main .pass2: vpbroadcastd m12, [o(pw_8192)] REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 mova [rsp+32*1], m9 mova [rsp+32*2], m10 punpckhwd m9, m0, m2 punpcklwd m0, m2 punpckhwd m2, m1, m3 punpcklwd m1, m3 punpcklwd m10, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 punpckhwd m3, m0, m9 punpcklwd m0, m9 punpckhwd m9, m2, m1 punpcklwd m2, m1 punpcklwd m7, m10, m4 punpckhwd m10, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m9 punpckhdq m3, m9 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m9, m10, m5 punpckhdq m10, m5 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 pmulhrsw m12, [rsp+32*0] mova [rsp+32*0], m8 vperm2i128 m4, m0, m6, 0x31 vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m9, 0x31 vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m10, 0x31 vinserti128 m3, xm10, 1 call m(idct_16x8_internal_8bpc).main vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 lea r2, [strideq*3] WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r2 lea r3, [dstq+strideq*4] %define dstq r3 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r2 mova m0, [rsp+32*0] mova m1, [rsp+32*1] mova m2, [rsp+32*2] punpckhwd m7, m0, m2 punpcklwd m0, m2 punpckhwd m2, m1, m11 punpcklwd m1, m11 punpckhwd m4, m12, m14 punpcklwd m12, m14 punpckhwd m5, m13, m15 punpcklwd m13, m15 punpckhwd m3, m0, m7 punpcklwd m0, m7 punpckhwd m9, m2, m1 punpcklwd m2, m1 punpcklwd m7, m12, m4 punpckhwd m12, m4 punpcklwd m4, m5, m13 punpckhwd m5, m13 punpckhdq m1, m0, m2 punpckldq m0, m2 punpckldq m2, m3, m9 punpckhdq m3, m9 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m9, m12, m5 punpckhdq m12, m5 vperm2i128 m4, m0, m6, 0x31 vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m9, 0x31 vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m12, 0x31 vinserti128 m3, xm12, 1 call m(idct_16x8_internal_8bpc).main2 vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 add r0, 16 add r3, 16 %define dstq r0 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r2 %define dstq r3 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r2 RET cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob vpbroadcastd m9, [pw_5] lea r4, [strideq*3] sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) .loop: mova xm0,[cq+16* 0] mova xm1, [cq+16* 4] vinserti128 m0, [cq+16* 1], 1 vinserti128 m1, [cq+16* 5], 1 pxor m8, m8 mova [cq+32*0], m8 mova [cq+32*2], m8 add cq, 16*16 mova xm2, [cq-16* 8] mova xm3, [cq-16* 4] vinserti128 m2, [cq-16* 7], 1 vinserti128 m3, [cq-16* 3], 1 mova xm4, [cq+16* 0] mova xm5, [cq+16* 4] vinserti128 m4, [cq+16* 1], 1 vinserti128 m5, [cq+16* 5], 1 mova xm6, [cq+16* 8] mova xm7, [cq+16*12] vinserti128 m6, [cq+16* 9], 1 vinserti128 m7, [cq+16*13], 1 REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose8x8 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 add dstq, strideq WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 add dstq, strideq WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 add dstq, strideq WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 add dstq, strideq sub cq, 16*16-32 lea dstq, [dstq+r4*4] add eobd, 0x80000000 jnc .loop RET ALIGN function_align .transpose8x8: punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m8, m1 punpckhdq m8, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m8 punpcklqdq m6, m8 ret cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob add cq, 16*8 vpbroadcastd m9, [pw_4096] lea r4, [strideq*3] lea r5, [dstq+strideq*4] sub eobd, 107 .loop: mova xm0, [cq-16*8] mova xm1, [cq-16*7] vinserti128 m0, [cq+16*0], 1 vinserti128 m1, [cq+16*1], 1 mova xm2, [cq-16*6] mova xm3, [cq-16*5] vinserti128 m2, [cq+16*2], 1 vinserti128 m3, [cq+16*3], 1 mova xm4, [cq-16*4] mova xm5, [cq-16*3] vinserti128 m4, [cq+16*4], 1 vinserti128 m5, [cq+16*5], 1 mova xm6, [cq-16*2] mova xm7, [cq-16*1] vinserti128 m6, [cq+16*6], 1 vinserti128 m7, [cq+16*7], 1 pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 %define dstq r5 WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r4 add cq, 16*16 add r0, 16 add r5, 16 add eobd, 0x80000000 jnc .loop RET %define o_base pw_5 + 128 %macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs %if %3 vpbroadcastd m15, [o(pw_2896x8)] pmulhrsw m0, m15, [%1+%2* 0] pmulhrsw m1, m15, [%1+%2* 1] pmulhrsw m2, m15, [%1+%2* 2] pmulhrsw m3, m15, [%1+%2* 3] pmulhrsw m4, m15, [%1+%2* 4] pmulhrsw m5, m15, [%1+%2* 5] pmulhrsw m6, m15, [%1+%2* 6] pmulhrsw m7, m15, [%1+%2* 7] pmulhrsw m8, m15, [%1+%2* 8] pmulhrsw m9, m15, [%1+%2* 9] pmulhrsw m10, m15, [%1+%2*10] pmulhrsw m11, m15, [%1+%2*11] pmulhrsw m12, m15, [%1+%2*12] pmulhrsw m13, m15, [%1+%2*13] pmulhrsw m14, m15, [%1+%2*14] pmulhrsw m15, [%1+%2*15] %else mova m0, [%1+%2* 0] mova m1, [%1+%2* 1] mova m2, [%1+%2* 2] mova m3, [%1+%2* 3] mova m4, [%1+%2* 4] mova m5, [%1+%2* 5] mova m6, [%1+%2* 6] mova m7, [%1+%2* 7] mova m8, [%1+%2* 8] mova m9, [%1+%2* 9] mova m10, [%1+%2*10] mova m11, [%1+%2*11] mova m12, [%1+%2*12] mova m13, [%1+%2*13] mova m14, [%1+%2*14] mova m15, [%1+%2*15] %endif mova [rsp], m15 %if %4 pxor m15, m15 REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15 %endif %endmacro %macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] mova m%4, [%2] paddsw m%3, m%1, m%4 psubsw m%1, m%4 pmovzxbw m%4, [dstq+%6] pmulhrsw m%3, m%5 pmulhrsw m%1, m%5 paddw m%3, m%4 pmovzxbw m%4, [r2+%7] paddw m%1, m%4 packuswb m%3, m%1 vpermq m%3, m%3, q3120 mova [dstq+%6], xm%3 vextracti128 [r2+%7], m%3, 1 %endmacro cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3 %undef cmp LOAD_16ROWS cq, 64, 1 call m(idct_16x16_internal_8bpc).main lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] lea tmp3q, [tmp1q+32*16] mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_16384)] call .transpose_2x8x8_round mova m15, [rsp+32*0] mova [tmp3q-32*4+ 0], xm0 vextracti128 [tmp3q+32*0+ 0], m0, 1 mova [tmp3q-32*3+ 0], xm2 vextracti128 [tmp3q+32*1+ 0], m2, 1 mova [tmp3q-32*2+ 0], xm4 vextracti128 [tmp3q+32*2+ 0], m4, 1 mova [tmp3q-32*1+ 0], xm6 vextracti128 [tmp3q+32*3+ 0], m6, 1 mova [tmp3q-32*4+16], xm8 vextracti128 [tmp3q+32*0+16], m8, 1 mova [tmp3q-32*3+16], xm10 vextracti128 [tmp3q+32*1+16], m10, 1 mova [tmp3q-32*2+16], xm12 vextracti128 [tmp3q+32*2+16], m12, 1 mova [tmp3q-32*1+16], xm14 vextracti128 [tmp3q+32*3+16], m14, 1 cmp eobd, 150 jg .full vinserti128 m0, m1, xm9, 1 vperm2i128 m4, m1, m9, 0x31 vinserti128 m2, m5, xm13, 1 vperm2i128 m6, m5, m13, 0x31 vinserti128 m1, m3, xm11, 1 vperm2i128 m5, m3, m11, 0x31 vinserti128 m3, m7, xm15, 1 vperm2i128 m7, m7, m15, 0x31 call .main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp .idct16 .dconly: movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .full: mova [tmp1q-32*4], m1 mova [tmp1q-32*3], m3 mova [tmp1q-32*2], m5 mova [tmp1q-32*1], m7 mova [tmp1q+32*0], m9 mova [tmp1q+32*1], m11 mova [tmp1q+32*2], m13 mova [tmp1q+32*3], m15 LOAD_16ROWS cq+32, 64, 1 call m(idct_16x16_internal_8bpc).main lea r2, [tmp3q+32*8] mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_16384)] call .transpose_2x8x8_round mova m15, [rsp+32*0] mova [r2-32*4+ 0], xm0 vextracti128 [r2+32*0+ 0], m0, 1 mova [r2-32*3+ 0], xm2 vextracti128 [r2+32*1+ 0], m2, 1 mova [r2-32*2+ 0], xm4 vextracti128 [r2+32*2+ 0], m4, 1 mova [r2-32*1+ 0], xm6 vextracti128 [r2+32*3+ 0], m6, 1 mova [r2-32*4+16], xm8 vextracti128 [r2+32*0+16], m8, 1 mova [r2-32*3+16], xm10 vextracti128 [r2+32*1+16], m10, 1 mova [r2-32*2+16], xm12 vextracti128 [r2+32*2+16], m12, 1 mova [r2-32*1+16], xm14 vextracti128 [r2+32*3+16], m14, 1 vinserti128 m8, m1, xm9, 1 vperm2i128 m12, m1, m9, 0x31 mova xm0, [tmp1q-32*4] mova xm1, [tmp1q-32*3] vinserti128 m0, [tmp1q+32*0], 1 vinserti128 m1, [tmp1q+32*1], 1 vinserti128 m10, m5, xm13, 1 vperm2i128 m14, m5, m13, 0x31 mova xm4, [tmp1q-32*4+16] mova xm5, [tmp1q-32*3+16] vinserti128 m4, [tmp1q+32*0+16], 1 vinserti128 m5, [tmp1q+32*1+16], 1 vinserti128 m9, m3, xm11, 1 vperm2i128 m13, m3, m11, 0x31 mova xm2, [tmp1q-32*2] mova xm3, [tmp1q-32*1] vinserti128 m2, [tmp1q+32*2], 1 vinserti128 m3, [tmp1q+32*3], 1 vinserti128 m11, m7, xm15, 1 vperm2i128 m15, m7, m15, 0x31 mova xm6, [tmp1q-32*2+16] mova xm7, [tmp1q-32*1+16] vinserti128 m6, [tmp1q+32*2+16], 1 vinserti128 m7, [tmp1q+32*3+16], 1 call .main_oddhalf LOAD_8ROWS_H r2-32*4, 32 .idct16: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 call m(idct_16x16_internal_8bpc).main imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq call .pass2_end RET ALIGN function_align cglobal_label .main_oddhalf_fast ; lower half is zero mova [rsp+gprsize+32*1], m7 pxor m7, m7 mova [rsp+gprsize+32*0], m7 mova [rsp+gprsize+32*2], m7 vpbroadcastd m11, [o(pw_3703x8)] vpbroadcastd m7, [o(pw_1751x8)] vpbroadcastd m12, [o(pw_m1380x8)] vpbroadcastd m8, [o(pw_3857x8)] vpbroadcastd m13, [o(pw_3973x8)] vpbroadcastd m15, [o(pw_995x8)] pmulhrsw m11, m4 ; t29a pmulhrsw m4, m7 ; t18a pmulhrsw m12, m3 ; t19a pmulhrsw m3, m8 ; t28a pmulhrsw m13, m2 ; t27a pmulhrsw m2, m15 ; t20a vpbroadcastd m10, [o(pw_m2106x8)] vpbroadcastd m7, [o(pw_3513x8)] vpbroadcastd m9, [o(pw_3290x8)] vpbroadcastd m8, [o(pw_2440x8)] vpbroadcastd m14, [o(pw_m601x8)] vpbroadcastd m15, [o(pw_4052x8)] pmulhrsw m10, m5 ; t21a pmulhrsw m5, m7 ; t26a pmulhrsw m9, m6 ; t25a pmulhrsw m6, m8 ; t22a pmulhrsw m14, m1 ; t23a pmulhrsw m1, m15 ; t24a vpbroadcastd m15, [o(pd_2048)] jmp .main2 ALIGN function_align cglobal_label .main_oddhalf mova [rsp+gprsize+32*0], m15 mova [rsp+gprsize+32*1], m7 mova [rsp+gprsize+32*2], m8 vpbroadcastd m15, [o(pd_2048)] ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a .main2: psubsw m7, m12, m4 ; t18 paddsw m12, m4 ; t19 psubsw m4, m2, m10 ; t21 paddsw m2, m10 ; t20 psubsw m10, m14, m6 ; t22 paddsw m14, m6 ; t23 psubsw m6, m1, m9 ; t25 paddsw m1, m9 ; t24 psubsw m9, m13, m5 ; t26 paddsw m13, m5 ; t27 psubsw m5, m3, m11 ; t29 paddsw m3, m11 ; t28 ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a psubsw m8, m14, m2 ; t20a paddsw m14, m2 ; t23a psubsw m2, m1, m13 ; t27a paddsw m1, m13 ; t24a psubsw m13, m6, m9 ; t21 paddsw m6, m9 ; t22 psubsw m9, m10, m4 ; t26 paddsw m10, m4 ; t25 ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a mova m4, [rsp+gprsize+32*0] ; in31 mova [rsp+gprsize+32*0], m6 ; t22 mova m6, [rsp+gprsize+32*1] ; in15 mova [rsp+gprsize+32*1], m14 ; t23a mova m14, [rsp+gprsize+32*2] ; in17 mova [rsp+gprsize+32*2], m1 ; t24a ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a psubsw m1, m0, m14 ; t17 paddsw m0, m14 ; t16 psubsw m14, m4, m6 ; t30 paddsw m4, m6 ; t31 ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a psubsw m6, m0, m12 ; t19a paddsw m0, m12 ; t16a psubsw m12, m4, m3 ; t28a paddsw m4, m3 ; t31a psubsw m3, m14, m5 ; t18 paddsw m14, m5 ; t17 psubsw m5, m1, m7 ; t29 paddsw m1, m7 ; t30 ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 psubsw m7, m1, m10 ; t25a paddsw m1, m10 ; t30a psubsw m10, m5, m9 ; t21 paddsw m5, m9 ; t18 psubsw m9, m12, m2 ; t20a paddsw m12, m2 ; t19a psubsw m2, m3, m13 ; t26 paddsw m3, m13 ; t29 psubsw m13, m6, m8 ; t27a paddsw m6, m8 ; t28a mova [tmp1q-32*2], m5 mova [tmp1q-32*1], m12 mova [tmp2q+32*0], m6 mova [tmp2q+32*1], m3 mova [tmp2q+32*2], m1 mova m5, [rsp+gprsize+32*0] ; t22 mova m6, [rsp+gprsize+32*1] ; t23 mova m3, [rsp+gprsize+32*2] ; t24a psubsw m1, m14, m5 ; t22a paddsw m14, m5 ; t17a psubsw m5, m0, m6 ; t23 paddsw m0, m6 ; t16 psubsw m6, m4, m3 ; t24 paddsw m4, m3 ; t31 vpbroadcastd m8, [o(pw_m2896_2896)] vpbroadcastd m3, [o(pw_2896_2896)] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m14 mova [tmp2q+32*3], m4 ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a mova [tmp1q+32*0], m13 mova [tmp1q+32*1], m2 mova [tmp1q+32*2], m7 mova [tmp1q+32*3], m6 mova [tmp2q-32*4], m5 mova [tmp2q-32*3], m1 mova [tmp2q-32*2], m10 mova [tmp2q-32*1], m9 ret ALIGN function_align .transpose_2x8x8_round: punpckhwd m6, m12, m13 punpcklwd m12, m13 punpckhwd m13, m8, m9 punpcklwd m8, m9 punpckhwd m9, m14, m15 punpcklwd m14, m15 punpckhwd m15, m10, m11 punpcklwd m10, m11 REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 punpckhdq m11, m8, m10 punpckldq m8, m10 punpckldq m10, m12, m14 punpckhdq m12, m14 punpckhdq m14, m13, m15 punpckldq m13, m15 punpckldq m15, m6, m9 punpckhdq m6, m9 punpckhqdq m9, m8, m10 punpcklqdq m8, m10 punpcklqdq m10, m11, m12 punpckhqdq m11, m12 punpcklqdq m12, m13, m15 punpckhqdq m13, m15 punpckhqdq m15, m14, m6 punpcklqdq m14, m6 pmulhrsw m6, m7, [rsp+gprsize+32*0] REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 pmulhrsw m7, [rsp+gprsize+32*1] mova [rsp+gprsize+32*0], m15 punpckhwd m15, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m15, m1 punpckhdq m15, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m15 punpcklqdq m6, m15 ret ALIGN function_align .pass2_end: mova [rsp+gprsize+32*0], m7 mova [rsp+gprsize+32*2], m15 vpbroadcastd m15, [o(pw_2048)] IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m1, [rsp+gprsize+32*1] IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 add dstq, strideq sub r2, strideq IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 add dstq, strideq sub r2, strideq mova m7, [rsp+gprsize+32*0] mova m1, [rsp+gprsize+32*2] IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 ret ; Perform the final sumsub step and YMM lane shuffling %macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] mova m%3, [tmp2q+32*( 3-%1)] psubsw m%4, m%1, m%3 paddsw m%1, m%3 mova m%3, [tmp1q+32*(11-%2)] mova [tmp1q+32*(11-%2)+16], xm%4 vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 paddsw m%4, m%2, m%3 psubsw m%2, m%3 mova [tmp1q+32*(11-%2)], xm%2 vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 vperm2i128 m%2, m%1, m%4, 0x31 vinserti128 m%1, xm%4, 1 %endmacro cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 or r3d, 16 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 vpbroadcastd m15, [o(pw_2896x8)] pmulhrsw m0, m15, [cq+32* 1] pmulhrsw m1, m15, [cq+32* 3] pmulhrsw m2, m15, [cq+32* 5] pmulhrsw m3, m15, [cq+32* 7] pmulhrsw m4, m15, [cq+32* 9] pmulhrsw m5, m15, [cq+32*11] pmulhrsw m6, m15, [cq+32*13] pmulhrsw m7, m15, [cq+32*15] pmulhrsw m8, m15, [cq+32*17] pmulhrsw m9, m15, [cq+32*19] pmulhrsw m10, m15, [cq+32*21] pmulhrsw m11, m15, [cq+32*23] pmulhrsw m12, m15, [cq+32*25] pmulhrsw m13, m15, [cq+32*27] pmulhrsw m14, m15, [cq+32*29] pmulhrsw m15, [cq+32*31] lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_16ROWS cq+32*0, 32*2, 1, 0 pxor m15, m15 mov r3d, 8 .zero_loop: mova [cq+32*0], m15 mova [cq+32*1], m15 mova [cq+32*2], m15 mova [cq+32*3], m15 add cq, 32*4 dec r3d jg .zero_loop call m(idct_16x16_internal_8bpc).main call .pass1_end lea r2, [strideq*3] mov r3, dstq .pass2: vpbroadcastd m7, [o(pw_16384)] call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round call m(idct_16x16_internal_8bpc).main mova [rsp+32*2], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m2, m3, m0 WRITE_16X2 2, 3, 1, 2, strideq*2, r2 pmulhrsw m1, m15, [rsp+32*1] WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 lea dstq, [dstq+strideq*4] REPX {pmulhrsw x, m15}, m4, m5, m6, m7 WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 WRITE_16X2 6, 7, 2, 3, strideq*2, r2 lea dstq, [dstq+strideq*4] REPX {pmulhrsw x, m15}, m8, m9, m10, m11 WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 WRITE_16X2 10, 11, 2, 3, strideq*2, r2 lea dstq, [dstq+strideq*4] REPX {pmulhrsw x, m15}, m11, m12, m13, m14 pmulhrsw m15, [rsp+32*2] WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 WRITE_16X2 14, 15, 2, 3, strideq*2, r2 test r3, r3 jnz .right_half RET .right_half: LOAD_8ROWS tmp1q-32*4, 32 LOAD_8ROWS_H tmp2q-32*4, 32 lea dstq, [r3+16] xor r3d, r3d mova [rsp+32*0], m6 mova [rsp+32*1], m7 jmp .pass2 ALIGN function_align .pass1_end: mova [rsp+gprsize+32*0], m9 IDCT32_PASS1_END 0, 8, 1, 9 IDCT32_PASS1_END 2, 10, 1, 9 IDCT32_PASS1_END 3, 11, 1, 9 IDCT32_PASS1_END 4, 12, 1, 9 IDCT32_PASS1_END 5, 13, 1, 9 IDCT32_PASS1_END 6, 14, 1, 9 IDCT32_PASS1_END 7, 15, 1, 9 mova m1, [rsp+gprsize+32*1] mova m9, [rsp+gprsize+32*0] mova [rsp+gprsize+32*0], m6 mova [rsp+gprsize+32*1], m7 IDCT32_PASS1_END 1, 9, 6, 7 ret cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob %undef cmp lea r6, [o_base] vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m12, [o(pw_8192)] cmp eobd, 43 ; if (eob > 43) setg r4b ; iteration_count++ cmp eobd, 150 ; if (eob > 150) setg al ; iteration_count++ add eobd, -279 ; if (eob > 278) adc r4b, al ; iteration_count++ lea r3, [strideq*3] mov r6, cq paddw m11, m12, m12 ; pw_16384 .loop: mova xm0, [cq+64* 0] mova xm1, [cq+64* 1] vinserti128 m0, [cq+64* 8], 1 vinserti128 m1, [cq+64* 9], 1 mova xm2, [cq+64* 2] mova xm3, [cq+64* 3] vinserti128 m2, [cq+64*10], 1 vinserti128 m3, [cq+64*11], 1 mova xm4, [cq+64* 4] mova xm5, [cq+64* 5] vinserti128 m4, [cq+64*12], 1 vinserti128 m5, [cq+64*13], 1 mova xm6, [cq+64* 6] mova xm7, [cq+64* 7] vinserti128 m6, [cq+64*14], 1 vinserti128 m7, [cq+64*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] add cq, 16 dec r4b jge .loop sub cq, 32 pxor m0, m0 mov r0d, 8 cmp cq, r6 ja .zero_loop .zero_loop_half: mova [r6+64*0], m0 mova [r6+64*1], m0 add r6, 64*4 mova [r6-64*2], m0 mova [r6-64*1], m0 sub r0d, 2 jg .zero_loop_half RET .zero_loop: mova [r6+32*0], m0 mova [r6+32*1], m0 mova [r6+32*2], m0 mova [r6+32*3], m0 add r6, 32*4 dec r0d jg .zero_loop RET cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob %undef cmp lea r6, [o_base] vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m11, [o(pw_2048)] cmp eobd, 35 ; if (eob > 35) setg r4b ; iteration_count++ cmp eobd, 150 ; if (eob > 150) setg r3b ; iteration_count += 2 lea r4d, [r4+r3*2] lea r3, [strideq*3] mov r5, dstq mov r6, cq .loop: mova xm0, [cq+32* 0] mova xm1, [cq+32* 1] vinserti128 m0, [cq+32* 8], 1 vinserti128 m1, [cq+32* 9], 1 mova xm2, [cq+32* 2] mova xm3, [cq+32* 3] vinserti128 m2, [cq+32*10], 1 vinserti128 m3, [cq+32*11], 1 mova xm4, [cq+32* 4] mova xm5, [cq+32* 5] vinserti128 m4, [cq+32*12], 1 vinserti128 m5, [cq+32*13], 1 mova xm6, [cq+32* 6] mova xm7, [cq+32* 7] vinserti128 m6, [cq+32*14], 1 vinserti128 m7, [cq+32*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r3 lea dstq, [dstq+strideq*4] add cq, 16 dec r4b jl .ret test r4b, 1 jz .loop add cq, 32*15 lea dstq, [r5+16] jmp .loop .ret: sub cd, eax pxor m0, m0 add cd, 384 .zero_loop: mova [r6+32*0], m0 mova [r6+32*1], m0 mova [r6+32*2], m0 mova [r6+32*3], m0 add r6, 32*4 sub cd, 128 jge .zero_loop RET cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 %undef cmp lea tmp1q, [rsp+32*7] lea tmp2q, [tmp1q+32*8] sub eobd, 136 mov tmp4d, eobd .pass1_loop: LOAD_8ROWS cq+64*1, 64*2 pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 test tmp4d, tmp4d jl .fast LOAD_8ROWS_H cq+64*17, 64*2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2 pxor m0, m0 REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 mova [rsp], m15 jmp .idct16 .fast: call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: LOAD_8ROWS cq+64*0, 64*2 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end vpbroadcastd m7, [o(pw_8192)] call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round lea tmp3q, [tmp1q+32*32] mova m15, [rsp] mova [tmp3q-32*4], m0 mova [tmp3q-32*3], m2 mova [tmp3q-32*2], m4 mova [tmp3q-32*1], m6 mova [tmp3q+32*0], m8 mova [tmp3q+32*1], m10 mova [tmp3q+32*2], m12 mova [tmp3q+32*3], m14 add tmp3q, 32*8 mova [tmp3q-32*4], m1 mova [tmp3q-32*3], m3 mova [tmp3q-32*2], m5 mova [tmp3q-32*1], m7 mova [tmp3q+32*0], m9 mova [tmp3q+32*1], m11 mova [tmp3q+32*2], m13 mova [tmp3q+32*3], m15 vpbroadcastd m9, [o(pw_8192)] pmulhrsw m0, m9, [tmp1q-32*4] pmulhrsw m1, m9, [tmp1q-32*3] pmulhrsw m2, m9, [tmp1q-32*2] pmulhrsw m3, m9, [tmp1q-32*1] pmulhrsw m4, m9, [tmp1q+32*0] pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 pmulhrsw m1, m9, [tmp2q-32*3] mova [tmp1q-32*3], m2 pmulhrsw m2, m9, [tmp2q-32*2] mova [tmp2q-32*3], m3 pmulhrsw m3, m9, [tmp2q-32*1] mova [tmp1q-32*2], m4 pmulhrsw m4, m9, [tmp2q+32*0] mova [tmp2q-32*2], m5 pmulhrsw m5, m9, [tmp2q+32*1] mova [tmp1q-32*1], m6 pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 mova [tmp2q+32*1], m3 mova [tmp1q+32*2], m4 mova [tmp2q+32*2], m5 mova [tmp1q+32*3], m6 mova [tmp2q+32*3], m7 add cq, 32 add tmp1q, 32*16 add tmp2q, 32*16 add eobd, 0x80000000 jnc .pass1_loop add tmp1q, 32*24 imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq test tmp4d, tmp4d jge .pass2_loop add tmp1q, 32*16 add tmp2q, 32*16 add tmp3q, 32*16 .pass2_loop: LOAD_8ROWS tmp2q-32*4, 32 test tmp4d, tmp4d jl .fast2 LOAD_8ROWS_H tmp3q-32*4, 32 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf sub tmp3q, 32*8 LOAD_8ROWS_H tmp3q-32*4, 32 sub tmp3q, 32*16 jmp .pass2_loop_end .fast2: call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast sub tmp3q, 32*24 pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 .pass2_loop_end: LOAD_8ROWS tmp3q-32*4, 32 mova [rsp], m15 call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end lea tmp3q, [tmp1q-32*32] cmp tmp2q, tmp3q jb .ret sub tmp2q, 32*32 sub dstq, r3 lea r2, [r2+r3+16] add dstq, 16 jmp .pass2_loop .ret: RET cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob %undef cmp vpbroadcastd m9, [pw_8192] sub eobd, 136 ; if (eob < 136) shr eobd, 30 ; topleft 16x16 only lea eobd, [eobq*2-8] lea r4, [strideq*3] mov r5, dstq lea r6, [cq+32] .loop: mova xm0, [cq+64* 0] mova xm1, [cq+64* 1] vinserti128 m0, [cq+64* 8], 1 vinserti128 m1, [cq+64* 9], 1 mova xm2, [cq+64* 2] mova xm3, [cq+64* 3] vinserti128 m2, [cq+64*10], 1 vinserti128 m3, [cq+64*11], 1 mova xm4, [cq+64* 4] mova xm5, [cq+64* 5] vinserti128 m4, [cq+64*12], 1 vinserti128 m5, [cq+64*13], 1 mova xm6, [cq+64* 6] mova xm7, [cq+64* 7] vinserti128 m6, [cq+64*14], 1 vinserti128 m7, [cq+64*15], 1 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 WRITE_16X2 2, 3, 0, 1, strideq*2, r4 lea dstq, [dstq+strideq*4] WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 WRITE_16X2 6, 7, 0, 1, strideq*2, r4 lea dstq, [dstq+strideq*4] add cq, 16 inc eobd jz .ret test eobd, 3 jnz .loop add cq, 64*15 lea dstq, [r5+16] jmp .loop .ret: pxor m0, m0 mov r0d, 16 cmp cq, r6 jne .zero_loop .zero_loop_topleft: mova [r6-32*1], m0 mova [r6+32*1], m0 mova [r6+32*3], m0 mova [r6+32*5], m0 add r6, 64*4 sub r0d, 4 jg .zero_loop_topleft RET .zero_loop: mova [r6-32*1], m0 mova [r6+32*0], m0 mova [r6+32*1], m0 mova [r6+32*2], m0 add r6, 32*4 dec r0d jg .zero_loop RET %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) %if %1 & 1 mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n %else mova m%5, [tmp1q-32*(45-%1)] mova m%4, [tmp2q-32*(20+%1)] %endif psubsw m%6, m%5, m%4 ; idct32 out31-n paddsw m%5, m%4 ; idct32 out 0+n psubsw m%4, m%6, m%3 ; out32+n paddsw m%6, m%3 ; out31-n psubsw m%3, m%5, m%2 ; out63-n paddsw m%5, m%2 ; out 0+n %if %0 == 6 ; pass 1 %if %1 & 1 mova [tmp2q-32*(19-%1)], m%4 mova [tmp1q-32*(14+%1)], m%6 mova [tmp1q+32*(18-%1)], m%3 mova [tmp2q-32*(51-%1)], m%5 %else mova [tmp1q-32*(13-%1)], m%4 mova [tmp2q-32*(20+%1)], m%6 mova [tmp2q+32*(12-%1)], m%3 mova [tmp1q-32*(45-%1)], m%5 %endif %else ; pass 2 REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 %if %1 & 1 %define %%d0 r2 %define %%d1 dstq %else %define %%d0 dstq %define %%d1 r2 %endif pmovzxbw m%2, [%%d0+%9 ] paddw m%2, m%4 pmovzxbw m%4, [%%d1+%8 ] paddw m%4, m%6 pmovzxbw m%6, [%%d1+%10] paddw m%3, m%6 pmovzxbw m%6, [%%d0+%7 ] paddw m%5, m%6 packuswb m%2, m%4 packuswb m%3, m%5 vpermq m%2, m%2, q3120 vpermq m%3, m%3, q3120 mova [%%d0+%9 ], xm%2 vextracti128 [%%d1+%8 ], m%2, 1 mova [%%d1+%10], xm%3 vextracti128 [%%d0+%7 ], m%3, 1 %endif %endmacro cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .normal: PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 %undef cmp lea tmp1q, [rsp+32*23] lea tmp2q, [tmp1q+32*24] sub eobd, 151 mov r7d, eobd .pass1_loop: LOAD_16ROWS cq, 64 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round mova m15, [rsp+32*0] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m2 mova [tmp1q-32*2], m4 mova [tmp1q-32*1], m6 mova [tmp1q+32*0], m8 mova [tmp1q+32*1], m10 mova [tmp1q+32*2], m12 mova [tmp1q+32*3], m14 mova [tmp2q-32*4], m1 mova [tmp2q-32*3], m3 mova [tmp2q-32*2], m5 mova [tmp2q-32*1], m7 mova [tmp2q+32*0], m9 mova [tmp2q+32*1], m11 mova [tmp2q+32*2], m13 mova [tmp2q+32*3], m15 add cq, 32 add tmp1q, 32*8 add tmp2q, 32*8 add eobd, 0x80000000 jnc .pass1_loop lea r2, [rsp+32*23] mova xm0, [r2-32*4+ 0] mova xm1, [r2-32*2+ 0] vinserti128 m0, [r2+32*0+ 0], 1 vinserti128 m1, [r2+32*2+ 0], 1 mova xm2, [r2-32*4+16] mova xm3, [r2-32*2+16] vinserti128 m2, [r2+32*0+16], 1 vinserti128 m3, [r2+32*2+16], 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 test r7d, r7d jl .fast lea r3, [r2+32*8] mova xm4, [r3-32*4+ 0] mova xm5, [r3-32*2+ 0] vinserti128 m4, [r3+32*0+ 0], 1 vinserti128 m5, [r3+32*2+ 0], 1 mova xm6, [r3-32*4+16] mova xm7, [r3-32*2+16] vinserti128 m6, [r3+32*0+16], 1 vinserti128 m7, [r3+32*2+16], 1 .fast: mova [rsp], m8 lea tmp1q, [rsp+32*7] call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 mova xm0, [r2-32*3+ 0] mova xm1, [r2-32*1+ 0] vinserti128 m0, [r2+32*1+ 0], 1 vinserti128 m1, [r2+32*3+ 0], 1 mova xm2, [r2-32*3+16] mova xm3, [r2-32*1+16] vinserti128 m2, [r2+32*1+16], 1 vinserti128 m3, [r2+32*3+16], 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 test r7d, r7d jl .fast2 mova xm4, [r3-32*3+ 0] mova xm5, [r3-32*1+ 0] vinserti128 m4, [r3+32*1+ 0], 1 vinserti128 m5, [r3+32*3+ 0], 1 mova xm6, [r3-32*3+16] mova xm7, [r3-32*1+16] vinserti128 m6, [r3+32*1+16], 1 vinserti128 m7, [r3+32*3+16], 1 .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast add r2, 32*24 vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova xm0, [r2-32*4+ 0] mova xm3, [r2-32*1+16] vinserti128 m0, [r2+32*0+ 0], 1 vinserti128 m3, [r2+32*3+16], 1 mova xm4, [r2-32*4+16] mova xm7, [r2-32*1+ 0] vinserti128 m4, [r2+32*0+16], 1 vinserti128 m7, [r2+32*3+ 0], 1 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r7d, r7d jl .fast3 add r3, 32*24 mova xm1, [r3-32*1+16] mova xm2, [r3-32*4+ 0] vinserti128 m1, [r3+32*3+16], 1 vinserti128 m2, [r3+32*0+ 0], 1 mova xm5, [r3-32*1+ 0] mova xm6, [r3-32*4+16] vinserti128 m5, [r3+32*3+ 0], 1 vinserti128 m6, [r3+32*0+16], 1 .fast3: add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova xm0, [r2-32*2+ 0] mova xm3, [r2-32*3+16] vinserti128 m0, [r2+32*2+ 0], 1 vinserti128 m3, [r2+32*1+16], 1 mova xm4, [r2-32*2+16] mova xm7, [r2-32*3+ 0] vinserti128 m4, [r2+32*2+16], 1 vinserti128 m7, [r2+32*1+ 0], 1 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r7d, r7d jl .fast4 mova xm1, [r3-32*3+16] mova xm2, [r3-32*2+ 0] vinserti128 m1, [r3+32*1+16], 1 vinserti128 m2, [r3+32*2+ 0], 1 mova xm5, [r3-32*3+ 0] mova xm6, [r3-32*2+16] vinserti128 m5, [r3+32*1+ 0], 1 vinserti128 m6, [r3+32*2+16], 1 .fast4: call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 RET ALIGN function_align %define o_base idct64_mul - 8 cglobal_label .main_part1 ; idct64 steps 1-5: ; in1/31/17/15/ 9/23/25/ 7 -> ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a ; in5/27/21/11/13/19/29/ 3 -> ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a vpbroadcastd m11, [o(idct64_mul+4* 0)] vpbroadcastd m13, [o(idct64_mul+4* 1)] vpbroadcastd m10, [o(idct64_mul+4* 4)] vpbroadcastd m12, [o(idct64_mul+4* 5)] pmulhrsw m11, m0 ; t63a pmulhrsw m0, m13 ; t32a pmulhrsw m10, m1 ; t62a pmulhrsw m1, m12 ; t33a vpbroadcastd m9, [o(idct64_mul+4* 8)] vpbroadcastd m13, [o(idct64_mul+4* 9)] vpbroadcastd m8, [o(idct64_mul+4*12)] vpbroadcastd m12, [o(idct64_mul+4*13)] pmulhrsw m9, m2 ; t61a pmulhrsw m2, m13 ; t34a pmulhrsw m8, m3 ; t60a pmulhrsw m3, m12 ; t35a psubsw m12, m0, m1 ; t33 paddsw m0, m1 ; t32 psubsw m1, m3, m2 ; t34 paddsw m3, m2 ; t35 psubsw m2, m8, m9 ; t61 paddsw m8, m9 ; t60 psubsw m9, m11, m10 ; t62 paddsw m11, m10 ; t63 ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a vpbroadcastd m14, [o(pw_401_4076)] ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a psubsw m10, m0, m3 ; t35a paddsw m0, m3 ; t32a psubsw m3, m11, m8 ; t60a paddsw m11, m8 ; t63a psubsw m8, m9, m2 ; t34 paddsw m9, m2 ; t33 psubsw m2, m12, m1 ; t61 paddsw m12, m1 ; t62 mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m9 mova [tmp2q+32*2], m12 mova [tmp2q+32*3], m11 vpbroadcastd m13, [o(pw_m4017_799)] vpbroadcastd m14, [o(pw_799_4017)] ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp2q+32*0], m10 mova [tmp2q+32*1], m8 vpbroadcastd m3, [o(idct64_mul+4*16)] vpbroadcastd m11, [o(idct64_mul+4*17)] vpbroadcastd m2, [o(idct64_mul+4*20)] vpbroadcastd m10, [o(idct64_mul+4*21)] vpbroadcastd m1, [o(idct64_mul+4*24)] vpbroadcastd m9, [o(idct64_mul+4*25)] vpbroadcastd m0, [o(idct64_mul+4*28)] vpbroadcastd m8, [o(idct64_mul+4*29)] pmulhrsw m3, m4 ; t59a pmulhrsw m4, m11 ; t36a pmulhrsw m2, m5 ; t58a pmulhrsw m5, m10 ; t37a pmulhrsw m1, m6 ; t57a pmulhrsw m6, m9 ; t38a pmulhrsw m0, m7 ; t56a pmulhrsw m7, m8 ; t39a psubsw m8, m4, m5 ; t37 paddsw m4, m5 ; t36 psubsw m5, m7, m6 ; t38 paddsw m7, m6 ; t39 psubsw m6, m0, m1 ; t57 paddsw m0, m1 ; t56 psubsw m1, m3, m2 ; t58 paddsw m3, m2 ; t59 ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a vpbroadcastd m10, [o(pw_3166_2598)] ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a psubsw m2, m7, m4 ; t36a paddsw m7, m4 ; t39a psubsw m4, m0, m3 ; t59a paddsw m0, m3 ; t56a psubsw m3, m6, m1 ; t37 paddsw m6, m1 ; t38 psubsw m1, m5, m8 ; t58 paddsw m5, m8 ; t57 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 mova [tmp2q-32*4], m0 mova [tmp2q-32*3], m5 vpbroadcastd m6, [o(pw_m799_m4017)] vpbroadcastd m7, [o(pw_m4017_799)] ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m1 mova [tmp2q-32*2], m3 mova [tmp2q-32*1], m2 ret %define o_base pw_5 + 128 .main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub sub r6, o_idct64_offset + 8 vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m13, [o(pw_2896_2896)] vpbroadcastd m14, [o(pw_m2896_2896)] .main_part2_pass1_loop: call .main_part2_internal IDCT64_PART2_END 0, 7, 0, 6, 9, 10 IDCT64_PART2_END 7, 8, 5, 0, 6, 7 IDCT64_PART2_END 8, 2, 1, 0, 6, 7 IDCT64_PART2_END 15, 3, 4, 0, 6, 7 cmp tmp1q, tmp2q jne .main_part2_pass1_loop ret cglobal_label .main_part2_internal mova m0, [tmp1q-32*12] ; t32a mova m6, [tmp2q-32*13] ; t39a mova m1, [tmp1q-32* 4] ; t40a mova m5, [tmp2q+32* 3] ; t55a add tmp1q, 32 sub tmp2q, 32 mova m2, [tmp1q+32* 3] ; t48a mova m4, [tmp2q-32* 4] ; t47a mova m3, [tmp1q+32*11] ; t56a mova m7, [tmp2q+32*12] ; t63a psubsw m8, m0, m6 ; t39 paddsw m0, m6 ; t32 psubsw m6, m4, m1 ; t40 paddsw m4, m1 ; t47 psubsw m1, m2, m5 ; t55 paddsw m2, m5 ; t48 psubsw m5, m7, m3 ; t56 paddsw m7, m3 ; t63 ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a vpbroadcastd m9, [o(pw_m1567_m3784)] ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a psubsw m3, m0, m4 ; t47a paddsw m0, m4 ; t32a psubsw m4, m7, m2 ; t48a paddsw m7, m2 ; t63a psubsw m2, m5, m1 ; t40 paddsw m5, m1 ; t39 psubsw m1, m8, m6 ; t55 paddsw m8, m6 ; t56 ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a ret .main_part2_pass2: sub r6, o_idct64_offset + 8 vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m13, [o(pw_2896_2896)] lea r9, [strideq*5] ; stride*5 lea r3, [r9+strideq*1] ; stride*6 lea r7, [r9+strideq*2] ; stride*7 lea r8, [r3+strideq*2] ; stride*8 lea r2, [dstq+r7] .main_part2_pass2_loop: vpbroadcastd m14, [o(pw_m2896_2896)] call .main_part2_internal vpbroadcastd m14, [o(pw_2048)] IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 add dstq, strideq sub r2, strideq cmp tmp1q, tmp2q jne .main_part2_pass2_loop ret cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd or r3d, 16 .dconly: pmulhrsw xm0, xm2 movd xm2, [o(pw_2048)] pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 pxor m1, m1 .dconly_loop: mova m2, [dstq+32*0] mova m3, [dstq+32*1] punpckhbw m4, m2, m1 punpcklbw m2, m1 punpckhbw m5, m3, m1 punpcklbw m3, m1 paddw m4, m0 paddw m2, m0 paddw m5, m0 paddw m3, m0 packuswb m2, m4 packuswb m3, m5 mova [dstq+32*0], m2 mova [dstq+32*1], m3 add dstq, strideq dec r3d jg .dconly_loop RET .normal: PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 LOAD_8ROWS cq+32*0, 32*4 pxor m8, m8 REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 lea tmp1q, [rsp+32*7] call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 LOAD_8ROWS cq+32*2, 32*4 pxor m8, m8 REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [cq+32* 1] mova m1, [cq+32*31] mova m2, [cq+32*17] mova m3, [cq+32*15] mova m4, [cq+32* 9] mova m5, [cq+32*23] mova m6, [cq+32*25] mova m7, [cq+32* 7] pxor m8, m8 REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [cq+32* 5] mova m1, [cq+32*27] mova m2, [cq+32*21] mova m3, [cq+32*11] mova m4, [cq+32*13] mova m5, [cq+32*19] mova m6, [cq+32*29] mova m7, [cq+32* 3] pxor m8, m8 REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*36 lea r2, [strideq*3] mov tmp2d, 4 .pass2_loop: lea r3, [tmp1q-32*8] mova xm0, [r3 -32*4] mova xm1, [r3 -32*3] vinserti128 m0, [tmp1q-32*4], 1 vinserti128 m1, [tmp1q-32*3], 1 mova xm2, [r3 -32*2] mova xm3, [r3 -32*1] vinserti128 m2, [tmp1q-32*2], 1 vinserti128 m3, [tmp1q-32*1], 1 mova xm4, [r3 +32*0] mova xm5, [r3 +32*1] vinserti128 m4, [tmp1q+32*0], 1 vinserti128 m5, [tmp1q+32*1], 1 mova xm6, [r3 +32*2] mova xm7, [r3 +32*3] vinserti128 m6, [tmp1q+32*2], 1 vinserti128 m7, [tmp1q+32*3], 1 mova xm8, [r3 -32*4+16] mova xm9, [r3 -32*3+16] vinserti128 m8, [tmp1q-32*4+16], 1 vinserti128 m9, [tmp1q-32*3+16], 1 mova xm10, [r3 -32*2+16] mova xm11, [r3 -32*1+16] vinserti128 m10, [tmp1q-32*2+16], 1 vinserti128 m11, [tmp1q-32*1+16], 1 mova xm12, [r3 +32*0+16] mova xm13, [r3 +32*1+16] vinserti128 m12, [tmp1q+32*0+16], 1 vinserti128 m13, [tmp1q+32*1+16], 1 mova xm14, [r3 +32*2+16] mova xm15, [r3 +32*3+16] vinserti128 m14, [tmp1q+32*2+16], 1 vinserti128 m15, [tmp1q+32*3+16], 1 mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round call m(idct_16x16_internal_8bpc).main mova [rsp+32*0], m15 vpbroadcastd m15, [o(pw_2048)] REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 WRITE_16X2 2, 3, 1, 2, strideq*2, r2 pmulhrsw m1, m15, [rsp+32*1] WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 lea r3, [dstq+strideq*4] %define dstq r3 WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 WRITE_16X2 6, 7, 2, 3, strideq*2, r2 REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 lea r3, [r3+strideq*4] WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 WRITE_16X2 10, 11, 2, 3, strideq*2, r2 pmulhrsw m15, [rsp+32*0] lea r3, [r3+strideq*4] WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 WRITE_16X2 14, 15, 2, 3, strideq*2, r2 add tmp1q, 32*16 add r0, 16 dec tmp2d jg .pass2_loop RET cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 or r3d, 64 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*7] lea r10d, [eobq-136] sar r10d, 31 .pass1_loop: lea tmp2q, [tmp1q+32*16] LOAD_8ROWS cq+64*1, 64*2, 1 pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 test r10b, r10b jnz .fast LOAD_8ROWS_H cq+64*17, 64*2, 2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf LOAD_8ROWS_H cq+64*16, 64*2, 1 mova [rsp], m15 pxor m15, m15 REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 jmp .idct16 .fast: call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: LOAD_8ROWS cq+64*0, 64*2, 1 pxor m15, m15 REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end vpbroadcastd m7, [o(pw_16384)] call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round lea r3, [tmp1q+32*48] mova m15, [rsp] mova [r3-32*4], m0 mova [r3-32*3], m2 mova [r3-32*2], m4 mova [r3-32*1], m6 mova [r3+32*0], m8 mova [r3+32*1], m10 mova [r3+32*2], m12 mova [r3+32*3], m14 add r3, 32*24 mova [r3-32*4], m1 mova [r3-32*3], m3 mova [r3-32*2], m5 mova [r3-32*1], m7 mova [r3+32*0], m9 mova [r3+32*1], m11 mova [r3+32*2], m13 mova [r3+32*3], m15 vpbroadcastd m9, [o(pw_16384)] pmulhrsw m0, m9, [tmp1q-32*4] pmulhrsw m1, m9, [tmp1q-32*3] pmulhrsw m2, m9, [tmp1q-32*2] pmulhrsw m3, m9, [tmp1q-32*1] pmulhrsw m4, m9, [tmp1q+32*0] pmulhrsw m5, m9, [tmp1q+32*1] pmulhrsw m6, m9, [tmp1q+32*2] pmulhrsw m7, m9, [tmp1q+32*3] call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q-32*4], m0 pmulhrsw m0, m9, [tmp2q-32*4] mova [tmp2q-32*4], m1 pmulhrsw m1, m9, [tmp2q-32*3] mova [tmp1q-32*3], m2 pmulhrsw m2, m9, [tmp2q-32*2] mova [tmp2q-32*3], m3 pmulhrsw m3, m9, [tmp2q-32*1] mova [tmp1q-32*2], m4 pmulhrsw m4, m9, [tmp2q+32*0] mova [tmp2q-32*2], m5 pmulhrsw m5, m9, [tmp2q+32*1] mova [tmp1q-32*1], m6 pmulhrsw m6, m9, [tmp2q+32*2] mova [tmp2q-32*1], m7 pmulhrsw m7, m9, [tmp2q+32*3] call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 mova [tmp2q+32*1], m3 mova [tmp1q+32*2], m4 mova [tmp2q+32*2], m5 mova [tmp1q+32*3], m6 mova [tmp2q+32*3], m7 add cq, 32 add tmp1q, 32*8 add r10d, 0x80000000 jnc .pass1_loop lea r2, [rsp+32*55] lea r7, [r2+32*24] .pass2_loop: lea r3, [r2+32*8] lea r8, [r7+32*8] mova m0, [r2-32*4] mova m1, [r2-32*2] mova m2, [r2+32*0] mova m3, [r2+32*2] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 test r10b, r10b jnz .fast2 mova m4, [r3-32*4] mova m5, [r3-32*2] mova m6, [r3+32*0] mova m7, [r3+32*2] .fast2: mova [rsp], m8 lea tmp1q, [rsp+32*39] call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 mova m0, [r2-32*3] mova m1, [r2-32*1] mova m2, [r2+32*1] mova m3, [r2+32*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 test r10b, r10b jnz .fast3 mova m4, [r3-32*3] mova m5, [r3-32*1] mova m6, [r3+32*1] mova m7, [r3+32*3] .fast3: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [r7-32*4] mova m3, [r7+32*3] mova m4, [r7+32*0] mova m7, [r7-32*1] pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10b, r10b jnz .fast4 mova m1, [r8+32*3] mova m2, [r8-32*4] mova m5, [r8-32*1] mova m6, [r8+32*0] .fast4: add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [r7-32*2] mova m3, [r7+32*1] mova m4, [r7+32*2] mova m7, [r7-32*3] pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10b, r10b jnz .fast5 mova m1, [r8+32*1] mova m2, [r8-32*2] mova m5, [r8-32*3] mova m6, [r8+32*2] .fast5: call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 add r10d, 0x80000000 jc .ret lea r2, [rsp+32*7] lea r7, [r2+32*16] sub dstq, r8 lea dstq, [dstq+strideq*4+16] jmp .pass2_loop .ret: RET cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] mov [cq], eobd pmulhrsw xm0, xm1 or r3d, 32 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ base, tmp3, tmp4 lea tmp1q, [rsp+32*7] lea tmp4d, [eobq-136] .pass1_loop: LOAD_8ROWS cq+64*0, 64*4, 1 pxor m8, m8 REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 LOAD_8ROWS cq+64*2, 64*4, 1 pxor m8, m8 REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 vpbroadcastd m7, [o(pw_2896x8)] pmulhrsw m0, m7, [cq+64* 1] pmulhrsw m1, m7, [cq+64*31] pmulhrsw m2, m7, [cq+64*17] pmulhrsw m3, m7, [cq+64*15] pmulhrsw m4, m7, [cq+64* 9] pmulhrsw m5, m7, [cq+64*23] pmulhrsw m6, m7, [cq+64*25] pmulhrsw m7, [cq+64* 7] pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 pmulhrsw m0, m7, [cq+64* 5] pmulhrsw m1, m7, [cq+64*27] pmulhrsw m2, m7, [cq+64*21] pmulhrsw m3, m7, [cq+64*11] pmulhrsw m4, m7, [cq+64*13] pmulhrsw m5, m7, [cq+64*19] pmulhrsw m6, m7, [cq+64*29] pmulhrsw m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_16384)] call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave add cq, 32 add tmp4d, 0x80000000 jnc .pass1_loop lea tmp1q, [rsp+32*15] imul r2, strideq, 19 lea r3, [strideq*3] add r2, dstq mov tmp4b, 4 .pass2_loop: lea tmp2q, [tmp1q+32*64] LOAD_8ROWS tmp1q-32*4, 32 test tmp4d, 0x40000000 jnz .fast LOAD_8ROWS_H tmp2q-32*4, 32 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf lea tmp3q, [tmp2q-32*8] LOAD_8ROWS_H tmp3q-32*4, 32 mova [rsp], m15 jmp .idct16 .fast: call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast pxor m8, m8 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 .idct16: lea tmp3q, [tmp1q-32*8] LOAD_8ROWS tmp3q-32*4, 32 call m(idct_16x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end add tmp1q, 32*16 sub dstq, r3 lea r2, [r2+r3+16] add dstq, 16 dec tmp4b jg .pass2_loop RET ALIGN function_align .transpose_round_interleave: mov tmp3d, 4 .loop: lea tmp2q, [tmp1q+32*8] mova xm0, [tmp1q-32*4] mova xm1, [tmp1q-32*3] vinserti128 m0, [tmp2q-32*4], 1 vinserti128 m1, [tmp2q-32*3], 1 mova xm2, [tmp1q-32*2] mova xm3, [tmp1q-32*1] vinserti128 m2, [tmp2q-32*2], 1 vinserti128 m3, [tmp2q-32*1], 1 mova xm4, [tmp1q+32*0] mova xm5, [tmp1q+32*1] vinserti128 m4, [tmp2q+32*0], 1 vinserti128 m5, [tmp2q+32*1], 1 mova xm6, [tmp1q+32*2] mova xm7, [tmp1q+32*3] vinserti128 m6, [tmp2q+32*2], 1 vinserti128 m7, [tmp2q+32*3], 1 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova xm8, [tmp1q-32*4+16] mova xm9, [tmp1q-32*3+16] vinserti128 m8, [tmp2q-32*4+16], 1 vinserti128 m9, [tmp2q-32*3+16], 1 mova [tmp1q-32*4], m0 mova [tmp2q-32*4], m1 mova [tmp1q-32*3], m2 mova [tmp2q-32*3], m3 mova xm2, [tmp1q-32*2+16] mova xm3, [tmp1q-32*1+16] vinserti128 m2, [tmp2q-32*2+16], 1 vinserti128 m3, [tmp2q-32*1+16], 1 mova [tmp1q-32*2], m4 mova [tmp2q-32*2], m5 mova [tmp1q-32*1], m6 mova [tmp2q-32*1], m7 mova xm4, [tmp1q+32*0+16] mova xm5, [tmp1q+32*1+16] vinserti128 m4, [tmp2q+32*0+16], 1 vinserti128 m5, [tmp2q+32*1+16], 1 mova xm6, [tmp1q+32*2+16] mova xm7, [tmp1q+32*3+16] vinserti128 m6, [tmp2q+32*2+16], 1 vinserti128 m7, [tmp2q+32*3+16], 1 pmulhrsw m0, m8, m10 pmulhrsw m1, m9, m10 REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 mova [tmp1q+32*0], m0 mova [tmp2q+32*0], m1 mova [tmp1q+32*1], m2 mova [tmp2q+32*1], m3 mova [tmp1q+32*2], m4 mova [tmp2q+32*2], m5 mova [tmp1q+32*3], m6 mova [tmp2q+32*3], m7 add tmp1q, 32*16 dec tmp3d jg .loop ret cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob lea r6, [o_base] test eobd, eobd jnz .normal movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_8192)] mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly .normal: PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 lea tmp1q, [rsp+32*71] lea r10d, [eobq-136] .pass1_loop: LOAD_8ROWS cq+64*0, 64*4 pxor m8, m8 REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 mova [rsp], m8 call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 LOAD_8ROWS cq+64*2, 64*4 pxor m8, m8 REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] mova m4, [cq+64* 9] mova m5, [cq+64*23] mova m6, [cq+64*25] mova m7, [cq+64* 7] pxor m8, m8 REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] mova m4, [cq+64*13] mova m5, [cq+64*19] mova m6, [cq+64*29] mova m7, [cq+64* 3] pxor m8, m8 REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 sub tmp1q, 32*44 vpbroadcastd m10, [o(pw_8192)] call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave add cq, 32 add r10d, 0x80000000 jnc .pass1_loop lea tmp1q, [rsp+32*7] mov r10b, 4 .pass2_loop: lea r2, [tmp1q+32*64] mova m0, [r2-32*4] mova m1, [r2-32*2] mova m2, [r2+32*0] mova m3, [r2+32*2] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 mova [rsp], m4 test r10d, 0x40000000 jnz .fast lea r3, [r2+32*64] mova m4, [r3-32*4] mova m5, [r3-32*2] mova m6, [r3+32*0] mova m7, [r3+32*2] .fast: call m(idct_16x16_internal_8bpc).main mova m1, [rsp+32*1] mova [tmp1q-32*4], m0 mova [tmp1q-32*3], m1 mova [tmp1q-32*2], m2 mova [tmp1q-32*1], m3 mova [tmp1q+32*0], m4 mova [tmp1q+32*1], m5 mova [tmp1q+32*2], m6 mova [tmp1q+32*3], m7 add tmp1q, 32*8 mova [tmp1q-32*4], m8 mova [tmp1q-32*3], m9 mova [tmp1q-32*2], m10 mova [tmp1q-32*1], m11 mova [tmp1q+32*0], m12 mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 mova m0, [r2-32*3] mova m1, [r2-32*1] mova m2, [r2+32*1] mova m3, [r2+32*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 test r10d, 0x40000000 jnz .fast2 mova m4, [r3-32*3] mova m5, [r3-32*1] mova m6, [r3+32*1] mova m7, [r3+32*3] .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m15, [o(pd_2048)] add r2, 32*8 add r3, 32*8 add tmp1q, 32*16 add tmp2q, 32*32 mova m0, [r2-32*4] ; 1 mova m3, [r2+32*3] ; 15 mova m4, [r2+32*0] ; 9 mova m7, [r2-32*1] ; 7 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10d, 0x40000000 jnz .fast3 mova m1, [r3+32*3] ; 31 mova m2, [r3-32*4] ; 17 mova m5, [r3-32*1] ; 23 mova m6, [r3+32*0] ; 25 .fast3: add r6, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 add r6, 8 add tmp1q, 32*8 sub tmp2q, 32*8 mova m0, [r2-32*2] ; 5 mova m3, [r2+32*1] ; 11 mova m4, [r2+32*2] ; 13 mova m7, [r2-32*3] ; 3 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r10d, 0x40000000 jnz .fast4 mova m1, [r3+32*1] ; 27 mova m2, [r3-32*2] ; 21 mova m5, [r3-32*3] ; 19 mova m6, [r3+32*2] ; 29 .fast4: call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 sub tmp1q, 32*28 sub dstq, r8 lea dstq, [dstq+strideq*4+16] dec r10b jg .pass2_loop RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/itx_avx512.asm000064400000000000000000011232451046102023000145070ustar 00000000000000; Copyright © 2020-2023, VideoLAN and dav1d authors ; Copyright © 2020-2023, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 const \ dup16_perm, db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 const \ int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 ; packed 4-bit qword shuffle indices permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 dq 0xc824352d56128751, 0xd906171e74301e15 dq 0x6271604b03472d62, 0x735342782165b426 dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e dq 0x5115049dd9045b79, 0x733726bffb263d1f permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 dq 0x0f11fa9c01150415, 0x0988f326039d2637 dq 0x05640f1108269d8c, 0x05290edb0aaebfae dq 0x0005000509378c9d, 0xffffffff0bbfaebf pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 gather8d: dd 0, 19, 1, 18, 2, 17, 3, 16 int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 pb_32: times 4 db 32 pw_2048: times 2 dw 2048 pw_4096: times 2 dw 4096 pw_8192: times 2 dw 8192 pw_16384: times 2 dw 16384 pw_1697x16: times 2 dw 1697*16 pw_1697x8: times 2 dw 1697*8 pw_2896x8: times 2 dw 2896*8 pd_2048: dd 2048 %define pw_5 (permD+52) %define pd_m1 (permD+60) %define pw_3803_1321 (permD+44) %define pw_2482_3803 (permD+12) %define pw_2440_3290 (permD+ 4) %define pw_m3290_2440 (permD+28) %define pw_3857_1380 (permD+36) %define pw_m1380_3857 (permD+20) pw_8192_m8192: dw 8192, -8192 pw_m8192_8192: dw -8192, 8192 pw_16384_m16384: dw 16384, -16384 pw_m16384_16384: dw -16384, 16384 pw_m1321_2482: dw -1321, 2482 pw_m3344_3344: dw -3344, 3344 pw_2482_3344: dw 2482, 3344 pw_m3803_3344: dw -3803, 3344 pd_3344: dd 3344 pw_m1321_m3344: dw -1321, -3344 pw_2896_m2896: dw 2896, -2896 pw_1567_m3784: dw 1567, -3784 pw_3784_m1567: dw 3784, -1567 pw_4017_m799: dw 4017, -799 pw_2276_m3406: dw 2276, -3406 pw_m799_m4017: dw -799, -4017 pw_m3406_m2276: dw -3406, -2276 %macro COEF_PAIR 2-3 0 pw_%1_%2: dw %1, %2 pw_m%2_%1: dw -%2, %1 %if %3 pw_m%1_m%2: dw -%1, -%2 %endif %endmacro COEF_PAIR 2896, 2896 COEF_PAIR 1567, 3784, 1 COEF_PAIR 3784, 1567 COEF_PAIR 201, 4091 COEF_PAIR 995, 3973 COEF_PAIR 1751, 3703 COEF_PAIR 3035, 2751 COEF_PAIR 3513, 2106 COEF_PAIR 4052, 601 COEF_PAIR 3166, 2598, 1 COEF_PAIR 3920, 1189, 1 COEF_PAIR 2276, 3406 COEF_PAIR 4017, 799 %macro COEF_X8 1-* %rep %0 dw %1*8, %1*8 %rotate 1 %endrep %endmacro pw_m2276x8: COEF_X8 -2276 pw_3406x8: COEF_X8 3406 pw_4017x8: COEF_X8 4017 pw_799x8: COEF_X8 799 pw_3784x8: COEF_X8 3784 pw_1567x8: COEF_X8 1567 pw_4076x8: COEF_X8 4076 pw_401x8: COEF_X8 401 pw_m2598x8: COEF_X8 -2598 pw_3166x8: COEF_X8 3166 pw_3612x8: COEF_X8 3612 pw_1931x8: COEF_X8 1931 pw_m1189x8: COEF_X8 -1189 pw_3920x8: COEF_X8 3920 pw_4091x8: COEF_X8 4091 pw_201x8: COEF_X8 201 pw_m2751x8: COEF_X8 -2751 pw_3035x8: COEF_X8 3035 pw_3703x8: COEF_X8 3703 pw_1751x8: COEF_X8 1751 pw_m1380x8: COEF_X8 -1380 pw_3857x8: COEF_X8 3857 pw_3973x8: COEF_X8 3973 pw_995x8: COEF_X8 995 pw_m2106x8: COEF_X8 -2106 pw_3513x8: COEF_X8 3513 pw_3290x8: COEF_X8 3290 pw_2440x8: COEF_X8 2440 pw_m601x8: COEF_X8 -601 pw_4052x8: COEF_X8 4052 pw_401_4076x8: dw 401*8, 4076*8 pw_m2598_3166x8: dw -2598*8, 3166*8 pw_1931_3612x8: dw 1931*8, 3612*8 pw_m1189_3920x8: dw -1189*8, 3920*8 pw_799_4017x8: dw 799*8, 4017*8 pw_m2276_3406x8: dw -2276*8, 3406*8 pw_201_4091x8: dw 201*8, 4091*8 pw_m601_4052x8: dw -601*8, 4052*8 pw_995_3973x8: dw 995*8, 3973*8 pw_m1380_3857x8: dw -1380*8, 3857*8 pw_1751_3703x8: dw 1751*8, 3703*8 pw_m2106_3513x8: dw -2106*8, 3513*8 pw_2440_3290x8: dw 2440*8, 3290*8 pw_m2751_3035x8: dw -2751*8, 3035*8 pw_101_4095x8: dw 101*8, 4095*8 pw_m2824_2967x8: dw -2824*8, 2967*8 pw_1660_3745x8: dw 1660*8, 3745*8 pw_m1474_3822x8: dw -1474*8, 3822*8 pw_897_3996x8: dw 897*8, 3996*8 pw_m2191_3461x8: dw -2191*8, 3461*8 pw_2359_3349x8: dw 2359*8, 3349*8 pw_m700_4036x8: dw -700*8, 4036*8 pw_501_4065x8: dw 501*8, 4065*8 pw_m2520_3229x8: dw -2520*8, 3229*8 pw_2019_3564x8: dw 2019*8, 3564*8 pw_m1092_3948x8: dw -1092*8, 3948*8 pw_1285_3889x8: dw 1285*8, 3889*8 pw_m1842_3659x8: dw -1842*8, 3659*8 pw_2675_3102x8: dw 2675*8, 3102*8 pw_m301_4085x8: dw -301*8, 4085*8 idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 COEF_PAIR 401, 4076, 1 COEF_PAIR 799, 4017 COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 COEF_PAIR 1931, 3612, 1 COEF_PAIR 3406, 2276 COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 SECTION .text %define o_base int8_permA+64*18 %define o(x) (r5 - (o_base) + (x)) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, ; 16 = special_mul1, 32 = special_mul2 %macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags mova m%2, m%4 %if %7 & 16 vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} mova m%3, m%4 %if %7 & 32 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} %else vpdpwssd m%3, m%1, m%6 %endif %elif %7 & 32 vpdpwssd m%2, m%1, m%5 mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} %elif %6 < 32 vpdpwssd m%2, m%1, m%5 mova m%3, m%4 vpdpwssd m%3, m%1, m%6 %elif %7 & 1 vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} %else vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} mova m%3, m%4 vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} %endif %if %7 & 2 psrld m%2, 12 pslld m%3, 4 vpshrdd m%1, m%3, m%2, 16 %elif %7 & 4 ; compared to using shifts (as above) this has better throughput, ; but worse latency and requires setting up the opmask/index ; registers, so only use this method for the larger transforms pslld m%1, m%2, 4 vpmultishiftqb m%1{k7}, m13, m%3 %else psrad m%2, 12 psrad m%3, 12 %if %7 & 8 == 0 packssdw m%1, m%3, m%2 %endif %endif %endmacro ; flags: same as ITX_MUL2X_PACK %macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags %if %11 & 1 vpbroadcastd m%4, [o(pw_%9_%10)] vpbroadcastd m%4{k1}, [o(pw_%7_%8)] vpbroadcastd m%5, [o(pw_m%10_%9)] vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] %else vpbroadcastd m%4, [o(pw_m%10_%9)] vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] vpbroadcastd m%5, [o(pw_%9_%10)] vpbroadcastd m%5{k1}, [o(pw_%7_%8)] %endif ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 %endmacro ; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 %macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 punpcklwd m%3, m%2, m%1 punpckhwd m%2, m%1 %if %7 < 32 mova m%1, m%5 vpdpwssd m%1, m%3, m%7 mova m%4, m%5 vpdpwssd m%4, m%2, m%7 %else mova m%1, m%5 vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} mova m%4, m%5 vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} %endif psrad m%1, 12 psrad m%4, 12 packssdw m%1, m%4 mova m%4, m%5 %if %7 < 32 vpdpwssd m%4, m%2, m%6 mova m%2, m%5 vpdpwssd m%2, m%3, m%6 %else vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} mova m%2, m%5 vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} %endif psrad m%4, 12 psrad m%2, 12 %if %0 == 8 packssdw m%8, m%2, m%4 %else packssdw m%2, m%4 %endif %endmacro %macro WRAP_XMM 1+ %xdefine %%reset RESET_MM_PERMUTATION INIT_XMM cpuname DEFINE_MMREGS xmm AVX512_MM_PERMUTATION %1 %%reset %endmacro %macro WRAP_YMM 1+ INIT_YMM cpuname %1 INIT_ZMM cpuname %endmacro %macro ITX4_END 4-5 2048 ; row[1-4], rnd %if %5 vpbroadcastd m2, [o(pw_%5)] pmulhrsw m0, m2 pmulhrsw m1, m2 %endif lea r2, [dstq+strideq*2] %assign %%i 1 %rep 4 %if %1 & 2 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) %else CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) %endif %assign %%i %%i + 1 %rotate 1 %endrep movd m2, [%%row_adr1] pinsrd m2, [%%row_adr2], 1 movd m3, [%%row_adr3] pinsrd m3, [%%row_adr4], 1 pmovzxbw m2, m2 pmovzxbw m3, m3 paddw m0, m2 paddw m1, m3 packuswb m0, m1 movd [%%row_adr1], m0 pextrd [%%row_adr2], m0, 1 pextrd [%%row_adr3], m0, 2 pextrd [%%row_adr4], m0, 3 ret %endmacro %macro INV_TXFM_FN 3 ; type1, type2, size cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base %define %%p1 m(i%1_%3_internal_8bpc) lea baseq, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. lea tx2q, [m(i%2_%3_internal_8bpc).pass2] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else ; jump to the 1st txfm function unless it's located directly after this times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x4 %ifidn %1_%2, dct_dct vpbroadcastw m0, [cq] vpbroadcastd m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [cq], eobd pmulhrsw m0, m1 mova m1, m0 jmp m(iadst_4x4_internal_8bpc).end2 %endif %endmacro %macro IDCT4_1D_PACKED 0 vpbroadcastd m4, [o(pd_2048)] punpckhwd m2, m1, m0 punpcklwd m1, m0 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 paddsw m0, m1, m2 ; out0 out1 psubsw m1, m2 ; out3 out2 %endmacro %macro IADST4_1D_PACKED 0 punpcklwd m4, m1, m0 ; in2 in0 punpckhwd m5, m1, m0 ; in3 in1 .main2: vpbroadcastd m3, [o(pd_2048)] mova m0, m3 vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} mova m2, m3 vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} mova m1, m3 vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} vpdpwssd m1, m5, [o(pd_3344)] {bcstd} vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} REPX {psrad x, 12}, m0, m2, m1, m3 packssdw m0, m2 ; out0 out1 packssdw m1, m3 ; out2 out3 %endmacro INIT_XMM avx512icl INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] IDCT4_1D_PACKED mova m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: IDCT4_1D_PACKED pxor ymm16, ymm16 mova [cq], ymm16 ITX4_END 0, 1, 3, 2 INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call .main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: pxor ymm16, ymm16 mova [cq], ymm16 .end2: ITX4_END 0, 1, 2, 3 ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: call m(iadst_4x4_internal_8bpc).main .end: pxor ymm16, ymm16 mova [cq], ymm16 .end2: ITX4_END 3, 2, 1, 0 INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x4_internal_8bpc).end %macro INV_TXFM_4X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x8 %ifidn %1_%2, dct_dct movd xmm1, [o(pw_2896x8)] pmulhrsw xmm0, xmm1, [cq] movd xmm2, [o(pw_2048)] pmulhrsw xmm0, xmm1 pmulhrsw xmm0, xmm1 pmulhrsw xmm0, xmm2 vpbroadcastw ym0, xmm0 mova ym1, ym0 jmp m(iadst_4x8_internal_8bpc).end3 %endif %endmacro %macro IDCT8_1D_PACKED 0 punpckhwd m5, m3, m0 ; in7 in1 punpckhwd m4, m1, m2 ; in3 in5 punpcklwd m3, m1 ; in6 in2 punpcklwd m2, m0 ; in4 in0 .main2: vpbroadcastd m6, [o(pd_2048)] ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 psubsw m0, m5, m4 ; t5a t6a (interleaved) paddsw m4, m5 ; t4 t7 (interleaved) ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 %if mmsize > 16 vbroadcasti32x4 m1, [o(deint_shuf)] pshufb m4, m1 %else pshufb m4, [o(deint_shuf)] %endif psubsw m1, m2, m3 ; tmp3 tmp2 paddsw m3, m2 ; tmp0 tmp1 punpckhqdq m2, m4, m0 ; t7 t6 punpcklqdq m4, m0 ; t4 t5 paddsw m0, m3, m2 ; out0 out1 psubsw m3, m2 ; out7 out6 psubsw m2, m1, m4 ; out4 out5 paddsw m1, m4 ; out3 out2 %endmacro %macro IADST8_1D_PACKED 1 ; pass vpbroadcastd m6, [o(pd_2048)] %if %1 == 1 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a psubsw m4, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a %if mmsize > 16 vbroadcasti32x4 m2, [o(deint_shuf)] %else mova m2, [o(deint_shuf)] %endif vprord m1, 16 psubsw m3, m0, m1 ; t3 t2 paddsw m0, m1 ; -out7 out0 psubsw m1, m4, m5 ; t7 t6 paddsw m4, m5 ; out6 -out1 pshufb m0, m2 pshufb m4, m2 mova m2, m6 vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} mova m5, m6 vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} psrad m2, 12 psrad m5, 12 packssdw m2, m5 ; out4 -out5 mova m5, m6 vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} mova m3, m6 vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} psrad m5, 12 psrad m3, 12 packssdw m1, m3, m5 ; out2 -out3 %else punpckhwd m0, m4, m3 ; 0 7 punpckhwd m1, m5, m2 ; 2 5 punpcklwd m2, m5 ; 4 3 punpcklwd m3, m4 ; 6 1 ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a psubsw m4, m0, m2 ; t4 t5 paddsw m0, m2 ; t0 t1 psubsw m5, m1, m3 ; t6 t7 paddsw m1, m3 ; t2 t3 shufps m2, m5, m4, q1032 punpckhwd m4, m2 punpcklwd m5, m2 ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a psubsw m2, m0, m1 ; t2 t3 paddsw m0, m1 ; out0 -out7 psubsw m1, m4, m5 ; t6 t7 paddsw m4, m5 ; -out1 out6 vpbroadcastd m5, [o(pw_2896x8)] punpckhqdq m3, m2, m1 ; t3 t7 punpcklqdq m2, m1 ; t2 t6 paddsw m1, m2, m3 ; t2+t3 t6+t7 psubsw m2, m3 ; t2-t3 t6-t7 punpckhqdq m3, m4, m0 ; out6 -out7 punpcklqdq m0, m4 ; out0 -out1 pmulhrsw m2, m5 ; out4 -out5 pshufd m1, m1, q1032 pmulhrsw m1, m5 ; out2 -out3 %endif %endmacro INIT_YMM avx512icl INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, identity INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 IDCT4_1D_PACKED vbroadcasti32x4 m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q .pass2: vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 call .main vpbroadcastd m4, [o(pw_2048)] vinserti32x4 m0, m0, xm2, 1 vinserti32x4 m1, m1, xm3, 1 pshufd m1, m1, q1032 jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align .main: WRAP_XMM IDCT8_1D_PACKED ret INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal_8bpc).main punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call .main_pass2 vpbroadcastd m4, [o(pw_2048)] vinserti32x4 m0, xm2, 1 vinserti32x4 m1, xm3, 1 pxor m5, m5 psubw m5, m4 .end: punpcklqdq m4, m5 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 .end3: vpbroadcastd m3, strided pmulld m5, m3, [o(pd_0to15)] kxnorb k1, k1, k1 kmovb k2, k1 vpgatherdd m3{k1}, [dstq+m5] pxor m4, m4 mova [cq], zmm20 punpcklbw m2, m3, m4 punpckhbw m3, m4 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpscatterdd [dstq+m5]{k2}, m0 RET ALIGN function_align .main_pass1: punpckhwd xm0, xm4, xm3 ; 0 7 punpckhwd xm1, xm5, xm2 ; 2 5 punpcklwd xm2, xm5 ; 4 3 punpcklwd xm3, xm4 ; 6 1 WRAP_XMM IADST8_1D_PACKED 1 punpcklqdq xm3, xm4, xm0 ; out6 -out7 punpckhqdq xm0, xm4 ; out0 -out1 ret ALIGN function_align .main_pass2: WRAP_XMM IADST8_1D_PACKED 2 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 vpermq m1, [cq+32*1], q3120 vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal_8bpc).main punpcklwd m3, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m3 punpckhwd m1, m3 jmp tx2q .pass2: vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 call m(iadst_4x8_internal_8bpc).main_pass2 vpbroadcastd m5, [o(pw_2048)] vinserti32x4 m3, xm1, 1 vinserti32x4 m2, xm0, 1 pxor m4, m4 psubw m4, m5 pshufd m0, m3, q1032 pshufd m1, m2, q1032 jmp m(iadst_4x8_internal_8bpc).end INIT_ZMM avx512icl INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd m0, [o(pw_2896x8)] pmulhrsw m0, [cq] mova m1, [o(int8_permB)] vpbroadcastd m2, [o(pw_1697x8)] vpermb m0, m1, m0 pmulhrsw m2, m0 paddsw m0, m2 vextracti32x8 ym1, m0, 1 jmp tx2q .pass2: vpbroadcastd ym4, [o(pw_4096)] jmp m(iadst_4x8_internal_8bpc).end2 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 imul r6d, 181 add r6d, 128+2048 sar r6d, 8+4 vpbroadcastw m0, r6d mova m1, m0 jmp m(iadst_4x16_internal_8bpc).end3 %endif %endmacro %macro IDCT16_1D_PACKED 0 punpckhwd m8, m7, m0 ; dct16 in15 in1 punpcklwd m9, m4, m0 ; dct4 in2 in0 punpckhwd m0, m3, m4 ; dct16 in7 in9 punpcklwd m7, m1 ; dct8 in7 in1 punpckhwd m1, m6 ; dct16 in3 in13 punpcklwd m3, m5 ; dct8 in3 in5 punpckhwd m5, m2 ; dct16 in11 in5 punpcklwd m6, m2 ; dct4 in3 in1 cglobal_label .main2 vpbroadcastd m10, [o(pd_2048)] .main3: vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 ; 0x33... ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a .main4: psubsw m2, m8, m0 ; t9 t14 paddsw m8, m0 ; t8 t15 psubsw m4, m1, m5 ; t10 t13 paddsw m1, m5 ; t11 t12 ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 psubsw m0, m8, m1 ; t11a t12a paddsw m8, m1 ; t8a t15a psubsw m1, m7, m3 ; t5a t6a paddsw m7, m3 ; t4 t7 .main5: ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a %if mmsize > 16 vbroadcasti32x4 m5, [o(deint_shuf)] %else mova m5, [o(deint_shuf)] %endif vpbroadcastd m11, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] paddsw m3, m2, m4 ; t9 t14 psubsw m2, m4 ; t10 t13 pshufb m8, m5 pshufb m7, m5 pshufb m3, m5 ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a punpckhqdq m2, m7, m1 ; t7 t6 punpcklqdq m7, m1 ; t4 t5 psubsw m1, m9, m6 ; dct4 out3 out2 paddsw m9, m6 ; dct4 out0 out1 packssdw m5, m11 ; t12 t13a packssdw m4, m0 ; t11 t10a punpckhqdq m0, m8, m3 ; t15a t14 punpcklqdq m8, m3 ; t8a t9 psubsw m3, m9, m2 ; dct8 out7 out6 paddsw m9, m2 ; dct8 out0 out1 psubsw m2, m1, m7 ; dct8 out4 out5 paddsw m1, m7 ; dct8 out3 out2 psubsw m7, m9, m0 ; out15 out14 paddsw m0, m9 ; out0 out1 psubsw m6, m1, m5 ; out12 out13 paddsw m1, m5 ; out3 out2 psubsw m5, m2, m4 ; out11 out10 paddsw m2, m4 ; out4 out5 psubsw m4, m3, m8 ; out8 out9 paddsw m3, m8 ; out7 out6 %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, identity INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova ym1, [cq+32*2] vinserti32x8 m1, [cq+32*0], 1 mova m0, [o(int16_perm)] mova ym2, [cq+32*3] vinserti32x8 m2, [cq+32*1], 1 vpbroadcastd m4, [o(pd_2048)] vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 vpbroadcastd m4, [o(pw_16384)] psubsw m3, m1, m2 paddsw m1, m2 ; out0 out1 vprord m3, 16 ; out2 out3 punpckldq m0, m1, m3 punpckhdq m1, m3 pmulhrsw m0, m4 pmulhrsw m1, m4 jmp tx2q .pass2: vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, ym1, 1 vextracti32x4 xm4, m0, 2 vextracti32x4 xm5, m1, 2 vextracti32x4 xm6, m0, 3 vextracti32x4 xm7, m1, 3 call .main vinserti32x4 ym0, xm2, 1 vinserti32x4 ym1, xm3, 1 vinserti32x4 ym4, xm6, 1 vinserti32x4 ym5, xm7, 1 vinserti32x8 m0, ym4, 1 vinserti32x8 m1, ym5, 1 vpbroadcastd m5, [o(pw_2048)] pshufd m1, m1, q1032 jmp m(iadst_4x16_internal_8bpc).end2 ALIGN function_align .main: WRAP_XMM IDCT16_1D_PACKED ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m1, [o(permB)] vpermq m0, m1, [cq+64*0] vpermq m1, m1, [cq+64*1] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m3, [o(pw_16384)] punpckhwd m2, m0, m1 punpcklwd m0, m1 pmulhrsw m2, m3 pmulhrsw m0, m3 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: call .main vpbroadcastd m5, [o(pw_2048)] psrlq m10, 4 psubw m6, m8, m5 .end: vpbroadcastd m7, [o(pw_2896x8)] paddsw ym1, ym2, ym4 psubsw ym2, ym4 vinserti32x8 m1, ym2, 1 pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 psrlq m0, m10, 4 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f punpcklqdq m5, m6 .end2: pmulhrsw m0, m5 pmulhrsw m1, m5 .end3: vpbroadcastd m3, strided pmulld m5, m3, [o(pd_0to15)] kxnorw k1, k1, k1 kmovw k2, k1 vpgatherdd m3{k1}, [dstq+m5] pxor m4, m4 mova [cq+64*0], m4 mova [cq+64*1], m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 paddw m0, m2 paddw m1, m3 packuswb m0, m1 vpscatterdd [dstq+m5]{k2}, m0 RET ALIGN function_align .main: movu m3, [o(permB+1)] psrlq m10, m3, 4 .main2: vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 vpbroadcastd m9, [o(pd_2048)] vpbroadcastq ym13, [o(int_mshift)] kxnorb k1, k1, k1 punpckhwd m4, m3, m0 ; in12 in3 in14 in1 punpcklwd m0, m3 ; in0 in15 in2 in13 kshiftrb k1, k1, 4 vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 INIT_YMM avx512icl vpcmpub k7, m13, m9, 6 ; 0x33... pxor m8, m8 ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 psubsw m2, m0, m3 ; t9a t8a t11a t10a paddsw m0, m3 ; t1a t0a t3a t2a psubsw m3, m1, m4 ; t13a t12a t15a t14a paddsw m4, m1 ; t5a t4a t7a t6a ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 psubw m7, m8, m7 ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 vpbroadcastd m6, [o(pw_3784_m1567)] vpbroadcastd m6{k1}, [o(pw_m3784_1567)] psubsw m1, m0, m4 ; t5 t4 t7 t6 paddsw m0, m4 ; t1 t0 t3 t2 psubsw m4, m2, m3 ; t13a t12a t15a t14a paddsw m2, m3 ; t9a t8a t11a t10a ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 vbroadcasti32x4 m5, [o(deint_shuf)] pshufb m0, m5 pshufb m2, m5 vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 pshufd m2, m2, q1032 ; t6a t7a t14 t15 psubsw m4, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 psubsw m1, m2 ; t7 t6 t15a t14a punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a punpcklqdq m4, m1 ; t3a t7 t11 t15a INIT_ZMM avx512icl vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m1, [o(permB)] vpermq m0, m1, [cq+64*0] vpermq m1, m1, [cq+64*1] call m(iadst_16x4_internal_8bpc).main vpbroadcastd m3, [o(pw_16384)] punpcklwd m2, m1, m0 punpckhwd m1, m0 pmulhrsw m2, m3 pmulhrsw m1, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 jmp tx2q .pass2: call m(iadst_4x16_internal_8bpc).main vpbroadcastd m6, [o(pw_2048)] psrlq m10, 12 psubw m5, m8, m6 jmp m(iadst_4x16_internal_8bpc).end INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m2, [o(int16_perm)] vpermb m1, m2, [cq+64*0] vpermb m2, m2, [cq+64*1] vpbroadcastd m4, [o(pw_1697x8)] vpbroadcastd m0, [o(pd_m1)] pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal pmulhrsw m4, m2 ; it still works, but if the input is -1 the vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. punpckldq m0, m1, m2 punpckhdq m1, m2 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x16)] vpbroadcastd m5, [o(pw_2048)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m0 paddsw m1, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x16_internal_8bpc).end2 %macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] movhps xm%3, [dstq+%5] movq xm%4, [dstq+%6] movhps xm%4, [dstq+%7] pmovzxbw m%3, xm%3 pmovzxbw m%4, xm%4 %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vextracti32x4 xm%4, m%3, 1 movq [dstq ], xm%3 movhps [dstq+%6], xm%3 movq [dstq+%5], xm%4 movhps [dstq+%7], xm%4 %endmacro %macro INV_TXFM_8X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x4 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_2048)] pmulhrsw xm0, xm1 pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 jmp m(iadst_8x4_internal_8bpc).end3 %endif %endmacro INIT_YMM avx512icl INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd xm3, [o(pw_2896x8)] pmulhrsw xm0, xm3, [cq+16*0] pmulhrsw xm1, xm3, [cq+16*1] pmulhrsw xm2, xm3, [cq+16*2] pmulhrsw xm3, [cq+16*3] call m(idct_4x8_internal_8bpc).main vbroadcasti32x4 m4, [o(deint_shuf)] vinserti32x4 m3, m1, xm3, 1 vinserti32x4 m1, m0, xm2, 1 shufps m0, m1, m3, q0220 shufps m1, m3, q1331 pshufb m0, m4 pshufb m1, m4 jmp tx2q .pass2: IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal_8bpc).main_pass1 vinserti32x4 m0, xm2, 1 vinserti32x4 m1, xm3, 1 pxor m3, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 psubsw m3, m2 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main .end: vpermq m0, m0, q3120 vpermq m1, m1, q3120 .end2: vpbroadcastd m2, [o(pw_2048)] pmulhrsw m0, m2 pmulhrsw m1, m2 .end3: pxor m2, m2 mova [cq], zmm18 lea r6, [strideq*3] WRITE_8X4 0, 1, 4, 5 RET ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd xm0, [o(pw_2896x8)] pshufd xm4, [cq+16*0], q1032 pmulhrsw xm3, xm0, [cq+16*3] pshufd xm5, [cq+16*1], q1032 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal_8bpc).main_pass1 vinserti32x4 m3, m3, xm1, 1 vinserti32x4 m2, m2, xm0, 1 punpckhwd m1, m3, m2 punpcklwd m3, m2 pxor m0, m0 psubsw m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call m(iadst_8x4_internal_8bpc).main mova m2, m1 vpermq m1, m0, q2031 vpermq m0, m2, q2031 jmp m(iadst_8x4_internal_8bpc).end2 INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova xm2, [cq+16*0] mova xm0, [cq+16*1] vinserti32x4 m2, [cq+16*2], 1 vinserti32x4 m0, [cq+16*3], 1 vpbroadcastd m3, [o(pw_2896x8)] punpcklwd m1, m2, m0 punpckhwd m2, m0 pmulhrsw m1, m3 pmulhrsw m2, m3 punpcklwd m0, m1, m2 punpckhwd m1, m2 paddsw m0, m0 paddsw m1, m1 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8 %ifidn %1_%2, dct_dct INIT_ZMM avx512icl movsx r6d, word [cq] mov [cq], eobd .dconly: imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 .dconly2: vpbroadcastd ym2, strided imul r6d, 181 pmulld ym5, ym2, [o(pd_0to15)] kxnorb k1, k1, k1 add r6d, 128+2048 sar r6d, 8+4 pxor m3, m3 vpbroadcastw m4, r6d .dconly_loop: kmovb k2, k1 vpgatherdq m2{k1}, [dstq+ym5] punpcklbw m0, m2, m3 punpckhbw m1, m2, m3 paddw m0, m4 paddw m1, m4 packuswb m0, m1 kmovb k1, k2 vpscatterdq [dstq+ym5]{k2}, m0 lea dstq, [dstq+strideq*8] sub r3d, 8 jg .dconly_loop RET INIT_YMM avx512icl %endif %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, identity INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 ; 0 1 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m2, [cq+32*2], q3120 ; 4 5 vpermq m1, [cq+32*1], q3120 ; 2 3 call .main shufps m4, m0, m1, q0220 shufps m5, m0, m1, q1331 shufps m1, m2, m3, q0220 shufps m3, m2, m3, q1331 vbroadcasti32x4 m0, [o(deint_shuf)] vpbroadcastd m2, [o(pw_16384)] REPX {pshufb x, m0}, m4, m5, m1, m3 REPX {pmulhrsw x, m2}, m4, m5, m1, m3 vinserti32x4 m0, m4, xm1, 1 vshufi32x4 m2, m4, m1, 0x03 vinserti32x4 m1, m5, xm3, 1 vshufi32x4 m3, m5, m3, 0x03 jmp tx2q .pass2: call .main vpbroadcastd m4, [o(pw_2048)] vpermq m0, m0, q3120 vpermq m1, m1, q2031 vpermq m2, m2, q3120 vpermq m3, m3, q2031 jmp m(iadst_8x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main IDCT8_1D_PACKED ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call .main_pass1 vpbroadcastd m5, [o(pw_16384_m16384)] punpcklwd m4, m0, m1 punpckhwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 punpcklwd m0, m1, m2 punpckhwd m1, m2 REPX {pmulhrsw x, m5}, m3, m4, m0, m1 vshufi32x4 m2, m3, m0, 0x03 vinserti32x4 m0, m3, xm0, 1 vshufi32x4 m3, m4, m1, 0x03 vinserti32x4 m1, m4, xm1, 1 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call .main_pass2 vpbroadcastd m5, [o(pw_2048)] vpbroadcastd xm4, [o(pw_4096)] psubw m4, m5 ; lower half = 2048, upper half = -2048 .end: REPX {vpermq x, x, q3120}, m0, m1, m2, m3 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 .end3: pmulhrsw m2, m4 pmulhrsw m3, m4 .end4: pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 lea r6, [strideq*3] WRITE_8X4 0, 1, 4, 5 lea dstq, [dstq+strideq*4] WRITE_8X4 2, 3, 4, 5 RET ALIGN function_align .main_pass1: punpckhwd m0, m4, m3 ; 0 7 punpckhwd m1, m5, m2 ; 2 5 punpcklwd m2, m5 ; 4 3 punpcklwd m3, m4 ; 6 1 IADST8_1D_PACKED 1 punpcklqdq m3, m4, m0 ; out6 -out7 punpckhqdq m0, m4 ; out0 -out1 ret ALIGN function_align cglobal_label .main_pass2 IADST8_1D_PACKED 2 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpermq m4, [cq+32*0], q1302 ; 1 0 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 call m(iadst_8x8_internal_8bpc).main_pass1 vpbroadcastd m5, [o(pw_m16384_16384)] punpckhwd m4, m3, m2 punpcklwd m3, m2 punpckhwd m2, m1, m0 punpcklwd m1, m0 punpckhwd m0, m4, m3 punpcklwd m4, m3 punpckhwd m3, m2, m1 punpcklwd m2, m1 REPX {pmulhrsw x, m5}, m0, m4, m3, m2 vinserti32x4 m1, m0, xm3, 1 vshufi32x4 m3, m0, m3, 0x03 vinserti32x4 m0, m4, xm2, 1 vshufi32x4 m2, m4, m2, 0x03 jmp tx2q .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 vpbroadcastd m4, [o(pw_2048)] vpbroadcastd xm5, [o(pw_4096)] psubw m4, m5 ; lower half = -2048, upper half = 2048 vpermq m5, m3, q2031 vpermq m3, m0, q2031 vpermq m0, m2, q2031 vpermq m2, m1, q2031 pmulhrsw m1, m0, m4 pmulhrsw m0, m5, m4 jmp m(iadst_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova xm3, [cq+16*0] mova xm2, [cq+16*1] vinserti32x4 m3, [cq+16*4], 1 vinserti32x4 m2, [cq+16*5], 1 mova xm4, [cq+16*2] mova xm0, [cq+16*3] vinserti32x4 m4, [cq+16*6], 1 vinserti32x4 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 punpckhwd m4, m0 punpckldq m0, m1, m2 punpckhdq m1, m2 punpckldq m2, m3, m4 punpckhdq m3, m4 jmp tx2q .pass2: vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_8x8_internal_8bpc).end %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd or r3d, 16 imul r6d, 181 add r6d, 128 sar r6d, 8 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly %endif %endmacro %macro ITX_8X16_LOAD_COEFS 0 vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m0, m4, [cq+32*0] add cq, 32*4 pmulhrsw m7, m4, [cq+32*3] pmulhrsw m1, m4, [cq-32*3] pmulhrsw m6, m4, [cq+32*2] pmulhrsw m2, m4, [cq-32*2] pmulhrsw m5, m4, [cq+32*1] pmulhrsw m3, m4, [cq-32*1] pmulhrsw m4, [cq+32*0] %endmacro INIT_ZMM avx512icl INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, identity INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m3, [o(permB)] vpermq m0, m3, [cq+64*0] vpbroadcastd m4, [o(pw_2896x8)] vpermq m1, m3, [cq+64*1] vpermq m2, m3, [cq+64*2] vpermq m3, m3, [cq+64*3] REPX {pmulhrsw x, m4}, m0, m1, m2, m3 call m(idct_16x8_internal_8bpc).main vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 REPX {pmulhrsw x, m5}, m4, m0, m2, m1 punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 punpckhdq m1, m0, m2 ; 1 5 9 13 punpckldq m0, m2 ; 0 4 8 12 punpckldq m2, m3, m4 ; 2 6 10 14 punpckhdq m3, m4 ; 3 7 11 15 jmp tx2q .pass2: vprord m5, [o(int16_perm)], 16 vshufi32x4 m2, m2, q1320 ; 2 10 14 6 vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 vshufi32x4 m1, m3, q0132 ; 9 13 7 3 vpermb m9, m5, m0 vpermb m7, m5, m2 vpermb m8, m5, m4 vpermb m0, m5, m1 vextracti32x8 ym6, m9, 1 vextracti32x8 ym3, m7, 1 vextracti32x8 ym5, m8, 1 vextracti32x8 ym1, m0, 1 call .main2 mova ym8, [o(gather8a)] lea r3, [dstq+strideq*4] pmovzxdq m9, ym8 pshufd ym8, ym8, q1230 vpermt2q m0, m9, m4 vpermt2q m1, m9, m5 vpermt2q m2, m9, m6 vpermt2q m3, m9, m7 .end: vpbroadcastd m7, [o(pw_2048)] .end2: pmulhrsw m0, m7 pmulhrsw m1, m7 .end3: pmulhrsw m2, m7 pmulhrsw m3, m7 .end4: vpbroadcastd ym6, strided kxnorb k1, k1, k1 pxor m4, m4 pmulld ym8, ym6 kmovb k2, k1 vpgatherdq m6{k1}, [dstq+ym8] kmovb k1, k2 vpgatherdq m7{k2}, [r3+ym8] mova [cq+64*0], m4 mova [cq+64*1], m4 kmovb k2, k1 mova [cq+64*2], m4 mova [cq+64*3], m4 punpcklbw m5, m6, m4 punpckhbw m6, m4 paddw m0, m5 paddw m1, m6 packuswb m0, m1 vpscatterdq [dstq+ym8]{k1}, m0 punpcklbw m6, m7, m4 punpckhbw m7, m4 paddw m2, m6 paddw m3, m7 packuswb m2, m3 vpscatterdq [r3+ym8]{k2}, m2 RET ALIGN function_align cglobal_label .main_fast2 ; bottom three-quarters are zero vpbroadcastd ym10, [o(pd_2048)] vpbroadcastq ym13, [o(int_mshift)] vpbroadcastd ym3, [o(pw_401_4076x8)] vpbroadcastd ym5, [o(pw_799_4017x8)] vpbroadcastd ym4, [o(pw_m1189_3920x8)] pxor ym6, ym6 punpckhwd ym2, ym0, ym0 pmulhrsw ym2, ym3 ; t8a t15a punpcklwd ym7, ym1, ym1 pmulhrsw ym7, ym5 ; t4a t7a punpckhwd ym1, ym1 pmulhrsw ym4, ym1 ; t11a t12a vpcmpub k7, ym13, ym10, 6 punpcklwd ym9, ym6, ym0 psubsw ym0, ym2, ym4 ; t11a t12a paddsw ym8, ym2, ym4 ; t8a t15a mova ym1, ym7 jmp .main5 ALIGN function_align cglobal_label .main_fast ; bottom half is zero vpbroadcastd ym10, [o(pd_2048)] vpbroadcastq ym13, [o(int_mshift)] pxor ym6, ym6 punpckhwd ym8, ym0, ym0 punpckhwd ym4, ym3, ym3 punpckhwd ym5, ym2, ym2 punpcklwd ym7, ym1, ym1 punpckhwd ym1, ym1 punpcklwd ym3, ym3 punpcklwd ym9, ym6, ym0 punpcklwd ym6, ym2 vpbroadcastd ym2, [o(pw_401_4076x8)] vpbroadcastd ym0, [o(pw_m2598_3166x8)] vpbroadcastd ym11, [o(pw_1931_3612x8)] vpbroadcastd ym12, [o(pw_m1189_3920x8)] pmulhrsw ym8, ym2 ; t8a t15a vpbroadcastd ym2, [o(pw_799_4017x8)] pmulhrsw ym0, ym4 ; t9a t14a vpbroadcastd ym4, [o(pw_m2276_3406x8)] pmulhrsw ym5, ym11 ; t10a t13a pmulhrsw ym1, ym12 ; t11a t12a pmulhrsw ym7, ym2 ; t4a t7a pmulhrsw ym3, ym4 ; t5a t6a vpcmpub k7, ym13, ym10, 6 jmp .main4 ALIGN function_align cglobal_label .main WRAP_YMM IDCT16_1D_PACKED ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_8bpc).main_pass1 vbroadcasti32x4 m6, [o(int_shuf1)] vpbroadcastd m7, [o(pw_16384_m16384)] punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 .pass1_end: REPX {pmulhrsw x, m7}, m3, m5, m4, m2 punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m5 punpckhqdq m3, m5 jmp tx2q .pass2: call .main_pass2 vpbroadcastd m6, [o(pw_2048)] psrlq m10, 4 psubw m7, m8, m6 .pass2_end: vpbroadcastd m5, [o(pw_2896x8)] paddsw m1, m2, m4 psubsw m2, m4 pmulhrsw m1, m5 ; -out7 out4 out6 -out5 pmulhrsw m5, m2 ; out8 -out11 -out9 out10 mova ym8, [o(gather8c)] lea r3, [dstq+strideq] psrlq m2, m10, 4 vpermi2q m2, m0, m3 ; 1 3 13 15 vpermt2q m0, m10, m3 ; 0 2 12 14 psrlq m3, m10, 8 vpermi2q m3, m1, m5 ; 5 7 9 11 psrlq m10, 12 vpermt2q m1, m10, m5 ; 4 6 8 10 pmulhrsw m0, m6 pmulhrsw m1, m6 jmp m(idct_8x16_internal_8bpc).end3 ALIGN function_align .main_pass1: vpbroadcastd m2, [o(pw_2896x8)] pmulhrsw m5, m2, [cq+64*0] pmulhrsw m3, m2, [cq+64*3] pmulhrsw m1, m2, [cq+64*1] pmulhrsw m2, [cq+64*2] movu m4, [o(permA+3)] psrlq m10, m4, 4 mova m6, m4 vpermi2q m4, m5, m3 ; in0 in12 in2 in14 vpermt2q m5, m10, m3 ; in15 in3 in13 in1 vpermi2q m6, m1, m2 ; in4 in8 in6 in10 vpermt2q m1, m10, m2 ; in11 in7 in9 in5 jmp .main ALIGN function_align .main_pass2: mova m4, [o(permC)] psrlq m5, m4, 4 vpermi2q m4, m0, m2 ; in0 in12 in2 in14 psrlq m6, m5, 4 vpermi2q m5, m1, m3 ; in15 in3 in13 in1 psrlq m10, m6, 4 vpermi2q m6, m0, m2 ; in4 in8 in6 in10 vpermt2q m1, m10, m3 ; in11 in7 in9 in5 .main: punpcklwd m0, m4, m5 ; in0 in15 in2 in13 punpckhwd m4, m5 ; in12 in3 in14 in1 punpcklwd m5, m6, m1 ; in4 in11 in6 in9 punpckhwd m6, m1 ; in8 in7 in10 in5 cglobal_label .main2 vpbroadcastd m9, [o(pd_2048)] vpbroadcastq m13, [o(int_mshift)] kxnorb k1, k1, k1 vpcmpub k7, m13, m9, 6 ; 0x33... pxor m8, m8 ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 psubsw m2, m0, m6 ; t9a t8a t11a t10a paddsw m0, m6 ; t1a t0a t3a t2a psubsw m3, m5, m4 ; t13a t12a t15a t14a paddsw m5, m4 ; t5a t4a t7a t6a ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 psubw m7, m8, m7 ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 vpbroadcastd m6, [o(pw_3784_m1567)] vpbroadcastd m6{k1}, [o(pw_m3784_1567)] psubsw m1, m0, m5 ; t5 t4 t7 t6 paddsw m0, m5 ; t1 t0 t3 t2 psubsw m4, m2, m3 ; t13a t12a t15a t14a paddsw m2, m3 ; t9a t8a t11a t10a ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 vbroadcasti32x4 m5, [o(deint_shuf)] pshufb m0, m5 pshufb m2, m5 vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 pshufd m2, m2, q1032 ; t7a t6a t15 t14 psubsw m4, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 paddsw m3, m1, m2 ; out12 -out3 -out13 out2 psubsw m1, m2 ; t7 t6 t15a t14a punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a punpcklqdq m4, m1 ; t3a t7 t11 t15a ret INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_16x8_internal_8bpc).main_pass1 vbroadcasti32x4 m6, [o(int_shuf2)] vpbroadcastd m7, [o(pw_m16384_16384)] punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 jmp m(iadst_8x16_internal_8bpc).pass1_end .pass2: call m(iadst_8x16_internal_8bpc).main_pass2 vpbroadcastd m7, [o(pw_2048)] psrlq m10, 36 psubw m6, m8, m7 jmp m(iadst_8x16_internal_8bpc).pass2_end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [o(int16_perm)] vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 vpbroadcastd m5, [o(pw_2896x8)] punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 REPX {pmulhrsw x, m5}, m1, m2, m3, m4 punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 jmp tx2q .pass2: vpbroadcastd m7, [o(pw_1697x16)] mova ym8, [o(gather8b)] lea r3, [dstq+strideq*2] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 REPX {paddsw x, x}, m0, m1, m2, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(idct_8x16_internal_8bpc).end %macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] pmovzxbw m%3, [dstq+%5] %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif pmovzxbw m%4, [dstq+%6] %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 vpermq m%3, m%3, q3120 mova [dstq+%5], xm%3 vextracti32x4 [dstq+%6], m%3, 1 %endmacro %macro INV_TXFM_16X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x4 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 %endif %endmacro INIT_ZMM avx512icl INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova xm0, [cq+16*0] mova xm1, [cq+16*1] mova xm2, [cq+16*2] mova xm3, [cq+16*3] mova xm4, [cq+16*4] mova xm5, [cq+16*5] mova xm6, [cq+16*6] mova xm7, [cq+16*7] call m(idct_4x16_internal_8bpc).main vpbroadcastd m8, [o(pw_16384)] vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 vinserti32x4 ym5, xm7, 1 ; b a f e vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 vinserti32x4 ym4, xm6, 1 ; 8 9 c d vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d pmulhrsw m1, m8 pmulhrsw m0, m8 pshufd m1, m1, q1032 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 jmp tx2q .pass2: IDCT4_1D_PACKED mova m2, [o(permA)] jmp m(iadst_16x4_internal_8bpc).end INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+64*0] mova m1, [cq+64*1] movshdup m3, [o(permB)] psrlq m10, m3, 4 call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m6, [o(pw_16384_m16384)] psrlq m0, m10, 4 psrlq m10, 8 .pass1_end: punpcklwd ym5, ym4, ym2 punpckhwd ym4, ym2 vinserti32x8 m5, ym4, 1 mova m1, m9 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} mova m4, m9 vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} psrad m1, 12 psrad m4, 12 packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 punpcklwd m0, m2 pmulhrsw m0, m6 pmulhrsw m1, m6 jmp tx2q .pass2: call .main movu m2, [o(permA+1)] .end: vpbroadcastd m3, [o(pw_2048)] pmulhrsw m0, m3 pmulhrsw m1, m3 .end2: psrlq m3, m2, 4 vpermi2q m2, m0, m1 vpermi2q m3, m0, m1 .end3: lea r3, [dstq+strideq*2] mova xm1, [dstq+strideq*0] vinserti32x4 ym1, [dstq+strideq*1], 1 vinserti32x4 m1, [r3 +strideq*0], 2 vinserti32x4 m1, [r3 +strideq*1], 3 pxor m4, m4 mova [cq+64*0], m4 mova [cq+64*1], m4 punpcklbw m0, m1, m4 punpckhbw m1, m4 paddw m0, m2 paddw m1, m3 packuswb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [r3 +strideq*0], m0, 2 vextracti32x4 [r3 +strideq*1], m0, 3 RET ALIGN function_align .main: IADST4_1D_PACKED ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m0, [cq+64*0] mova m1, [cq+64*1] movshdup m3, [o(permB)] psrlq m10, m3, 4 call m(iadst_4x16_internal_8bpc).main2 vpbroadcastd m6, [o(pw_m16384_16384)] psrlq m0, m10, 12 psrlq m10, 16 jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: call m(iadst_16x4_internal_8bpc).main movu m2, [o(permA+2)] jmp m(iadst_16x4_internal_8bpc).end INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m1, [cq+64*0] mova m2, [cq+64*1] vpbroadcastd m3, [o(pw_1697x16)] vpbroadcastd m4, [o(pw_16384)] mova m5, [o(idtx_16x4p)] shufps m0, m1, m2, q2020 shufps m1, m2, q3131 pmulhrsw m2, m3, m0 pmulhrsw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 paddsw m0, m2 paddsw m1, m3 vpermb m0, m5, m0 vpermb m1, m5, m1 jmp tx2q .pass2: vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 movu m2, [o(permA+1)] jmp m(iadst_16x4_internal_8bpc).end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd or r3d, 8 .dconly: imul r6d, 181 add r6d, 128 sar r6d, 8 .dconly2: imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 .dconly3: imul r6d, 181 lea r2, [strideq*3] add r6d, 128+2048 sar r6d, 8+4 pxor m2, m2 vpbroadcastw m3, r6d .dconly_loop: mova xm1, [dstq+strideq*0] vinserti32x4 ym1, [dstq+strideq*1], 1 vinserti32x4 m1, [dstq+strideq*2], 2 vinserti32x4 m1, [dstq+r2 ], 3 punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+r2 ], m0, 3 lea dstq, [dstq+strideq*4] sub r3d, 4 jg .dconly_loop RET %endif %endmacro %macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd vpbroadcastd m8, [o(pw_2896x8)] vpermq m0, [cq+32*0], q3120 add cq, 32*4 vpermq m7, [cq+32*3], q%1 vpermq m1, [cq-32*3], q%1 vpermq m6, [cq+32*2], q3120 vpermq m2, [cq-32*2], q3120 vpermq m5, [cq+32*1], q%1 vpermq m3, [cq-32*1], q%1 vpermq m4, [cq+32*0], q3120 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, identity INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd m1, [o(pw_2896x8)] vpermq m0, [cq+64*0], q3120 vpermq m2, [cq+64*1], q3120 vpermq m4, [cq+64*2], q3120 vpermq m6, [cq+64*3], q3120 REPX {pmulhrsw x, m1}, m0, m2, m4, m6 vextracti32x8 ym1, m0, 1 vextracti32x8 ym3, m2, 1 vextracti32x8 ym5, m4, 1 vextracti32x8 ym7, m6, 1 call m(idct_8x16_internal_8bpc).main vbroadcasti32x4 m8, [o(int_shuf1)] vbroadcasti32x4 m9, [o(int_shuf2)] vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 vpbroadcastd m2, [o(pw_16384)] pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 REPX {pmulhrsw x, m2}, m0, m1, m6, m7 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp tx2q .pass2: vshufi32x4 m0, m2, m4, q2020 ; 0 1 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 ; 2 3 vshufi32x4 m3, m5, q3131 ; 6 7 call .main movshdup m4, [o(permC)] psrlq m6, m4, 4 vpermq m5, m4, q1032 vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 psrlq m6, m5, 4 vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 vpbroadcastd m6, [o(pw_2048)] .end: REPX {pmulhrsw x, m6}, m0, m4, m1, m5 .end2: lea r3, [dstq+strideq*4] lea r4, [strideq*3] mova xm3, [dstq+strideq*0] mova xm6, [dstq+strideq*2] vinserti32x4 ym3, [dstq+strideq*1], 1 vinserti32x4 ym6, [dstq+r4 ], 1 vinserti32x4 m3, [r3 +strideq*0], 2 vinserti32x4 m6, [r3 +strideq*2], 2 vinserti32x4 m3, [r3 +strideq*1], 3 vinserti32x4 m6, [r3 +r4 ], 3 pxor m7, m7 mova [cq+64*0], m7 mova [cq+64*1], m7 mova [cq+64*2], m7 mova [cq+64*3], m7 punpcklbw m2, m3, m7 punpckhbw m3, m7 paddw m0, m2 paddw m4, m3 packuswb m0, m4 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [r3 +strideq*0], m0, 2 vextracti32x4 [r3 +strideq*1], m0, 3 punpcklbw m3, m6, m7 punpckhbw m6, m7 paddw m1, m3 paddw m5, m6 packuswb m1, m5 mova [dstq+strideq*2], xm1 vextracti32x4 [dstq+r4 ], ym1, 1 vextracti32x4 [r3 +strideq*2], m1, 2 vextracti32x4 [r3 +r4 ], m1, 3 RET ALIGN function_align cglobal_label .main IDCT8_1D_PACKED ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_8x16_internal_8bpc).main_pass1 vpbroadcastd m7, [o(pw_16384_m16384)] psrlq m10, 4 .pass1_end: punpcklwd m5, m4, m2 punpckhwd m4, m2 mova m1, m9 vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} mova m6, m9 vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} mova m2, m9 vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} psrad m1, 12 psrad m6, 12 packssdw m1, m6 ; out8 -out7 -out9 out6 psrad m2, 12 psrad m9, 12 packssdw m2, m9 ; -out11 out4 out10 -out5 psrlq m4, m10, 4 vpermi2q m4, m0, m2 vpermt2q m0, m10, m2 psrlq m5, m10, 8 vpermi2q m5, m1, m3 psrlq m10, 12 vpermt2q m1, m10, m3 punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 REPX {pmulhrsw x, m7}, m2, m3, m4, m5 jmp tx2q .pass2: vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 vshufi32x4 m3, m5, q3131 ; 6 7 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call .main_pass2 movshdup m4, [o(permC)] pmulhrsw m0, m6 pmulhrsw m1, m6 psrlq m6, m4, 4 mova m5, m4 vpermi2q m4, m0, m2 vpermt2q m0, m6, m2 vpermi2q m5, m1, m3 vpermt2q m1, m6, m3 jmp m(idct_16x8_internal_8bpc).end2 ALIGN function_align .main_pass1: vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m3, m4, [cq+64*0] pmulhrsw m1, m4, [cq+64*3] pmulhrsw m2, m4, [cq+64*1] pmulhrsw m4, [cq+64*2] mova m5, [o(int16_perm)] kxnorb k1, k1, k1 vpblendmd m0{k1}, m1, m3 ; 0 7 vmovdqa32 m3{k1}, m1 ; 6 1 vpblendmd m1{k1}, m4, m2 ; 2 5 vmovdqa32 m2{k1}, m4 ; 4 3 REPX {vpermb x, m5, x}, m0, m1, m2, m3 IADST8_1D_PACKED 1 ret ALIGN function_align cglobal_label .main_pass2 IADST8_1D_PACKED 2 pxor m5, m5 psubd m5, m6 packssdw m6, m5 pmulhrsw m2, m6 pmulhrsw m3, m6 ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_8x16_internal_8bpc).main_pass1 vpbroadcastd m7, [o(pw_m16384_16384)] psrlq m10, 20 jmp m(iadst_16x8_internal_8bpc).pass1_end .pass2: vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 vshufi32x4 m3, m5, q3131 ; 6 7 pshufd m4, m0, q1032 ; 1 0 pshufd m5, m1, q1032 ; 3 2 call m(iadst_16x8_internal_8bpc).main_pass2 movshdup m4, [o(permC)] pmulhrsw m5, m6, m0 pmulhrsw m0, m6, m1 psrlq m1, m4, 12 psrlq m4, 8 mova m7, m4 vpermi2q m4, m0, m3 vpermt2q m0, m1, m3 vpermi2q m1, m5, m2 vpermt2q m5, m7, m2 jmp m(idct_16x8_internal_8bpc).end2 INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 vpbroadcastd m0, [o(pw_2896x8)] pmulhrsw m3, m0, [cq+64*0] pmulhrsw m4, m0, [cq+64*1] pmulhrsw m5, m0, [cq+64*2] pmulhrsw m0, [cq+64*3] vpbroadcastd m7, [o(pw_1697x16)] vpbroadcastd m8, [o(pw_16384)] shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 mova m9, [o(int8_permA)] pmulhrsw m0, m7, m2 pmulhrsw m1, m7, m3 pmulhrsw m6, m7, m4 pmulhrsw m7, m5 REPX {pmulhrsw x, m8}, m0, m1, m6, m7 paddsw m2, m0 paddsw m3, m1 paddsw m4, m6 paddsw m5, m7 REPX {vpermb x, m9, x}, m2, m3, m4, m5 jmp tx2q .pass2: mova m7, [o(permB)] vpbroadcastd m6, [o(pw_4096)] vpermq m0, m7, m2 vpermq m4, m7, m4 vpermq m1, m7, m3 vpermq m5, m7, m5 jmp m(idct_16x8_internal_8bpc).end %macro INV_TXFM_16X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x16 %ifidn %1_%2, dct_dct movsx r6d, word [cq] mov [cq], eobd or r3d, 16 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, identity INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m7, [o(permB)] vpermq m0, m7, [cq+64*0] vpermq m1, m7, [cq+64*1] vpermq m2, m7, [cq+64*2] vpermq m3, m7, [cq+64*3] vpermq m4, m7, [cq+64*4] vpermq m5, m7, [cq+64*5] vpermq m6, m7, [cq+64*6] vpermq m7, m7, [cq+64*7] call .main vbroadcasti32x4 m12, [o(int_shuf1)] vbroadcasti32x4 m11, [o(int_shuf2)] vpbroadcastd m13, [o(pw_8192)] pshufb m0, m12 pshufb m8, m1, m11 pshufb m2, m12 pshufb m9, m3, m11 pshufb m4, m12 pshufb m10, m5, m11 pshufb m6, m12 pshufb m11, m7, m11 REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 punpckhdq m1, m0, m8 punpckldq m0, m8 punpckhdq m3, m2, m9 punpckldq m2, m9 punpckhdq m5, m4, m10 punpckldq m4, m10 punpckhdq m7, m6, m11 punpckldq m6, m11 jmp tx2q .pass2: vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 vshufi32x4 m2, m0, m4, q3131 ; 4 5 vshufi32x4 m0, m4, q2020 ; 0 1 vshufi32x4 m4, m6, m8, q2020 ; 8 9 vshufi32x4 m6, m8, q3131 ; 12 13 vshufi32x4 m3, m1, m5, q3131 ; 6 7 vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 15 call .main mova m8, [o(permD)] psrlq m12, m8, 4 psrlq m9, m8, 8 psrlq m13, m8, 12 mova m10, m8 vpermi2q m8, m0, m2 ; 0 1 4 5 vpermt2q m0, m12, m2 mova m11, m9 vpermi2q m9, m1, m3 ; 2 3 6 7 vpermt2q m1, m13, m3 vpermi2q m10, m4, m6 ; 8 9 12 13 vpermt2q m4, m12, m6 vpermi2q m11, m5, m7 ; 10 11 14 15 vpermt2q m5, m13, m7 .end: vpbroadcastd m12, [o(pw_2048)] .end2: REPX {pmulhrsw x, m12}, m0, m1, m4, m5 .end3: REPX {pmulhrsw x, m12}, m8, m9, m10, m11 lea r3, [strideq*3] lea r4, [dstq+strideq*4] lea r5, [dstq+strideq*8] lea r6, [r4 +strideq*8] mova xm3, [dstq+strideq*0] mova xm6, [dstq+strideq*2] vinserti32x4 ym3, [dstq+strideq*1], 1 vinserti32x4 ym6, [dstq+r3 ], 1 vinserti32x4 m3, [r4+strideq*0], 2 vinserti32x4 m6, [r4+strideq*2], 2 vinserti32x4 m3, [r4+strideq*1], 3 vinserti32x4 m6, [r4+r3 ], 3 mova xm12, [r5+strideq*0] mova xm13, [r5+strideq*2] vinserti32x4 ym12, [r5+strideq*1], 1 vinserti32x4 ym13, [r5+r3 ], 1 vinserti32x4 m12, [r6+strideq*0], 2 vinserti32x4 m13, [r6+strideq*2], 2 vinserti32x4 m12, [r6+strideq*1], 3 vinserti32x4 m13, [r6+r3 ], 3 pxor m7, m7 REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklbw m2, m3, m7 punpckhbw m3, m7 paddw m0, m2 paddw m8, m3 packuswb m0, m8 punpcklbw m2, m6, m7 punpckhbw m6, m7 paddw m1, m2 paddw m9, m6 packuswb m1, m9 punpcklbw m2, m12, m7 punpckhbw m12, m7 paddw m2, m4 paddw m10, m12 packuswb m2, m10 punpcklbw m3, m13, m7 punpckhbw m13, m7 paddw m3, m5 paddw m11, m13 packuswb m3, m11 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 mova [dstq+strideq*2], xm1 vextracti32x4 [dstq+r3 ], ym1, 1 vextracti32x4 [r4+strideq*0], m0, 2 vextracti32x4 [r4+strideq*1], m0, 3 vextracti32x4 [r4+strideq*2], m1, 2 vextracti32x4 [r4+r3 ], m1, 3 mova [r5+strideq*0], xm2 vextracti32x4 [r5+strideq*1], ym2, 1 mova [r5+strideq*2], xm3 vextracti32x4 [r5+r3 ], ym3, 1 vextracti32x4 [r6+strideq*0], m2, 2 vextracti32x4 [r6+strideq*1], m2, 3 vextracti32x4 [r6+strideq*2], m3, 2 vextracti32x4 [r6+r3 ], m3, 3 RET ALIGN function_align cglobal_label .main_fast2 ; bottom three-quarters are zero vpbroadcastd m10, [o(pd_2048)] vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 .main_fast4: vpbroadcastd m2, [o(pw_401_4076x8)] vpbroadcastd m4, [o(pw_m1189_3920x8)] vpbroadcastd m3, [o(pw_799_4017x8)] pmulhrsw m2, m8 ; t8a t15a pmulhrsw m4, m1 ; t11a t12a pmulhrsw m7, m3 ; t4a t7a pxor m6, m6 psubsw m0, m2, m4 ; t11a t12a paddsw m8, m2, m4 ; t8a t15a mova m1, m7 jmp .main5 ALIGN function_align cglobal_label .main_fast ; bottom half is zero vpbroadcastd m10, [o(pd_2048)] .main_fast3: vpbroadcastq m13, [o(int_mshift)] vpcmpub k7, m13, m10, 6 .main_fast5: vpbroadcastd m2, [o(pw_401_4076x8)] vpbroadcastd m4, [o(pw_m2598_3166x8)] vpbroadcastd m11, [o(pw_1931_3612x8)] vpbroadcastd m12, [o(pw_m1189_3920x8)] pmulhrsw m8, m2 ; t8a t15a vpbroadcastd m2, [o(pw_799_4017x8)] pmulhrsw m0, m4 ; t9a t14a vpbroadcastd m4, [o(pw_m2276_3406x8)] pmulhrsw m5, m11 ; t10a t13a pmulhrsw m1, m12 ; t11a t12a pmulhrsw m7, m2 ; t4a t7a pmulhrsw m3, m4 ; t5a t6a jmp .main4 ALIGN function_align cglobal_label .main IDCT16_1D_PACKED ret INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call .main_pass1 vpbroadcastd m10, [o(pw_8192_m8192)] punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 .pass1_end: REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: call .main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 psrlq m12, m10, 12 psrlq m13, m10, 4 mova m9, m8 vpermi2q m8, m0, m2 ; 0 1 4 5 vpermt2q m0, m12, m2 vpermi2q m9, m1, m3 ; 2 3 6 7 vpermt2q m1, m12, m3 vpbroadcastd m12, [o(pw_2048)] mov r3d, 0xff00ff00 mova m11, m10 vpermi2q m10, m4, m6 ; 8 9 12 13 vpermt2q m4, m13, m6 kmovd k1, r3d vpermi2q m11, m5, m7 ; 10 11 14 15 vpermt2q m5, m13, m7 pxor m7, m7 vpsubw m12{k1}, m7, m12 jmp m(idct_16x16_internal_8bpc).end2 ALIGN function_align .main_pass1: mova m4, [o(permB)] psrlq m3, m4, 4 vpermq m0, m4, [cq+64*0] vpermq m7, m3, [cq+64*7] vpermq m6, m4, [cq+64*6] vpermq m1, m3, [cq+64*1] vpermq m2, m4, [cq+64*2] vpermq m5, m3, [cq+64*5] vpermq m4, m4, [cq+64*4] vpermq m3, m3, [cq+64*3] call .main vpbroadcastd m13, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] mova m2, m10 vpdpwssd m2, m5, m13 ; -out5 mova m8, m10 vpdpwssd m8, m11, m13 ; out4 mova m9, m10 vpdpwssd m9, m5, m12 ; out10 mova m5, m10 vpdpwssd m5, m11, m12 ; -out11 mova m11, m10 vpdpwssd m11, m3, m13 ; -out7 mova m14, m10 vpdpwssd m14, m4, m13 ; out6 mova m13, m10 vpdpwssd m13, m3, m12 ; out8 vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 packssdw m2, m8 ; -out5 out4 packssdw m5, m9, m5 ; out10 -out11 packssdw m3, m11, m14 ; -out7 out6 packssdw m4, m13, m10 ; out8 -out9 ret ALIGN function_align .main_pass2: vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 vshufi32x4 m2, m0, m4, q3131 ; 4 5 vshufi32x4 m0, m4, q2020 ; 0 1 vshufi32x4 m4, m6, m8, q2020 ; 8 9 vshufi32x4 m6, m8, q3131 ; 12 13 vshufi32x4 m3, m1, m5, q3131 ; 6 7 vshufi32x4 m1, m5, q2020 ; 2 3 vshufi32x4 m5, m7, m9, q2020 ; 10 11 vshufi32x4 m7, m9, q3131 ; 14 15 cglobal_label .main_pass2b REPX {pshufd x, x, q1032}, m1, m3, m5, m7 call .main vpbroadcastd m8, [o(pw_2896x8)] pshufb m2, m11, m12 pshufb m5, m12 pshufb m3, m12 pshufb m4, m12 punpcklqdq m9, m5, m2 ; t15a t7 punpckhqdq m5, m2 ; t14a t6 shufps m2, m3, m4, q1032 ; t2a t10 shufps m3, m4, q3210 ; t3a t11 psubsw m4, m2, m3 ; out8 -out9 paddsw m3, m2 ; -out7 out6 paddsw m2, m5, m9 ; -out5 out4 psubsw m5, m9 ; out10 -out11 REPX {pmulhrsw x, m8}, m2, m3, m4, m5 ret ALIGN function_align .main: vpbroadcastd m10, [o(pd_2048)] vpbroadcastq m13, [o(int_mshift)] punpckhwd m8, m7, m0 ; in14 in1 punpcklwd m0, m7 ; in0 in15 punpcklwd m7, m6, m1 ; in12 in3 punpckhwd m1, m6 ; in2 in13 punpckhwd m6, m5, m2 ; in10 in5 punpcklwd m2, m5 ; in4 in11 punpcklwd m5, m4, m3 ; in8 in7 punpckhwd m3, m4 ; in6 in9 vpcmpub k7, m13, m10, 6 ; 0x33... ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 psubsw m4, m0, m5 ; t9a t8a paddsw m0, m5 ; t1a t0a psubsw m5, m1, m6 ; t11a t10a paddsw m1, m6 ; t3a t2a psubsw m6, m2, m7 ; t13a t12a paddsw m2, m7 ; t5a t4a psubsw m7, m3, m8 ; t15a t14a paddsw m3, m8 ; t7a t6a ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 psubsw m8, m1, m3 ; t7 t6 paddsw m1, m3 ; t3 t2 psubsw m3, m0, m2 ; t5 t4 paddsw m0, m2 ; t1 t0 psubsw m2, m5, m7 ; t14a t15a paddsw m7, m5 ; t10a t11a psubsw m5, m4, m6 ; t12a t13a paddsw m4, m6 ; t8a t9a ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 vbroadcasti32x4 m12, [o(deint_shuf)] paddsw m6, m4, m7 ; -out1 out14 psubsw m4, m7 ; t10 t11 psubsw m11, m3, m8 ; t7 t6 paddsw m8, m3 ; out12 -out3 psubsw m3, m0, m1 ; t3a t2a paddsw m0, m1 ; -out15 out0 paddsw m1, m2, m5 ; -out13 out2 psubsw m5, m2 ; t15a t14a pshufb m0, m12 pshufb m6, m12 pshufb m8, m12 pshufb m1, m12 shufps m7, m6, m0, q1032 ; out14 -out15 shufps m0, m6, m0, q3210 ; -out1 out0 punpcklqdq m6, m8, m1 ; out12 -out13 punpckhqdq m1, m8, m1 ; -out3 out2 ret INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 call m(iadst_16x16_internal_8bpc).main_pass1 vpbroadcastd m10, [o(pw_m8192_8192)] punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp m(iadst_16x16_internal_8bpc).pass1_end .pass2: call m(iadst_16x16_internal_8bpc).main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 psrlq m12, m10, 12 psrlq m13, m10, 4 mova m9, m8 vpermi2q m8, m7, m5 ; 0 1 4 5 vpermt2q m7, m12, m5 vpermi2q m9, m6, m4 ; 2 3 6 7 vpermt2q m6, m12, m4 vpbroadcastd m12, [o(pw_2048)] mov r3d, 0x00ff00ff mova m11, m10 vpermi2q m10, m3, m1 ; 8 9 12 13 vpermt2q m3, m13, m1 kmovd k1, r3d vpermi2q m11, m2, m0 ; 10 11 14 15 vpermt2q m2, m13, m0 pxor m0, m0 vpsubw m12{k1}, m0, m12 pmulhrsw m0, m7, m12 pmulhrsw m1, m6, m12 pmulhrsw m4, m3, m12 pmulhrsw m5, m2, m12 jmp m(idct_16x16_internal_8bpc).end3 INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 mova m8, [o(int16_perm)] vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 vpbroadcastd m0, [o(pw_1697x16)] vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 pmulhrsw m9, m0, m1 pmulhrsw m10, m0, m2 pmulhrsw m11, m0, m3 pmulhrsw m12, m0, m4 pmulhrsw m13, m0, m5 pmulhrsw m14, m0, m6 pmulhrsw m15, m0, m7 pmulhrsw m0, m8 REPX {psraw x, 1}, m9, m10, m11, m12 pavgw m1, m9 pavgw m2, m10 pavgw m3, m11 pavgw m4, m12 REPX {psraw x, 1}, m13, m14, m15, m0 pavgw m5, m13 pavgw m6, m14 pavgw m7, m15 pavgw m8, m0 punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 jmp tx2q ALIGN function_align .pass2: vpbroadcastd m11, [o(pw_1697x16)] pmulhrsw m12, m11, m0 pmulhrsw m13, m11, m1 pmulhrsw m14, m11, m2 pmulhrsw m15, m11, m3 pmulhrsw m8, m11, m4 pmulhrsw m9, m11, m5 pmulhrsw m10, m11, m6 pmulhrsw m11, m7 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 paddsw m0, m12 paddsw m1, m13 paddsw m2, m14 paddsw m3, m15 paddsw m8, m4 movu m4, [o(permD+2)] paddsw m9, m5 paddsw m6, m10 paddsw m7, m11 psrlq m12, m4, 4 mova m5, m4 mova m10, m4 mova m11, m4 vpermi2q m4, m0, m2 ; 8 9 12 13 vpermt2q m0, m12, m2 ; 0 1 4 5 vpermi2q m5, m1, m3 ; 10 11 14 15 vpermt2q m1, m12, m3 ; 2 3 6 7 vpermi2q m10, m8, m6 vpermt2q m8, m12, m6 vpermi2q m11, m9, m7 vpermt2q m9, m12, m7 jmp m(idct_16x16_internal_8bpc).end %macro ITX_UNPACK_MULHRSW 8 ; dst[1-2], src, tmp, coef[1-4] vpbroadcastd m%4, [o(pw_%5_%6x8)] punpcklwd m%1, m%3, m%3 pmulhrsw m%1, m%4 vpbroadcastd m%4, [o(pw_%7_%8x8)] punpckhwd m%2, m%3, m%3 pmulhrsw m%2, m%4 %endmacro cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly cmp eobd, 107 jb .fast mova m5, [cq+64*5] mova m3, [cq+64*3] mova m1, [cq+64*1] mova m7, [cq+64*7] mova m2, [cq+64*2] mova m6, [cq+64*6] mova m0, [cq+64*0] mova m4, [cq+64*4] call m(inv_txfm_add_dct_dct_32x8_8bpc).main mova m8, [o(idct_8x32p)] vpbroadcastd m9, [o(pw_8192)] REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 punpckldq m8, m0, m1 ; ab punpckhdq m0, m1 punpckldq m1, m2, m3 ; cd punpckhdq m2, m3 punpckldq m3, m4, m5 ; ef punpckhdq m4, m5 punpckldq m5, m6, m7 ; gh punpckhdq m6, m7 REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 punpcklqdq m20, m3, m5 punpckhqdq m16, m3, m5 punpcklqdq m19, m4, m6 punpckhqdq m17, m4, m6 vinserti32x4 ym8, ym18, xm20, 1 vshufi32x4 ym1, ym18, ym20, 0x03 vinserti32x4 ym9, ym14, xm16, 1 vshufi32x4 ym3, ym14, ym16, 0x03 vinserti32x4 ym0, ym21, xm19, 1 vshufi32x4 ym5, ym21, ym19, 0x03 vinserti32x4 ym7, ym15, xm17, 1 vshufi32x4 ym6, ym15, ym17, 0x03 call m(idct_8x16_internal_8bpc).main2 psrlq m12, [o(permB)], 60 vpermt2q m14, m12, m16 vpermt2q m21, m12, m19 vpermt2q m15, m12, m17 vpermi2q m12, m18, m20 vextracti32x8 ym16, m14, 1 vextracti32x8 ym19, m21, 1 vextracti32x8 ym17, m15, 1 vextracti32x8 ym20, m12, 1 call .main2 jmp .end .fast: ; right half is zero mova m0, [o(int16_perm)] mova ym2, [cq+64*4] vinserti32x8 m2, [cq+64*0], 1 mova ym3, [cq+64*6] vinserti32x8 m3, [cq+64*2], 1 mova ym4, [cq+64*3] vinserti32x8 m4, [cq+64*5], 1 mova ym5, [cq+64*7] vinserti32x8 m5, [cq+64*1], 1 REPX {vpermb x, m0, x}, m2, m3, m4, m5 call m(idct_16x8_internal_8bpc).main2 vbroadcasti32x4 m4, [o(int_shuf3)] vbroadcasti32x4 m5, [o(int_shuf4)] pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 vpbroadcastd m4, [o(pw_8192)] psrlq m5, [o(permB)], 60 punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 REPX {pmulhrsw x, m4}, m6, m17, m2, m16 vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 vpermt2q m2, m5, m6 ; 8 10 vpermt2q m16, m5, m17 ; 9 11 vextracti32x8 ym3, m2, 1 ; 12 14 vextracti32x8 ym17, m16, 1 ; 13 15 call m(idct_8x16_internal_8bpc).main_fast call .main_fast .end: vpbroadcastd ym8, strided pmulld ym8, [o(gather8d)] call .main_end lea r3, [dstq+strideq*4] kxnorb k1, k1, k1 lea r4, [dstq+strideq*8] pxor m9, m9 lea r1, [r3+strideq*8] kmovb k2, k1 vpgatherdq m12{k1}, [r0+ym8] kmovb k1, k2 vpgatherdq m13{k2}, [r3+ym8] kmovb k2, k1 vpgatherdq m14{k1}, [r4+ym8] kmovb k1, k2 vpgatherdq m15{k2}, [r1+ym8] REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 REPX {mova [cq+64*x], m9}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklbw m11, m12, m9 punpckhbw m12, m9 paddw m0, m11 paddw m1, m12 packuswb m0, m1 kmovb k2, k1 vpscatterdq [r0+ym8]{k1}, m0 punpcklbw m12, m13, m9 punpckhbw m13, m9 paddw m2, m12 paddw m3, m13 packuswb m2, m3 kmovb k1, k2 vpscatterdq [r3+ym8]{k2}, m2 punpcklbw m13, m14, m9 punpckhbw m14, m9 paddw m4, m13 paddw m5, m14 packuswb m4, m5 kmovb k2, k1 vpscatterdq [r4+ym8]{k1}, m4 punpcklbw m14, m15, m9 punpckhbw m15, m9 paddw m6, m14 paddw m7, m15 packuswb m6, m7 vpscatterdq [r1+ym8]{k2}, m6 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 INIT_YMM avx512icl ALIGN function_align cglobal_label .main_fast2 ; bottom three-quarters are zero ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a ITX_UNPACK_MULHRSW 21, 20, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a mova m11, m12 mova m17, m20 mova m15, m21 mova m16, m14 jmp .main4 ALIGN function_align cglobal_label .main_fast ; bottom half is zero ITX_UNPACK_MULHRSW 12, 14, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a ITX_UNPACK_MULHRSW 21, 15, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a ITX_UNPACK_MULHRSW 20, 16, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a ITX_UNPACK_MULHRSW 19, 17, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a jmp .main3 ALIGN function_align cglobal_label .main punpcklwd m12, m21, m14 ; in31 in1 punpckhwd m14, m21 ; in3 in29 punpcklwd m21, m20, m15 ; in27 in5 punpckhwd m15, m20 ; in7 in25 punpcklwd m20, m19, m16 ; in23 in9 punpckhwd m16, m19 ; in11 in21 punpcklwd m19, m18, m17 ; in19 in13 punpckhwd m17, m18 ; in15 in17 .main2: ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a .main3: psubsw m11, m12, m17 ; t17 t30 paddsw m12, m17 ; t16 t31 psubsw m17, m15, m20 ; t18 t29 paddsw m20, m15 ; t19 t28 psubsw m15, m21, m16 ; t21 t26 paddsw m21, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 .main4: ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a vpbroadcastd m8, [o(pw_m3784_1567)] psubsw m19, m12, m20 ; t19a t28a paddsw m20, m12 ; t16a t31a psubsw m12, m14, m21 ; t20a t27a paddsw m14, m21 ; t23a t24a psubsw m21, m11, m17 ; t18 t29 paddsw m11, m17 ; t17 t30 psubsw m17, m16, m15 ; t21 t26 paddsw m16, m15 ; t22 t25 ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a vbroadcasti32x4 m18, [o(deint_shuf)] vpbroadcastd m8, [o(pw_m2896_2896)] vpbroadcastd m9, [o(pw_2896_2896)] psubsw m15, m20, m14 ; t23 t24 paddsw m20, m14 ; t16 t31 psubsw m14, m11, m16 ; t22a t25a paddsw m11, m16 ; t17a t30a psubsw m16, m21, m17 ; t21 t26 paddsw m21, m17 ; t18 t29 psubsw m17, m19, m12 ; t20a t27a paddsw m19, m12 ; t19a t28a REPX {pshufb x, m18}, m20, m11, m21, m19 ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 packssdw m18, m13 ; t23a t22 packssdw m12, m15 ; t24a t25 ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 packssdw m16, m13 ; t20 t21a packssdw m14, m15 ; t27 t26a punpcklqdq m13, m19, m21 ; t19a t18 punpckhqdq m19, m21 ; t28a t29 punpcklqdq m21, m20, m11 ; t16 t17a punpckhqdq m20, m11 ; t31 t30a INIT_ZMM avx512icl mova m15, [o(permA)] ret cglobal_label .main_end vpbroadcastd m10, [o(pw_2048)] vpermt2q m0, m15, m1 ; t0 t1 t2 t3 vpermt2q m20, m15, m19 ; t31 t30a t29 t28a vpermt2q m2, m15, m3 ; t4 t5 t6 t7 vpermt2q m14, m15, m12 ; t27 t26a t25 t24a vpermt2q m4, m15, m5 ; t8 t9 t10 t11 vpermt2q m18, m15, m16 ; t23a t22 t21a t20 vpermt2q m6, m15, m7 ; t12 t13 t14 t15 vpermt2q m13, m15, m21 ; t19a t18 t17a t16 psubsw m7, m0, m20 ; out31 out30 out29 out28 paddsw m0, m20 ; out0 out1 out2 out3 psubsw m5, m2, m14 ; out27 out26 out25 out24 paddsw m2, m14 ; out4 out5 out6 out7 psubsw m3, m4, m18 ; out23 out22 out21 out20 paddsw m4, m18 ; out8 out9 out10 out11 psubsw m1, m6, m13 ; out19 out18 out17 out16 paddsw m6, m13 ; out12 out13 out14 out15 vzeroupper ret %macro LOAD_PACKED_16X2 3 ; dst, row[1-2] vbroadcasti32x4 ym%1, [cq+16*%2] vbroadcasti32x4 ym8, [cq+16*%3] shufpd ym%1, ym8, 0x0c %endmacro cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob %undef cmp test eobd, eobd jz .dconly lea r5, [o_base] LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 pxor m4, m4 REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 cmp eobd, 107 jb .fast LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 call m(idct_8x16_internal_8bpc).main LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 pxor m8, m8 REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 call m(inv_txfm_add_dct_dct_8x32_8bpc).main jmp .pass2 .fast: ; bottom half is zero mova ym5, ym4 mova ym6, ym4 mova ym7, ym4 call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: vpbroadcastd m10, [o(pw_8192)] vpermt2q m0, m15, m4 ; t0 t1 t9 t8 vpermt2q m20, m15, m18 ; t31 t30a t23a t22 vpermt2q m3, m15, m7 ; t7 t6 t14 t15 vpermt2q m12, m15, m21 ; t25 t24a t17a t16 vpermt2q m2, m15, m6 ; t4 t5 t13 t12 vpermt2q m14, m15, m13 ; t23a t22 t21a t20 vpermt2q m1, m15, m5 ; t3 t2 t10 t11 vpermt2q m19, m15, m16 ; t27 t26a t19a t18 psubsw m8, m0, m20 ; out31 out30 out22 out23 paddsw m0, m20 ; out0 out1 out9 out8 paddsw m6, m3, m12 ; out7 out6 out14 out15 psubsw m3, m12 ; out24 out25 out17 out16 psubsw m5, m2, m14 ; out27 out26 out18 out19 paddsw m4, m2, m14 ; out4 out5 out13 out12 psubsw m7, m1, m19 ; out28 out29 out21 out20 paddsw m2, m1, m19 ; out3 out2 out10 out11 vzeroupper vshufi32x4 m1, m0, m3, q1221 ; out1 out9 out17 out25 vshufi32x4 m0, m3, q0330 ; out0 out8 out16 out24 vshufi32x4 m3, m2, m5, q0330 ; out3 out11 out19 out27 vshufi32x4 m2, m5, q1221 ; out2 out10 out18 out26 vshufi32x4 m5, m4, m7, q1221 ; out5 out13 out21 out29 vshufi32x4 m4, m7, q0330 ; out4 out12 out20 out28 vshufi32x4 m7, m6, m8, q0330 ; out7 out15 out23 out31 vshufi32x4 m6, m8, q1221 ; out6 out14 out22 out30 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 call .main vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 lea r2, [strideq*3] lea r3, [dstq+strideq*4] movshdup m12, [o(permD)] pmovzxbw m8, [dstq+strideq*0] pmovzxbw m9, [dstq+strideq*1] pmovzxbw m10, [dstq+strideq*2] pmovzxbw m11, [dstq+r2 ] paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 pmovzxbw m8, [r3+strideq*0] pmovzxbw m9, [r3+strideq*1] pmovzxbw m10, [r3+strideq*2] pmovzxbw m11, [r3+r2 ] paddw m4, m8 paddw m5, m9 paddw m6, m10 paddw m7, m11 packuswb m0, m1 packuswb m2, m3 vpermq m0, m12, m0 vpermq m2, m12, m2 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym2 vextracti32x8 [dstq+r2 ], m2, 1 packuswb m4, m5 packuswb m6, m7 vpermq m4, m12, m4 vpermq m6, m12, m6 mova [r3+strideq*0], ym4 vextracti32x8 [r3+strideq*1], m4, 1 mova [r3+strideq*2], ym6 vextracti32x8 [r3+r2 ], m6, 1 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 8 .dconly2: imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 .dconly3: imul r6d, 181 add r6d, 128+2048 sar r6d, 8+4 pxor m2, m2 vpbroadcastw m3, r6d .dconly_loop: mova ym1, [dstq+strideq*0] vinserti32x8 m1, [dstq+strideq*1], 1 punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub r3d, 2 jg .dconly_loop RET ALIGN function_align cglobal_label .main vpbroadcastd m10, [o(pd_2048)] .main2: ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 .main3: paddsw m8, m1, m5 ; t4 psubsw m1, m5 ; t5a paddsw m9, m7, m3 ; t7 psubsw m7, m3 ; t6a ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 psubsw m5, m0, m2 ; dct4 out2 paddsw m2, m0 ; dct4 out1 paddsw m0, m4, m6 ; dct4 out0 psubsw m4, m6 ; dct4 out3 psubsw m6, m2, m1 ; out6 paddsw m1, m2 ; out1 paddsw m2, m5, m7 ; out2 psubsw m5, m7 ; out5 psubsw m7, m0, m9 ; out7 paddsw m0, m9 ; out0 paddsw m3, m4, m8 ; out3 psubsw m4, m8 ; out4 ret cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c vpbroadcastd m7, [pw_5] paddsw m0, m7, [cq+64*0] paddsw m1, m7, [cq+64*1] vpbroadcastd ym9, strided paddsw m2, m7, [cq+64*2] paddsw m3, m7, [cq+64*3] paddsw m4, m7, [cq+64*4] paddsw m5, m7, [cq+64*5] paddsw m6, m7, [cq+64*6] paddsw m7, [cq+64*7] pmulld ym14, ym9, [pd_0to15] lea r3, [dstq+strideq*1] lea r4, [dstq+strideq*2] kxnorb k1, k1, k1 pxor m13, m13 add r1, r4 ; dstq+strideq*3 kmovb k2, k1 vpgatherdq m9{k1}, [r0+ym14*4] kmovb k1, k2 vpgatherdq m10{k2}, [r3+ym14*4] kmovb k2, k1 call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 vpgatherdq m11{k1}, [r4+ym14*4] kmovb k1, k2 vpgatherdq m12{k2}, [r1+ym14*4] REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklbw m8, m9, m13 ; 0 8 16 24 punpckhbw m9, m13 ; 4 12 20 28 paddw m0, m8 paddw m4, m9 packuswb m0, m4 kmovb k2, k1 vpscatterdq [r0+ym14*4]{k1}, m0 punpcklbw m8, m10, m13 ; 1 9 17 25 punpckhbw m10, m13 ; 5 13 21 29 paddw m1, m8 paddw m5, m10 packuswb m1, m5 kmovb k1, k2 vpscatterdq [r3+ym14*4]{k2}, m1 punpcklbw m8, m11, m13 ; 2 10 18 26 punpckhbw m11, m13 ; 6 14 22 30 paddw m2, m8 paddw m6, m11 packuswb m2, m6 kmovb k2, k1 vpscatterdq [r4+ym14*4]{k1}, m2 punpcklbw m8, m12, m13 ; 3 11 19 27 punpckhbw m12, m13 ; 7 15 23 31 paddw m3, m8 paddw m7, m12 packuswb m3, m7 vpscatterdq [r1+ym14*4]{k2}, m3 RET cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c vpbroadcastd m0, [pw_4096] pmulhrsw m3, m0, [cq+64*0] pmulhrsw m4, m0, [cq+64*4] pmulhrsw m6, m0, [cq+64*1] pmulhrsw m5, m0, [cq+64*5] pmulhrsw m7, m0, [cq+64*2] pmulhrsw m2, m0, [cq+64*6] pmulhrsw m8, m0, [cq+64*3] pmulhrsw m0, [cq+64*7] mova m13, [int8_permA] lea r3, [strideq*3] lea r4, [dstq+strideq*4] punpckldq m1, m3, m4 punpckhdq m3, m4 punpckldq m4, m6, m5 punpckhdq m6, m5 punpckldq m5, m7, m2 punpckhdq m7, m2 punpckldq m2, m8, m0 punpckhdq m8, m0 mova ym9, [dstq+strideq*0] vinserti32x8 m9, [dstq+strideq*2], 1 mova ym10, [dstq+strideq*1] vinserti32x8 m10, [dstq+r3 ], 1 mova ym11, [r4+strideq*0] vinserti32x8 m11, [r4+strideq*2], 1 mova ym12, [r4+strideq*1] vinserti32x8 m12, [r4+r3 ], 1 REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 pxor m13, m13 REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 punpckhqdq m1, m4 ; b0 b2 d0 d2 punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 punpckhqdq m5, m2 ; b1 b3 d1 d3 punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 punpckhqdq m3, m6 ; f0 f2 h0 h2 punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 punpckhqdq m7, m8 ; f1 f3 h1 h3 punpcklbw m8, m9, m13 punpckhbw m9, m13 paddw m0, m8 paddw m4, m9 packuswb m0, m4 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*2], m0, 1 punpcklbw m8, m10, m13 punpckhbw m10, m13 paddw m1, m8 paddw m5, m10 packuswb m1, m5 mova [dstq+strideq*1], ym1 vextracti32x8 [dstq+r3 ], m1, 1 punpcklbw m8, m11, m13 punpckhbw m11, m13 paddw m2, m8 paddw m6, m11 packuswb m2, m6 mova [r4+strideq*0], ym2 vextracti32x8 [r4+strideq*2], m2, 1 punpcklbw m8, m12, m13 punpckhbw m12, m13 paddw m3, m8 paddw m7, m12 packuswb m3, m7 mova [r4+strideq*1], ym3 vextracti32x8 [r4+r3 ], m3, 1 RET %macro IDCT_16x32_END 3 ; src[1-2], row mova xm8, [dstq+strideq*0] vinserti32x4 ym8, [dstq+strideq*1], 1 mova xm9, [dstq+r3 ] vinserti32x4 ym9, [dstq+strideq*2], 1 pmulhrsw m%1, m10 pmulhrsw m%2, m10 vpermb m8, m11, m8 vpermb m9, m11, m9 mova [cq+64*(%3*2+0)], m13 mova [cq+64*(%3*2+1)], m13 paddw m8, m%1 paddw m9, m%2 packuswb m8, m9 vpermd m8, m12, m8 mova [dstq+strideq*0], xm8 vextracti32x4 [dstq+strideq*1], ym8, 1 vextracti32x4 [dstq+strideq*2], m8, 2 vextracti32x4 [dstq+r3 ], m8, 3 %if %1 != 20 lea dstq, [dstq+strideq*4] %endif %endmacro cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly vpbroadcastd m15, [o(pw_2896x8)] cmp eobd, 151 jb .fast pmulhrsw m5, m15, [cq+64*10] pmulhrsw m3, m15, [cq+64* 6] pmulhrsw m1, m15, [cq+64* 2] pmulhrsw m7, m15, [cq+64*14] pmulhrsw m2, m15, [cq+64* 4] pmulhrsw m6, m15, [cq+64*12] pmulhrsw m0, m15, [cq+64* 0] pmulhrsw m4, m15, [cq+64* 8] call m(inv_txfm_add_dct_dct_32x8_8bpc).main pmulhrsw m14, m15, [cq+64* 1] pmulhrsw m21, m15, [cq+64*15] pmulhrsw m18, m15, [cq+64* 9] pmulhrsw m17, m15, [cq+64* 7] pmulhrsw m16, m15, [cq+64* 5] pmulhrsw m19, m15, [cq+64*11] pmulhrsw m20, m15, [cq+64*13] pmulhrsw m15, [cq+64* 3] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova m8, [o(idct_16x32p)] vpbroadcastd m9, [o(pw_16384)] REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ m14, m15, m16, m17, m18, m19, m20, m21 punpckldq m8, m0, m1 punpckhdq m0, m1 punpckldq m1, m2, m3 punpckhdq m2, m3 REPX {pmulhrsw x, m9}, m8, m0, m1, m2 punpckldq m3, m4, m5 punpckhdq m4, m5 punpckldq m5, m6, m7 punpckhdq m6, m7 REPX {pmulhrsw x, m9}, m3, m4, m5, m6 punpckldq m7, m14, m15 punpckhdq m14, m15 punpckldq m15, m16, m17 punpckhdq m16, m17 REPX {pmulhrsw x, m9}, m7, m14, m15, m16 punpckldq m17, m18, m19 punpckhdq m18, m19 punpckldq m19, m20, m21 punpckhdq m20, m21 REPX {pmulhrsw x, m9}, m17, m18, m19, m20 punpcklqdq m21, m8, m1 punpckhqdq m8, m1 punpcklqdq m1, m0, m2 punpckhqdq m0, m2 punpcklqdq m2, m3, m5 punpckhqdq m3, m5 punpcklqdq m5, m4, m6 punpckhqdq m4, m6 punpcklqdq m6, m7, m15 punpckhqdq m7, m15 punpcklqdq m15, m14, m16 punpckhqdq m14, m16 punpcklqdq m16, m17, m19 punpckhqdq m17, m19 punpcklqdq m19, m18, m20 punpckhqdq m18, m20 vinserti32x8 m20, m21, ym2, 1 vshufi32x4 m21, m2, q3232 vinserti32x8 m2, m8, ym3, 1 vshufi32x4 m8, m3, q3232 vinserti32x8 m3, m1, ym5, 1 vshufi32x4 m1, m5, q3232 vinserti32x8 m5, m0, ym4, 1 vshufi32x4 m0, m4, q3232 vinserti32x8 m4, m6, ym16, 1 vshufi32x4 m6, m16, q3232 vinserti32x8 m16, m7, ym17, 1 vshufi32x4 m7, m17, q3232 vinserti32x8 m17, m15, ym19, 1 vshufi32x4 m15, m19, q3232 vinserti32x8 m19, m14, ym18, 1 vshufi32x4 m14, m18, q3232 vshufi32x4 m18, m21, m6, q3131 ; 27 5 vshufi32x4 m21, m6, q2020 ; 31 1 vshufi32x4 m6, m8, m7, q2020 ; 24 8 vshufi32x4 m8, m7, q3131 ; 30 2 vshufi32x4 m7, m1, m15, q2020 ; 28 4 vshufi32x4 m1, m15, q3131 ; 6 26 vshufi32x4 m15, m0, m14, q2020 ; 7 25 vshufi32x4 m0, m14, q3131 ; 14 18 vshufi32x4 m14, m20, m4, q2020 ; 3 29 vshufi32x4 m20, m4, q3131 ; 23 9 vshufi32x4 m9, m3, m17, q2020 ; 16 0 vshufi32x4 m3, m17, q3131 ; 12 20 vshufi32x4 m17, m5, m19, q2020 ; 15 17 vshufi32x4 m5, m19, q3131 ; 22 10 vshufi32x4 m19, m2, m16, q2020 ; 19 13 vshufi32x4 m16, m2, m16, q3131 ; 11 21 call m(idct_16x16_internal_8bpc).main3 call .main_oddhalf jmp .pass2 .fast: ; right half is zero mova ym8, [cq+64*15] vinserti32x8 m8, [cq+64* 1], 1 mova m2, [o(int16_perm)] mova ym9, [cq+64* 8] vinserti32x8 m9, [cq+64* 0], 1 mova ym0, [cq+64* 7] vinserti32x8 m0, [cq+64* 9], 1 mova ym7, [cq+64*14] vinserti32x8 m7, [cq+64* 2], 1 mova ym1, [cq+64* 3] vinserti32x8 m1, [cq+64*13], 1 mova ym3, [cq+64* 6] vinserti32x8 m3, [cq+64*10], 1 mova ym5, [cq+64*11] vinserti32x8 m5, [cq+64* 5], 1 mova ym6, [cq+64*12] vinserti32x8 m6, [cq+64* 4], 1 REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 call m(idct_16x16_internal_8bpc).main2 vbroadcasti32x4 m8, [o(int_shuf3)] vbroadcasti32x4 m9, [o(int_shuf4)] vpbroadcastd m11, [o(pw_16384)] pshufb m0, m8 pshufb m1, m9 pshufb m2, m8 pshufb m3, m9 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 pshufb m4, m8 pshufb m5, m9 pshufb m6, m8 pshufb m7, m9 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 punpckhdq m17, m0, m1 punpckldq m0, m1 punpckhdq m16, m2, m3 punpckldq m2, m3 punpckhdq m18, m4, m5 punpckldq m4, m5 punpckhdq m5, m6, m7 punpckldq m6, m7 vinserti32x8 m1, m0, ym2, 1 vshufi32x4 m3, m0, m2, q3232 vinserti32x8 m2, m4, ym6, 1 vshufi32x4 m4, m6, q3232 vinserti32x8 m15, m17, ym16, 1 vshufi32x4 m17, m16, q3232 vinserti32x8 m16, m18, ym5, 1 vshufi32x4 m18, m5, q3232 vshufi32x4 m0, m1, m2, q2020 ; 0 2 vshufi32x4 m1, m2, q3131 ; 4 6 vshufi32x4 m2, m3, m4, q2020 ; 8 10 vshufi32x4 m3, m4, q3131 ; 12 14 vshufi32x4 m14, m15, m16, q2020 ; 1 3 vshufi32x4 m15, m16, q3131 ; 5 7 vshufi32x4 m16, m17, m18, q2020 ; 9 11 vshufi32x4 m17, m18, q3131 ; 13 15 pxor m6, m6 punpckhwd m8, m0, m0 punpcklwd m9, m6, m0 punpckhwd m0, m3, m3 punpckhwd m5, m2, m2 punpcklwd m7, m1, m1 punpckhwd m1, m1 punpcklwd m3, m3 punpcklwd m6, m2 call m(idct_16x16_internal_8bpc).main_fast5 punpcklwd m21, m14, m14 punpckhwd m14, m14 punpcklwd m18, m15, m15 punpckhwd m15, m15 punpcklwd m20, m16, m16 punpckhwd m16, m16 punpcklwd m19, m17, m17 punpckhwd m17, m17 call .main_oddhalf_fast .pass2: vpbroadcastd m10, [o(pw_2048)] mova m11, [o(end_16x32p)] lea r3, [strideq*3] pxor m13, m13 psrld m12, m11, 8 IDCT_16x32_END 0, 1, 0 IDCT_16x32_END 2, 3, 1 IDCT_16x32_END 4, 5, 2 IDCT_16x32_END 6, 7, 3 IDCT_16x32_END 14, 15, 4 IDCT_16x32_END 16, 17, 5 IDCT_16x32_END 18, 19, 6 IDCT_16x32_END 20, 21, 7 RET ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly ALIGN function_align cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m8, [o(pw_201_4091x8)] vpbroadcastd m20, [o(pw_m1380_3857x8)] vpbroadcastd m9, [o(pw_995_3973x8)] vpbroadcastd m16, [o(pw_m601_4052x8)] pmulhrsw m21, m8 ; t16a, t31a pmulhrsw m20, m15 ; t19a, t28a pmulhrsw m18, m9 ; t20a, t27a pmulhrsw m14, m16 ; t23a, t24a mova m8, m21 mova m17, m20 mova m15, m18 mova m16, m14 jmp .main3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m8, [o(pw_201_4091x8)] vpbroadcastd m9, [o(pw_m2751_3035x8)] vpbroadcastd m11, [o(pw_1751_3703x8)] vpbroadcastd m12, [o(pw_m1380_3857x8)] pmulhrsw m21, m8 ; t16a, t31a vpbroadcastd m8, [o(pw_995_3973x8)] pmulhrsw m17, m9 ; t17a, t30a vpbroadcastd m9, [o(pw_m2106_3513x8)] pmulhrsw m20, m11 ; t18a, t29a vpbroadcastd m11, [o(pw_2440_3290x8)] pmulhrsw m15, m12 ; t19a, t28a vpbroadcastd m12, [o(pw_m601_4052x8)] pmulhrsw m18, m8 ; t20a, t27a pmulhrsw m16, m9 ; t21a, t26a pmulhrsw m19, m11 ; t22a, t25a pmulhrsw m14, m12 ; t23a, t24a jmp .main2 ALIGN function_align cglobal_label .main_oddhalf ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a .main2: psubsw m8, m21, m17 ; t17 t30 paddsw m21, m17 ; t16 t31 psubsw m17, m15, m20 ; t18 t29 paddsw m20, m15 ; t19 t28 psubsw m15, m18, m16 ; t21 t26 paddsw m18, m16 ; t20 t27 psubsw m16, m14, m19 ; t22 t25 paddsw m14, m19 ; t23 t24 .main3: ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a vpbroadcastd m11, [o(pw_m3784_1567)] psubsw m19, m21, m20 ; t19a t28a paddsw m21, m20 ; t16a t31a psubsw m20, m14, m18 ; t20a t27a paddsw m14, m18 ; t23a t24a psubsw m18, m8, m17 ; t18 t29 paddsw m8, m17 ; t17 t30 psubsw m17, m16, m15 ; t21 t26 paddsw m15, m16 ; t22 t25 ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a vbroadcasti32x4 m9, [o(deint_shuf)] psubsw m16, m21, m14 ; t23 t24 paddsw m14, m21 ; t16 t31 psubsw m21, m8, m15 ; t22a t25a paddsw m15, m8 ; t17a t30a psubsw m8, m18, m17 ; t21 t26 paddsw m18, m17 ; t18 t29 paddsw m17, m19, m20 ; t19a t28a psubsw m19, m20 ; t20a t27a vpbroadcastd m11, [o(pw_m2896_2896)] vpbroadcastd m12, [o(pw_2896_2896)] REPX {pshufb x, m9}, m14, m15, m18, m17 mova m9, m10 vpdpwssd m9, m16, m11 mova m20, m10 vpdpwssd m20, m21, m11 psrad m9, 12 psrad m20, 12 packssdw m9, m20 ; t23a t22 mova m20, m10 vpdpwssd m20, m16, m12 mova m16, m10 vpdpwssd m16, m21, m12 psrad m20, 12 psrad m16, 12 packssdw m16, m20, m16 ; t24a t25 ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 packssdw m11, m20 ; t27 t26a packssdw m8, m21 ; t20 t21a punpcklqdq m20, m14, m15 ; t16 t17a punpckhqdq m14, m15 ; t31 t30a punpckhqdq m15, m17, m18 ; t28a t29 punpcklqdq m17, m18 ; t19a t18 psubsw m21, m0, m14 ; out31 out30 paddsw m0, m14 ; out0 out1 psubsw m14, m7, m20 ; out16 out17 paddsw m7, m20 ; out15 out14 psubsw m20, m1, m15 ; out28 out29 paddsw m1, m15 ; out3 out2 psubsw m15, m6, m17 ; out19 out18 paddsw m6, m17 ; out12 out13 psubsw m17, m4, m9 ; out23 out22 paddsw m4, m9 ; out8 out9 psubsw m18, m3, m16 ; out24 out25 paddsw m3, m16 ; out7 out6 psubsw m16, m5, m8 ; out20 out21 paddsw m5, m8 ; out11 out10 psubsw m19, m2, m11 ; out27 out26 paddsw m2, m11 ; out4 out5 ret cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly mova m21, [o(permB)] vpermq m1, m21, [cq+64* 0] ; 0 1 vpermq m14, m21, [cq+64* 1] ; 2 3 vpermq m20, m21, [cq+64* 2] ; 4 5 vpermq m15, m21, [cq+64* 3] ; 6 7 vpbroadcastd m8, [o(pw_2896x8)] vpermq m2, m21, [cq+64* 4] ; 8 9 vpermq m16, m21, [cq+64* 5] ; 10 11 vpermq m3, m21, [cq+64* 6] ; 12 13 vpermq m17, m21, [cq+64* 7] ; 14 15 REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 pxor m12, m12 REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 cmp eobd, 151 jb .fast vpermq m9, m21, [cq+64* 8] ; 16 17 vpermq m19, m21, [cq+64* 9] ; 18 19 vpermq m4, m21, [cq+64*10] ; 20 21 vpermq m5, m21, [cq+64*11] ; 22 23 vpermq m6, m21, [cq+64*12] ; 24 25 vpermq m18, m21, [cq+64*13] ; 26 27 vpermq m7, m21, [cq+64*14] ; 28 29 vpermq m21, m21, [cq+64*15] ; 30 31 REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 punpcklwd m8, m21, m14 ; 30 2 punpckhwd m21, m1 ; 31 1 punpcklwd m0, m17, m19 ; 14 18 punpckhwd m17, m9 ; 15 17 punpcklwd m9, m1 ; 16 0 punpckhwd m14, m7 ; 3 29 punpcklwd m1, m15, m18 ; 6 26 punpckhwd m15, m6 ; 7 25 punpcklwd m6, m2 ; 24 8 punpckhwd m19, m3 ; 19 13 punpcklwd m3, m4 ; 12 20 punpckhwd m18, m20 ; 27 5 punpcklwd m7, m20 ; 28 4 punpckhwd m20, m5, m2 ; 23 9 punpcklwd m5, m16 ; 22 10 punpckhwd m16, m4 ; 11 21 call m(idct_16x16_internal_8bpc).main2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf jmp .pass2 .fast: ; bottom half zero punpcklwd m8, m14, m14 ; 2 punpcklwd m0, m17, m17 ; 14 punpcklwd m5, m16, m16 ; 10 punpcklwd m9, m12, m1 ; __ 0 punpckhwd m21, m1, m1 ; 1 punpcklwd m1, m15, m15 ; 6 punpcklwd m7, m20, m20 ; 4 punpckhwd m19, m3, m3 ; 13 punpcklwd m3, m3 ; 12 punpcklwd m6, m12, m2 ; __ 8 punpckhwd m18, m20, m20 ; 5 punpckhwd m20, m2, m2 ; 9 call m(idct_16x16_internal_8bpc).main_fast punpckhwd m15, m15 ; 7 punpckhwd m14, m14 ; 3 punpckhwd m16, m16 ; 11 punpckhwd m17, m17 ; 15 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast .pass2: vpbroadcastd m9, [o(pw_16384)] call .transpose_round vshufi32x4 m16, m14, m2, q3131 ; 5 vshufi32x4 m14, m2, q2020 ; 1 vshufi32x4 m2, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m1, m18, q3131 ; 6 vshufi32x4 m1, m18, q2020 ; 2 vshufi32x4 m18, m20, m6, q2020 ; 9 vshufi32x4 m20, m6, q3131 ; 13 vshufi32x4 m6, m21, m4, q3131 ; 12 vshufi32x4 m4, m21, m4, q2020 ; 8 vshufi32x4 m21, m19, m7, q3131 ; 15 vshufi32x4 m19, m7, q2020 ; 11 vshufi32x4 m7, m5, m15, q3131 ; 14 vshufi32x4 m5, m15, q2020 ; 10 vshufi32x4 m15, m17, m9, q2020 ; 3 vshufi32x4 m17, m9, q3131 ; 7 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 call .main_oddhalf vpbroadcastd m12, [o(pw_2048)] movshdup m13, [o(permD)] lea r2, [strideq*3] pmovzxbw m8, [dstq+strideq*0] pmovzxbw m9, [dstq+strideq*1] pmovzxbw m10, [dstq+strideq*2] pmovzxbw m11, [dstq+r2 ] REPX {pmulhrsw x, m12}, m0, m1, m2, m3 lea r3, [dstq+strideq*4] paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 pmovzxbw m8, [r3+strideq*0] pmovzxbw m9, [r3+strideq*1] pmovzxbw m10, [r3+strideq*2] pmovzxbw m11, [r3+r2 ] REPX {pmulhrsw x, m12}, m4, m5, m6, m7 lea r4, [dstq+strideq*8] packuswb m0, m1 paddw m4, m8 paddw m5, m9 packuswb m2, m3 paddw m6, m10 paddw m7, m11 pmovzxbw m8, [r4+strideq*0] pmovzxbw m9, [r4+strideq*1] pmovzxbw m10, [r4+strideq*2] pmovzxbw m11, [r4+r2 ] REPX {pmulhrsw x, m12}, m14, m15, m16, m17 lea r5, [r3+strideq*8] packuswb m4, m5 paddw m14, m8 paddw m15, m9 packuswb m6, m7 paddw m16, m10 paddw m17, m11 pmovzxbw m8, [r5+strideq*0] pmovzxbw m9, [r5+strideq*1] pmovzxbw m10, [r5+strideq*2] pmovzxbw m11, [r5+r2 ] REPX {pmulhrsw x, m12}, m18, m19, m20, m21 packuswb m14, m15 paddw m18, m8 paddw m19, m9 packuswb m16, m17 paddw m20, m10 paddw m21, m11 packuswb m18, m19 packuswb m20, m21 REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym2 vextracti32x8 [dstq+r2 ], m2, 1 mova [r3+strideq*0], ym4 vextracti32x8 [r3+strideq*1], m4, 1 mova [r3+strideq*2], ym6 vextracti32x8 [r3+r2 ], m6, 1 mova [r4+strideq*0], ym14 vextracti32x8 [r4+strideq*1], m14, 1 mova [r4+strideq*2], ym16 vextracti32x8 [r4+r2 ], m16, 1 mova [r5+strideq*0], ym18 vextracti32x8 [r5+strideq*1], m18, 1 mova [r5+strideq*2], ym20 vextracti32x8 [r5+r2 ], m20, 1 RET ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 16 imul r6d, 181 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 ALIGN function_align cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero vpbroadcastd m8, [o(pw_2896x8)] vpbroadcastd m4, [o(pw_4076x8)] vpbroadcastd m3, [o(pw_401x8)] pmulhrsw m8, m0 ; t0 pmulhrsw m4, m14 ; t15a pmulhrsw m3, m14 ; t8a punpcklwd m9, m3, m4 punpckhwd m5, m3, m4 mova m2, m10 vpdpwssd m2, m9, [o(pw_m3784_1567)] {bcstd} mova m1, m10 vpdpwssd m1, m5, [o(pw_m3784_1567)] {bcstd} mova m6, m10 vpdpwssd m6, m5, [o(pw_1567_3784)] {bcstd} mova m5, m10 vpdpwssd m5, m9, [o(pw_1567_3784)] {bcstd} vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] psubsw m21, m8, m4 ; out15 paddsw m0, m8, m4 ; out0 psubsw m14, m8, m3 ; out8 paddsw m7, m8, m3 ; out7 REPX {psrad x, 12}, m2, m1, m6, m5 packssdw m2, m1 ; t9a packssdw m5, m6 ; t14a ITX_MULSUB_2W 4, 3, 16, 17, 10, 11, 12 ; t11, t12 psubsw m20, m8, m5 ; out14 paddsw m1, m8, m5 ; out1 psubsw m15, m8, m2 ; out9 paddsw m6, m8, m2 ; out6 ITX_MULSUB_2W 5, 2, 16, 17, 10, 11, 12 ; t10a, t13a psubsw m18, m8, m3 ; out12 paddsw m3, m8 ; out3 psubsw m17, m8, m4 ; out11 paddsw m4, m8 ; out4 psubsw m19, m8, m2 ; out13 paddsw m2, m8 ; out2 psubsw m16, m8, m5 ; out10 paddsw m5, m8 ; out5 ret cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m9, [o(pw_2896x8)] vpbroadcastd m2, [o(pw_4017x8)] vpbroadcastd m3, [o(pw_799x8)] vpbroadcastd m18, [o(pw_4076x8)] vpbroadcastd m19, [o(pw_401x8)] vpbroadcastd m20, [o(pw_m1189x8)] vpbroadcastd m16, [o(pw_3920x8)] pmulhrsw m9, m0 ; t0 pmulhrsw m2, m1 ; t7a pmulhrsw m1, m3 ; t4a pmulhrsw m18, m14 ; t15a pmulhrsw m14, m19 ; t8a pmulhrsw m20, m15 ; t11a pmulhrsw m15, m16 ; t12a psubsw m7, m9, m2 ; idct8 out7 paddsw m0, m9, m2 ; idct8 out0 psubsw m4, m9, m1 ; idct8 out4 paddsw m3, m9, m1 ; idct8 out3 ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 mova m21, m18 mova m19, m14 mova m16, m15 mova m8, m20 psubsw m6, m9, m1 ; idct8 out6 paddsw m1, m9 ; idct8 out1 psubsw m5, m9, m2 ; idct8 out5 paddsw m2, m9 ; idct8 out2 jmp .main3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m5, [o(pw_m2276x8)] vpbroadcastd m11, [o(pw_3406x8)] vpbroadcastd m7, [o(pw_4017x8)] vpbroadcastd m12, [o(pw_799x8)] vpbroadcastd m6, [o(pw_3784x8)] vpbroadcastd m10, [o(pw_1567x8)] vpbroadcastd m4, [o(pw_2896x8)] pmulhrsw m5, m3 ; t5a pmulhrsw m3, m11 ; t6a pmulhrsw m7, m1 ; t7a pmulhrsw m1, m12 ; t4a pmulhrsw m6, m2 ; t3 pmulhrsw m2, m10 ; t2 pmulhrsw m4, m0 ; t0 vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] vpbroadcastd m10, [o(pd_2048)] mova m0, m4 ; t1 call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 vpbroadcastd m21, [o(pw_4076x8)] vpbroadcastd m8, [o(pw_401x8)] vpbroadcastd m18, [o(pw_m2598x8)] vpbroadcastd m9, [o(pw_3166x8)] vpbroadcastd m19, [o(pw_3612x8)] vpbroadcastd m11, [o(pw_1931x8)] vpbroadcastd m20, [o(pw_m1189x8)] vpbroadcastd m12, [o(pw_3920x8)] pmulhrsw m21, m14 ; t15a pmulhrsw m14, m8 ; t8a pmulhrsw m18, m17 ; t9a pmulhrsw m17, m9 ; t14a pmulhrsw m19, m16 ; t13a pmulhrsw m16, m11 ; t10a pmulhrsw m20, m15 ; t11a pmulhrsw m15, m12 ; t12a jmp .main2 ALIGN function_align cglobal_label .main_oddhalf ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a .main2: paddsw m8, m20, m16 ; t11 psubsw m20, m16 ; t10 paddsw m16, m15, m19 ; t12 psubsw m15, m19 ; t13 psubsw m19, m14, m18 ; t9 paddsw m14, m18 ; t8 psubsw m18, m21, m17 ; t14 paddsw m21, m17 ; t15 .main3: vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a vpbroadcastd m11, [o(pw_m1567_m3784)] ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] psubsw m17, m14, m8 ; t11a paddsw m8, m14 ; t8a paddsw m14, m18, m15 ; t9 psubsw m18, m15 ; t10 psubsw m15, m19, m20 ; t13 paddsw m19, m20 ; t14 paddsw m20, m21, m16 ; t15a psubsw m16, m21, m16 ; t12a ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 psubsw m21, m0, m20 ; out15 paddsw m0, m20 ; out0 psubsw m20, m1, m19 ; out14 paddsw m1, m19 ; out1 psubsw m19, m2, m18 ; out13 paddsw m2, m18 ; out2 psubsw m18, m3, m17 ; out12 paddsw m3, m17 ; out3 psubsw m17, m4, m16 ; out11 paddsw m4, m16 ; out4 psubsw m16, m5, m15 ; out10 paddsw m5, m15 ; out5 psubsw m15, m6, m14 ; out9 paddsw m6, m14 ; out6 psubsw m14, m7, m8 ; out8 paddsw m7, m8 ; out7 ret .transpose_round: punpcklwd m8, m0, m2 punpckhwd m0, m2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m6 punpckhwd m4, m6 punpcklwd m6, m5, m7 punpckhwd m5, m7 punpcklwd m7, m14, m16 punpckhwd m14, m16 punpcklwd m16, m15, m17 punpckhwd m15, m17 punpcklwd m17, m19, m21 punpckhwd m19, m21 punpckhwd m21, m18, m20 punpcklwd m18, m20 punpcklwd m20, m8, m1 punpckhwd m8, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 REPX {pmulhrsw x, m9}, m20, m8, m1, m0 punpcklwd m6, m7, m15 punpckhwd m7, m15 punpcklwd m15, m14, m16 punpckhwd m14, m16 REPX {pmulhrsw x, m9}, m2, m3, m5, m4 punpckhwd m16, m18, m19 punpcklwd m18, m19 punpcklwd m19, m21, m17 punpckhwd m21, m17 REPX {pmulhrsw x, m9}, m6, m7, m15, m14 punpcklwd m17, m8, m0 ; a2 a6 aa ae punpckhwd m8, m0 ; a3 a7 ab af punpcklwd m0, m20, m1 ; a0 a4 a8 ac punpckhwd m20, m1 ; a1 a5 a9 ad REPX {pmulhrsw x, m9}, m16, m18, m19, m21 punpcklwd m1, m2, m5 ; b0 b4 b8 bc punpckhwd m2, m5 ; b1 b5 b9 bd punpcklwd m5, m3, m4 ; b2 b6 ba be punpckhwd m3, m4 ; b3 b7 bb bf punpcklwd m4, m6, m15 ; c0 c4 c8 cc punpckhwd m6, m15 ; c1 c5 c9 cd punpcklwd m15, m7, m14 ; c2 c6 ca ce punpckhwd m7, m14 ; c3 c7 cb cf punpcklwd m14, m18, m19 ; d0 d4 d8 dc punpckhwd m18, m19 ; d1 d5 d9 dd punpcklwd m9, m16, m21 ; d2 d6 da de punpckhwd m16, m21 ; d3 d7 db df vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 vshufi32x4 m15, m9, q3232 ; ca ce da de vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 vshufi32x4 m7, m16, q3232 ; cb cf db df ret %macro IDTX_16x32 4 ; src/dst[1-4] pmulhrsw m%1, m15, [cq+64*%1] pmulhrsw m%2, m15, [cq+64*%2] pmulhrsw m%3, m15, [cq+64*%3] pmulhrsw m%4, m15, [cq+64*%4] pmulhrsw m18, m16, m%1 pmulhrsw m19, m16, m%2 pmulhrsw m20, m16, m%3 pmulhrsw m21, m16, m%4 REPX {pmulhrsw x, m17}, m18, m19, m20, m21 paddsw m%1, m18 paddsw m%2, m19 paddsw m%3, m20 paddsw m%4, m21 %endmacro %macro IDTX_16x32_STORE 2 ; src[1-2] mova xm17, [dstq+r3*0] vinserti128 ym17, [dstq+r3*4], 1 vinserti32x4 m17, [dstq+r3*8], 2 vinserti32x4 m17, [dstq+r4*8], 3 mova [cq+64*(%1*2+0)], m18 mova [cq+64*(%1*2+1)], m18 punpcklbw m16, m17, m18 punpckhbw m17, m18 paddw m16, m%1 paddw m17, m%2 packuswb m16, m17 mova [dstq+r3*0], xm16 vextracti128 [dstq+r3*4], ym16, 1 vextracti32x4 [dstq+r3*8], m16, 2 vextracti32x4 [dstq+r4*8], m16, 3 %if %1 != 7 add dstq, strideq %endif %endmacro cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c vpbroadcastd m15, [pw_2896x8] vpbroadcastd m16, [pw_1697x16] vpbroadcastd m17, [pw_16384] IDTX_16x32 0, 1, 2, 3 IDTX_16x32 4, 5, 6, 7 IDTX_16x32 8, 9, 10, 11 IDTX_16x32 12, 13, 14, 15 vpbroadcastd m16, [pw_8192] call .transpose_2x8x8_round lea r3, [strideq*2] lea r4, [strideq*3] pxor m18, m18 IDTX_16x32_STORE 0, 8 IDTX_16x32_STORE 1, 9 IDTX_16x32_STORE 2, 10 IDTX_16x32_STORE 3, 11 IDTX_16x32_STORE 4, 12 IDTX_16x32_STORE 5, 13 IDTX_16x32_STORE 6, 14 IDTX_16x32_STORE 7, 15 RET ALIGN function_align .transpose_2x8x8_round: punpckhwd m17, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m17, m1 punpckhdq m17, m1 REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m17 punpcklqdq m6, m17 punpckhwd m17, m12, m13 punpcklwd m12, m13 punpckhwd m13, m8, m9 punpcklwd m8, m9 punpckhwd m9, m14, m15 punpcklwd m14, m15 punpckhwd m15, m10, m11 punpcklwd m10, m11 punpckhdq m11, m8, m10 punpckldq m8, m10 punpckldq m10, m12, m14 punpckhdq m12, m14 punpckhdq m14, m13, m15 punpckldq m13, m15 punpckldq m15, m17, m9 punpckhdq m17, m9 REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 punpckhqdq m9, m8, m10 punpcklqdq m8, m10 punpcklqdq m10, m11, m12 punpckhqdq m11, m12 punpcklqdq m12, m13, m15 punpckhqdq m13, m15 punpckhqdq m15, m14, m17 punpcklqdq m14, m17 ret %macro IDTX_32x16 4 ; dst[1-4] pmulhrsw m%2, m12, [cq+32*(%1+ 0)] pmulhrsw m18, m12, [cq+32*(%1+16)] pmulhrsw m%4, m12, [cq+32*(%3+ 0)] pmulhrsw m19, m12, [cq+32*(%3+16)] REPX {paddsw x, x}, m%2, m18, m%4, m19 mova m%1, m14 vpermi2q m%1, m%2, m18 vpermt2q m%2, m16, m18 %if %3 != 14 mova m%3, m14 %endif vpermi2q m%3, m%4, m19 vpermt2q m%4, m16, m19 pmulhrsw m18, m17, m%1 pmulhrsw m19, m17, m%2 pmulhrsw m20, m17, m%3 pmulhrsw m21, m17, m%4 REPX {paddsw x, x}, m%1, m%2, m%3, m%4 paddsw m%1, m18 paddsw m%2, m19 paddsw m%3, m20 paddsw m%4, m21 %endmacro %macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 mova ym19, [dstq+strideq*0] vinserti32x8 m19, [dstq+strideq*8], 1 %if %3 == 0 mova [cq+64*(%1*2+0)], m20 mova [cq+64*(%1*2+1)], m20 %endif punpcklbw m18, m19, m20 punpckhbw m19, m20 paddw m18, m%1 paddw m19, m%2 packuswb m18, m19 mova [dstq+strideq*0], ym18 vextracti32x8 [dstq+strideq*8], m18, 1 %if %3 || %1 != 7 add dstq, strideq %endif %endmacro cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c vpbroadcastd m12, [pw_2896x8] movu m14, [permB+7] vpbroadcastd m17, [pw_1697x16] psrlq m16, m14, 4 IDTX_32x16 0, 1, 2, 3 IDTX_32x16 4, 5, 6, 7 IDTX_32x16 8, 9, 10, 11 IDTX_32x16 12, 13, 14, 15 vpbroadcastd m16, [pw_2048] call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round pxor m20, m20 IDTX_32x16_STORE 0, 8 IDTX_32x16_STORE 1, 9 IDTX_32x16_STORE 2, 10 IDTX_32x16_STORE 3, 11 IDTX_32x16_STORE 4, 12 IDTX_32x16_STORE 5, 13 IDTX_32x16_STORE 6, 14 IDTX_32x16_STORE 7, 15 RET %macro IDCT_32x32_END 4 ; src, mem, stride[1-2] pmovzxbw m10, [dstq+%3] pmovzxbw m11, [r3 +%4] %if %2 < 8 paddsw m8, m%2, m%1 psubsw m9, m%2, m%1 %else mova m9, [cq+64*(%2*2-16)] paddsw m8, m9, m%1 psubsw m9, m%1 %endif pmulhrsw m8, m12 pmulhrsw m9, m12 %if %2 >= 8 %if %2 == 8 pxor m0, m0 %endif mova [cq+64*(%2*2-16)], m0 mova [cq+64*(%2*2-15)], m0 %endif paddw m8, m10 paddw m9, m11 packuswb m8, m9 vpermq m8, m13, m8 mova [dstq+%3], ym8 vextracti32x8 [r3 +%4], m8, 1 %if %2 == 3 || %2 == 7 || %2 == 11 add dstq, r5 sub r3, r5 %endif %endmacro cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly WIN64_SPILL_XMM 30 cmp eobd, 136 jb .fast mova m5, [cq+64*20] mova m3, [cq+64*12] mova m1, [cq+64* 4] mova m7, [cq+64*28] mova m2, [cq+64* 8] mova m6, [cq+64*24] mova m0, [cq+64* 0] mova m4, [cq+64*16] call m(inv_txfm_add_dct_dct_32x8_8bpc).main mova m14, [cq+64* 2] mova m21, [cq+64*30] mova m18, [cq+64*18] mova m17, [cq+64*14] mova m16, [cq+64*10] mova m19, [cq+64*22] mova m20, [cq+64*26] mova m15, [cq+64* 6] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 mova m22, [cq+64* 1] mova m21, [cq+64*31] mova m14, [cq+64*17] mova m29, [cq+64*15] mova m26, [cq+64* 9] mova m17, [cq+64*23] mova m18, [cq+64*25] mova m25, [cq+64* 7] mova m24, [cq+64* 5] mova m19, [cq+64*27] mova m16, [cq+64*21] mova m27, [cq+64*11] mova m28, [cq+64*13] mova m15, [cq+64*19] mova m20, [cq+64*29] mova m23, [cq+64* 3] call .main_oddhalf vpbroadcastd m10, [o(pw_8192)] psubsw m13, m0, m29 ; 31 paddsw m0, m29 ; 0 psubsw m29, m1, m28 ; 30 paddsw m1, m28 ; 1 psubsw m28, m2, m27 ; 29 paddsw m2, m27 ; 2 psubsw m27, m3, m26 ; 28 paddsw m3, m26 ; 3 psubsw m26, m4, m25 ; 27 paddsw m4, m25 ; 4 psubsw m25, m5, m24 ; 26 paddsw m5, m24 ; 5 psubsw m24, m6, m23 ; 25 paddsw m6, m23 ; 6 psubsw m23, m7, m22 ; 24 paddsw m7, m22 ; 7 pxor m9, m9 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 punpckhwd m3, m23, m24 punpcklwd m23, m24 punpckhwd m24, m25, m26 punpcklwd m25, m26 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 punpckhwd m26, m27, m28 punpcklwd m27, m28 punpckhwd m28, m29, m13 punpcklwd m29, m13 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 REPX {pmulhrsw x, m10}, m0, m4, m8, m22 punpckhdq m13, m23, m25 punpckldq m23, m25 punpckhdq m25, m27, m29 punpckldq m27, m29 REPX {pmulhrsw x, m10}, m13, m23, m25, m27 punpckhdq m9, m3, m24 punpckldq m3, m24 punpckhdq m24, m26, m28 punpckldq m26, m28 punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 punpckhqdq m23, m27 ; d01 d09 d17 d25 punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 punpcklqdq m13, m25 ; d02 d10 d18 d26 punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 punpcklqdq m3, m26 ; d04 d12 d20 d28 punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 punpcklqdq m9, m24 ; d06 d14 d22 d30 REPX {pmulhrsw x, m10}, m25, m3, m26 mova [cq+64* 9], m23 mova [cq+64*11], m27 mova [cq+64*13], m25 mova [cq+64*15], m26 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 punpcklqdq m8, m22 ; a04 a12 a20 a28 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 punpcklqdq m0, m4 ; a00 a08 a16 a24 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 punpcklqdq m7, m2 ; a02 a10 a18 a26 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 punpcklqdq m6, m1 ; a06 a14 a22 a30 mova m2, [cq+64* 0] mova m11, [cq+64* 2] mova m12, [cq+64* 4] mova m29, [cq+64* 6] mova m27, [cq+64* 8] mova m26, [cq+64*10] mova m4, [cq+64*12] mova m28, [cq+64*14] psubsw m1, m2, m21 ; 23 paddsw m2, m21 ; 8 psubsw m21, m11, m20 ; 22 paddsw m11, m20 ; 9 psubsw m20, m12, m19 ; 21 paddsw m12, m19 ; 10 psubsw m19, m29, m18 ; 20 paddsw m29, m18 ; 11 psubsw m18, m27, m17 ; 19 paddsw m27, m17 ; 12 psubsw m17, m26, m16 ; 18 paddsw m26, m16 ; 13 paddsw m16, m4, m15 ; 14 psubsw m4, m15 ; 17 pmulhrsw m15, m6, m10 psubsw m6, m28, m14 ; 16 paddsw m28, m14 ; 15 pmulhrsw m14, m7, m10 punpcklwd m7, m6, m4 punpckhwd m6, m4 punpckhwd m4, m17, m18 punpcklwd m17, m18 punpckhwd m18, m19, m20 punpcklwd m19, m20 punpckhwd m20, m21, m1 punpcklwd m21, m1 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 pmulhrsw m23, m10 pmulhrsw m25, m10 punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 REPX {pmulhrsw x, m10}, m28, m2, m12, m27 punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 REPX {pmulhrsw x, m10}, m16, m1, m11, m29 punpckhdq m26, m19, m21 punpckldq m19, m21 punpckhdq m21, m6, m4 punpckldq m6, m4 REPX {pmulhrsw x, m10}, m26, m19, m21, m6 punpckhdq m4, m18, m20 punpckldq m18, m20 punpckhdq m20, m7, m17 punpckldq m7, m17 REPX {pmulhrsw x, m10}, m4, m18, m20, m7 punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 punpckhqdq m28, m12 ; b03 b11 b19 b27 punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 punpcklqdq m2, m27 ; b00 b08 b16 b24 punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 punpcklqdq m1, m29 ; b04 b12 b20 b28 punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 punpcklqdq m16, m11 ; b06 b14 b22 b30 mova [cq+64* 1], m12 mova [cq+64* 3], m28 mova [cq+64* 5], m27 mova [cq+64* 7], m29 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 punpcklqdq m20, m26 ; c02 c10 c18 c26 punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 punpcklqdq m7, m19 ; c00 c08 c16 c24 punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 punpcklqdq m6, m18 ; c04 c12 c20 c28 punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 punpcklqdq m21, m4 ; c06 c14 c22 c30 pmulhrsw m19, m9, m10 vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 vshufi32x4 m3, m1, m6, q3131 ; 12 vshufi32x4 m1, m6, q2020 ; 4 vshufi32x4 m6, m4, m2, q3131 ; 24 vshufi32x4 m4, m2, q2020 ; 16 vshufi32x4 m2, m0, m7, q3131 ; 8 vshufi32x4 m0, m7, q2020 ; 0 vshufi32x4 m7, m5, m8, q3131 ; 28 vshufi32x4 m5, m8, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x8_8bpc).main vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 vshufi32x4 m16, m14, m20, q3131 ; 10 vshufi32x4 m14, m20, q2020 ; 2 vshufi32x4 m20, m18, m17, q3131 ; 26 vshufi32x4 m18, m17, q2020 ; 18 vshufi32x4 m17, m15, m21, q3131 ; 14 vshufi32x4 m15, m21, q2020 ; 6 vshufi32x4 m21, m19, m13, q3131 ; 30 vshufi32x4 m19, m13, q2020 ; 22 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 mova m15, [cq+64* 1] mova m16, [cq+64* 3] mova m17, [cq+64* 5] mova m19, [cq+64* 7] mova m20, [cq+64* 9] mova m21, [cq+64*11] mova m13, [cq+64*13] mova m18, [cq+64*15] vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 vshufi32x4 m18, m14, m26, q3131 ; 25 vshufi32x4 m14, m26, q2020 ; 17 vshufi32x4 m19, m15, m27, q3131 ; 27 vshufi32x4 m15, m27, q2020 ; 19 vshufi32x4 m20, m16, m28, q3131 ; 29 vshufi32x4 m16, m28, q2020 ; 21 vshufi32x4 m21, m17, m29, q3131 ; 31 vshufi32x4 m17, m29, q2020 ; 23 vshufi32x4 m26, m22, m8, q3131 ; 9 vshufi32x4 m22, m8, q2020 ; 1 vshufi32x4 m27, m23, m9, q3131 ; 11 vshufi32x4 m23, m9, q2020 ; 3 vshufi32x4 m28, m24, m11, q3131 ; 13 vshufi32x4 m24, m11, q2020 ; 5 vshufi32x4 m29, m25, m12, q3131 ; 15 vshufi32x4 m25, m12, q2020 ; 7 call .main_oddhalf jmp .end .fast: ; bottom/right halves are zero mova m14, [o(dup16_perm)] pmovzxwd m9, [cq+64* 0] pmovzxwd m6, [cq+64* 8] vpermb m8, m14, [cq+64* 2] vpermb ym0, ym14, [cq+64*14] vpermb ym5, ym14, [cq+64*10] vpermb m1, m14, [cq+64* 6] vpermb m7, m14, [cq+64* 4] vpermb ym3, ym14, [cq+64*12] pslld m9, 16 pslld m6, 16 call m(idct_16x16_internal_8bpc).main_fast vpermb m21, m14, [cq+64* 1] vpermb ym17, ym14, [cq+64*15] vpermb ym20, ym14, [cq+64* 9] vpermb m15, m14, [cq+64* 7] vpermb m18, m14, [cq+64* 5] vpermb ym16, ym14, [cq+64*11] vpermb ym19, ym14, [cq+64*13] vpermb m14, m14, [cq+64* 3] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m9, [o(pw_8192)] call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round vshufi32x4 m22, m14, m2, q2020 ; 1 vshufi32x4 m24, m14, m2, q3131 ; 5 vshufi32x4 m23, m17, m9, q2020 ; 3 vshufi32x4 m25, m17, m9, q3131 ; 7 vshufi32x4 m16, m5, m15, q2020 ; 10 vshufi32x4 m17, m5, m15, q3131 ; 14 vshufi32x4 m14, m1, m18, q2020 ; 2 vshufi32x4 m15, m1, m18, q3131 ; 6 vshufi32x4 m1, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m21, m4, q3131 ; 12 vshufi32x4 m2, m21, m4, q2020 ; 8 vshufi32x4 m26, m20, m6, q2020 ; 9 vshufi32x4 m28, m20, m6, q3131 ; 13 vshufi32x4 m27, m19, m7, q2020 ; 11 vshufi32x4 m29, m19, m7, q3131 ; 15 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 call .main_oddhalf_fast .end: lea r4, [strideq*3] vpbroadcastd m12, [o(pw_2048)] movshdup m13, [o(permD)] lea r3, [dstq+r4*8] lea r5, [strideq+r4] ; stride*4 add r3, r5 ; dst+stride*28 IDCT_32x32_END 29, 0, strideq*0, r4 IDCT_32x32_END 28, 1, strideq*1, strideq*2 IDCT_32x32_END 27, 2, strideq*2, strideq*1 IDCT_32x32_END 26, 3, r4 , strideq*0 IDCT_32x32_END 25, 4, strideq*0, r4 IDCT_32x32_END 24, 5, strideq*1, strideq*2 IDCT_32x32_END 23, 6, strideq*2, strideq*1 IDCT_32x32_END 22, 7, r4 , strideq*0 IDCT_32x32_END 21, 8, strideq*0, r4 IDCT_32x32_END 20, 9, strideq*1, strideq*2 IDCT_32x32_END 19, 10, strideq*2, strideq*1 IDCT_32x32_END 18, 11, r4 , strideq*0 IDCT_32x32_END 17, 12, strideq*0, r4 IDCT_32x32_END 16, 13, strideq*1, strideq*2 IDCT_32x32_END 15, 14, strideq*2, strideq*1 IDCT_32x32_END 14, 15, r4 , strideq*0 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 ALIGN function_align cglobal_label .main_oddhalf_fast3 ; bottom seven-eights are zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m24, [o(pw_m601x8)] vpbroadcastd m12, [o(pw_4052x8)] pmulhrsw m21, m22 ; t31a pmulhrsw m22, m8 ; t16a pmulhrsw m24, m23 ; t23a pmulhrsw m23, m12 ; t24a punpcklwd m9, m22, m21 punpckhwd m8, m22, m21 mova m15, m10 vpdpwssd m15, m9, [o(pw_m4017_799)] {bcstd} mova m17, m10 vpdpwssd m17, m8, [o(pw_m4017_799)] {bcstd} REPX {psrad x, 12}, m15, m17 packssdw m15, m17 mova m17, m10 vpdpwssd m17, m8, [o(pw_799_4017)] {bcstd} mova m8, m10 vpdpwssd m8, m9, [o(pw_799_4017)] {bcstd} REPX {psrad x, 12}, m17, m8 packssdw m8, m17 punpcklwd m9, m24, m23 punpckhwd m16, m24, m23 mova m20, m10 vpdpwssd m20, m9, [o(pw_m3406_m2276)] {bcstd} mova m17, m10 vpdpwssd m17, m16, [o(pw_m3406_m2276)] {bcstd} REPX {psrad x, 12}, m20, m17 packssdw m20, m17 mova m17, m10 vpdpwssd m17, m16, [o(pw_m2276_3406)] {bcstd} mova m16, m10 vpdpwssd m16, m9, [o(pw_m2276_3406)] {bcstd} REPX {psrad x, 12}, m17, m16 packssdw m16, m17 mova m17, m21 mova m27, m15 mova m25, m20 mova m29, m8 mova m18, m22 mova m14, m24 mova m28, m16 mova m26, m23 jmp .main4 cglobal_label .main_oddhalf_fast2 ; bottom three-quarters are zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m18, [o(pw_m1380x8)] vpbroadcastd m9, [o(pw_3857x8)] vpbroadcastd m19, [o(pw_3973x8)] vpbroadcastd m11, [o(pw_995x8)] vpbroadcastd m28, [o(pw_m601x8)] vpbroadcastd m12, [o(pw_4052x8)] pmulhrsw m21, m22 ; t31a pmulhrsw m22, m8 ; t16a pmulhrsw m18, m25 ; t19a pmulhrsw m25, m9 ; t28a pmulhrsw m19, m24 ; t27a pmulhrsw m24, m11 ; t20a pmulhrsw m28, m23 ; t23a pmulhrsw m23, m12 ; t24a mova m15, m21 mova m8, m22 mova m14, m18 mova m27, m25 mova m29, m19 mova m26, m24 mova m16, m28 mova m20, m23 jmp .main3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom half is zero vpbroadcastd m21, [o(pw_4091x8)] vpbroadcastd m8, [o(pw_201x8)] vpbroadcastd m14, [o(pw_m2751x8)] vpbroadcastd m9, [o(pw_3035x8)] vpbroadcastd m17, [o(pw_3703x8)] vpbroadcastd m11, [o(pw_1751x8)] vpbroadcastd m18, [o(pw_m1380x8)] vpbroadcastd m12, [o(pw_3857x8)] pmulhrsw m21, m22 ; t31a vpbroadcastd m19, [o(pw_3973x8)] pmulhrsw m22, m8 ; t16a vpbroadcastd m8, [o(pw_995x8)] pmulhrsw m14, m29 ; t30a vpbroadcastd m16, [o(pw_m2106x8)] pmulhrsw m29, m9 ; t17a vpbroadcastd m9, [o(pw_3513x8)] pmulhrsw m17, m26 ; t29a vpbroadcastd m15, [o(pw_3290x8)] pmulhrsw m26, m11 ; t18a vpbroadcastd m11, [o(pw_2440x8)] pmulhrsw m18, m25 ; t19a vpbroadcastd m20, [o(pw_m601x8)] pmulhrsw m25, m12 ; t28a vpbroadcastd m12, [o(pw_4052x8)] pmulhrsw m19, m24 ; t27a pmulhrsw m24, m8 ; t20a pmulhrsw m16, m27 ; t21a pmulhrsw m27, m9 ; t26a pmulhrsw m15, m28 ; t25a pmulhrsw m28, m11 ; t22a pmulhrsw m20, m23 ; t23a pmulhrsw m23, m12 ; t24a jmp .main2 ALIGN function_align cglobal_label .main_oddhalf ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a .main2: psubsw m8, m22, m14 ; t17 paddsw m22, m14 ; t16 paddsw m14, m18, m26 ; t19 psubsw m18, m26 ; t18 psubsw m26, m24, m16 ; t21 paddsw m24, m16 ; t20 psubsw m16, m20, m28 ; t22 paddsw m28, m20 ; t23 psubsw m20, m23, m15 ; t25 paddsw m23, m15 ; t24 psubsw m15, m21, m29 ; t30 paddsw m21, m29 ; t31 psubsw m29, m19, m27 ; t26 paddsw m19, m27 ; t27 paddsw m27, m25, m17 ; t28 psubsw m25, m17 ; t29 .main3: ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a psubsw m17, m21, m27 ; t28a paddsw m21, m27 ; t31a psubsw m27, m15, m25 ; t18 paddsw m15, m25 ; t17 psubsw m25, m20, m29 ; t21 paddsw m20, m29 ; t22 psubsw m29, m8, m18 ; t29 paddsw m8, m18 ; t30 psubsw m18, m22, m14 ; t19a paddsw m22, m14 ; t16a psubsw m14, m28, m24 ; t20a paddsw m24, m28 ; t23a paddsw m28, m16, m26 ; t25 psubsw m16, m26 ; t26 psubsw m26, m23, m19 ; t27a paddsw m23, m19 ; t24a .main4: vpbroadcastd m12, [o(pw_m3784_1567)] vpbroadcastd m11, [o(pw_1567_3784)] ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 vpbroadcastd m11, [o(pw_m1567_m3784)] ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 vpbroadcastd m12, [o(pw_m2896_2896)] vpbroadcastd m11, [o(pw_2896_2896)] psubsw m19, m27, m25 ; t26 paddsw m27, m25 ; t29 psubsw m25, m17, m26 ; t20a paddsw m17, m26 ; t19a paddsw m26, m18, m14 ; t28a psubsw m18, m14 ; t27a paddsw m14, m22, m24 ; t16 psubsw m22, m24 ; t23 psubsw m24, m29, m16 ; t21 paddsw m16, m29 ; t18 paddsw m29, m21, m23 ; t31 psubsw m21, m23 ; t24 psubsw m23, m15, m20 ; t22a paddsw m15, m20 ; t17a psubsw m20, m8, m28 ; t25a paddsw m28, m8 ; t30a ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 ret %macro IDTX_32x32 2 ; dst[1-2] vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements vmovdqa32 ym18, [cq+64*(%2+16)] vpermt2q m%1, m21, m17 vpermt2q m%2, m21, m18 %endmacro cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c movu m21, [permB+7] vpbroadcastd m16, [pw_8192] pxor m20, m20 .loop: IDTX_32x32 0, 1 IDTX_32x32 2, 3 IDTX_32x32 4, 5 IDTX_32x32 6, 7 IDTX_32x32 8, 9 IDTX_32x32 10, 11 IDTX_32x32 12, 13 IDTX_32x32 14, 15 call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round IDTX_32x16_STORE 0, 8, 1 IDTX_32x16_STORE 1, 9, 1 IDTX_32x16_STORE 2, 10, 1 IDTX_32x16_STORE 3, 11, 1 IDTX_32x16_STORE 4, 12, 1 IDTX_32x16_STORE 5, 13, 1 IDTX_32x16_STORE 6, 14, 1 IDTX_32x16_STORE 7, 15, 1 lea dstq, [dstq+strideq*8] btc cq, 5 jnc .loop mov r0d, 8 .zero_loop: mova [cq+64*0], m20 mova [cq+64*1], m20 mova [cq+64*2], m20 mova [cq+64*3], m20 add cq, 64*4 dec r0d jg .zero_loop RET cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jz .dconly WIN64_SPILL_XMM 30 cmp eobd, 151 jb .fast mova m5, [cq+64*10] mova m3, [cq+64* 6] mova m1, [cq+64* 2] mova m7, [cq+64*14] mova m2, [cq+64* 4] mova m6, [cq+64*12] mova m0, [cq+64* 0] mova m4, [cq+64* 8] call m(inv_txfm_add_dct_dct_32x8_8bpc).main mova m14, [cq+64* 1] mova m21, [cq+64*15] mova m18, [cq+64* 9] mova m17, [cq+64* 7] mova m16, [cq+64* 5] mova m19, [cq+64*11] mova m20, [cq+64*13] mova m15, [cq+64* 3] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf vpbroadcastd m9, [o(pw_8192)] %macro TRANSPOSE_8x4_ROUND 4 punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 %endmacro TRANSPOSE_8x4_ROUND 0, 1, 2, 3 TRANSPOSE_8x4_ROUND 4, 5, 6, 7 TRANSPOSE_8x4_ROUND 14, 15, 16, 17 TRANSPOSE_8x4_ROUND 18, 19, 20, 21 vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 vshufi32x4 m22, m26, m4, q2020 ; 0 1 vshufi32x4 m26, m4, q3131 ; 8 9 vshufi32x4 m23, m27, m5, q2020 ; 2 3 vshufi32x4 m27, m5, q3131 ; 10 11 vshufi32x4 m24, m28, m6, q2020 ; 4 5 vshufi32x4 m28, m6, q3131 ; 12 13 vshufi32x4 m25, m29, m7, q2020 ; 6 7 vshufi32x4 m29, m7, q3131 ; 14 15 vshufi32x4 m4, m0, m14, q2020 ; 16 17 vshufi32x4 m3, m0, m14, q3131 ; 24 25 vshufi32x4 m20, m1, m15, q2020 ; 18 19 vshufi32x4 m19, m1, m15, q3131 ; 26 27 vshufi32x4 m5, m2, m16, q2020 ; 20 21 vshufi32x4 m0, m2, m16, q3131 ; 28 29 vshufi32x4 m16, m8, m17, q2020 ; 22 23 vshufi32x4 m17, m8, m17, q3131 ; 30 31 pxor m6, m6 mova [cq+64* 0], m4 mova [cq+64* 2], m5 mova [cq+64* 4], m3 mova [cq+64* 6], m0 punpcklwd m8, m24, m24 ; 4 punpcklwd m0, m0 ; 28 punpcklwd m5, m5 ; 20 punpcklwd m1, m28, m28 ; 12 punpcklwd m7, m26, m26 ; 8 punpcklwd m3, m3 ; 24 punpcklwd m9, m6, m22 ; __ 0 punpcklwd m6, m4 ; __ 16 call m(idct_16x16_internal_8bpc).main_fast3 mova [cq+64* 1], m20 mova [cq+64* 3], m16 mova [cq+64* 5], m19 mova [cq+64* 7], m17 punpcklwd m21, m23, m23 ; 2 punpcklwd m17, m17 ; 30 punpcklwd m20, m20 ; 18 punpcklwd m15, m29, m29 ; 14 punpcklwd m18, m27, m27 ; 10 punpcklwd m16, m16 ; 22 punpcklwd m19, m19 ; 26 punpcklwd m14, m25, m25 ; 6 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 mova m21, [cq+64* 7] mova m14, [cq+64* 0] mova m17, [cq+64* 3] mova m18, [cq+64* 4] mova m19, [cq+64* 5] mova m16, [cq+64* 2] mova m15, [cq+64* 1] mova m20, [cq+64* 6] REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ m24, m19, m16, m27, m28, m15, m20, m23 call .main_oddhalf jmp .end .fast: ; right half is zero mova ym8, [cq+64*15] vinserti32x8 m8, [cq+64* 1], 1 mova m2, [o(int16_perm)] mova ym9, [cq+64* 8] vinserti32x8 m9, [cq+64* 0], 1 mova ym0, [cq+64* 7] vinserti32x8 m0, [cq+64* 9], 1 mova ym7, [cq+64*14] vinserti32x8 m7, [cq+64* 2], 1 mova ym1, [cq+64* 3] vinserti32x8 m1, [cq+64*13], 1 mova ym3, [cq+64* 6] vinserti32x8 m3, [cq+64*10], 1 mova ym5, [cq+64*11] vinserti32x8 m5, [cq+64* 5], 1 mova ym6, [cq+64*12] vinserti32x8 m6, [cq+64* 4], 1 REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 call m(idct_16x16_internal_8bpc).main2 vbroadcasti32x4 m8, [o(int_shuf3)] vbroadcasti32x4 m9, [o(int_shuf4)] vpbroadcastd m11, [o(pw_8192)] pshufb m0, m8 pshufb m1, m9 pshufb m2, m8 pshufb m3, m9 REPX {pmulhrsw x, m11}, m0, m1, m2, m3 pshufb m4, m8 pshufb m5, m9 pshufb m6, m8 pshufb m7, m9 REPX {pmulhrsw x, m11}, m4, m5, m6, m7 punpckhdq m28, m0, m1 punpckldq m0, m1 punpckhdq m27, m2, m3 punpckldq m2, m3 punpckhdq m22, m4, m5 punpckldq m4, m5 punpckhdq m23, m6, m7 punpckldq m6, m7 vinserti32x8 m14, m0, ym2, 1 vshufi32x4 m15, m0, m2, q3232 vinserti32x8 m2, m4, ym6, 1 vshufi32x4 m4, m6, q3232 vshufi32x4 m21, m14, m2, q2020 ; 0 2 vshufi32x4 m14, m2, q3131 ; 4 6 vshufi32x4 m18, m15, m4, q2020 ; 8 10 vshufi32x4 m15, m4, q3131 ; 12 14 pxor m9, m9 punpcklwd m8, m14, m14 ; 4 punpcklwd m1, m15, m15 ; 12 punpcklwd m7, m18, m18 ; 8 punpcklwd m9, m21 ; __ 0 call m(idct_16x16_internal_8bpc).main_fast4 punpckhwd m21, m21 ; 2 punpckhwd m15, m15 ; 14 punpckhwd m18, m18 ; 10 punpckhwd m14, m14 ; 6 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 vinserti32x8 m24, m28, ym27, 1 vshufi32x4 m28, m27, q3232 vinserti32x8 m27, m22, ym23, 1 vshufi32x4 m22, m23, q3232 vshufi32x4 m23, m24, m27, q2020 ; 1 3 vshufi32x4 m24, m27, q3131 ; 5 7 vshufi32x4 m27, m28, m22, q2020 ; 9 11 vshufi32x4 m28, m22, q3131 ; 13 15 punpcklwd m22, m23, m23 ; 1 punpckhwd m29, m28, m28 ; 15 punpcklwd m26, m27, m27 ; 9 punpckhwd m25, m24, m24 ; 7 mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 punpcklwd m24, m24 ; 5 punpckhwd m27, m27 ; 11 punpcklwd m28, m28 ; 13 punpckhwd m23, m23 ; 3 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 call .main_oddhalf_fast .end: imul r6, strideq, 60 mova m10, [o(end_16x32p)] vpbroadcastd m11, [o(pw_2048)] lea r3, [strideq*3] pxor m12, m12 add r6, dstq ; dst+stride*60 psrldq m13, m10, 1 lea r4, [strideq+r3] ; stride*4 %macro IDCT_16x64_END 3 ; idct32, idct64, tmp %if %1 & 1 %define %%s0 r3 %define %%s1 strideq*2 %define %%s2 strideq*1 %define %%s3 strideq*0 %else %define %%s0 strideq*0 %define %%s1 strideq*1 %define %%s2 strideq*2 %define %%s3 r3 %if %1 add dstq, r4 sub r6, r4 %endif %endif %if %1 < 8 pmulhrsw m8, m11, m%1 pmulhrsw m9, m11, m%2 %else mova m9, [cq+64*%1] paddsw m8, m9, m%2 ; out 0+n, 1+n psubsw m9, m%2 ; out 63-n, 62-n pmulhrsw m8, m11 pmulhrsw m9, m11 %endif mova xm29, [dstq+%%s0] vinserti128 ym29, [dstq+%%s1], 1 mova xm%3, [r6 +%%s3] vinserti128 ym%3, [r6 +%%s2], 1 vpermb m29, m10, m29 vpermb m%3, m10, m%3 mova [cq+64*%1], m12 paddw m29, m8 paddw m%3, m9 packuswb m29, m%3 vpermd m29, m13, m29 mova [dstq+%%s0], xm29 vextracti128 [dstq+%%s1], ym29, 1 vextracti32x4 [r6 +%%s2], m29, 2 vextracti32x4 [r6 +%%s3], m29, 3 %endmacro IDCT_16x64_END 0, 29, 0 IDCT_16x64_END 1, 28, 28 IDCT_16x64_END 2, 27, 28 IDCT_16x64_END 3, 26, 28 IDCT_16x64_END 4, 25, 28 IDCT_16x64_END 5, 24, 28 IDCT_16x64_END 6, 23, 28 IDCT_16x64_END 7, 22, 28 IDCT_16x64_END 8, 21, 28 IDCT_16x64_END 9, 20, 28 IDCT_16x64_END 10, 19, 28 IDCT_16x64_END 11, 18, 28 IDCT_16x64_END 12, 17, 28 IDCT_16x64_END 13, 16, 28 IDCT_16x64_END 14, 15, 28 IDCT_16x64_END 15, 14, 28 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 64 imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 ALIGN function_align cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m21, [o(pw_m1474_3822x8)] vpbroadcastd m14, [o(pw_897_3996x8)] vpbroadcastd m17, [o(pw_m700_4036x8)] vpbroadcastd m18, [o(pw_501_4065x8)] vpbroadcastd m19, [o(pw_m1092_3948x8)] vpbroadcastd m16, [o(pw_1285_3889x8)] vpbroadcastd m15, [o(pw_m301_4085x8)] pmulhrsw m8, m22 ; t32a t63a pmulhrsw m21, m29 ; t35a t60a pmulhrsw m14, m26 ; t36a t59a pmulhrsw m17, m25 ; t39a t56 pmulhrsw m18, m24 ; t40a t55a pmulhrsw m19, m27 ; t43a t52a pmulhrsw m16, m28 ; t44a t51a pmulhrsw m15, m23 ; t47a t48a mova m22, m8 mova m29, m21 mova m26, m14 mova m25, m17 mova m24, m18 mova m27, m19 mova m28, m16 mova m20, m15 jmp .main_oddhalf2 ALIGN function_align cglobal_label .main_oddhalf vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m9, [o(pw_m2824_2967x8)] vpbroadcastd m11, [o(pw_1660_3745x8)] vpbroadcastd m12, [o(pw_m1474_3822x8)] pmulhrsw m22, m8 ; t32a t63a vpbroadcastd m8, [o(pw_897_3996x8)] pmulhrsw m21, m9 ; t33a t62a vpbroadcastd m9, [o(pw_m2191_3461x8)] pmulhrsw m14, m11 ; t34a t61a vpbroadcastd m11, [o(pw_2359_3349x8)] pmulhrsw m29, m12 ; t35a t60a vpbroadcastd m12, [o(pw_m700_4036x8)] pmulhrsw m26, m8 ; t36a t59a vpbroadcastd m8, [o(pw_501_4065x8)] pmulhrsw m17, m9 ; t37a t58a vpbroadcastd m9, [o(pw_m2520_3229x8)] pmulhrsw m18, m11 ; t38a t57a vpbroadcastd m11, [o(pw_2019_3564x8)] pmulhrsw m25, m12 ; t39a t56a vpbroadcastd m12, [o(pw_m1092_3948x8)] pmulhrsw m24, m8 ; t40a t55a vpbroadcastd m8, [o(pw_1285_3889x8)] pmulhrsw m19, m9 ; t41a t54a vpbroadcastd m9, [o(pw_m1842_3659x8)] pmulhrsw m16, m11 ; t42a t53a vpbroadcastd m11, [o(pw_2675_3102x8)] pmulhrsw m27, m12 ; t43a t52a vpbroadcastd m12, [o(pw_m301_4085x8)] pmulhrsw m28, m8 ; t44a t51a pmulhrsw m15, m9 ; t45a t50a pmulhrsw m20, m11 ; t46a t49a pmulhrsw m23, m12 ; t47a t48a psubsw m8, m22, m21 ; t33 t62 paddsw m22, m21 ; t32 t63 psubsw m21, m29, m14 ; t34 t61 paddsw m29, m14 ; t35 t60 psubsw m14, m26, m17 ; t37 t58 paddsw m26, m17 ; t36 t59 psubsw m17, m25, m18 ; t38 t57 paddsw m25, m18 ; t39 t56 psubsw m18, m24, m19 ; t41 t54 paddsw m24, m19 ; t40 t55 psubsw m19, m27, m16 ; t42 t53 paddsw m27, m16 ; t43 t52 psubsw m16, m28, m15 ; t45 t50 paddsw m28, m15 ; t44 t51 psubsw m15, m23, m20 ; t46 t49 paddsw m20, m23 ; t47 t48 .main_oddhalf2: ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a vpbroadcastd m11, [o(pw_m4017_799)] psubsw m23, m25, m26 ; t36a t59a paddsw m25, m26 ; t39a t56a psubsw m26, m24, m27 ; t43a t52a paddsw m27, m24 ; t40a t55a psubsw m24, m20, m28 ; t44a t51a paddsw m20, m28 ; t47a t48a psubsw m28, m8, m21 ; t34 t61 paddsw m8, m21 ; t33 t62 psubsw m21, m17, m14 ; t37 t58 paddsw m17, m14 ; t38 t57 psubsw m14, m18, m19 ; t42 t53 paddsw m18, m19 ; t41 t54 psubsw m19, m15, m16 ; t45 t50 paddsw m15, m16 ; t46 t49 psubsw m16, m22, m29 ; t35a t60a paddsw m22, m29 ; t32a t63a ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a vpbroadcastd m11, [o(pw_m2276_3406)] ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a vpbroadcastd m11, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_m3784_1567)] psubsw m29, m22, m25 ; t39 t56 paddsw m22, m25 ; t32 t63 psubsw m25, m20, m27 ; t40 t55 paddsw m20, m27 ; t47 t48 psubsw m27, m8, m17 ; t38a t57a paddsw m8, m17 ; t33a t62a psubsw m17, m15, m18 ; t41a t54a paddsw m15, m18 ; t46a t49a paddsw m18, m16, m23 ; t35a t60a psubsw m16, m23 ; t36a t59a psubsw m23, m24, m26 ; t43a t52a paddsw m24, m26 ; t44a t51a paddsw m26, m28, m21 ; t34 t61 psubsw m28, m21 ; t37 t58 psubsw m21, m19, m14 ; t42 t53 paddsw m19, m14 ; t45 t50 ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a vpbroadcastd m11, [o(pw_m1567_m3784)] ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a vbroadcasti32x4 m13, [o(deint_shuf)] vpbroadcastd m11, [o(pw_2896_2896)] vpbroadcastd m12, [o(pw_m2896_2896)] paddsw m14, m22, m20 ; t32a t63a psubsw m22, m20 ; t47a t48a psubsw m20, m8, m15 ; t46 t49 paddsw m8, m15 ; t33 t62 paddsw m15, m18, m24 ; t35 t60 psubsw m18, m24 ; t44 t51 psubsw m24, m26, m19 ; t45a t50a paddsw m26, m19 ; t34a t61a REPX {pshufb x, m13}, m14, m8, m15, m26 psubsw m19, m29, m25 ; t40 t55 paddsw m25, m29 ; t39 t56 psubsw m29, m27, m17 ; t41a t54a paddsw m27, m17 ; t38a t57a psubsw m17, m16, m23 ; t43a t52a paddsw m16, m23 ; t36a t59a psubsw m9, m28, m21 ; t42 t53 paddsw m28, m21 ; t37 t58 REPX {pshufb x, m13}, m25, m27, m16, m28 ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a packssdw m21, m22 ; t47 t46a packssdw m13, m23 ; t48 t49a ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 packssdw m20, m18 ; t44a t45 packssdw m22, m23 ; t51a t50 ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 packssdw m18, m19 ; t40a t41 packssdw m24, m23 ; t55a t54 ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a packssdw m19, m17 ; t43 t42a packssdw m23, m29 ; t52 t53a punpcklqdq m17, m25, m27 ; t39 t38a punpckhqdq m25, m27 ; t56 t57a punpckhqdq m27, m15, m26 ; t60 t61a punpcklqdq m15, m26 ; t35 t34a punpckhqdq m26, m16, m28 ; t59a t58 punpcklqdq m16, m28 ; t36a t37 punpckhqdq m28, m14, m8 ; t63a t62 punpcklqdq m14, m8 ; t32a t33 psubsw m29, m0, m28 ; out63 out62 paddsw m0, m28 ; out0 out1 psubsw m28, m1, m27 ; out60 out61 paddsw m1, m27 ; out3 out2 psubsw m27, m2, m26 ; out59 out58 paddsw m2, m26 ; out4 out5 psubsw m26, m3, m25 ; out56 out57 paddsw m3, m25 ; out7 out6 psubsw m25, m4, m24 ; out55 out54 paddsw m4, m24 ; out8 out9 psubsw m24, m5, m23 ; out52 out53 paddsw m5, m23 ; out11 out10 psubsw m23, m6, m22 ; out51 out50 paddsw m6, m22 ; out12 out13 psubsw m22, m7, m13 ; out48 out49 paddsw m7, m13 ; out15 out14 ret cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob %undef cmp lea r5, [o_base] test eobd, eobd jnz .normal movsx r6d, word [cq] mov [cq], eobd or r3d, 16 .dconly: imul r6d, 181 add r6d, 128+512 sar r6d, 8+2 .dconly2: imul r6d, 181 add r6d, 128+2048 sar r6d, 8+4 pxor m2, m2 vpbroadcastw m3, r6d .dconly_loop: mova m1, [dstq] punpcklbw m0, m1, m2 punpckhbw m1, m2 paddw m0, m3 paddw m1, m3 packuswb m0, m1 mova [dstq], m0 add dstq, strideq dec r3d jg .dconly_loop RET .normal: WIN64_SPILL_XMM 31 mova m19, [o(dup16_perm)] mova m24, [cq+64* 2] mova m28, [cq+64* 6] mova m26, [cq+64* 4] mova m22, [cq+64* 0] mova m23, [cq+64* 1] mova m29, [cq+64* 7] mova m27, [cq+64* 5] mova m25, [cq+64* 3] vpermb m8, m19, m24 ; 4 vpermb m1, m19, m28 ; 12 vpermb m7, m19, m26 ; 8 vpermb m9, m19, m22 ; __ 0 vpermb m21, m19, m23 ; 2 vpermb m15, m19, m29 ; 14 vpermb m18, m19, m27 ; 10 vpermb m14, m19, m25 ; 6 pslld m9, 16 vpord m30, m19, [o(pb_32)] {1to16} REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 cmp eobd, 151 jb .fast vpermb m0, m19, [cq+64*14] ; 28 vpermb m5, m19, [cq+64*10] ; 20 vpermb m3, m19, [cq+64*12] ; 24 vpermb m6, m19, [cq+64* 8] ; __ 16 pslld m6, 16 call m(idct_16x16_internal_8bpc).main_fast vpermb m17, m19, [cq+64*15] ; 30 vpermb m20, m19, [cq+64* 9] ; 18 vpermb m16, m19, [cq+64*11] ; 22 vpermb m19, m19, [cq+64*13] ; 26 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 vpermb m21, m30, [cq+64*15] vpermb m14, m30, [cq+64* 8] vpermb m17, m30, [cq+64*11] vpermb m18, m30, [cq+64*12] vpermb m19, m30, [cq+64*13] vpermb m16, m30, [cq+64*10] vpermb m15, m30, [cq+64* 9] vpermb m20, m30, [cq+64*14] call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf jmp .end .fast: ; bottom half is zero call m(idct_16x16_internal_8bpc).main_fast2 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast .end: mova [cq+64* 8], m4 mova [cq+64* 9], m5 mova [cq+64*10], m6 mova [cq+64*11], m7 mova [cq+64*12], m26 mova [cq+64*13], m27 mova [cq+64*14], m28 mova [cq+64*15], m29 vpbroadcastd m13, [o(pw_8192)] call .pass1_end call .pass2 mova [cq+64* 0], m0 mova [cq+64* 1], m1 mova [cq+64* 2], m2 mova [cq+64* 3], m3 mova [cq+64* 4], m4 mova [cq+64* 5], m5 mova [cq+64* 6], m6 mova [cq+64* 7], m7 pmulhrsw m0, m13, [cq+64* 8] pmulhrsw m1, m13, [cq+64* 9] pmulhrsw m2, m13, [cq+64*10] pmulhrsw m3, m13, [cq+64*11] vpbroadcastd m30, [o(pw_2048)] pmulhrsw m4, m13, m22 pmulhrsw m5, m13, m23 pmulhrsw m6, m13, m24 pmulhrsw m7, m13, m25 pmulhrsw m22, m30, m14 pmulhrsw m14, m13, m26 pmulhrsw m23, m30, m15 pmulhrsw m15, m13, m27 pmulhrsw m24, m30, m16 pmulhrsw m16, m13, m28 pmulhrsw m25, m30, m17 pmulhrsw m17, m13, m29 pmulhrsw m26, m30, m18 pmulhrsw m18, m13, [cq+64*12] pmulhrsw m27, m30, m19 pmulhrsw m19, m13, [cq+64*13] pmulhrsw m28, m30, m20 pmulhrsw m20, m13, [cq+64*14] pmulhrsw m29, m30, m21 pmulhrsw m21, m13, [cq+64*15] call .transpose_round call .pass2 pxor m10, m10 lea r3, [strideq*3] %macro IDCT_64x16_END 4 mova m9, [dstq+%4] %if %1 < 8 pmulhrsw m%3, m30, [cq+64*%1] %endif pmulhrsw m%2, m30 mova [cq+64*%1], m10 punpcklbw m8, m9, m10 punpckhbw m9, m10 paddw m8, m%3 paddw m9, m%2 packuswb m8, m9 mova [dstq+%4], m8 %if %1 == 3 || %1 == 7 || %1 == 11 lea dstq, [dstq+strideq*4] %endif %endmacro IDCT_64x16_END 0, 0, 11, strideq*0 IDCT_64x16_END 1, 1, 11, strideq*1 IDCT_64x16_END 2, 2, 11, strideq*2 IDCT_64x16_END 3, 3, 11, r3 IDCT_64x16_END 4, 4, 11, strideq*0 IDCT_64x16_END 5, 5, 11, strideq*1 IDCT_64x16_END 6, 6, 11, strideq*2 IDCT_64x16_END 7, 7, 11, r3 IDCT_64x16_END 8, 14, 22, strideq*0 IDCT_64x16_END 9, 15, 23, strideq*1 IDCT_64x16_END 10, 16, 24, strideq*2 IDCT_64x16_END 11, 17, 25, r3 IDCT_64x16_END 12, 18, 26, strideq*0 IDCT_64x16_END 13, 19, 27, strideq*1 IDCT_64x16_END 14, 20, 28, strideq*2 IDCT_64x16_END 15, 21, 29, r3 RET ALIGN function_align .pass1_end: mova m4, [cq+64* 0] mova m5, [cq+64* 1] mova m6, [cq+64* 2] mova m7, [cq+64* 3] mova m8, [cq+64* 4] mova m9, [cq+64* 5] mova m11, [cq+64* 6] mova m12, [cq+64* 7] psubsw m29, m4, m21 ; out47 out46 paddsw m4, m21 ; out16 out17 psubsw m28, m5, m20 ; out44 out45 paddsw m5, m20 ; out19 out18 REPX {pmulhrsw x, m13}, m0, m1, m2, m3 psubsw m27, m6, m19 ; out43 out42 paddsw m6, m19 ; out20 out21 psubsw m26, m7, m18 ; out40 out41 paddsw m7, m18 ; out23 out22 pmulhrsw m18, m13, m22 pmulhrsw m19, m13, m23 pmulhrsw m20, m13, m24 pmulhrsw m21, m13, m25 paddsw m25, m12, m14 ; out31 out30 psubsw m14, m12, m14 ; out32 out33 paddsw m24, m11, m15 ; out28 out29 psubsw m15, m11, m15 ; out35 out34 REPX {pmulhrsw x, m13}, m4, m5, m6, m7 paddsw m23, m9, m16 ; out27 out26 psubsw m16, m9, m16 ; out36 out37 paddsw m22, m8, m17 ; out24 out25 psubsw m17, m8, m17 ; out39 out38 REPX {pmulhrsw x, m13}, m14, m15, m16, m17 .transpose_round: %macro TRANSPOSE_8x4_PACKED 4 punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 punpcklwd m%3, m%4, m%2 ; 2 punpckhwd m%4, m%2 ; 3 punpckhwd m%2, m%1, m8 ; 1 punpcklwd m%1, m8 ; 0 %endmacro TRANSPOSE_8x4_PACKED 0, 1, 2, 3 TRANSPOSE_8x4_PACKED 18, 19, 20, 21 TRANSPOSE_8x4_PACKED 4, 5, 6, 7 TRANSPOSE_8x4_PACKED 14, 15, 16, 17 vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 ret .pass2: vshufi32x4 m7, m5, m19, q3131 ; 14 vshufi32x4 m5, m19, q2020 ; 10 vshufi32x4 m21, m6, m20, q3131 ; 15 vshufi32x4 m19, m6, m20, q2020 ; 11 vshufi32x4 m20, m4, m18, q3131 ; 13 vshufi32x4 m18, m4, m18, q2020 ; 9 vshufi32x4 m6, m8, m2, q3131 ; 12 vshufi32x4 m4, m8, m2, q2020 ; 8 vshufi32x4 m2, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m1, m16, q3131 ; 6 vshufi32x4 m1, m16, q2020 ; 2 vshufi32x4 m16, m9, m15, q3131 ; 5 vshufi32x4 m14, m9, m15, q2020 ; 1 vshufi32x4 m15, m11, m17, q2020 ; 3 vshufi32x4 m17, m11, m17, q3131 ; 7 call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob vpbroadcastd m23, [o(pw_2896x8)] %undef cmp cmp eobd, 136 jb .fast pmulhrsw m5, m23, [cq+64*20] pmulhrsw m3, m23, [cq+64*12] pmulhrsw m1, m23, [cq+64* 4] pmulhrsw m7, m23, [cq+64*28] pmulhrsw m2, m23, [cq+64* 8] pmulhrsw m6, m23, [cq+64*24] pmulhrsw m0, m23, [cq+64* 0] pmulhrsw m4, m23, [cq+64*16] call m(inv_txfm_add_dct_dct_32x8_8bpc).main pmulhrsw m14, m23, [cq+64* 2] pmulhrsw m21, m23, [cq+64*30] pmulhrsw m18, m23, [cq+64*18] pmulhrsw m17, m23, [cq+64*14] pmulhrsw m16, m23, [cq+64*10] pmulhrsw m19, m23, [cq+64*22] pmulhrsw m20, m23, [cq+64*26] pmulhrsw m15, m23, [cq+64* 6] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 pmulhrsw m22, m23, [cq+64* 1] pmulhrsw m21, m23, [cq+64*31] pmulhrsw m14, m23, [cq+64*17] pmulhrsw m29, m23, [cq+64*15] pmulhrsw m26, m23, [cq+64* 9] pmulhrsw m17, m23, [cq+64*23] pmulhrsw m18, m23, [cq+64*25] pmulhrsw m25, m23, [cq+64* 7] pmulhrsw m24, m23, [cq+64* 5] pmulhrsw m19, m23, [cq+64*27] pmulhrsw m16, m23, [cq+64*21] pmulhrsw m27, m23, [cq+64*11] pmulhrsw m28, m23, [cq+64*13] pmulhrsw m15, m23, [cq+64*19] pmulhrsw m20, m23, [cq+64*29] pmulhrsw m23, [cq+64* 3] call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf vpbroadcastd m12, [o(pw_16384)] psubsw m13, m0, m29 ; 31 paddsw m0, m29 ; 0 psubsw m29, m1, m28 ; 30 paddsw m1, m28 ; 1 psubsw m28, m2, m27 ; 29 paddsw m2, m27 ; 2 psubsw m27, m3, m26 ; 28 paddsw m3, m26 ; 3 psubsw m26, m4, m25 ; 27 paddsw m4, m25 ; 4 psubsw m25, m5, m24 ; 26 paddsw m5, m24 ; 5 psubsw m24, m6, m23 ; 25 paddsw m6, m23 ; 6 psubsw m23, m7, m22 ; 24 paddsw m7, m22 ; 7 pxor m9, m9 punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 punpckhwd m3, m23, m24 punpcklwd m23, m24 punpckhwd m24, m25, m26 punpcklwd m25, m26 REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 punpckhwd m26, m27, m28 punpcklwd m27, m28 punpckhwd m28, m29, m13 punpcklwd m29, m13 REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 REPX {pmulhrsw x, m12}, m7, m0, m2, m4 punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 REPX {pmulhrsw x, m12}, m6, m8, m1, m22 punpckhdq m13, m23, m25 punpckldq m23, m25 punpckhdq m25, m27, m29 punpckldq m27, m29 REPX {pmulhrsw x, m12}, m13, m23, m25, m27 punpckhdq m9, m3, m24 punpckldq m3, m24 punpckhdq m24, m26, m28 punpckldq m26, m28 REPX {pmulhrsw x, m12}, m9, m3, m24, m26 punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 punpcklqdq m23, m27 ; d00 d08 d16 d24 punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 punpckhqdq m13, m25 ; d03 d11 d19 d27 punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 punpckhqdq m3, m26 ; d05 d13 d21 d29 punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 punpckhqdq m9, m24 ; d07 d15 d23 d31 mova [cq+64* 3], m23 mova [cq+64*13], m27 mova [cq+64* 7], m25 mova [cq+64*15], m26 punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 punpcklqdq m8, m22 ; a04 a12 a20 a28 punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 punpcklqdq m0, m4 ; a00 a08 a16 a24 punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 punpcklqdq m7, m2 ; a02 a10 a18 a26 punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 punpcklqdq m6, m1 ; a06 a14 a22 a30 mova [cq+64* 1], m0 mova [cq+64* 9], m7 mova [cq+64* 5], m8 mova [cq+64*11], m6 mova m2, [cq+64* 0] mova m11, [cq+64* 2] mova m8, [cq+64* 4] mova m29, [cq+64* 6] mova m27, [cq+64* 8] mova m26, [cq+64*10] mova m4, [cq+64*12] mova m28, [cq+64*14] psubsw m1, m2, m21 ; 23 paddsw m2, m21 ; 8 psubsw m21, m11, m20 ; 22 paddsw m11, m20 ; 9 psubsw m20, m8, m19 ; 21 paddsw m8, m19 ; 10 psubsw m19, m29, m18 ; 20 paddsw m29, m18 ; 11 psubsw m18, m27, m17 ; 19 paddsw m27, m17 ; 12 psubsw m17, m26, m16 ; 18 paddsw m26, m16 ; 13 psubsw m16, m4, m15 ; 17 paddsw m4, m15 ; 14 psubsw m15, m28, m14 ; 16 paddsw m28, m14 ; 15 punpcklwd m14, m15, m16 punpckhwd m15, m16 punpckhwd m16, m17, m18 punpcklwd m17, m18 punpckhwd m18, m19, m20 punpcklwd m19, m20 punpckhwd m20, m21, m1 punpcklwd m21, m1 punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 REPX {pmulhrsw x, m12}, m28, m2, m8, m27 punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 REPX {pmulhrsw x, m12}, m4, m1, m11, m29 punpckhdq m26, m19, m21 punpckldq m19, m21 punpckhdq m21, m15, m16 punpckldq m15, m16 REPX {pmulhrsw x, m12}, m26, m19, m21, m15 punpckhdq m16, m18, m20 punpckldq m18, m20 punpckhdq m20, m14, m17 punpckldq m14, m17 REPX {pmulhrsw x, m12}, m16, m18, m20, m14 punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 punpcklqdq m28, m8 ; b02 b10 b18 b26 punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 punpcklqdq m2, m27 ; b00 b08 b16 b24 punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 punpckhqdq m1, m29 ; b05 b13 b21 b29 punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 punpckhqdq m4, m11 ; b07 b15 b23 b31 mova [cq+64* 0], m2 mova [cq+64* 8], m28 mova [cq+64* 4], m27 mova [cq+64*10], m29 punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 punpcklqdq m20, m26 ; c02 c10 c18 c26 punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 punpcklqdq m14, m19 ; c00 c08 c16 c24 punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 punpcklqdq m15, m18 ; c04 c12 c20 c28 punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 punpcklqdq m21, m16 ; c06 c14 c22 c30 mova [cq+64* 2], m14 mova [cq+64*12], m20 mova [cq+64* 6], m15 mova [cq+64*14], m21 vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 mov r4, rsp vshufi32x4 m0, m22, m19, q2020 ; 1 vshufi32x4 m1, m17, m29, q3131 ; 31 vshufi32x4 m2, m14, m26, q2020 ; 17 vshufi32x4 m3, m25, m18, q3131 ; 15 call .main_part1 vshufi32x4 m0, m25, m18, q2020 ; 7 vshufi32x4 m1, m14, m26, q3131 ; 25 vshufi32x4 m2, m17, m29, q2020 ; 23 vshufi32x4 m3, m22, m19, q3131 ; 9 call .main_part1 vshufi32x4 m0, m24, m21, q2020 ; 5 vshufi32x4 m1, m15, m27, q3131 ; 27 vshufi32x4 m2, m16, m28, q2020 ; 21 vshufi32x4 m3, m23, m20, q3131 ; 11 call .main_part1 vshufi32x4 m0, m23, m20, q2020 ; 3 vshufi32x4 m1, m16, m28, q3131 ; 29 vshufi32x4 m2, m15, m27, q2020 ; 19 vshufi32x4 m3, m24, m21, q3131 ; 13 call .main_part1 call .main_part2 mova m0, [cq+64* 1] ; a0 mova m15, [cq+64* 0] ; b0 mova m3, [cq+64* 2] ; c0 mova m16, [cq+64* 3] ; d0 mova m14, [cq+64* 5] ; a4 mova m8, [cq+64* 4] ; b4 mova m17, [cq+64* 6] ; c4 mova m1, [cq+64* 7] ; d4 vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 vshufi32x4 m1, m0, m3, q3131 ; 8 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m2, m15, q3131 ; 24 vshufi32x4 m2, m15, q2020 ; 16 vshufi32x4 m15, m14, m17, q3131 ; 12 vshufi32x4 m14, m17, q2020 ; 4 vshufi32x4 m17, m16, m8, q3131 ; 28 vshufi32x4 m16, m8, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova m8, [cq+64* 8] mova m9, [cq+64*12] mova m11, [cq+64*10] mova m12, [cq+64*14] mova [cq+64* 0], m14 mova [cq+64* 2], m15 mova [cq+64* 4], m16 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64*12], m20 mova [cq+64*14], m21 mova m22, [cq+64* 9] mova m27, [cq+64*13] mova m23, [cq+64*11] mova m24, [cq+64*15] vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 vshufi32x4 m28, m26, m8, q3131 ; 26 vshufi32x4 m26, m8, q2020 ; 18 vshufi32x4 m24, m22, m9, q3131 ; 10 vshufi32x4 m22, m9, q2020 ; 2 vshufi32x4 m29, m27, m11, q3131 ; 30 vshufi32x4 m27, m11, q2020 ; 22 vshufi32x4 m25, m23, m12, q3131 ; 14 vshufi32x4 m23, m12, q2020 ; 6 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast jmp .end .fast: ; bottom/right halves are zero pmulhrsw ym9, ym23, [cq+64* 0] pmulhrsw ym6, ym23, [cq+64* 8] mova m14, [o(dup16_perm)] pmulhrsw ym8, ym23, [cq+64* 2] pmulhrsw xm0, xm23, [cq+64*14] pmulhrsw xm5, xm23, [cq+64*10] pmulhrsw ym1, ym23, [cq+64* 6] pmulhrsw ym7, ym23, [cq+64* 4] pmulhrsw xm3, xm23, [cq+64*12] pmovzxwd m9, ym9 pmovzxwd m6, ym6 vpermb m8, m14, m8 punpcklwd xm0, xm0 vpermb ym5, ym14, ym5 vpermb m1, m14, m1 vpermb m7, m14, m7 punpcklwd xm3, xm3 pslld m9, 16 pslld m6, 16 call m(idct_16x16_internal_8bpc).main_fast vpmulhrsw ym21, ym23, [cq+64* 1] {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements {evex}vpmulhrsw ym18, ym23, [cq+64* 5] {evex}vpmulhrsw xm16, xm23, [cq+64*11] {evex}vpmulhrsw xm19, xm23, [cq+64*13] {evex}vpmulhrsw ym23, [cq+64* 3] vpermb m21, m14, m21 punpcklwd xm17, xm17 vpermb ym20, ym14, ym20 vpermb m15, m14, m15 vpermb m18, m14, m18 vpermb ym16, ym14, ym16 punpcklwd xm19, xm19 vpermb m14, m14, m23 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast vpbroadcastd m9, [o(pw_16384)] call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round vshufi32x4 m16, m0, m3, q2020 ; 0 vshufi32x4 m26, m0, m3, q3131 ; 4 vshufi32x4 m0, m14, m2, q2020 ; 1 vshufi32x4 m14, m2, q3131 ; 5 vshufi32x4 m3, m19, m7, q3131 ; 15 vshufi32x4 m19, m7, q2020 ; 11 vshufi32x4 m27, m17, m9, q2020 ; 3 vshufi32x4 m17, m9, q3131 ; 7 vshufi32x4 m28, m20, m6, q2020 ; 9 vshufi32x4 m20, m6, q3131 ; 13 vshufi32x4 m22, m1, m18, q2020 ; 2 vshufi32x4 m23, m1, m18, q3131 ; 6 vshufi32x4 m24, m5, m15, q2020 ; 10 vshufi32x4 m25, m5, m15, q3131 ; 14 vshufi32x4 m15, m21, m4, q3131 ; 12 vshufi32x4 m21, m21, m4, q2020 ; 8 mov r4, rsp call .main_part1_fast mova m0, m17 mova m3, m28 call .main_part1_fast mova m0, m14 mova m3, m19 call .main_part1_fast mova m0, m27 mova m3, m20 call .main_part1_fast call .main_part2 mova m0, m16 mova m1, m21 mova m14, m26 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [cq+64*14], m21 mova [cq+64* 0], m14 mova [cq+64* 6], m17 mova [cq+64* 8], m18 mova [cq+64*10], m19 mova [cq+64* 4], m16 mova [cq+64* 2], m15 mova [cq+64*12], m20 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 .end: lea r4, [strideq*3] vpbroadcastd m12, [o(pw_2048)] movshdup m13, [o(permD)] lea r5, [r4+strideq] ; stride*4 lea r3, [dstq+r4*8] lea r6, [strideq+r5*8] ; stride*33 lea r8, [r4+r5*8] ; stride*35 add r3, r5 ; dst+stride*28 lea r7, [r6+strideq] ; stride*34 %macro IDCT_32x64_END 6 ; src, mem, stride[1-4] %if %2 < 8 paddsw m10, m%2, m%1 psubsw m11, m%2, m%1 %else mova m11, [cq+64*(%2*2-16)] paddsw m10, m11, m%1 psubsw m11, m%1 %endif mova m9, [rsp+64*(31-%2)] mova m%1, [rsp+64*%2] paddsw m8, m10, m9 psubsw m10, m9 paddsw m9, m11, m%1 pmovzxbw m0, [dstq+%3] psubsw m11, m%1 pmovzxbw m%1, [r3 +%4] REPX {pmulhrsw x, m12}, m8, m10, m9, m11 paddw m8, m0 pmovzxbw m0, [r3 +%5] paddw m10, m%1 pmovzxbw m%1, [dstq+%6] paddw m9, m0 paddw m11, m%1 %if %2 >= 8 %if %2 == 8 pxor m1, m1 %endif mova [cq+64*(%2*2-16)], m1 mova [cq+64*(%2*2-15)], m1 %endif packuswb m8, m10 packuswb m9, m11 vpermq m8, m13, m8 vpermq m9, m13, m9 mova [dstq+%3], ym8 vextracti32x8 [r3 +%4], m8, 1 mova [r3 +%5], ym9 vextracti32x8 [dstq+%6], m9, 1 %if %2 == 3 || %2 == 7 || %2 == 11 add dstq, r5 sub r3, r5 %endif %endmacro IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 64 imul r6d, 181 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 ALIGN function_align ; bottom three-quarters are zero cglobal_label .main_part1_fast2 vpbroadcastd m7, [o(idct64_mul+4*0)] vpbroadcastd m8, [o(idct64_mul+4*1)] pmulhrsw m7, m0 ; t63a pmulhrsw m0, m8 ; t32a punpcklwd m4, m0, m7 punpckhwd m6, m0, m7 mova m1, m10 vpdpwssd m1, m4, [o(idct64_mul+4*9)] {bcstd} mova m9, m10 vpdpwssd m9, m6, [o(idct64_mul+4*9)] {bcstd} REPX {psrad x, 12}, m1, m9 packssdw m1, m9 mova m9, m10 vpdpwssd m9, m6, [o(idct64_mul+4*8)] {bcstd} mova m6, m10 vpdpwssd m6, m4, [o(idct64_mul+4*8)] {bcstd} REPX {psrad x, 12}, m9, m6 packssdw m6, m9 mova m4, m0 mova m3, m7 mova m5, m1 mova m2, m6 jmp .main_part1c cglobal_label .main_part1_fast vpbroadcastd m1, [o(idct64_mul+4*0)] vpbroadcastd m8, [o(idct64_mul+4*1)] vpbroadcastd m2, [o(idct64_mul+4*6)] vpbroadcastd m9, [o(idct64_mul+4*7)] pmulhrsw m1, m0 ; t63a pmulhrsw m0, m8 ; t32a pmulhrsw m2, m3 ; t60a pmulhrsw m3, m9 ; t35a mova m8, m0 mova m7, m1 mova m6, m3 mova m5, m2 jmp .main_part1b cglobal_label .main_part1 ; idct64 steps 1-5: ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a vpbroadcastd m7, [o(idct64_mul+4*0)] vpbroadcastd m8, [o(idct64_mul+4*1)] vpbroadcastd m6, [o(idct64_mul+4*2)] vpbroadcastd m9, [o(idct64_mul+4*3)] pmulhrsw m7, m0 ; t63a vpbroadcastd m5, [o(idct64_mul+4*4)] pmulhrsw m0, m8 ; t32a vpbroadcastd m8, [o(idct64_mul+4*5)] pmulhrsw m6, m1 ; t62a vpbroadcastd m4, [o(idct64_mul+4*6)] pmulhrsw m1, m9 ; t33a vpbroadcastd m9, [o(idct64_mul+4*7)] pmulhrsw m5, m2 ; t61a pmulhrsw m2, m8 ; t34a pmulhrsw m4, m3 ; t60a pmulhrsw m3, m9 ; t35a psubsw m8, m0, m1 ; t33 paddsw m0, m1 ; t32 psubsw m1, m7, m6 ; t62 paddsw m7, m6 ; t63 psubsw m6, m3, m2 ; t34 paddsw m3, m2 ; t35 psubsw m2, m4, m5 ; t61 paddsw m5, m4 ; t60 .main_part1b: vpbroadcastd m11, [o(idct64_mul+4*8)] vpbroadcastd m12, [o(idct64_mul+4*9)] ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a vpbroadcastd m11, [o(idct64_mul+4*10)] ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a psubsw m4, m0, m3 ; t35a paddsw m0, m3 ; t32a psubsw m3, m7, m5 ; t60a paddsw m7, m5 ; t63a psubsw m5, m1, m2 ; t34 paddsw m1, m2 ; t33 psubsw m2, m8, m6 ; t61 paddsw m6, m8 ; t62 .main_part1c: vpbroadcastd m11, [o(idct64_mul+4*11)] vpbroadcastd m12, [o(idct64_mul+4*12)] add r5, 4*13 ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a mova [r4+64*0], m0 mova [r4+64*7], m7 mova [r4+64*1], m1 mova [r4+64*6], m6 mova [r4+64*3], m3 mova [r4+64*4], m4 mova [r4+64*2], m2 mova [r4+64*5], m5 add r4, 64*8 ret cglobal_label .main_part2 vpbroadcastd m11, [o(pw_1567_3784 -16*13)] vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] lea r6, [r4+64*7] vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] vpbroadcastd m18, [o(pw_2896_2896 -16*13)] vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] sub r5, 16*13 .main_part2_loop: mova m0, [r4-64*32] ; t32a mova m1, [r6-64*24] ; t39a mova m2, [r6-64*32] ; t63a mova m3, [r4-64*24] ; t56a mova m4, [r4-64*16] ; t40a mova m5, [r6-64* 8] ; t47a mova m6, [r6-64*16] ; t55a mova m7, [r4-64* 8] ; t48a psubsw m8, m0, m1 ; t39 paddsw m0, m1 ; t32 psubsw m1, m2, m3 ; t56 paddsw m2, m3 ; t63 psubsw m3, m5, m4 ; t40 paddsw m5, m4 ; t47 psubsw m4, m7, m6 ; t55 paddsw m7, m6 ; t48 ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a psubsw m6, m2, m7 ; t48a paddsw m2, m7 ; t63a psubsw m7, m0, m5 ; t47a paddsw m0, m5 ; t32a psubsw m5, m8, m3 ; t55 paddsw m8, m3 ; t56 psubsw m3, m1, m4 ; t40 paddsw m1, m4 ; t39 ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a mova [r6-64* 8], m2 mova [r4-64*32], m0 mova [r4-64* 8], m8 mova [r6-64*32], m1 mova [r6-64*24], m6 mova [r4-64*16], m7 mova [r4-64*24], m5 mova [r6-64*16], m3 add r4, 64 sub r6, 64 cmp r4, r6 jb .main_part2_loop ret cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob vpbroadcastd m23, [o(pw_2896x8)] %undef cmp cmp eobd, 136 jb .fast pmulhrsw m0, m23, [cq+64* 1] pmulhrsw m1, m23, [cq+64*31] pmulhrsw m2, m23, [cq+64*17] pmulhrsw m3, m23, [cq+64*15] vpbroadcastd m10, [o(pd_2048)] mov r4, rsp call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 pmulhrsw m0, m23, [cq+64* 7] pmulhrsw m1, m23, [cq+64*25] pmulhrsw m2, m23, [cq+64*23] pmulhrsw m3, m23, [cq+64* 9] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 pmulhrsw m0, m23, [cq+64* 5] pmulhrsw m1, m23, [cq+64*27] pmulhrsw m2, m23, [cq+64*21] pmulhrsw m3, m23, [cq+64*11] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 pmulhrsw m0, m23, [cq+64* 3] pmulhrsw m1, m23, [cq+64*29] pmulhrsw m2, m23, [cq+64*19] pmulhrsw m3, m23, [cq+64*13] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 pmulhrsw m3, m23, [cq+64*24] pmulhrsw m1, m23, [cq+64* 8] pmulhrsw m2, m23, [cq+64*16] pmulhrsw m0, m23, [cq+64* 0] pmulhrsw m14, m23, [cq+64* 4] pmulhrsw m17, m23, [cq+64*28] pmulhrsw m16, m23, [cq+64*20] pmulhrsw m15, m23, [cq+64*12] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast pmulhrsw m22, m23, [cq+64* 2] pmulhrsw m29, m23, [cq+64*30] pmulhrsw m26, m23, [cq+64*18] pmulhrsw m25, m23, [cq+64*14] pmulhrsw m24, m23, [cq+64*10] pmulhrsw m27, m23, [cq+64*22] pmulhrsw m28, m23, [cq+64*26] pmulhrsw m23, [cq+64* 6] mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_16384)] call .pass1_end_part1 mova [cq+64*16], m1 mova [cq+64*17], m3 mova [cq+64*18], m5 mova [cq+64*19], m7 mova [cq+64*24], m23 mova [cq+64*25], m25 mova [cq+64*26], m27 mova [cq+64*27], m29 pmulhrsw m23, m13, m0 ; a0 pmulhrsw m25, m13, m2 ; a2 pmulhrsw m27, m13, m4 ; a4 pmulhrsw m29, m13, m6 ; a6 REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 call .pass1_end_part2 mova [cq+64*20], m15 mova [cq+64*21], m17 mova [cq+64*22], m19 mova [cq+64*23], m21 mova [cq+64*28], m1 mova [cq+64*29], m3 mova [cq+64*30], m5 mova [cq+64*31], m7 REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 vshufi32x4 m2, m3, m15, q3131 ; 8 vshufi32x4 m0, m3, m15, q2020 ; 0 vshufi32x4 m6, m23, m22, q3131 ; 24 vshufi32x4 m4, m23, m22, q2020 ; 16 vshufi32x4 m3, m1, m18, q3131 ; 12 vshufi32x4 m1, m18, q2020 ; 4 vshufi32x4 m7, m27, m26, q3131 ; 28 vshufi32x4 m5, m27, m26, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x8_8bpc).main vshufi32x4 m16, m14, m17, q3131 ; 10 vshufi32x4 m14, m17, q2020 ; 2 vshufi32x4 m17, m19, m20, q3131 ; 14 vshufi32x4 m15, m19, m20, q2020 ; 6 vshufi32x4 m20, m25, m24, q3131 ; 26 vshufi32x4 m18, m25, m24, q2020 ; 18 vshufi32x4 m21, m29, m28, q3131 ; 30 vshufi32x4 m19, m29, m28, q2020 ; 22 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf pmulhrsw m22, m13, [cq+64*16] ; a1 pmulhrsw m23, m13, [cq+64*20] ; c1 pmulhrsw m24, m13, [cq+64*24] ; e1 pmulhrsw m25, m13, [cq+64*28] ; g1 pmulhrsw m26, m13, [cq+64*17] ; a3 pmulhrsw m27, m13, [cq+64*21] ; c3 pmulhrsw m28, m13, [cq+64*25] ; e3 pmulhrsw m29, m13, [cq+64*29] ; g3 mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 pmulhrsw m14, m13, [cq+64*18] ; a5 pmulhrsw m15, m13, [cq+64*22] ; c5 pmulhrsw m16, m13, [cq+64*26] ; e5 pmulhrsw m17, m13, [cq+64*30] ; g5 pmulhrsw m18, m13, [cq+64*19] ; a7 pmulhrsw m19, m13, [cq+64*23] ; c7 pmulhrsw m20, m13, [cq+64*27] ; e7 pmulhrsw m21, m13, [cq+64*31] ; g7 vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 mova [cq+64* 0], m0 mova [cq+64* 1], m1 mova [cq+64* 2], m2 mova [cq+64* 3], m3 mova [cq+64* 4], m4 mova [cq+64* 5], m5 mova [cq+64* 6], m6 mova [cq+64* 7], m7 vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 vshufi32x4 m27, m23, m11, q3131 ; 11 m27 vshufi32x4 m23, m11, q2020 ; 3 m23 vshufi32x4 m19, m26, m28, q3131 ; 27 m19 vshufi32x4 m15, m26, m28, q2020 ; 19 m15 vshufi32x4 m29, m25, m17, q3131 ; 15 m29 vshufi32x4 m25, m17, q2020 ; 7 m25 vshufi32x4 m21, m18, m20, q3131 ; 31 m21 vshufi32x4 m17, m18, m20, q2020 ; 23 m17 vshufi32x4 m20, m14, m16, q3131 ; 29 m20 vshufi32x4 m16, m14, m16, q2020 ; 21 m16 vshufi32x4 m18, m22, m24, q3131 ; 25 m18 vshufi32x4 m14, m22, m24, q2020 ; 17 m14 vshufi32x4 m26, m8, m9, q3131 ; 9 m26 vshufi32x4 m22, m8, m9, q2020 ; 1 m22 vshufi32x4 m28, m12, m13, q3131 ; 13 m28 vshufi32x4 m24, m12, m13, q2020 ; 5 m24 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf vpbroadcastd m13, [o(pw_16384)] pmulhrsw m0, m13, [r4-64*21] pmulhrsw m1, m13, [r4-64*22] pmulhrsw m2, m13, [r4-64*23] pmulhrsw m3, m13, [r4-64*24] pmulhrsw m4, m13, [r4-64*25] pmulhrsw m5, m13, [r4-64*26] pmulhrsw m6, m13, [r4-64*27] pmulhrsw m7, m13, [r4-64*28] mova [cq+64*16], m14 mova [cq+64*17], m15 mova [cq+64*18], m16 mova [cq+64*19], m17 mova [cq+64*20], m18 mova [cq+64*21], m19 mova [cq+64*22], m20 mova [cq+64*23], m21 pmulhrsw m14, m13, [r4-64*12] pmulhrsw m15, m13, [r4-64*11] pmulhrsw m16, m13, [r4-64*10] pmulhrsw m17, m13, [r4-64* 9] pmulhrsw m18, m13, [r4-64* 8] pmulhrsw m19, m13, [r4-64* 7] pmulhrsw m20, m13, [r4-64* 6] pmulhrsw m21, m13, [r4-64* 5] mova [cq+64*24], m22 mova [cq+64*25], m23 mova [cq+64*26], m24 mova [cq+64*27], m25 mova [cq+64*28], m26 mova [cq+64*29], m27 mova [cq+64*30], m28 mova [cq+64*31], m29 call .transpose_2x8x8_lo mova [r4-64*12], m1 mova [r4-64*11], m3 mova [r4-64*10], m5 mova [r4-64* 9], m7 mova [r4-64* 8], m15 mova [r4-64* 7], m17 mova [r4-64* 6], m19 mova [r4-64* 5], m21 vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 pmulhrsw m0, m13, [r4-64*20] pmulhrsw m1, m13, [r4-64*19] pmulhrsw m2, m13, [r4-64*18] pmulhrsw m3, m13, [r4-64*17] pmulhrsw m4, m13, [r4-64*16] pmulhrsw m5, m13, [r4-64*15] pmulhrsw m6, m13, [r4-64*14] pmulhrsw m7, m13, [r4-64*13] pmulhrsw m14, m13, [r4-64*29] pmulhrsw m15, m13, [r4-64*30] pmulhrsw m16, m13, [r4-64*31] pmulhrsw m17, m13, [r4-64*32] pmulhrsw m18, m13, [r4-64*33] pmulhrsw m19, m13, [r4-64*34] pmulhrsw m20, m13, [r4-64*35] pmulhrsw m21, m13, [r4-64*36] call .transpose_2x8x8_lo mova [r4-64*20], m1 mova [r4-64*19], m3 mova [r4-64*18], m5 mova [r4-64*17], m7 mova [r4-64*16], m15 mova [r4-64*15], m17 mova [r4-64*14], m19 mova [r4-64*13], m21 vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 vshufi32x4 m2, m0, m22, q3131 ; 8 vshufi32x4 m0, m22, q2020 ; 0 vshufi32x4 m3, m1, m26, q3131 ; 12 vshufi32x4 m1, m26, q2020 ; 4 vshufi32x4 m6, m4, m23, q3131 ; 24 vshufi32x4 m4, m23, q2020 ; 16 vshufi32x4 m7, m5, m27, q3131 ; 28 vshufi32x4 m5, m27, q2020 ; 20 call m(inv_txfm_add_dct_dct_32x8_8bpc).main vshufi32x4 m16, m14, m24, q3131 ; 10 vshufi32x4 m14, m24, q2020 ; 2 vshufi32x4 m17, m15, m28, q3131 ; 14 vshufi32x4 m15, m28, q2020 ; 6 vshufi32x4 m20, m18, m25, q3131 ; 26 vshufi32x4 m18, m25, q2020 ; 18 vshufi32x4 m21, m19, m29, q3131 ; 30 vshufi32x4 m19, m29, q2020 ; 22 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf mova m22, [r4-64*20] mova m26, [r4-64*16] mova m23, [r4-64*19] mova m27, [r4-64*15] mova m24, [r4-64*18] mova m28, [r4-64*14] mova m25, [r4-64*17] mova m29, [r4-64*13] mova [r4-64*20], m14 mova [r4-64*19], m15 mova [r4-64*18], m16 mova [r4-64*17], m17 mova [r4-64*16], m18 mova [r4-64*15], m19 mova [r4-64*14], m20 mova [r4-64*13], m21 mova m19, [r4-64*12] mova m11, [r4-64* 8] mova m20, [r4-64*11] mova m12, [r4-64* 7] mova m21, [r4-64*10] mova m8, [r4-64* 6] mova m9, [r4-64* 9] mova m18, [r4-64* 5] vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 vshufi32x4 m26, m22, m27, q3131 ; 9 vshufi32x4 m22, m27, q2020 ; 1 vshufi32x4 m27, m23, m28, q3131 ; 11 vshufi32x4 m23, m28, q2020 ; 3 vshufi32x4 m28, m24, m29, q3131 ; 13 vshufi32x4 m24, m29, q2020 ; 5 vshufi32x4 m29, m25, m8, q3131 ; 15 vshufi32x4 m25, m8, q2020 ; 7 vshufi32x4 m18, m14, m19, q3131 ; 25 vshufi32x4 m14, m19, q2020 ; 17 vshufi32x4 m19, m15, m20, q3131 ; 27 vshufi32x4 m15, m20, q2020 ; 19 vshufi32x4 m20, m16, m21, q3131 ; 29 vshufi32x4 m16, m21, q2020 ; 21 vshufi32x4 m21, m17, m9, q3131 ; 31 vshufi32x4 m17, m9, q2020 ; 23 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf jmp .end .fast: ; bottom/right halves are zero {evex}vpmulhrsw ym8, ym23, [cq+64* 4] {evex}vpmulhrsw xm1, xm23, [cq+64*12] mova m28, [o(dup16_perm)] {evex}vpmulhrsw ym7, ym23, [cq+64* 8] vpmulhrsw ym22, ym23, [cq+64* 0] vpermb m8, m28, m8 vpermb ym1, ym28, ym1 vpermb m7, m28, m7 pmovzxwd m9, ym22 pslld m9, 16 call m(idct_16x16_internal_8bpc).main_fast2 {evex}vpmulhrsw ym21, ym23, [cq+64* 2] {evex}vpmulhrsw xm15, xm23, [cq+64*14] {evex}vpmulhrsw xm18, xm23, [cq+64*10] {evex}vpmulhrsw ym14, ym23, [cq+64* 6] vpermb m21, m28, m21 punpcklwd xm15, xm15 vpermb ym18, ym28, ym18 vpermb m14, m28, m14 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 vpmulhrsw ym22, ym23, [cq+64* 1] {evex}vpmulhrsw xm29, xm23, [cq+64*15] {evex}vpmulhrsw xm26, xm23, [cq+64* 9] {evex}vpmulhrsw ym25, ym23, [cq+64* 7] {evex}vpmulhrsw ym24, ym23, [cq+64* 5] {evex}vpmulhrsw xm27, xm23, [cq+64*11] {evex}vpmulhrsw xm8, xm23, [cq+64*13] {evex}vpmulhrsw ym23, [cq+64* 3] vpermb m22, m28, m22 punpcklwd xm29, xm29 vpermb ym26, ym28, ym26 vpermb m25, m28, m25 mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 REPX {vpermb x, m28, x}, m24, m27, m23 punpcklwd xm28, xm8, xm8 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast mov r4, rsp vpbroadcastd m13, [o(pw_16384)] mova [r4+64*16], m4 mova [r4+64*17], m5 mova [r4+64*18], m6 mova [r4+64*19], m7 mova [r4+64*28], m26 mova [r4+64*29], m27 mova [r4+64*30], m28 mova [r4+64*31], m29 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end mova [r4+64*20], m22 mova [r4+64*21], m23 mova [r4+64*22], m24 mova [r4+64*23], m25 mova [r4+64*24], m26 mova [r4+64*25], m27 mova [r4+64*26], m28 mova [r4+64*27], m29 call .pass2_fast mova [cq+64* 8], m14 mova [cq+64* 9], m15 mova [cq+64*10], m16 mova [cq+64*11], m17 mova [cq+64*12], m18 mova [cq+64*13], m19 mova [cq+64*14], m20 mova [cq+64*15], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast mova [cq+64* 0], m0 mova [cq+64* 1], m1 mova [cq+64* 2], m2 mova [cq+64* 3], m3 mova [cq+64* 4], m4 mova [cq+64* 5], m5 mova [cq+64* 6], m6 mova [cq+64* 7], m7 pmulhrsw m0, m13, [r4+64*16] pmulhrsw m1, m13, [r4+64*17] pmulhrsw m2, m13, [r4+64*18] pmulhrsw m3, m13, [r4+64*19] pmulhrsw m4, m13, [r4+64*20] pmulhrsw m5, m13, [r4+64*21] pmulhrsw m6, m13, [r4+64*22] pmulhrsw m7, m13, [r4+64*23] mova [cq+64*16], m14 mova [cq+64*17], m15 mova [cq+64*18], m16 mova [cq+64*19], m17 mova [cq+64*20], m18 mova [cq+64*21], m19 mova [cq+64*22], m20 mova [cq+64*23], m21 pmulhrsw m14, m13, [r4+64*24] pmulhrsw m15, m13, [r4+64*25] pmulhrsw m16, m13, [r4+64*26] pmulhrsw m17, m13, [r4+64*27] pmulhrsw m18, m13, [r4+64*28] pmulhrsw m19, m13, [r4+64*29] pmulhrsw m20, m13, [r4+64*30] pmulhrsw m21, m13, [r4+64*31] mova [cq+64*24], m22 mova [cq+64*25], m23 mova [cq+64*26], m24 mova [cq+64*27], m25 mova [cq+64*28], m26 mova [cq+64*29], m27 mova [cq+64*30], m28 mova [cq+64*31], m29 call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round call .pass2_fast mova [r4+64*16], m14 mova [r4+64*17], m15 mova [r4+64*18], m16 mova [r4+64*19], m17 mova [r4+64*20], m18 mova [r4+64*21], m19 mova [r4+64*22], m20 mova [r4+64*23], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast .end: vpbroadcastd m13, [o(pw_2048)] lea r5, [strideq*3] pxor m12, m12 lea r3, [dstq+r5*8] lea r6, [strideq+r5] ; stride*4 add r3, r6 ; dst+stride*28 %macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi mova m11, [cq+64*( %3)] ; 0 mova m9, [cq+64*(31-%3)] ; 31 %if %3 >= 8 mova m%1, [rsp+64*(%1+16)] %endif mova m10, [dstq+%4] paddsw m8, m11, m9 psubsw m11, m9 paddsw m9, m%1, m%2 psubsw m%1, m%2 punpcklbw m%2, m10, m12 punpckhbw m10, m12 pmulhrsw m8, m13 pmulhrsw m9, m13 paddw m8, m%2 paddw m9, m10 mova m10, [r3+%5] pmulhrsw m11, m13 pmulhrsw m%1, m13 mova [cq+64*( %3)], m12 mova [cq+64*(31-%3)], m12 punpcklbw m%2, m10, m12 punpckhbw m10, m12 packuswb m8, m9 paddw m11, m%2 paddw m%1, m10 packuswb m11, m%1 mova [dstq+%4], m8 mova [r3 +%5], m11 %if %3 == 3 || %3 == 7 || %3 == 11 add dstq, r6 sub r3, r6 %endif %endmacro IDCT_64x32_END 0, 29, 0, strideq*0, r5 IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 IDCT_64x32_END 3, 26, 3, r5 , strideq*0 IDCT_64x32_END 4, 25, 4, strideq*0, r5 IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 IDCT_64x32_END 7, 22, 7, r5 , strideq*0 IDCT_64x32_END 0, 21, 8, strideq*0, r5 IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 IDCT_64x32_END 3, 18, 11, r5 , strideq*0 IDCT_64x32_END 4, 17, 12, strideq*0, r5 IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 IDCT_64x32_END 7, 14, 15, r5 , strideq*0 RET ALIGN function_align .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 32 imul r6d, 181 add r6d, 128 sar r6d, 8 imul r6d, 181 add r6d, 128+256 sar r6d, 8+1 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 ALIGN function_align .pass1_end_part1: %macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 %if %1 != %3 mova m%1, [cq+64*%1] %endif mova m9, [r4+64*(%3-36)] ; idct64 32+n mova m11, [r4+64*(-5-%3)] ; idct64 63-n psubsw m8, m%1, m%2 ; idct32 31-n paddsw m%1, m%2 ; idct32 0+n %if %1 == %3 psubsw m%2, m8, m9 ; out 32+n e paddsw m8, m9 ; out 31-n d psubsw m9, m%1, m11 ; out 63-n h paddsw m%1, m11 ; out 0+n a %else paddsw m%2, m8, m9 ; out 23-n c psubsw m8, m9 ; out 40+n f paddsw m9, m%1, m11 ; out 8+n b psubsw m%1, m11 ; out 55-n g %endif mova [r4+64*(%3-36)], m8 mova [r4+64*(-5-%3)], m9 %endmacro IDCT_64x32_PASS1_END 0, 29, 0 IDCT_64x32_PASS1_END 1, 28, 1 IDCT_64x32_PASS1_END 2, 27, 2 IDCT_64x32_PASS1_END 3, 26, 3 IDCT_64x32_PASS1_END 4, 25, 4 IDCT_64x32_PASS1_END 5, 24, 5 IDCT_64x32_PASS1_END 6, 23, 6 IDCT_64x32_PASS1_END 7, 22, 7 .transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 punpckhqdq m23, m22, m27 ; 1 23 punpcklqdq m22, m27 ; 0 22 punpckhqdq m27, m26, m28 ; 5 27 punpcklqdq m26, m28 ; 4 26 punpcklqdq m28, m29, m25 ; 6 28 punpckhqdq m29, m25 ; 7 29 punpckhqdq m25, m24, m8 ; 3 25 punpcklqdq m24, m8 ; 2 24 .transpose_8x8: punpckhwd m8, m4, m5 punpcklwd m4, m5 punpckhwd m5, m0, m1 punpcklwd m0, m1 punpckhwd m1, m6, m7 punpcklwd m6, m7 punpckhwd m7, m2, m3 punpcklwd m2, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckldq m2, m4, m6 punpckhdq m4, m6 punpckhdq m6, m5, m7 punpckldq m5, m7 punpckldq m7, m8, m1 punpckhdq m8, m1 punpckhqdq m1, m0, m2 punpcklqdq m0, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 punpcklqdq m4, m5, m7 punpckhqdq m5, m7 punpckhqdq m7, m6, m8 punpcklqdq m6, m8 ret .pass1_end_part2: IDCT_64x32_PASS1_END 0, 21, 8 IDCT_64x32_PASS1_END 1, 20, 9 IDCT_64x32_PASS1_END 2, 19, 10 IDCT_64x32_PASS1_END 3, 18, 11 IDCT_64x32_PASS1_END 4, 17, 12 IDCT_64x32_PASS1_END 5, 16, 13 IDCT_64x32_PASS1_END 6, 15, 14 IDCT_64x32_PASS1_END 7, 14, 15 .transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 punpcklwd m8, m3, m2 punpckhwd m3, m2 punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m7, m6 punpckhwd m7, m6 punpcklwd m6, m5, m4 punpckhwd m5, m4 punpckldq m4, m7, m5 punpckhdq m7, m5 punpckldq m5, m8, m2 punpckhdq m8, m2 punpckhdq m2, m0, m6 punpckldq m0, m6 punpckldq m6, m3, m1 punpckhdq m3, m1 punpckhqdq m1, m0, m5 punpcklqdq m0, m5 punpckhqdq m5, m4, m6 punpcklqdq m4, m6 punpcklqdq m6, m7, m3 punpckhqdq m7, m3 punpckhqdq m3, m2, m8 punpcklqdq m2, m8 punpckhwd m8, m18, m19 punpcklwd m18, m19 punpckhwd m19, m14, m15 punpcklwd m14, m15 punpckhwd m15, m20, m21 punpcklwd m20, m21 punpckhwd m21, m16, m17 punpcklwd m16, m17 punpckhdq m17, m14, m16 punpckldq m14, m16 punpckldq m16, m18, m20 punpckhdq m18, m20 punpckhdq m20, m19, m21 punpckldq m19, m21 punpckldq m21, m8, m15 punpckhdq m8, m15 punpckhqdq m15, m14, m16 punpcklqdq m14, m16 punpcklqdq m16, m17, m18 punpckhqdq m17, m18 punpcklqdq m18, m19, m21 punpckhqdq m19, m21 punpckhqdq m21, m20, m8 punpcklqdq m20, m8 ret .pass2_fast: vshufi32x4 m24, m9, m15, q3131 ; 5 vshufi32x4 m22, m9, m15, q2020 ; 1 vshufi32x4 m15, m1, m16, q3131 ; 6 vshufi32x4 m14, m1, m16, q2020 ; 2 vshufi32x4 m1, m0, m3, q3131 ; 4 vshufi32x4 m0, m3, q2020 ; 0 vshufi32x4 m3, m8, m2, q3131 ; 12 vshufi32x4 m2, m8, m2, q2020 ; 8 vshufi32x4 m25, m11, m17, q3131 ; 7 vshufi32x4 m23, m11, m17, q2020 ; 3 vshufi32x4 m17, m5, m19, q3131 ; 14 vshufi32x4 m16, m5, m19, q2020 ; 10 vshufi32x4 m29, m6, m20, q3131 ; 15 vshufi32x4 m27, m6, m20, q2020 ; 11 vshufi32x4 m28, m4, m18, q3131 ; 13 vshufi32x4 m26, m4, m18, q2020 ; 9 jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob lea r5, [o_base] test eobd, eobd jz .dconly PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob %undef cmp cmp eobd, 136 jb .fast mova m0, [cq+64* 1] mova m1, [cq+64*31] mova m2, [cq+64*17] mova m3, [cq+64*15] vpbroadcastd m10, [o(pd_2048)] mov r4, rsp call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, [cq+64* 7] mova m1, [cq+64*25] mova m2, [cq+64*23] mova m3, [cq+64* 9] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, [cq+64* 5] mova m1, [cq+64*27] mova m2, [cq+64*21] mova m3, [cq+64*11] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 mova m0, [cq+64* 3] mova m1, [cq+64*29] mova m2, [cq+64*19] mova m3, [cq+64*13] call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, [cq+64* 0] mova m1, [cq+64* 8] mova m2, [cq+64*16] mova m3, [cq+64*24] mova m14, [cq+64* 4] mova m15, [cq+64*12] mova m16, [cq+64*20] mova m17, [cq+64*28] call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast mova m22, [cq+64* 2] mova m29, [cq+64*30] mova m26, [cq+64*18] mova m25, [cq+64*14] mova m24, [cq+64*10] mova m27, [cq+64*22] mova m28, [cq+64*26] mova m23, [cq+64* 6] mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_8192)] call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 mova [r4+64*36], m1 mova [r4+64*37], m3 mova [r4+64*38], m5 mova [r4+64*39], m7 mova [r4+64*44], m23 mova [r4+64*45], m25 mova [r4+64*46], m27 mova [r4+64*47], m29 pmulhrsw m23, m13, m0 ; a0 pmulhrsw m25, m13, m2 ; a2 pmulhrsw m27, m13, m4 ; a4 pmulhrsw m29, m13, m6 ; a6 call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 lea r6, [r4-64*4] add r4, 64*28 call .pass2_end mov r4, rsp mova m0, [r4+64*23] mova m1, [r4+64*22] mova m2, [r4+64*21] mova m3, [r4+64*20] mova m4, [r4+64*19] mova m5, [r4+64*18] mova m6, [r4+64*17] mova m7, [r4+64*16] mova m22, [r4+64*15] mova m23, [r4+64*14] mova m24, [r4+64*13] mova m25, [r4+64*12] mova m26, [r4+64*11] mova m27, [r4+64*10] mova m28, [r4+64* 9] mova m29, [r4+64* 8] call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi vpbroadcastd m13, [o(pw_8192)] mova [r4+64* 8], m1 mova [r4+64* 9], m3 mova [r4+64*10], m5 mova [r4+64*11], m7 mova [r4+64*16], m23 mova [r4+64*17], m25 mova [r4+64*18], m27 mova [r4+64*19], m29 pmulhrsw m23, m13, m0 ; b0 pmulhrsw m25, m13, m2 ; b2 pmulhrsw m27, m13, m4 ; b4 pmulhrsw m29, m13, m6 ; b6 mova m0, [r4+64*31] mova m1, [r4+64*30] mova m2, [r4+64*29] mova m3, [r4+64*28] mova m4, [r4+64*27] mova m5, [r4+64*26] mova m6, [r4+64*25] mova m7, [r4+64*24] mova m14, [r4+64* 7] mova m15, [r4+64* 6] mova m16, [r4+64* 5] mova m17, [r4+64* 4] mova m18, [r4+64* 3] mova m19, [r4+64* 2] mova m20, [r4+64* 1] mova m21, [r4+64* 0] call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo mov r6, cq call .pass2_end jmp .end .fast: ; bottom/right halves are zero mova m28, [o(dup16_perm)] pmovzxwd m9, [cq+64* 0] vpermb m8, m28, [cq+64* 4] vpermb ym1, ym28, [cq+64*12] vpermb m7, m28, [cq+64* 8] pslld m9, 16 call m(idct_16x16_internal_8bpc).main_fast2 vpermb m21, m28, [cq+64* 2] vpermb ym15, ym28, [cq+64*14] vpermb ym18, ym28, [cq+64*10] vpermb m14, m28, [cq+64* 6] call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 vpermb m22, m28, [cq+64* 1] vpermb ym29, ym28, [cq+64*15] vpermb ym26, ym28, [cq+64* 9] vpermb m25, m28, [cq+64* 7] vpermb m24, m28, [cq+64* 5] vpermb ym27, ym28, [cq+64*11] vpermb m23, m28, [cq+64* 3] vpermb ym28, ym28, [cq+64*13] mova [cq+64* 0], m14 mova [cq+64* 1], m15 mova [cq+64* 2], m16 mova [cq+64* 3], m17 mova [cq+64* 4], m18 mova [cq+64* 5], m19 mova [cq+64* 6], m20 mova [cq+64* 7], m21 call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_8192)] mova [cq+64*16], m4 mova [cq+64*17], m5 mova [cq+64*18], m6 mova [cq+64*19], m7 mova [cq+64*28], m26 mova [cq+64*29], m27 mova [cq+64*30], m28 mova [cq+64*31], m29 call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end mova [cq+64*20], m22 mova [cq+64*21], m23 mova [cq+64*22], m24 mova [cq+64*23], m25 mova [cq+64*24], m26 mova [cq+64*25], m27 mova [cq+64*26], m28 mova [cq+64*27], m29 lea r4, [rsp+64*64] lea r3, [rsp+64*32] call .pass2_fast pmulhrsw m0, m13, [cq+64*16] pmulhrsw m1, m13, [cq+64*17] pmulhrsw m2, m13, [cq+64*18] pmulhrsw m3, m13, [cq+64*19] pmulhrsw m4, m13, [cq+64*20] pmulhrsw m5, m13, [cq+64*21] pmulhrsw m6, m13, [cq+64*22] pmulhrsw m7, m13, [cq+64*23] pmulhrsw m14, m13, [cq+64*24] pmulhrsw m15, m13, [cq+64*25] pmulhrsw m16, m13, [cq+64*26] pmulhrsw m17, m13, [cq+64*27] pmulhrsw m18, m13, [cq+64*28] pmulhrsw m19, m13, [cq+64*29] pmulhrsw m20, m13, [cq+64*30] pmulhrsw m21, m13, [cq+64*31] call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round mov r4, rsp mov r3, cq call .pass2_fast .end: vpbroadcastd m17, [o(pw_2048)] lea r5, [strideq*8] mov r3, dstq pxor m16, m16 sub r4, 64*5 ; rsp+64*31 mov r6, rsp .end_loop: mova m2, [r6+64*32] ; idct16 0+n lo mova m7, [r6+64*48] ; idct32 31-n lo mova m6, [cq+64* 0] ; idct16 0+n hi mova m0, [cq+64*16] ; idct32 31-n hi mova m4, [r4+64*64] ; idct64 63-n lo mova m1, [r4+64* 0] ; idct64 63-n hi mova m5, [r6+64*64] ; idct64 32+n lo mova m8, [r6+64* 0] ; idct64 32+n hi sub r3, strideq paddsw m3, m2, m7 ; idct32 0+n lo mova m12, [dstq+r5*0] psubsw m2, m7 ; idct32 31-n lo mova m15, [r3 +r5*8] paddsw m7, m6, m0 ; idct32 0+n hi mova m13, [r3 +r5*4] psubsw m6, m0 ; idct32 31-n hi mova m14, [dstq+r5*4] paddsw m0, m3, m4 ; out 0+n lo add r6, 64 psubsw m3, m4 ; out 63-n lo sub r4, 64 paddsw m4, m7, m1 ; out 0+n hi mova [cq+64* 0], m16 psubsw m7, m1 ; out 63-n hi mova [cq+64*16], m16 paddsw m1, m2, m5 ; out 31-n lo add cq, 64 psubsw m2, m5 ; out 32+n lo paddsw m5, m6, m8 ; out 31-n hi psubsw m6, m8 ; out 32+n hi pmulhrsw m0, m17 punpcklbw m8, m12, m16 pmulhrsw m4, m17 punpckhbw m12, m16 pmulhrsw m3, m17 punpcklbw m11, m15, m16 pmulhrsw m7, m17 punpckhbw m15, m16 pmulhrsw m1, m17 punpcklbw m9, m13, m16 pmulhrsw m5, m17 punpckhbw m13, m16 pmulhrsw m2, m17 punpcklbw m10, m14, m16 pmulhrsw m6, m17 punpckhbw m14, m16 paddw m0, m8 paddw m4, m12 packuswb m0, m4 paddw m3, m11 paddw m7, m15 packuswb m3, m7 paddw m1, m9 paddw m5, m13 packuswb m1, m5 paddw m2, m10 paddw m6, m14 packuswb m2, m6 mova [dstq+r5*0], m0 mova [r3 +r5*8], m3 mova [r3 +r5*4], m1 mova [dstq+r5*4], m2 add dstq, strideq cmp r6, r4 jb .end_loop RET .dconly: movsx r6d, word [cq] mov [cq], eobd or r3d, 64 jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly ALIGN function_align .pass2_end: REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 mova [r4+64*20], m1 mova [r4+64*21], m3 mova [r4+64*22], m5 mova [r4+64*23], m7 vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 mova [r4+64*12], m15 mova [r4+64*13], m17 mova [r4+64*14], m19 mova [r4+64*15], m21 vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 vshufi32x4 m0, m1, m5, q2020 ; 0 vshufi32x4 m1, m5, q3131 ; 8 vshufi32x4 m2, m3, m14, q2020 ; 16 vshufi32x4 m3, m14, q3131 ; 24 vshufi32x4 m14, m15, m18, q2020 ; 4 vshufi32x4 m15, m18, q3131 ; 12 vshufi32x4 m16, m17, m19, q2020 ; 20 vshufi32x4 m17, m19, q3131 ; 28 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast vshufi32x4 m24, m22, m25, q3131 ; 10 vshufi32x4 m22, m25, q2020 ; 2 vshufi32x4 m25, m23, m28, q3131 ; 14 vshufi32x4 m23, m28, q2020 ; 6 vshufi32x4 m28, m26, m27, q3131 ; 26 vshufi32x4 m26, m27, q2020 ; 18 vshufi32x4 m27, m29, m13, q2020 ; 22 vshufi32x4 m29, m13, q3131 ; 30 mova [r6+64* 0], m0 mova [r6+64* 1], m1 mova [r6+64* 2], m2 mova [r6+64* 3], m3 mova [r6+64* 4], m4 mova [r6+64* 5], m5 mova [r6+64* 6], m6 mova [r6+64* 7], m7 mova [r6+64* 8], m14 mova [r6+64* 9], m15 mova [r6+64*10], m16 mova [r6+64*11], m17 mova [r6+64*12], m18 mova [r6+64*13], m19 mova [r6+64*14], m20 mova [r6+64*15], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast vpbroadcastd m13, [o(pw_8192)] mova [r6+64*16], m29 mova [r6+64*17], m28 mova [r6+64*18], m27 mova [r6+64*19], m26 mova [r6+64*20], m25 mova [r6+64*21], m24 mova [r6+64*22], m23 mova [r6+64*23], m22 mova [r6+64*24], m21 mova [r6+64*25], m20 mova [r6+64*26], m19 mova [r6+64*27], m18 mova [r6+64*28], m17 mova [r6+64*29], m16 mova [r6+64*30], m15 mova [r6+64*31], m14 pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 pmulhrsw m16, m13, [r4+64*12] pmulhrsw m17, m13, [r4+64*16] pmulhrsw m18, m13, [r4+64*20] pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 pmulhrsw m20, m13, [r4+64*15] pmulhrsw m21, m13, [r4+64*19] pmulhrsw m22, m13, [r4+64*23] vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 pmulhrsw m24, m13, [r4+64*14] pmulhrsw m25, m13, [r4+64*18] pmulhrsw m26, m13, [r4+64*22] vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 pmulhrsw m28, m13, [r4+64*13] pmulhrsw m29, m13, [r4+64*17] pmulhrsw m13, [r4+64*21] vshufi32x4 m0, m14, m16, q2020 ; 1 vshufi32x4 m1, m19, m21, q3131 ; 31 vshufi32x4 m2, m15, m17, q2020 ; 17 vshufi32x4 m3, m18, m20, q3131 ; 15 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 vshufi32x4 m0, m18, m20, q2020 ; 7 vshufi32x4 m1, m15, m17, q3131 ; 25 vshufi32x4 m2, m19, m21, q2020 ; 23 vshufi32x4 m3, m14, m16, q3131 ; 9 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 vshufi32x4 m0, m22, m24, q2020 ; 5 vshufi32x4 m1, m27, m29, q3131 ; 27 vshufi32x4 m2, m23, m25, q2020 ; 21 vshufi32x4 m3, m26, m28, q3131 ; 11 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 vshufi32x4 m0, m26, m28, q2020 ; 3 vshufi32x4 m1, m23, m25, q3131 ; 29 vshufi32x4 m2, m27, m29, q2020 ; 19 vshufi32x4 m3, m22, m24, q3131 ; 13 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 ALIGN function_align .pass2_fast: vshufi32x4 m23, m1, m16, q3131 ; 6 vshufi32x4 m22, m1, m16, q2020 ; 2 vshufi32x4 m14, m0, m3, q3131 ; 4 vshufi32x4 m26, m0, m3, q2020 ; 0 vshufi32x4 m28, m9, m15, q3131 ; 5 vshufi32x4 m0, m9, m15, q2020 ; 1 vshufi32x4 m16, m11, m17, q3131 ; 7 vshufi32x4 m29, m11, m17, q2020 ; 3 vshufi32x4 m15, m8, m2, q3131 ; 12 vshufi32x4 m27, m8, m2, q2020 ; 8 vshufi32x4 m25, m5, m19, q3131 ; 14 vshufi32x4 m24, m5, m19, q2020 ; 10 vshufi32x4 m3, m6, m20, q3131 ; 15 vshufi32x4 m19, m6, m20, q2020 ; 11 vshufi32x4 m17, m4, m18, q3131 ; 13 vshufi32x4 m18, m4, m18, q2020 ; 9 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m16 mova m3, m18 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m28 mova m3, m19 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast mova m0, m29 mova m3, m17 call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 mova m0, m26 mova m1, m27 call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 mova [r3+64* 0], m0 mova [r3+64* 1], m1 mova [r3+64* 2], m2 mova [r3+64* 3], m3 mova [r3+64* 4], m4 mova [r3+64* 5], m5 mova [r3+64* 6], m6 mova [r3+64* 7], m7 mova [r3+64* 8], m14 mova [r3+64* 9], m15 mova [r3+64*10], m16 mova [r3+64*11], m17 mova [r3+64*12], m18 mova [r3+64*13], m19 mova [r3+64*14], m20 mova [r3+64*15], m21 call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 mova [r3+64*16], m29 mova [r3+64*17], m28 mova [r3+64*18], m27 mova [r3+64*19], m26 mova [r3+64*20], m25 mova [r3+64*21], m24 mova [r3+64*22], m23 mova [r3+64*23], m22 mova [r3+64*24], m21 mova [r3+64*25], m20 mova [r3+64*26], m19 mova [r3+64*27], m18 mova [r3+64*28], m17 mova [r3+64*29], m16 mova [r3+64*30], m15 mova [r3+64*31], m14 ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/itx_sse.asm000064400000000000000000010037101046102023000142450ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 %macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 pw_%1_m%2: times 4 dw %1, -%2 %if %3 != 2 pw_%2_%1: times 4 dw %2, %1 %endif %if %3 pw_m%1_m%2: times 4 dw -%1, -%2 %endif %endmacro ;adst4 pw_1321_3803: times 4 dw 1321, 3803 pw_2482_m1321: times 4 dw 2482, -1321 pw_3344_2482: times 4 dw 3344, 2482 pw_3344_m3803: times 4 dw 3344, -3803 pw_3344_m3344: times 4 dw 3344, -3344 pw_0_3344 times 4 dw 0, 3344 pw_m6688_m3803: times 4 dw -6688, -3803 COEF_PAIR 2896, 2896 COEF_PAIR 1567, 3784 COEF_PAIR 799, 4017 COEF_PAIR 3406, 2276 COEF_PAIR 401, 4076 COEF_PAIR 1931, 3612 COEF_PAIR 3166, 2598 COEF_PAIR 3920, 1189 COEF_PAIR 3784, 1567, 1 COEF_PAIR 995, 3973 COEF_PAIR 1751, 3703 COEF_PAIR 3513, 2106 COEF_PAIR 3857, 1380 COEF_PAIR 4017, 799, 1 COEF_PAIR 201, 4091 COEF_PAIR 2440, 3290 COEF_PAIR 3035, 2751 COEF_PAIR 4052, 601 COEF_PAIR 2276, 3406, 1 COEF_PAIR 4076, 401, 2 COEF_PAIR 2598, 3166, 2 COEF_PAIR 3612, 1931, 2 COEF_PAIR 1189, 3920, 2 pd_2048: times 4 dd 2048 pw_2048: times 8 dw 2048 pw_m2048: times 8 dw -2048 pw_4096: times 8 dw 4096 pw_16384: times 8 dw 16384 pw_m16384: times 8 dw -16384 pw_1697x16: times 8 dw 1697*16 pw_1697x8: times 8 dw 1697*8 pw_2896x8: times 8 dw 2896*8 pw_3344x8: times 8 dw 3344*8 pw_8192: times 8 dw 8192 pw_m8192: times 8 dw -8192 pw_5: times 8 dw 5 pw_201x8: times 8 dw 201*8 pw_4091x8: times 8 dw 4091*8 pw_m2751x8: times 8 dw -2751*8 pw_3035x8: times 8 dw 3035*8 pw_1751x8: times 8 dw 1751*8 pw_3703x8: times 8 dw 3703*8 pw_m1380x8: times 8 dw -1380*8 pw_3857x8: times 8 dw 3857*8 pw_995x8: times 8 dw 995*8 pw_3973x8: times 8 dw 3973*8 pw_m2106x8: times 8 dw -2106*8 pw_3513x8: times 8 dw 3513*8 pw_2440x8: times 8 dw 2440*8 pw_3290x8: times 8 dw 3290*8 pw_m601x8: times 8 dw -601*8 pw_4052x8: times 8 dw 4052*8 pw_4095x8: times 8 dw 4095*8 pw_101x8: times 8 dw 101*8 pw_2967x8: times 8 dw 2967*8 pw_m2824x8: times 8 dw -2824*8 pw_3745x8: times 8 dw 3745*8 pw_1660x8: times 8 dw 1660*8 pw_3822x8: times 8 dw 3822*8 pw_m1474x8: times 8 dw -1474*8 pw_3996x8: times 8 dw 3996*8 pw_897x8: times 8 dw 897*8 pw_3461x8: times 8 dw 3461*8 pw_m2191x8: times 8 dw -2191*8 pw_3349x8: times 8 dw 3349*8 pw_2359x8: times 8 dw 2359*8 pw_4036x8: times 8 dw 4036*8 pw_m700x8: times 8 dw -700*8 pw_4065x8: times 8 dw 4065*8 pw_501x8: times 8 dw 501*8 pw_3229x8: times 8 dw 3229*8 pw_m2520x8: times 8 dw -2520*8 pw_3564x8: times 8 dw 3564*8 pw_2019x8: times 8 dw 2019*8 pw_3948x8: times 8 dw 3948*8 pw_m1092x8: times 8 dw -1092*8 pw_3889x8: times 8 dw 3889*8 pw_1285x8: times 8 dw 1285*8 pw_3659x8: times 8 dw 3659*8 pw_m1842x8: times 8 dw -1842*8 pw_3102x8: times 8 dw 3102*8 pw_2675x8: times 8 dw 2675*8 pw_4085x8: times 8 dw 4085*8 pw_m301x8: times 8 dw -301*8 SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %if ARCH_X86_64 %define o(x) x %else %define o(x) r5-$$+x ; PIC %endif %macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] lea r2, [dstq+strideq*2] %assign %%i 1 %rotate 5 %rep 4 %if %1 & 2 CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) %else CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) %endif %assign %%i %%i + 1 %rotate 1 %endrep movd m%3, [%%row_adr1] ;dst0 movd m%5, [%%row_adr2] ;dst1 punpckldq m%3, m%5 ;high: dst1 :low: dst0 movd m%4, [%%row_adr3] ;dst2 movd m%5, [%%row_adr4] ;dst3 punpckldq m%4, m%5 ;high: dst3 :low: dst2 pxor m%5, m%5 punpcklbw m%3, m%5 ;extend byte to word punpcklbw m%4, m%5 ;extend byte to word paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 movd [%%row_adr1], m%3 ;store dst0 + out0 pshuflw m%4, m%3, q1032 movd [%%row_adr2], m%4 ;store dst1 + out1 punpckhqdq m%3, m%3 movd [%%row_adr3], m%3 ;store dst2 + out2 psrlq m%3, 32 movd [%%row_adr4], m%3 ;store dst3 + out3 %endmacro %macro ITX4_END 4-5 2048 ; row[1-4], rnd %if %5 mova m2, [o(pw_%5)] pmulhrsw m0, m2 pmulhrsw m1, m2 %endif WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 ret %endmacro ; flags: 1 = swap, 2: coef_regs, 4: no_pack %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags %if %6 & 2 pmaddwd m%2, m%4, m%1 pmaddwd m%1, m%5 %elif %6 & 1 pmaddwd m%2, m%1, [o(pw_%5_%4)] pmaddwd m%1, [o(pw_%4_m%5)] %else pmaddwd m%2, m%1, [o(pw_%4_m%5)] pmaddwd m%1, [o(pw_%5_%4)] %endif paddd m%2, m%3 paddd m%1, m%3 psrad m%2, 12 psrad m%1, 12 %if %6 & 4 == 0 packssdw m%1, m%2 %endif %endmacro %macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 mova m3, [o(pd_2048)] punpckhwd m2, m0, m1 ;unpacked in1 in3 punpcklwd m0, m1 ;unpacked in0 in2 ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 psubsw m1, m0, m2 ;high: out2 ;low: out3 paddsw m0, m2 ;high: out1 ;low: out0 %endmacro %macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 %define %%p1 m(i%1_%3_internal_8bpc) %if ARCH_X86_32 LEA r5, $$ %endif %if has_epilogue %ifidn %1_%2, dct_dct test eobd, eobd jz %%end %endif lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] call %%p1 RET %%end: %else lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] %ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else times ((%%end - %%p1) >> 31) & 1 jmp %%p1 ALIGN function_align %%end: %endif %endif %endmacro %macro INV_TXFM_4X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x4, 6 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [coeffq], eobd ;0 pmulhrsw m0, m1 mova m1, m0 TAIL_CALL m(iadst_4x4_internal_8bpc).end2 %endif %endmacro INIT_XMM ssse3 ; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. INV_TXFM_4X4_FN dct, dct INV_TXFM_4X4_FN dct, adst INV_TXFM_4X4_FN dct, flipadst INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 mova m1, [coeffq+16*1] ;high: in3 ;low in2 IDCT4_1D_PACKED mova m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 shufps m0, m1, q0220 pshufb m0, m2 ;high: in1 ;low: in0 pshufb m1, m3, m2 ;high: in3 ;low :in2 jmp tx2q .pass2: IDCT4_1D_PACKED pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); ITX4_END 0, 1, 3, 2 INV_TXFM_4X4_FN adst, dct INV_TXFM_4X4_FN adst, adst INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call .main punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 punpcklwd m0, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: call .main .end: pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 .end2: ITX4_END 0, 1, 2, 3 ALIGN function_align cglobal_label .main punpcklwd m2, m0, m1 ;unpacked in0 in2 punpckhwd m0, m1 ;unpacked in1 in3 mova m3, m0 pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 paddd m1, m0 ;t2 pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 paddd m4, m0 ;t0 + t3 pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 mova m0, [o(pd_2048)] paddd m1, m0 ;t2 + 2048 paddd m2, m0 paddd m0, m4 ;t0 + t3 + 2048 paddd m5, m2 ;t1 + t3 + 2048 paddd m2, m4 paddd m2, m3 ;t0 + t1 - t3 + 2048 REPX {psrad x, 12}, m1, m0, m5, m2 packssdw m0, m5 ;high: out1 ;low: out0 packssdw m1, m2 ;high: out3 ;low: out3 ret INV_TXFM_4X4_FN flipadst, dct INV_TXFM_4X4_FN flipadst, adst INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call m(iadst_4x4_internal_8bpc).main punpcklwd m2, m1, m0 punpckhwd m1, m0 punpcklwd m0, m1, m2 ;high: in3 ;low :in2 punpckhwd m1, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: call m(iadst_4x4_internal_8bpc).main .end: pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 .end2: ITX4_END 3, 2, 1, 0 INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] mova m3, [o(pw_1697x8)] pmulhrsw m2, m0, m3 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 punpckhwd m2, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m2 ;high: in3 ;low :in2 punpcklwd m0, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: mova m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 paddsw m0, m2 paddsw m1, m3 jmp m(iadst_4x4_internal_8bpc).end %macro IWHT4_1D_PACKED 0 punpckhqdq m3, m0, m1 ;low: in1 high: in3 punpcklqdq m0, m1 ;low: in0 high: in2 psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 paddw m0, m3 ;low: in0 + in1 high: in2 + in3 punpckhqdq m2, m2 ;t2 t2 punpcklqdq m0, m0 ;t0 t0 psubw m1, m0, m2 psraw m1, 1 ;t4 t4 psubw m1, m3 ;low: t1/out2 high: t3/out1 psubw m0, m1 ;high: out0 paddw m2, m1 ;low: out3 %endmacro INIT_XMM sse2 cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] pxor m2, m2 mova [coeffq+16*0], m2 mova [coeffq+16*1], m2 psraw m0, 2 psraw m1, 2 IWHT4_1D_PACKED punpckhwd m0, m1 punpcklwd m3, m1, m2 punpckhdq m1, m0, m3 punpckldq m0, m3 IWHT4_1D_PACKED shufpd m0, m2, 0x01 ITX4_END 0, 3, 2, 1, 0 %macro IDCT8_1D_PACKED 0 mova m6, [o(pd_2048)] punpckhwd m4, m0, m3 ;unpacked in1 in7 punpcklwd m0, m2 ;unpacked in0 in4 punpckhwd m2, m1 ;unpacked in5 in3 punpcklwd m1, m3 ;unpacked in2 in6 ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 psubsw m3, m4, m2 ;low: t6a high: t5a paddsw m4, m2 ;low: t7 high: t4 pshufb m3, [o(deint_shuf1)] ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 psubsw m2, m0, m1 ;low: tmp3 high: tmp2 paddsw m0, m1 ;low: tmp0 high: tmp1 punpcklqdq m1, m4, m3 ;low: t7 high: t6 punpckhqdq m4, m3 ;low: t4 high: t5 psubsw m3, m0, m1 ;low: out7 high: out6 paddsw m0, m1 ;low: out0 high: out1 paddsw m1, m2, m4 ;low: out3 high: out2 psubsw m2, m4 ;low: out4 high: out5 %endmacro ;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 ;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 %macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 punpckhwd m%4, m%1, m%2 punpcklwd m%1, m%2 %if %7 < 8 pmaddwd m%2, m%7, m%1 pmaddwd m%3, m%7, m%4 %else mova m%2, [o(pw_%7_%6)] %if %8 pmaddwd m%3, m%1, m%2 pmaddwd m%2, m%4 %else pmaddwd m%3, m%4, m%2 pmaddwd m%2, m%1 %endif %endif paddd m%3, m%5 paddd m%2, m%5 psrad m%3, 12 psrad m%2, 12 %if %8 packssdw m%3, m%2 %else packssdw m%2, m%3 ;dst2 %endif %if %7 < 8 pmaddwd m%4, m%6 pmaddwd m%1, m%6 %elif %8 mova m%2, [o(pw_%6_m%7)] pmaddwd m%4, m%2 pmaddwd m%1, m%2 %else mova m%3, [o(pw_%6_m%7)] pmaddwd m%4, m%3 pmaddwd m%1, m%3 %endif paddd m%4, m%5 paddd m%1, m%5 psrad m%4, 12 psrad m%1, 12 packssdw m%1, m%4 ;dst1 %endmacro %macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 psubsw m%3, m%1, m%2 ;out2 paddsw m%2, m%1 ;out1 paddsw m%1, m%5, m%4 ;out0 psubsw m%4, m%5 ;out3 %endmacro %macro WRITE_4X8 4 ;row[1-4] WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 lea dstq, [dstq+strideq*4] WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 %endmacro %macro INV_4X8 0 punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhdq m1, m0, m2 ;low: in2 high: in3 punpckldq m0, m2 ;low: in0 high: in1 punpckldq m2, m3, m4 ;low: in4 high: in5 punpckhdq m3, m4 ;low: in6 high: in7 %endmacro %macro INV_TXFM_4X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x8, 8 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [coeffq], eobd pmulhrsw m0, m1 pmulhrsw m0, m1 pmulhrsw m0, [o(pw_2048)] mova m1, m0 mova m2, m0 mova m3, m0 TAIL_CALL m(iadst_4x8_internal_8bpc).end3 %endif %endmacro INIT_XMM ssse3 INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst INV_TXFM_4X8_FN dct, identity cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: call m(idct_8x4_internal_8bpc).main jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: call .main shufps m1, m1, q1032 shufps m3, m3, q1032 mova m4, [o(pw_2048)] jmp m(iadst_4x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main IDCT8_1D_PACKED ret INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: call m(iadst_8x4_internal_8bpc).main .pass1_end: INV_4X8 jmp tx2q .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 call .main mova m4, [o(pw_2048)] pxor m5, m5 psubw m5, m4 .end: punpcklqdq m4, m5 .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 pmulhrsw m2, m4 pmulhrsw m3, m4 pxor m5, m5 mova [coeffq+16*0], m5 mova [coeffq+16*1], m5 mova [coeffq+16*2], m5 mova [coeffq+16*3], m5 .end3: WRITE_4X8 0, 1, 2, 3 RET ALIGN function_align cglobal_label .main mova m6, [o(pd_2048)] punpckhwd m4, m3, m0 ;unpacked in7 in0 punpckhwd m5, m2, m1 ;unpacked in5 in2 punpcklwd m1, m2 ;unpacked in3 in4 punpcklwd m0, m3 ;unpacked in1 in6 ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a psubsw m3, m4, m1 ;low: t4 high: t5 paddsw m4, m1 ;low: t0 high: t1 psubsw m2, m5, m0 ;low: t6 high: t7 paddsw m5, m0 ;low: t2 high: t3 shufps m1, m3, m2, q1032 punpckhwd m2, m1 punpcklwd m3, m1 ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a psubsw m1, m4, m5 ;low: t2 high: t3 paddsw m4, m5 ;low: out0 high: -out7 psubsw m5, m3, m2 ;low: t7 high: t6 paddsw m3, m2 ;low: out6 high: -out1 shufps m0, m4, m3, q3210 ;low: out0 high: -out1 shufps m3, m4, q3210 ;low: out6 high: -out7 mova m2, [o(pw_2896_m2896)] mova m7, [o(pw_2896_2896)] shufps m4, m1, m5, q1032 ;low: t3 high: t7 shufps m1, m5, q3210 ;low: t2 high: t6 punpcklwd m5, m1, m4 punpckhwd m1, m4 pmaddwd m4, m2, m1 ;-out5 pmaddwd m2, m5 ; out4 pmaddwd m1, m7 ; out2 pmaddwd m5, m7 ;-out3 REPX {paddd x, m6}, m4, m2, m1, m5 REPX {psrad x, 12}, m4, m2, m1, m5 packssdw m1, m5 ;low: out2 high: -out3 packssdw m2, m4 ;low: out4 high: -out5 ret INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: call m(iadst_8x4_internal_8bpc).main punpcklwd m4, m3, m2 punpckhwd m3, m2 punpcklwd m5, m1, m0 punpckhwd m1, m0 punpckldq m2, m3, m1 ;low: in4 high: in5 punpckhdq m3, m1 ;low: in6 high: in7 punpckldq m0, m4, m5 ;low: in0 high: in1 punpckhdq m1, m4, m5 ;low: in2 high: in3 jmp tx2q .pass2: shufps m0, m0, q1032 shufps m1, m1, q1032 call m(iadst_4x8_internal_8bpc).main mova m4, m0 mova m5, m1 pshufd m0, m3, q1032 pshufd m1, m2, q1032 pshufd m2, m5, q1032 pshufd m3, m4, q1032 mova m5, [o(pw_2048)] pxor m4, m4 psubw m4, m5 jmp m(iadst_4x8_internal_8bpc).end INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] .pass1: mova m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(iadst_4x8_internal_8bpc).pass1_end .pass2: mova m4, [o(pw_4096)] jmp m(iadst_4x8_internal_8bpc).end2 %macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] movq m%3, [dstq ] movq m%4, [dstq+strideq] pxor m%5, m%5 punpcklbw m%3, m%5 ;extend byte to word punpcklbw m%4, m%5 ;extend byte to word %ifnum %1 paddw m%3, m%1 %else paddw m%3, %1 %endif %ifnum %2 paddw m%4, m%2 %else paddw m%4, %2 %endif packuswb m%3, m%4 movq [dstq ], m%3 punpckhqdq m%3, m%3 movq [dstq+strideq], m%3 %endmacro %macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] WRITE_8X2 %1, %2, %5, %6, %7 lea dstq, [dstq+strideq*2] WRITE_8X2 %3, %4, %5, %6, %7 %endmacro %macro INV_TXFM_8X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x4, 8 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 pmulhrsw m0, m1 mova m2, [o(pw_2048)] pmulhrsw m0, m1 pmulhrsw m0, m2 mova m1, m0 mova m2, m0 mova m3, m0 TAIL_CALL m(iadst_8x4_internal_8bpc).end2 %endif %endmacro INV_TXFM_8X4_FN dct, dct INV_TXFM_8X4_FN dct, adst INV_TXFM_8X4_FN dct, flipadst INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] call m(idct_4x8_internal_8bpc).main mova m4, [o(deint_shuf1)] mova m5, [o(deint_shuf2)] pshufb m0, m4 pshufb m1, m5 pshufb m2, m4 pshufb m3, m5 punpckhdq m4, m0, m1 punpckldq m0, m1 punpckhdq m5, m2, m3 punpckldq m2, m3 punpckhqdq m1, m0, m2 ;in1 punpcklqdq m0, m2 ;in0 punpckhqdq m3, m4, m5 ;in3 punpcklqdq m2 ,m4, m5 ;in2 jmp tx2q .pass2: call .main jmp m(iadst_8x4_internal_8bpc).end ALIGN function_align cglobal_label .main mova m6, [o(pd_2048)] IDCT4_1D 0, 1, 2, 3, 4, 5, 6 ret INV_TXFM_8X4_FN adst, dct INV_TXFM_8X4_FN adst, adst INV_TXFM_8X4_FN adst, flipadst INV_TXFM_8X4_FN adst, identity cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] shufps m0, m0, q1032 shufps m1, m1, q1032 call m(iadst_4x8_internal_8bpc).main punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 pxor m5, m5 psubsw m3, m5, m1 psubsw m5, m4 punpckhdq m4, m5, m3 punpckldq m5, m3 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckhwd m1, m0, m5 ;in1 punpcklwd m0, m5 ;in0 punpcklwd m2, m3, m4 ;in2 punpckhwd m3, m4 ;in3 jmp tx2q .pass2: call .main .end: mova m4, [o(pw_2048)] pmulhrsw m0, m4 pmulhrsw m1, m4 pmulhrsw m2, m4 pmulhrsw m3, m4 .end2: pxor m6, m6 mova [coeffq+16*0], m6 mova [coeffq+16*1], m6 mova [coeffq+16*2], m6 mova [coeffq+16*3], m6 .end3: WRITE_8X4 0, 1, 2, 3, 4, 5, 6 RET ALIGN function_align cglobal_label .main punpckhwd m6, m0, m2 ;unpacked in0 in2 punpcklwd m0, m2 ;unpacked in0 in2 punpckhwd m7, m1, m3 ;unpacked in1 in3 punpcklwd m1, m3 ;unpacked in1 in3 mova m2, [o(pw_3344_m3344)] mova m4, [o(pw_0_3344)] pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 pmaddwd m5, m4, m7 ;3344 * in3 pmaddwd m2, m0 pmaddwd m4, m1 paddd m3, m5 paddd m2, m4 mova m4, [o(pd_2048)] paddd m3, m4 ;t2 + 2048 paddd m2, m4 psrad m3, 12 psrad m2, 12 packssdw m2, m3 ;out2 pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 paddd m3, m4 ;t0 + t3 pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 mova m4, [o(pd_2048)] paddd m0, m4 paddd m4, m3 ;t0 + t3 + 2048 paddd m5, m0 ;t1 + t3 + 2048 paddd m3, m0 paddd m3, m1 ;t0 + t1 - t3 + 2048 psrad m4, 12 ;out0 psrad m5, 12 ;out1 psrad m3, 12 ;out3 packssdw m0, m4, m5 ;low: out0 high: out1 pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 paddd m1, m4 ;t0 + t3 pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 mova m4, [o(pd_2048)] paddd m6, m4 paddd m4, m1 ;t0 + t3 + 2048 paddd m5, m6 ;t1 + t3 + 2048 paddd m1, m6 paddd m1, m7 ;t0 + t1 - t3 + 2048 psrad m4, 12 ;out0 psrad m5, 12 ;out1 psrad m1, 12 ;out3 packssdw m3, m1 ;out3 packssdw m4, m5 ;low: out0 high: out1 punpckhqdq m1, m0, m4 ;out1 punpcklqdq m0, m4 ;out0 ret INV_TXFM_8X4_FN flipadst, dct INV_TXFM_8X4_FN flipadst, adst INV_TXFM_8X4_FN flipadst, flipadst INV_TXFM_8X4_FN flipadst, identity cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] shufps m0, m0, q1032 shufps m1, m1, q1032 call m(iadst_4x8_internal_8bpc).main punpckhwd m5, m3, m2 punpcklwd m3, m2 punpckhwd m2, m1, m0 punpcklwd m1, m0 pxor m0, m0 psubsw m4, m0, m2 psubsw m0, m5 punpckhdq m2, m0, m4 punpckldq m0, m4 punpckhdq m4, m3, m1 punpckldq m3, m1 punpckhwd m1, m0, m3 ;in1 punpcklwd m0, m3 ;in0 punpckhwd m3, m2, m4 ;in3 punpcklwd m2, m4 ;in2 jmp tx2q .pass2: call m(iadst_8x4_internal_8bpc).main mova m4, m0 mova m5, m1 mova m0, m3 mova m1, m2 mova m2, m5 mova m3, m4 jmp m(iadst_8x4_internal_8bpc).end INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [coeffq+16*0] pmulhrsw m1, m3, [coeffq+16*1] pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m3, [coeffq+16*3] paddsw m0, m0 paddsw m1, m1 paddsw m2, m2 paddsw m3, m3 punpckhwd m4, m0, m1 punpcklwd m0, m1 punpckhwd m1, m2, m3 punpcklwd m2, m3 punpckhdq m5, m4, m1 punpckldq m4, m1 punpckhdq m3, m0, m2 punpckldq m0, m2 punpckhwd m1, m0, m4 ;in1 punpcklwd m0, m4 ;in0 punpcklwd m2, m3, m5 ;in2 punpckhwd m3, m5 ;in3 jmp tx2q .pass2: mova m7, [o(pw_1697x8)] pmulhrsw m4, m7, m0 pmulhrsw m5, m7, m1 pmulhrsw m6, m7, m2 pmulhrsw m7, m3 paddsw m0, m4 paddsw m1, m5 paddsw m2, m6 paddsw m3, m7 jmp m(iadst_8x4_internal_8bpc).end %macro INV_TXFM_8X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x8, 8, 16*4 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mova m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m2 psrlw m2, 3 pmulhrsw m0, m1 pmulhrsw m0, m2 .end: mov r3d, 2 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] .loop: WRITE_8X4 0, 0, 0, 0, 1, 2, 3 lea dstq, [dstq+strideq*2] dec r3d jg .loop jmp tx2q .end3: RET %endif %endmacro %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 %if %3 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [%1+%2*0] pmulhrsw m1, m7, [%1+%2*1] pmulhrsw m2, m7, [%1+%2*2] pmulhrsw m3, m7, [%1+%2*3] pmulhrsw m4, m7, [%1+%2*4] pmulhrsw m5, m7, [%1+%2*5] pmulhrsw m6, m7, [%1+%2*6] pmulhrsw m7, [%1+%2*7] %else mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] mova m4, [%1+%2*4] mova m5, [%1+%2*5] mova m6, [%1+%2*6] mova m7, [%1+%2*7] %endif %endmacro %macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a psubsw m%2, m%4, m%5 ;t6a paddsw m%4, m%5 ;t7 psubsw m%5, m%1, m%3 ;t5a paddsw m%1, m%3 ;t4 ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 %endmacro INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: call .main .pass1_end: mova m7, [o(pw_16384)] .pass1_end1: REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 .pass1_end2: REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize+16*0] cglobal_label .pass1_end3 punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 mova [rsp+gprsize+16*2], m6 mova m6, [rsp+gprsize+16*1] punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 mova [rsp+gprsize+16*0], m2 punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 mova m7, [rsp+gprsize+16*2] punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 mova m7, [rsp+gprsize+16*0] jmp tx2q .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 .end2: REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, [rsp+gprsize+16*0] mova [rsp+gprsize+16*2], m5 mova [rsp+gprsize+16*0], m7 .end3: WRITE_8X4 0, 1, 2, 3, 5, 6, 7 lea dstq, [dstq+strideq*2] WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 jmp tx2q .end4: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ret ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m1 mova m7, [o(pd_2048)] IDCT4_1D 0, 2, 4, 6, 1, 3, 7 mova m3, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*2], m2 mova m2, [rsp+gprsize*2+16*1] mova [rsp+gprsize*2+16*1], m4 mova m4, [rsp+gprsize*2+16*0] mova [rsp+gprsize*2+16*0], m6 IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 mova m6, [rsp+gprsize*2+16*0] psubsw m7, m0, m4 ;out7 paddsw m0, m4 ;out0 mova [rsp+gprsize*2+16*0], m7 mova m1, [rsp+gprsize*2+16*2] psubsw m4, m6, m3 ;out4 paddsw m3, m6 ;out3 mova m7, [rsp+gprsize*2+16*1] psubsw m6, m1, m5 ;out6 paddsw m1, m5 ;out1 psubsw m5, m7, m2 ;out5 paddsw m2, m7 ;out2 ret INV_TXFM_8X8_FN adst, dct INV_TXFM_8X8_FN adst, adst INV_TXFM_8X8_FN adst, flipadst INV_TXFM_8X8_FN adst, identity cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: call .main call .main_pass1_end .pass1_end: mova m7, [o(pw_16384)] .pass1_end1: REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 pxor m6, m6 psubw m6, m7 mova m7, m6 jmp m(idct_8x8_internal_8bpc).pass1_end2 ALIGN function_align .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call .main call .main_pass2_end .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*1], m6 pxor m6, m6 psubw m6, m7 mova m7, m6 jmp m(idct_8x8_internal_8bpc).end2 ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*0], m7 mova [rsp+gprsize*2+16*1], m3 mova [rsp+gprsize*2+16*2], m4 mova m7, [o(pd_2048)] ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a paddsw m3, m2, m6 ;t2 psubsw m2, m6 ;t6 paddsw m4, m5, m1 ;t3 psubsw m5, m1 ;t7 ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a mova m6, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*2], m5 mova m1, [rsp+gprsize*2+16*1] mova [rsp+gprsize*2+16*1], m2 mova m5, [rsp+gprsize*2+16*0] mova [rsp+gprsize*2+16*0], m3 ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a psubsw m2, m0, m6 ;t4 paddsw m0, m6 ;t0 paddsw m3, m5, m1 ;t1 psubsw m5, m1 ;t5 ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a mova m7, [rsp+gprsize*2+16*0] paddsw m1, m3, m4 ;-out7 psubsw m3, m4 ;t3 mova [rsp+gprsize*2+16*0], m1 psubsw m4, m0, m7 ;t2 paddsw m0, m7 ;out0 mova m6, [rsp+gprsize*2+16*2] mova m7, [rsp+gprsize*2+16*1] paddsw m1, m5, m6 ;-out1 psubsw m5, m6 ;t6 paddsw m6, m2, m7 ;out6 psubsw m2, m7 ;t7 ret ALIGN function_align .main_pass1_end: mova [rsp+gprsize*2+16*1], m1 mova [rsp+gprsize*2+16*2], m6 punpckhwd m1, m4, m3 punpcklwd m4, m3 punpckhwd m7, m5, m2 punpcklwd m5, m2 mova m2, [o(pw_2896_2896)] mova m6, [o(pd_2048)] pmaddwd m3, m2, m7 pmaddwd m2, m5 paddd m3, m6 paddd m2, m6 psrad m3, 12 psrad m2, 12 packssdw m2, m3 ;out2 mova m3, [o(pw_2896_m2896)] pmaddwd m7, m3 pmaddwd m5, m3 paddd m7, m6 paddd m5, m6 psrad m7, 12 psrad m5, 12 packssdw m5, m7 ;-out5 mova m3, [o(pw_2896_2896)] pmaddwd m7, m3, m1 pmaddwd m3, m4 paddd m7, m6 paddd m3, m6 psrad m7, 12 psrad m3, 12 packssdw m3, m7 ;-out3 mova m7, [o(pw_2896_m2896)] pmaddwd m1, m7 pmaddwd m4, m7 paddd m1, m6 paddd m4, m6 psrad m1, 12 psrad m4, 12 packssdw m4, m1 ;-out5 mova m1, [rsp+gprsize*2+16*1] mova m6, [rsp+gprsize*2+16*2] ret ALIGN function_align cglobal_label .main_pass2_end paddsw m7, m4, m3 ;t2 + t3 psubsw m4, m3 ;t2 - t3 paddsw m3, m5, m2 ;t6 + t7 psubsw m5, m2 ;t6 - t7 mova m2, [o(pw_2896x8)] pmulhrsw m4, m2 ;out4 pmulhrsw m5, m2 ;-out5 pmulhrsw m7, m2 ;-out3 pmulhrsw m2, m3 ;out2 mova m3, m7 ret INV_TXFM_8X8_FN flipadst, dct INV_TXFM_8X8_FN flipadst, adst INV_TXFM_8X8_FN flipadst, flipadst INV_TXFM_8X8_FN flipadst, identity cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 .pass1: call m(iadst_8x8_internal_8bpc).main call m(iadst_8x8_internal_8bpc).main_pass1_end .pass1_end: mova m7, [o(pw_m16384)] .pass1_end1: pmulhrsw m1, m7 mova [rsp+gprsize+16*1], m1 mova m1, m6 mova m6, m2 pmulhrsw m2, m5, m7 mova m5, m6 mova m6, m4 pmulhrsw m4, m3, m7 mova m3, m6 mova m6, m0 mova m0, m7 pxor m7, m7 psubw m7, m0 pmulhrsw m0, [rsp+gprsize+16*0] REPX {pmulhrsw x, m7}, m1, m3, m5 pmulhrsw m7, m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .pass2_main: call m(iadst_8x8_internal_8bpc).main call m(iadst_8x8_internal_8bpc).main_pass2_end .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m2, m4, m6 mova [rsp+gprsize+16*2], m2 mova m2, m0 pxor m0, m0 psubw m0, m7 mova m7, m2 pmulhrsw m1, m0 pmulhrsw m2, m5, m0 mova [rsp+gprsize+16*1], m1 mova m5, m4 mova m1, m6 pmulhrsw m4, m3, m0 pmulhrsw m0, [rsp+gprsize+16*0] mova m3, m5 mova [rsp+gprsize+16*0], m7 jmp m(idct_8x8_internal_8bpc).end3 INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 ALIGN function_align .pass2: lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] .end: pmulhrsw m7, [o(pw_4096)] mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_4096)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+gprsize+16*2], m5 mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).end3 %macro INV_TXFM_4X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 4x16, 8 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mov [coeffq], eobd pmulhrsw m0, [o(pw_16384)] pmulhrsw m0, m1 pmulhrsw m0, [o(pw_2048)] .end: WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 lea dstq, [dstq+strideq*4] WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 lea dstq, [dstq+strideq*4] WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 lea dstq, [dstq+strideq*4] WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 RET %endif %endmacro INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst INV_TXFM_4X16_FN dct, identity cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] .pass1: mova m0, [coeffq+16*1] mova m1, [coeffq+16*3] mova m2, [coeffq+16*5] mova m3, [coeffq+16*7] push tx2q lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] jmp r3 .pass1_2: mova [coeffq+16*1], m0 mova [coeffq+16*3], m1 mova [coeffq+16*5], m2 mova [coeffq+16*7], m3 mova m0, [coeffq+16*0] mova m1, [coeffq+16*2] mova m2, [coeffq+16*4] mova m3, [coeffq+16*6] lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: pop tx2q mova m4, [coeffq+16*1] mova m5, [coeffq+16*3] mova m6, [coeffq+16*5] mova m7, [o(pw_16384)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*7], m7 jmp tx2q .pass2: call m(idct_16x4_internal_8bpc).main .end: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*4], m4 .end1: mova [coeffq+16*5], m5 mova [coeffq+16*6], m6 mov r3, coeffq WRITE_4X8 0, 1, 3, 2 mova m0, [r3+16*4] mova m1, [r3+16*5] mova m2, [r3+16*6] mova m3, m7 lea dstq, [dstq+strideq*4] WRITE_4X8 0, 1, 3, 2 .end2: pxor m7, m7 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ret INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] jmp m(idct_4x16_internal_8bpc).pass1 .pass2: call m(iadst_16x4_internal_8bpc).main call m(iadst_16x4_internal_8bpc).main_pass2_end punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 punpckhqdq m4, m5 ;low: out8 high: out10 punpcklqdq m5, m7, m2 ;low: out4 high: out6 punpckhqdq m2, m7 ;low: -out9 high: -out11 mova [coeffq+16*4], m2 mova [coeffq+16*5], m6 mova m2, [coeffq+16*6] mova m6, [coeffq+16*7] punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 punpcklqdq m0, m6 ;low: out0 high: out2 punpckhqdq m6, m3, m2 ;low: out12 high: out14 punpcklqdq m2, m3 ;low: -out1 high: -out3 mova m7, [o(pw_2048)] .end1: REPX {pmulhrsw x, m7}, m0, m5, m4, m6 pxor m3, m3 psubw m3, m7 mova m7, [coeffq+16*4] REPX {pmulhrsw x, m3}, m2, m7, m1 pmulhrsw m3, [coeffq+16*5] mova [coeffq+16*7], m5 punpckhqdq m5, m4, m7 ;low: out10 high: out11 punpcklqdq m4, m7 ;low: out8 high: out9 punpckhqdq m7, m6, m1 ;low: out14 high: out15 punpcklqdq m6, m1 ;low: out12 high: out13 punpckhqdq m1, m0, m2 ;low: out2 high: out3 punpcklqdq m0, m2 ;low: out0 high: out1 mova [coeffq+16*4], m4 mova m4, [coeffq+16*7] punpcklqdq m2, m4, m3 ;low: out4 high: out5 punpckhqdq m4, m3 ;low: out6 high: out7 mova m3, m4 .end2: mova [coeffq+16*5], m5 mova [coeffq+16*6], m6 mov r3, coeffq WRITE_4X8 0, 1, 2, 3 mova m0, [r3+16*4] mova m1, [r3+16*5] mova m2, [r3+16*6] mova m3, m7 lea dstq, [dstq+strideq*4] WRITE_4X8 0, 1, 2, 3 .end3: pxor m7, m7 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ret INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] jmp m(idct_4x16_internal_8bpc).pass1 .pass2: call m(iadst_16x4_internal_8bpc).main call m(iadst_16x4_internal_8bpc).main_pass2_end punpckhqdq m6, m5, m4 ;low: out5 high: out7 punpcklqdq m4, m5 ;low: -out8 high: -out10 punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 punpcklqdq m2, m7 ;low: out9 high: out11 mova [coeffq+16*4], m2 mova [coeffq+16*5], m6 mova m2, [coeffq+16*6] mova m6, [coeffq+16*7] punpcklqdq m1, m6, m0 ;low: out13 high: out15 punpckhqdq m0, m6 ;low: -out0 high: -out2 punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 punpckhqdq m2, m3 ;low: out1 high: out3 mova m7, [o(pw_m2048)] jmp m(iadst_4x16_internal_8bpc).end1 INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity %macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 pmulhrsw m%2, m%4 %else paddsw m%1, m%1 %endif paddsw m%1, m%2 %endmacro cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*1] mova m6, [o(pw_1697x8)] mova m1, [coeffq+16*3] mova m2, [coeffq+16*5] mova m3, [coeffq+16*7] pcmpeqw m7, m7 mov r3, tx2q lea tx2q, [o(.pass1_2)] .pass1: pmulhrsw m4, m6, m0 pmulhrsw m5, m6, m1 pavgw m4, m0 pcmpeqw m0, m7 pavgw m5, m1 pcmpeqw m1, m7 pandn m0, m4 pmulhrsw m4, m6, m2 pandn m1, m5 pmulhrsw m5, m6, m3 pavgw m4, m2 pcmpeqw m2, m7 pavgw m5, m3 pcmpeqw m3, m7 pandn m2, m4 pandn m3, m5 jmp m(iadst_4x8_internal_8bpc).pass1_end .pass1_2: mova [coeffq+16*1], m0 mova [coeffq+16*3], m1 mova [coeffq+16*5], m2 mova [coeffq+16*7], m3 mova m0, [coeffq+16*0] mova m1, [coeffq+16*2] mova m2, [coeffq+16*4] mova m3, [coeffq+16*6] lea tx2q, [o(.pass1_end)] jmp .pass1 .pass1_end: mova m4, [coeffq+16*1] mova m5, [coeffq+16*3] mova m6, [coeffq+16*5] jmp r3 .pass2: mova m7, [o(pw_1697x16)] mova [coeffq+16*6], m6 REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 mova m6, [coeffq+16*7] IDTX16 6, 7, 7 mova [coeffq+16*7], m6 mova m6, [coeffq+16*6] pmulhrsw m7, m6, [o(pw_1697x16)] paddsw m6, m6 paddsw m6, m7 mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*7] mova [coeffq+16*4], m4 jmp m(iadst_4x16_internal_8bpc).end2 %macro INV_TXFM_16X4_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x4, 8 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd mov r2d, 2 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] .dconly: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 pxor m5, m5 .dconly_loop: mova m1, [dstq] mova m3, [dstq+strideq] punpckhbw m2, m1, m5 punpcklbw m1, m5 punpckhbw m4, m3, m5 punpcklbw m3, m5 paddw m2, m0 paddw m1, m0 paddw m4, m0 paddw m3, m0 packuswb m1, m2 packuswb m3, m4 mova [dstq], m1 mova [dstq+strideq], m3 lea dstq, [dstq+strideq*2] dec r2d jg .dconly_loop jmp tx2q .end: RET %endif %endmacro %macro LOAD_7ROWS 2 ;src, stride mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] mova m4, [%1+%2*4] mova m5, [%1+%2*5] mova m6, [%1+%2*6] %endmacro %macro SAVE_7ROWS 2 ;src, stride mova [%1+%2*0], m0 mova [%1+%2*1], m1 mova [%1+%2*2], m2 mova [%1+%2*3], m3 mova [%1+%2*4], m4 mova [%1+%2*5], m5 mova [%1+%2*6], m6 %endmacro %macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] punpckhwd m%5, m%4, m%1 ;packed in13 in3 punpcklwd m%1, m%4 ;packed in1 in15 punpcklwd m%4, m%3, m%2 ;packed in9 in7 punpckhwd m%2, m%3 ;packed in5 in11 mova m%7, [o(pd_2048)] ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a psubsw m%6, m%1, m%4 ;low: t9 high: t14 paddsw m%1, m%4 ;low: t8 high: t15 psubsw m%4, m%5, m%2 ;low: t10 high: t13 paddsw m%5, m%2 ;low: t11 high: t12 mova m%2, [o(deint_shuf2)] pshufb m%6, m%2 pshufb m%4, m%2 ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a psubsw m%3, m%1, m%5 ;low: t11a high: t12a paddsw m%1, m%5 ;low: t8a high: t15a psubsw m%5, m%6, m%4 ;low: t10 high: t13 paddsw m%6, m%4 ;low: t9 high: t14 pshufb m%3, m%2 pshufb m%5, m%2 ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a packssdw m%2, m%4 ;low: t11 high: t10a packssdw m%3, m%5 ;low: t12 high: t13a punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 punpcklqdq m%1, m%6 ;low: t8a high: t9 %endmacro INV_TXFM_16X4_FN dct, dct INV_TXFM_16X4_FN dct, adst INV_TXFM_16X4_FN dct, flipadst INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main .pass1_end: punpckhwd m7, m0, m2 ;packed out1, out5 punpcklwd m0, m2 ;packed out0, out4 punpcklwd m2, m1, m3 ;packed out3, out7 punpckhwd m1, m3 ;packed out2, out6 mova [coeffq+16*6], m7 mova m7, [coeffq+16*7] punpckhwd m3, m4, m6 ;packed out9, out13 punpcklwd m4, m6 ;packed out8, out12 punpcklwd m6, m5, m7 ;packed out11, out15 punpckhwd m5, m7 ;packed out10, out14 .pass1_end2: mova m7, [o(pw_16384)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [coeffq+16*6] mova [coeffq+16*6], m7 .pass1_end3: punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high punpcklwd m3, m6 ;packed 9, 10, 13, 15 low punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high punpcklwd m4, m5 ;packed 8, 10, 12, 14 low punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) mova [coeffq+16*7], m3 mova m3, [coeffq+16*6] punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high punpcklwd m3, m2 ;packed 1, 3, 5, 7 low punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high punpcklwd m0, m1 ;packed 0, 2, 4, 6 low punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) jmp tx2q .pass2: lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] .pass2_end: mova [coeffq+16*4], m4 mova [coeffq+16*5], m5 mova [coeffq+16*6], m6 lea r3, [dstq+8] call tx2q add coeffq, 16*4 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] mova m2, [coeffq+16*2] mova m3, [coeffq+16*3] mov dstq, r3 jmp tx2q ALIGN function_align cglobal_label .main punpckhqdq m7, m0, m1 ;low:in1 high:in3 punpcklqdq m0, m1 punpcklqdq m1, m2, m3 punpckhqdq m3, m2 ;low:in7 high:in5 mova [coeffq+16*4], m7 mova [coeffq+16*5], m3 mova m7, [coeffq+16*7] punpcklqdq m2, m4, m5 punpckhqdq m4, m5 ;low:in9 high:in11 punpcklqdq m3, m6, m7 punpckhqdq m7, m6 ;low:in15 high:in13 mova [coeffq+16*6], m4 IDCT8_1D_PACKED mova m6, [coeffq+16*4] mova m4, [coeffq+16*5] mova m5, [coeffq+16*6] mova [coeffq+16*4], m1 mova [coeffq+16*5], m2 mova [coeffq+16*6], m3 IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 mova m1, [coeffq+16*4] psubsw m3, m0, m7 ;low:out15 high:out14 paddsw m0, m7 ;low:out0 high:out1 psubsw m7, m1, m5 ;low:out12 high:out13 paddsw m1, m5 ;low:out3 high:out2 mova [coeffq+16*7], m3 mova m2, [coeffq+16*5] mova m3, [coeffq+16*6] psubsw m5, m2, m4 ;low:out11 high:out10 paddsw m2, m4 ;low:out4 high:out5 psubsw m4, m3, m6 ;low:out8 high:out9 paddsw m3, m6 ;low:out7 high:out6 mova m6, m7 ret INV_TXFM_16X4_FN adst, dct INV_TXFM_16X4_FN adst, adst INV_TXFM_16X4_FN adst, flipadst INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main call .main_pass1_end punpckhwd m6, m7, m0 ;packed -out11, -out15 punpcklwd m0, m7 ;packed out0, out4 punpcklwd m7, m3, m4 ;packed -out3, -out7 punpckhwd m4, m3 ;packed out8, out12 mova m1, [coeffq+16*6] punpcklwd m3, m1, m5 ;packed -out1, -out5 punpckhwd m5, m1 ;packed out10, out14 mova m1, [coeffq+16*7] mova [coeffq+16*6], m3 mova [coeffq+16*7], m7 punpckhwd m3, m2, m1 ;packed -out9, -out13 punpcklwd m1, m2 ;packed out2, out6 mova m7, [o(pw_16384)] .pass1_end: REPX {pmulhrsw x, m7}, m0, m1, m4, m5 pxor m2, m2 psubw m2, m7 mova m7, [coeffq+16*6] REPX {pmulhrsw x, m2}, m7, m3, m6 pmulhrsw m2, [coeffq+16*7] mova [coeffq+16*6], m7 jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] jmp m(idct_16x4_internal_8bpc).pass2_end ALIGN function_align cglobal_label .main mova [coeffq+16*6], m0 pshufd m0, m1, q1032 pshufd m2, m2, q1032 punpckhwd m1, m6, m0 ;packed in13, in2 punpcklwd m0, m6 ;packed in3, in12 punpckhwd m7, m5, m2 ;packed in11, in4 punpcklwd m2, m5 ;packed in5, in10 mova m6, [o(pd_2048)] ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 psubsw m5, m1, m2 ;low:t10a high:t11a paddsw m1, m2 ;low:t2a high:t3a psubsw m2, m7, m0 ;low:t12a high:t13a paddsw m7, m0 ;low:t4a high:t5a punpcklqdq m0, m5 punpckhwd m0, m5 ;packed t10a, t11a punpcklqdq m5, m2 punpckhwd m2, m5 ;packed t13a, t12a ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 mova [coeffq+16*4], m1 mova [coeffq+16*5], m7 mova m1, [coeffq+16*6] mova m7, [coeffq+16*7] pshufd m1, m1, q1032 pshufd m3, m3, q1032 punpckhwd m5, m7, m1 ;packed in15, in0 punpcklwd m1, m7 ;packed in1, in14 punpckhwd m7, m4, m3 ;packed in9, in6 punpcklwd m3, m4 ;packed in7, in8 ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 psubsw m4, m5, m3 ;low:t8a high:t9a paddsw m5, m3 ;low:t0a high:t1a psubsw m3, m7, m1 ;low:t14a high:t15a paddsw m7, m1 ;low:t6a high:t7a punpcklqdq m1, m4 punpckhwd m1, m4 ;packed t8a, t9a punpcklqdq m4, m3 punpckhwd m3, m4 ;packed t15a, t14a ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 paddsw m4, m1, m2 ;low:t12a high:t13a psubsw m1, m2 ;low:t8a high:t9a psubsw m2, m0, m3 ;low:t14a high:t15a paddsw m0, m3 ;low:t10a high:t11a punpcklqdq m3, m1 punpckhwd m3, m1 ;packed t12a, t13a punpcklqdq m1, m2 punpckhwd m2, m1 ;packed t15a, t14a ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 psubsw m1, m3, m2 ;low:t14a high:t15a paddsw m3, m2 ;low:out2 high:-out13 psubsw m2, m4, m0 ;low:t10 high:t11 paddsw m0, m4 ;low:-out1 high:out14 mova [coeffq+16*6], m0 mova [coeffq+16*7], m3 mova m0, [coeffq+16*4] mova m3, [coeffq+16*5] psubsw m4, m5, m3 ;low:t4 high:t5 paddsw m5, m3 ;low:t0 high:t1 psubsw m3, m0, m7 ;low:t6 high:t7 paddsw m0, m7 ;low:t2 high:t3 punpcklqdq m7, m4 punpckhwd m7, m4 ;packed t4, t5 punpcklqdq m4, m3 punpckhwd m3, m4 ;packed t7, t6 ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a psubsw m4, m5, m0 ;low:t2a high:t3a paddsw m0, m5 ;low:out0 high:-out15 psubsw m5, m7, m3 ;low:t6 high:t7 paddsw m3, m7 ;low:-out3 high:out12 ret ALIGN function_align .main_pass1_end: mova m7, [o(deint_shuf1)] mova [coeffq+16*4], m0 mova [coeffq+16*5], m3 mova m0, [o(pw_2896_m2896)] mova m3, [o(pw_2896_2896)] pshufb m1, m7 ;t14a t15a pshufb m2, m7 ;t10 t11 pshufb m4, m7 ;t2a t3a pshufb m5, m7 ;t6 t7 pmaddwd m7, m0, m2 pmaddwd m2, m3 paddd m7, m6 paddd m2, m6 psrad m7, 12 psrad m2, 12 packssdw m2, m7 ;low:out6 high:-out9 pmaddwd m7, m0, m4 pmaddwd m4, m3 paddd m7, m6 paddd m4, m6 psrad m7, 12 psrad m4, 12 packssdw m4, m7 ;low:-out7 high:out8 pmaddwd m7, m3, m5 pmaddwd m5, m0 paddd m7, m6 paddd m5, m6 psrad m7, 12 psrad m5, 12 packssdw m7, m5 ;low:out4 high:-out11 pmaddwd m5, m3, m1 pmaddwd m1, m0 paddd m5, m6 paddd m1, m6 psrad m5, 12 psrad m1, 12 packssdw m5, m1 ;low:-out5 high:out10 mova m0, [coeffq+16*4] mova m3, [coeffq+16*5] ret ALIGN function_align cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] punpckhqdq m6, m2, m1 ;low:t11 high:t15a punpcklqdq m2, m1 ;low:t10 high:t14a psubsw m1, m2, m6 paddsw m2, m6 punpckhqdq m6, m4, m5 ;low:t3a high:t7 punpcklqdq m4, m5 ;low:t2a high:t6 psubsw m5, m4, m6 paddsw m4, m6 pmulhrsw m1, m7 ;low:-out9 high:out10 pmulhrsw m2, m7 ;low:out6 high:-out5 pmulhrsw m5, m7 ;low:out8 high:-out11 pmulhrsw m4, m7 ;low:-out7 high:out4 punpckhqdq m7, m4, m5 ;low:out4 high:-out11 punpcklqdq m4, m5 ;low:-out7 high:out8 punpckhqdq m5, m2, m1 ;low:-out5 high:out10 punpcklqdq m2, m1 ;low:out6 high:-out9 ret INV_TXFM_16X4_FN flipadst, dct INV_TXFM_16X4_FN flipadst, adst INV_TXFM_16X4_FN flipadst, flipadst INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call m(iadst_16x4_internal_8bpc).main call m(iadst_16x4_internal_8bpc).main_pass1_end punpcklwd m6, m7, m0 ;packed out11, out15 punpckhwd m0, m7 ;packed -out0, -out4 punpckhwd m7, m3, m4 ;packed out3, out7 punpcklwd m4, m3 ;packed -out8, -out12 mova m1, [coeffq+16*6] punpckhwd m3, m1, m5 ;packed out1, out5 punpcklwd m5, m1 ;packed -out10, -out14 mova m1, [coeffq+16*7] mova [coeffq+16*6], m3 mova [coeffq+16*7], m7 punpcklwd m3, m2, m1 ;packed out9, out13 punpckhwd m1, m2 ;packed -out2, -out6 mova m7, [o(pw_m16384)] jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] jmp m(idct_16x4_internal_8bpc).pass2_end INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m1, [coeffq+16*6] mova m0, [coeffq+16*5] mova m2, [coeffq+16*7] mova m6, [o(pw_1697x16)] mova m7, [o(pw_16384)] pmulhrsw m4, m6, m1 pmulhrsw m3, m6, m0 pmulhrsw m5, m6, m2 pmulhrsw m4, m7 pmulhrsw m3, m7 pmulhrsw m5, m7 paddsw m1, m4 paddsw m0, m3 paddsw m5, m2 mova m2, [coeffq+16*2] mova m3, [coeffq+16*3] mova m4, [coeffq+16*4] mova [coeffq+16*6], m1 mova [coeffq+16*5], m0 mova [coeffq+16*7], m5 pmulhrsw m0, m6, m2 pmulhrsw m1, m6, m3 pmulhrsw m5, m6, m4 pmulhrsw m0, m7 pmulhrsw m1, m7 pmulhrsw m5, m7 paddsw m2, m0 paddsw m3, m1 paddsw m4, m5 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] pmulhrsw m5, m6, m0 pmulhrsw m6, m1 pmulhrsw m5, m7 pmulhrsw m6, m7 paddsw m0, m5 paddsw m1, m6 mova m6, [coeffq+16*6] mova m5, [coeffq+16*5] punpckhwd m7, m0, m2 ;packed out1, out5 punpcklwd m0, m2 ;packed out0, out4 punpckhwd m2, m1, m3 ;packed out3, out7 punpcklwd m1, m3 ;packed out2, out6 mova [coeffq+16*6], m7 mova m7, [coeffq+16*7] punpckhwd m3, m4, m6 ;packed out9, out13 punpcklwd m4, m6 ;packed out8, out12 punpckhwd m6, m5, m7 ;packed out11, out15 punpcklwd m5, m7 ;packed out10, out14 jmp m(idct_16x4_internal_8bpc).pass1_end3 .pass2: lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] jmp m(idct_16x4_internal_8bpc).pass2_end %macro SAVE_8ROWS 2 ;src, stride mova [%1+%2*0], m0 mova [%1+%2*1], m1 mova [%1+%2*2], m2 mova [%1+%2*3], m3 mova [%1+%2*4], m4 mova [%1+%2*5], m5 mova [%1+%2*6], m6 mova [%1+%2*7], m7 %endmacro %macro INV_TXFM_8X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 8x16, 8, 16*16 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 mova m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 pmulhrsw m0, m2 psrlw m2, 3 ; pw_2048 pmulhrsw m0, m1 pmulhrsw m0, m2 mov r3d, 4 lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET %endif %endmacro INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst INV_TXFM_8X16_FN dct, identity cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] .pass1: LOAD_8ROWS coeffq+16*1, 32, 1 mov [rsp+gprsize+16*11], tx2q lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] jmp r3 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32, 1 mov tx2q, [rsp+gprsize+16*11] jmp r3 .pass2: lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] .pass2_pre: mova [coeffq+16*2 ], m1 mova [coeffq+16*6 ], m3 mova [coeffq+16*10], m5 mova [coeffq+16*14], m7 mova m1, m2 mova m2, m4 mova m3, m6 mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*5 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*13] .pass2_main: call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*2 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*10] mova m3, [coeffq+16*14] mova m4, [coeffq+16*3 ] mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*11] mova m7, [coeffq+16*15] call m(idct_16x8_internal_8bpc).main mov r3, dstq lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ret INV_TXFM_8X16_FN adst, dct INV_TXFM_8X16_FN adst, adst INV_TXFM_8X16_FN adst, flipadst INV_TXFM_8X16_FN adst, identity cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] jmp m(idct_8x16_internal_8bpc).pass1 .pass2: lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] .pass2_pre: mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 mova m0, m2 mova m1, m3 mova m2, m4 mova m3, m5 .pass2_main: mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*3 ] mova m6, [coeffq+16*13] mova m7, [coeffq+16*15] mova [rsp+gprsize+16*3], m4 mova [rsp+gprsize+16*4], m5 mova [rsp+gprsize+16*9], m6 mova [rsp+gprsize+32*5], m7 mova m4, [coeffq+16*5 ] mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end mov r3, dstq lea dstq, [dstq+strideq*8] jmp m(iadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN flipadst, dct INV_TXFM_8X16_FN flipadst, adst INV_TXFM_8X16_FN flipadst, flipadst INV_TXFM_8X16_FN flipadst, identity cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] jmp m(idct_8x16_internal_8bpc).pass1 .pass2: lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] lea r3, [dstq+strideq*8] .pass2_pre: mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 mova m0, m2 mova m1, m3 mova m2, m4 mova m3, m5 .pass2_main: mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*3 ] mova m6, [coeffq+16*13] mova m7, [coeffq+16*15] mova [rsp+gprsize+16*3], m4 mova [rsp+gprsize+16*4], m5 mova [rsp+gprsize+16*9], m6 mova [rsp+gprsize+32*5], m7 mova m4, [coeffq+16*5 ] mova m5, [coeffq+16*7 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*11] call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end jmp m(iflipadst_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iflipadst_8x8_internal_8bpc).end INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 32, 1 mov r3, tx2q lea tx2q, [o(.pass1_end)] mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32, 1 mov tx2q, r3 mova [rsp+gprsize+16*1], m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass2: lea tx2q, [o(.end1)] .end: mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*1], m6 mova m7, [o(pw_1697x16)] REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 mova m6, [rsp+gprsize+16*1] mova [rsp+gprsize+16*2], m5 IDTX16 6, 5, 7 mova m5, [rsp+gprsize+16*0] IDTX16 5, 7, 7 mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [rsp+gprsize+16*2] mova [rsp+gprsize+16*0], m5 mova [rsp+gprsize+16*1], m6 mova [rsp+gprsize+16*2], m7 jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end %macro INV_TXFM_16X8_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x8, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 4 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif %endmacro INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst INV_TXFM_16X8_FN dct, identity cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 32, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*1, 32, 1 call .main mov r3, tx2q lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 jmp m(idct_8x8_internal_8bpc).pass1_end .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).pass2_main ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 mova [rsp+gprsize*2+32*5], m5 mova m6, [o(pd_2048)] ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a psubsw m2, m0, m4 ;t9 paddsw m0, m4 ;t8 psubsw m4, m7, m3 ;t14 paddsw m7, m3 ;t15 ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a mova m3, [rsp+gprsize*2+16*1] mova m5, [rsp+gprsize*2+32*5] mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+32*5], m4 mova m2, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*2], m7 ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a psubsw m4, m2, m3 ;t10 paddsw m2, m3 ;t11 psubsw m3, m1, m5 ;t13 paddsw m1, m5 ;t12 ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a mova m7, [rsp+gprsize*2+32*5] psubsw m6, m0, m2 ;t11a paddsw m0, m2 ;t8a paddsw m2, m7, m3 ;t9 psubsw m7, m3 ;t10 mova m5, [rsp+gprsize*2+16*0] psubsw m3, m5, m0 ;out8 paddsw m0, m5 ;out7 mova [rsp+gprsize*2+32*5], m0 mova m5, [rsp+gprsize*2+16*9] psubsw m0, m5, m2 ;out9 paddsw m2, m5 ;out6 mova [rsp+gprsize*2+16*0], m0 mova [rsp+gprsize*2+16*9], m2 mova m0, [rsp+gprsize*2+16*1] mova m2, [rsp+gprsize*2+16*2] mova [rsp+gprsize*2+16*1], m3 psubsw m5, m0, m4 ;t13 paddsw m0, m4 ;t14 mova m3, [o(pd_2048)] psubsw m4, m2, m1 ;t12a paddsw m1, m2 ;t15a mova [rsp+gprsize*2+16*2], m1 ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 mova m3, [rsp+gprsize*2+16*8] psubsw m2, m3, m5 ;out10 paddsw m3, m5 ;out5 mova m5, [rsp+gprsize*2+16*7] mova [rsp+gprsize*2+16*8], m3 psubsw m3, m5, m4 ;out11 paddsw m5, m4 ;out4 mova m4, [rsp+gprsize*2+16*6] mova [rsp+gprsize*2+16*7], m5 paddsw m5, m4, m6 ;out3 psubsw m4, m6 ;out12 mova m6, [rsp+gprsize*2+16*5] mova [rsp+gprsize*2+16*6], m5 psubsw m5, m6, m7 ;out13 paddsw m6, m7 ;out2 mova m7, [rsp+gprsize*2+16*4] mova [rsp+gprsize*2+16*5], m6 psubsw m6, m7, m0 ;out14 paddsw m7, m0 ;out1 mova m1, [rsp+gprsize*2+16*2] mova m0, [rsp+gprsize*2+16*3] mova [rsp+gprsize*2+16*4], m7 psubsw m7, m0, m1 ;out15 paddsw m0, m1 ;out0 mova [rsp+gprsize*2+16*3], m0 mova m1, [rsp+gprsize*2+16*0] mova m0, [rsp+gprsize*2+16*1] mova [rsp+gprsize*2+16*0], m7 ret INV_TXFM_16X8_FN adst, dct INV_TXFM_16X8_FN adst, adst INV_TXFM_16X8_FN adst, flipadst INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] pmulhrsw m2, m7, [coeffq+16*14] pmulhrsw m3, m7, [coeffq+16*15] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 pmulhrsw m0, m7, [coeffq+16*6 ] pmulhrsw m1, m7, [coeffq+16*7 ] pmulhrsw m2, m7, [coeffq+16*8 ] pmulhrsw m3, m7, [coeffq+16*9 ] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 pmulhrsw m0, m7, [coeffq+16*2 ] pmulhrsw m1, m7, [coeffq+16*3 ] pmulhrsw m2, m7, [coeffq+16*4 ] pmulhrsw m3, m7, [coeffq+16*5 ] pmulhrsw m4, m7, [coeffq+16*10] pmulhrsw m5, m7, [coeffq+16*11] pmulhrsw m6, m7, [coeffq+16*12] pmulhrsw m7, [coeffq+16*13] call .main call .main_pass1_end mov r3, tx2q lea tx2q, [o(.pass1_end)] jmp m(iadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 jmp m(iadst_8x8_internal_8bpc).pass1_end .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iadst_8x8_internal_8bpc).pass2_main ALIGN function_align cglobal_label .main mova [rsp+gprsize*2+16*0], m1 mova [rsp+gprsize*2+16*1], m2 mova [rsp+gprsize*2+16*2], m6 mova m6, [o(pd_2048)] ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 psubsw m1, m0, m4 ;t10a paddsw m0, m4 ;t2a psubsw m4, m7, m3 ;t11a paddsw m3, m7 ;t3a ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 mova m2, [rsp+gprsize*2+16*0] ;in3 mova m7, [rsp+gprsize*2+16*1] ;in4 mova [rsp+gprsize*2+16*0], m1 ;t11 mova [rsp+gprsize*2+16*1], m4 ;t10 mova m1, [rsp+gprsize*2+16*2] ;in12 mova [rsp+gprsize*2+16*2], m0 ;t2a ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 psubsw m0, m7, m1 ;t12a paddsw m1, m7 ;t4a psubsw m4, m5, m2 ;t13a paddsw m5, m2 ;t5a ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 mova m2, [rsp+gprsize*2+16*8] ;in1 mova m7, [rsp+gprsize*2+16*9] ;in14 mova [rsp+gprsize*2+16*8], m4 ;t12 mova [rsp+gprsize*2+16*9], m0 ;t13 mova m4, [rsp+gprsize*2+16*4] ;in9 mova m0, [rsp+gprsize*2+16*5] ;in6 mova [rsp+gprsize*2+16*4], m1 ;t4a mova [rsp+gprsize*2+16*5], m5 ;t5a ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 psubsw m1, m0, m7 ;t14a paddsw m0, m7 ;t6a psubsw m5, m4, m2 ;t15a paddsw m4, m2 ;t7a ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 mova m2, [rsp+gprsize*2+16*2] ;t2a mova [rsp+gprsize*2+16*2], m5 ;t14 psubsw m7, m2, m0 ;t6 paddsw m2, m0 ;t2 psubsw m0, m3, m4 ;t7 paddsw m3, m4 ;t3 ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a mova m4, [rsp+gprsize*2+16*7] ;in0 mova m5, [rsp+gprsize*2+32*5] ;in15 mova [rsp+gprsize*2+16*7], m3 ;t3 mova [rsp+gprsize*2+32*5], m1 ;t15 mova m1, [rsp+gprsize*2+16*6] ;in7 mova m3, [rsp+gprsize*2+16*3] ;in8 mova [rsp+gprsize*2+16*6], m7 ;t7a mova [rsp+gprsize*2+16*3], m0 ;t6a ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 psubsw m0, m4, m3 ;t8a paddsw m4, m3 ;t0a psubsw m3, m5, m1 ;t9a paddsw m5, m1 ;t1a ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 mova m1, [rsp+gprsize*2+16*4] ;t4a mova m7, [rsp+gprsize*2+16*5] ;t5a mova [rsp+gprsize*2+16*4], m3 ;t8 mova [rsp+gprsize*2+16*5], m0 ;t9 psubsw m0, m4, m1 ;t4 paddsw m4, m1 ;t0 psubsw m3, m5, m7 ;t5 paddsw m5, m7 ;t1 ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a mova m7, [rsp+gprsize*2+16*3] ;t6a psubsw m1, m4, m2 ;t2a paddsw m4, m2 ;out0 mova [rsp+gprsize*2+16*3], m4 ;out0 mova m4, [rsp+gprsize*2+16*6] ;t7a psubsw m2, m3, m7 ;t6 paddsw m3, m7 ;-out3 mova [rsp+gprsize*2+16*6], m3 ;-out3 psubsw m3, m0, m4 ;t7 paddsw m0, m4 ;out12 mova [rsp+gprsize*2+16*12], m3 mova m3, [rsp+gprsize*2+16*7] ;t3 mova [rsp+gprsize*2+16* 7], m2 ;out4 psubsw m2, m5, m3 ;t3a paddsw m5, m3 ;-out15 mova [rsp+gprsize*2+16*11], m2 mova m2, [rsp+gprsize*2+32*5] ;t15 mova [rsp+gprsize*2+16*10], m1 ;-out7 mova m1, [rsp+gprsize*2+16*0] ;t11 mova [rsp+gprsize*2+16*0 ], m5 ;-out15 mova m3, [rsp+gprsize*2+16*1] ;t10 mova [rsp+gprsize*2+16*1 ], m4 ;-out11 mova m4, [rsp+gprsize*2+16*2] ;t14 mova [rsp+gprsize*2+16*2 ], m0 ;out12 psubsw m0, m3, m4 ;t14a paddsw m3, m4 ;t10a psubsw m5, m1, m2 ;t15a paddsw m1, m2 ;t11a ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 mova m2, [rsp+gprsize*2+16*4] ;t8 mova m4, [rsp+gprsize*2+16*5] ;t9 mova [rsp+gprsize*2+16*4], m3 ;t10a mova [rsp+gprsize*2+16*5], m1 ;t11a mova m3, [rsp+gprsize*2+16*8] ;t12 mova m1, [rsp+gprsize*2+16*9] ;t13 mova [rsp+gprsize*2+16*8], m5 ;t14 mova [rsp+gprsize*2+16*9], m0 ;t15 psubsw m5, m2, m3 ;t12a paddsw m2, m3 ;t8a psubsw m0, m4, m1 ;t13a paddsw m4, m1 ;t9a ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 mova m6, [rsp+gprsize*2+16*4] ;t10a mova m1, [rsp+gprsize*2+16*5] ;t11a psubsw m3, m2, m6 ;t10 paddsw m2, m6 ;-out1 paddsw m6, m4, m1 ;out14 psubsw m4, m1 ;t11 mova [rsp+gprsize*2+16*14], m4 mova [rsp+gprsize*2+16* 4], m2 ;-out1 mova m4, [rsp+gprsize*2+16*8] ;t14 mova m2, [rsp+gprsize*2+16*9] ;t15 mova [rsp+gprsize*2+16* 9], m3 ;out6 psubsw m3, m0, m4 ;t14a paddsw m0, m4 ;out2 psubsw m4, m5, m2 ;t15a paddsw m5, m2 ;-out13 mova [rsp+gprsize*2+16* 5], m0 ;out2 ret ALIGN function_align .main_pass1_end: mova m0, [rsp+gprsize*2+16*14] mova [rsp+gprsize*2+16*14], m5 mova [rsp+gprsize*2+16*15], m6 mova m5, [o(pw_2896_2896)] mova m6, [o(pw_2896_m2896)] mova m7, [o(pd_2048)] punpcklwd m2, m3, m4 punpckhwd m3, m4 pmaddwd m4, m5, m2 pmaddwd m2, m6 pmaddwd m1, m5, m3 pmaddwd m3, m6 REPX {paddd x, m7}, m4, m2, m1, m3 REPX {psrad x, 12}, m4, m1, m2, m3 packssdw m4, m1 ;-out5 packssdw m2, m3 ;out10 mova [rsp+gprsize*2+16* 8], m4 mova m3, [rsp+gprsize*2+16* 9] punpcklwd m1, m3, m0 punpckhwd m3, m0 pmaddwd m0, m5, m1 pmaddwd m1, m6 pmaddwd m4, m5, m3 pmaddwd m3, m6 REPX {paddd x, m7}, m0, m1, m4, m3 REPX {psrad x, 12}, m0, m4, m1, m3 packssdw m0, m4 ;out6 packssdw m1, m3 ;-out9 mova [rsp+gprsize*2+16* 9], m0 mova m0, [rsp+gprsize*2+16* 7] mova m4, [rsp+gprsize*2+16*12] punpcklwd m3, m0, m4 punpckhwd m0, m4 pmaddwd m4, m5, m3 pmaddwd m3, m6 pmaddwd m5, m0 pmaddwd m0, m6 REPX {paddd x, m7}, m4, m3, m5, m0 REPX {psrad x, 12}, m4, m5, m3, m0 packssdw m4, m5 ;out4 packssdw m3, m0 ;-out11 mova [rsp+gprsize*2+16* 7], m4 mova m4, [rsp+gprsize*2+16*10] mova m5, [rsp+gprsize*2+16*11] punpcklwd m0, m4, m5 punpckhwd m4, m5 pmaddwd m5, m0, [o(pw_2896_2896)] pmaddwd m0, m6 pmaddwd m6, m4 pmaddwd m4, [o(pw_2896_2896)] REPX {paddd x, m7}, m5, m0, m6, m4 REPX {psrad x, 12}, m0, m6, m5, m4 packssdw m0, m6 ;out8 packssdw m5, m4 ;-out7 mova [rsp+gprsize*2+16*10], m5 mova m4, [rsp+gprsize*2+16* 2] ;out12 mova m5, [rsp+gprsize*2+16*14] ;-out13 mova m6, [rsp+gprsize*2+16*15] ;out14 ret ALIGN function_align cglobal_label .main_pass2_end mova m7, [o(pw_2896x8)] mova m1, [rsp+gprsize*2+16* 9] mova m2, [rsp+gprsize*2+16*14] paddsw m0, m1, m2 psubsw m1, m2 pmulhrsw m0, m7 ;out6 pmulhrsw m1, m7 ;-out9 mova [rsp+gprsize*2+16* 9], m0 psubsw m2, m3, m4 paddsw m3, m4 pmulhrsw m2, m7 ;out10 pmulhrsw m3, m7 ;-out5 mova [rsp+gprsize*2+16* 8], m3 mova m3, [rsp+gprsize*2+16* 7] mova m4, [rsp+gprsize*2+16*12] paddsw m0, m3, m4 psubsw m3, m4 pmulhrsw m0, m7 ;out4 pmulhrsw m3, m7 ;-out11 mova [rsp+gprsize*2+16* 7], m0 mova m0, [rsp+gprsize*2+16*10] paddsw m4, m0, [rsp+gprsize*2+16*11] psubsw m0, [rsp+gprsize*2+16*11] pmulhrsw m4, m7 ;-out7 pmulhrsw m0, m7 ;out8 mova [rsp+gprsize*2+16*10], m4 mova m4, [rsp+gprsize*2+16*2 ] ;out12 ret INV_TXFM_16X8_FN flipadst, dct INV_TXFM_16X8_FN flipadst, adst INV_TXFM_16X8_FN flipadst, flipadst INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [o(pw_2896x8)] pmulhrsw m0, m7, [coeffq+16*0 ] pmulhrsw m1, m7, [coeffq+16*1 ] pmulhrsw m2, m7, [coeffq+16*14] pmulhrsw m3, m7, [coeffq+16*15] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 pmulhrsw m0, m7, [coeffq+16*6 ] pmulhrsw m1, m7, [coeffq+16*7 ] pmulhrsw m2, m7, [coeffq+16*8 ] pmulhrsw m3, m7, [coeffq+16*9 ] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 pmulhrsw m0, m7, [coeffq+16*2 ] pmulhrsw m1, m7, [coeffq+16*3 ] pmulhrsw m2, m7, [coeffq+16*4 ] pmulhrsw m3, m7, [coeffq+16*5 ] pmulhrsw m4, m7, [coeffq+16*10] pmulhrsw m5, m7, [coeffq+16*11] pmulhrsw m6, m7, [coeffq+16*12] pmulhrsw m7, [coeffq+16*13] call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, tx2q lea tx2q, [o(.pass1_end)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 32 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 jmp m(iflipadst_8x8_internal_8bpc).pass1_end .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x8_internal_8bpc).pass2_main .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iflipadst_8x8_internal_8bpc).pass2_main INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*16 mova m4, [coeffq-16*7] mova m5, [coeffq-16*5] mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mov r3, tx2q lea tx2q, [o(.pass1_end)] .pass1: mova m0, [o(pw_2896x8)] mova m2, [o(pw_1697x16)] mova m3, [o(pw_16384)] sub coeffq, 8*16 REPX {pmulhrsw x, m0}, m4, m5, m6, m7 pmulhrsw m1, m2, m4 pmulhrsw m1, m3 paddsw m1, m4 ; 1 pmulhrsw m4, m2, m5 pmulhrsw m4, m3 paddsw m4, m5 ; 3 pmulhrsw m5, m2, m6 pmulhrsw m5, m3 paddsw m5, m6 ; 5 pmulhrsw m6, m2, m7 pmulhrsw m6, m3 paddsw m7, m6 ; 7 pmulhrsw m6, m0, [coeffq+16*6] mova [rsp+gprsize+16*0], m4 pmulhrsw m4, m2, m6 pmulhrsw m4, m3 paddsw m6, m4 ; 6 pmulhrsw m4, m0, [coeffq+16*4] mova [rsp+gprsize+16*1], m6 pmulhrsw m6, m2, m4 pmulhrsw m6, m3 paddsw m4, m6 ; 4 pmulhrsw m6, m0, [coeffq+16*2] pmulhrsw m0, [coeffq+16*0] pmulhrsw m2, m6 pmulhrsw m2, m3 paddsw m2, m6 ; 2 pmulhrsw m6, m0, [o(pw_1697x16)] pmulhrsw m6, m3 mova m3, [rsp+gprsize+16*0] paddsw m0, m6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: mova [coeffq+16*1], m4 mova [coeffq+16*3], m5 mova [coeffq+16*5], m6 mova [coeffq+16*7], m7 mova m4, [coeffq-16*7] mova m5, [coeffq-16*5] mova m6, [coeffq-16*3] mova m7, [coeffq-16*1] mova [coeffq-16*7], m0 mova [coeffq-16*5], m1 mova [coeffq-16*3], m2 mova [coeffq-16*1], m3 mov tx2q, r3 jmp .pass1 .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iidentity_8x8_internal_8bpc).end .end: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] mov dstq, r3 jmp m(iidentity_8x8_internal_8bpc).end %macro INV_TXFM_16X16_FN 2 ; type1, type2 INV_TXFM_FN %1, %2, 16x16, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 8 lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly .end: RET %endif %endmacro INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst INV_TXFM_16X16_FN dct, identity cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*3, 64 call m(idct_16x8_internal_8bpc).main mov r3, tx2q lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass2: lea tx2q, [o(.end)] jmp m(idct_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 mov dstq, r3 mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*4 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*12] mova m4, [coeffq+16*1 ] mova m5, [coeffq+16*5 ] mova m6, [coeffq+16*9 ] mova m7, [coeffq+16*13] lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] jmp m(idct_8x16_internal_8bpc).pass2_main %macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 mova m0, [coeffq+16*1 ] mova m1, [coeffq+16*3 ] mova m2, [coeffq+16*29] mova m3, [coeffq+16*31] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 mova m0, [coeffq+16*13] mova m1, [coeffq+16*15] mova m2, [coeffq+16*17] mova m3, [coeffq+16*19] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 mova m0, [coeffq+16*5 ] mova m1, [coeffq+16*7 ] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*11] mova m4, [coeffq+16*21] mova m5, [coeffq+16*23] mova m6, [coeffq+16*25] mova m7, [coeffq+16*27] %endmacro %macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*2 ] mova m2, [coeffq+16*28] mova m3, [coeffq+16*30] mova [rsp+gprsize+16*7], m0 mova [rsp+gprsize+16*8], m1 mova [rsp+gprsize+16*9], m2 mova [rsp+gprsize+32*5], m3 mova m0, [coeffq+16*12] mova m1, [coeffq+16*14] mova m2, [coeffq+16*16] mova m3, [coeffq+16*18] mova [rsp+gprsize+16*3], m2 mova [rsp+gprsize+16*4], m3 mova [rsp+gprsize+16*5], m0 mova [rsp+gprsize+16*6], m1 mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*10] mova m4, [coeffq+16*20] mova m5, [coeffq+16*22] mova m6, [coeffq+16*24] mova m7, [coeffq+16*26] %endmacro INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, adst INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q lea tx2q, [o(.pass1_end)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*17, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*1, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_8192)] jmp m(iadst_8x8_internal_8bpc).pass1_end1 .pass2: lea tx2q, [o(.end)] jmp m(iadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] mov dstq, r3 lea r3, [dstq+8] jmp m(iadst_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 mov dstq, r3 mova m4, [coeffq+16*0 ] mova m5, [coeffq+16*2 ] mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*10] mova m6, [coeffq+16*12] mova m7, [coeffq+16*14] mova [rsp+gprsize+16*7], m4 mova [rsp+gprsize+16*8], m5 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] jmp m(iadst_8x16_internal_8bpc).pass2_main INV_TXFM_16X16_FN flipadst, dct INV_TXFM_16X16_FN flipadst, adst INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mov r3, tx2q lea tx2q, [o(.pass1_end)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+16*1, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+16*17, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end2)] mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS coeffq+16* 0, 32 mova [rsp+gprsize+16*0], m7 mov tx2q, r3 mova m7, [o(pw_m8192)] jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 .pass2: lea tx2q, [o(.end)] lea r3, [dstq+8] jmp m(iflipadst_8x16_internal_8bpc).pass2_pre .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] lea dstq, [dstq+strideq*2] jmp m(iflipadst_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 mova m4, [coeffq+16*0 ] mova m5, [coeffq+16*2 ] mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*6 ] mova m2, [coeffq+16*8 ] mova m3, [coeffq+16*10] mova m6, [coeffq+16*12] mova m7, [coeffq+16*14] mova [rsp+gprsize+16*7], m4 mova [rsp+gprsize+16*8], m5 mova [rsp+gprsize+16*5], m6 mova [rsp+gprsize+16*6], m7 lea tx2q, [o(.end2)] mov dstq, r3 jmp m(iflipadst_8x16_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp m(iflipadst_8x8_internal_8bpc).end %macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 pmulhrsw m%2, m%3, m%1 psraw m%2, 1 pavgw m%1, m%2 %endmacro INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16*17 mov r3, tx2q lea tx2q, [o(.pass1_end)] .pass1: mova m6, [o(pw_1697x16)] mova m7, [coeffq+32*6] mova m0, [coeffq+32*0] mova m1, [coeffq+32*1] mova m2, [coeffq+32*2] mova m3, [coeffq+32*3] mova m4, [coeffq+32*4] REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 mova m5, [coeffq+32*5] mova [rsp+gprsize+16*1], m7 IDTX16B 5, 7, 6 mova m7, [coeffq+32*7] IDTX16B 7, 6, 6 jmp m(idct_8x8_internal_8bpc).pass1_end3 .pass1_end: SAVE_8ROWS coeffq, 32 sub coeffq, 16 lea tx2q, [o(.pass1_end1)] jmp .pass1 .pass1_end1: SAVE_8ROWS coeffq, 32 sub coeffq, 15*16 lea tx2q, [o(.pass1_end2)] jmp .pass1 .pass1_end2: SAVE_8ROWS coeffq, 32 sub coeffq, 16 mov tx2q, r3 jmp .pass1 .pass2: lea r3, [dstq+8] lea tx2q, [o(.end1)] .end: mova [rsp+gprsize+16*0], m7 mova [rsp+gprsize+16*1], m4 mova m7, [o(pw_1697x16)] REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 mova m4, [o(pw_2048)] pmulhrsw m5, m4 pmulhrsw m6, m4 mova [rsp+gprsize+16*2], m5 mova m5, [rsp+gprsize+16*1] mova [rsp+gprsize+16*1], m6 IDTX16 5, 6, 7 mova m6, [rsp+gprsize+16*0] IDTX16 6, 7, 7 REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 pmulhrsw m4, m5 mova [rsp+gprsize+16*0], m6 jmp m(idct_8x8_internal_8bpc).end3 .end1: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*2] jmp .end .end2: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 32*8 LOAD_8ROWS coeffq, 32 lea tx2q, [o(.end3)] mov dstq, r3 jmp .end .end3: LOAD_8ROWS coeffq+16*1, 32 lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] lea dstq, [dstq+strideq*2] jmp .end cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_8x32_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd pmulhrsw m0, m2 psrlw m2, 2 ;pw_2048 pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 mov r3d, 8 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop .end: RET cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 cmp eobd, 106 jle .fast LOAD_8ROWS coeffq+16*3, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1: mova [rsp+gprsize+16*9 ], m0 ;in24 mova [rsp+gprsize+16*10], m4 ;in28 mova [rsp+gprsize+16*17], m2 ;in26 mova [rsp+gprsize+16*18], m6 ;in30 mova [rsp+gprsize+16*31], m1 ;in25 mova [rsp+gprsize+16*30], m3 ;in27 mova [rsp+gprsize+16*27], m5 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 LOAD_8ROWS coeffq+16*2, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_1: mova [rsp+gprsize+16*7 ], m0 ;in16 mova [rsp+gprsize+16*8 ], m4 ;in20 mova [rsp+gprsize+16*15], m2 ;in18 mova [rsp+gprsize+16*16], m6 ;in22 mova [rsp+gprsize+16*33], m1 ;in17 mova [rsp+gprsize+16*28], m3 ;in19 mova [rsp+gprsize+16*29], m5 ;in21 mova [rsp+gprsize+16*32], m7 ;in23 .fast: LOAD_8ROWS coeffq+16*1, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: mova [rsp+gprsize+16*5 ], m0 ;in8 mova [rsp+gprsize+16*6 ], m4 ;in12 mova [rsp+gprsize+16*13], m2 ;in10 mova [rsp+gprsize+16*14], m6 ;in14 mova [rsp+gprsize+16*21], m1 ;in9 mova [rsp+gprsize+16*24], m3 ;in11 mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: mova [rsp+gprsize+16*11], m2 ;in2 mova [rsp+gprsize+16*12], m6 ;in6 mova [rsp+gprsize+16*19], m1 ;in1 mova [rsp+gprsize+16*26], m3 ;in3 mova [rsp+gprsize+16*23], m5 ;in5 mova [rsp+gprsize+16*22], m7 ;in7 mova m1, m4 ;in4 mova m2, [rsp+gprsize+16*5 ] ;in8 mova m3, [rsp+gprsize+16*6 ] ;in12 cmp eobd, 106 jg .full pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 mova m0, [rsp+gprsize+16*11] mova m1, [rsp+gprsize+16*12] mova m2, [rsp+gprsize+16*13] mova m3, [rsp+gprsize+16*14] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call .main_fast jmp .pass2 .full: mova m4, [rsp+gprsize+16*7 ] ;in16 mova m5, [rsp+gprsize+16*8 ] ;in20 mova m6, [rsp+gprsize+16*9 ] ;in24 mova m7, [rsp+gprsize+16*10] ;in28 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call .main .pass2: lea r3, [o(.end6)] .end: mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.end2)] .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9, 10, 11, 12, 13, 14, 15, \ 16, 17, 18, 19, 20, 21, 22, 23, \ 24, 25, 26, 27, 28, 29, 30, 31 jmp tx2q .end2: lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).end .end3: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).end .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).end .end5: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 lea dstq, [dstq+strideq*2] mov tx2q, r3 jmp m(idct_8x8_internal_8bpc).end .end6: ret ALIGN function_align cglobal_label .main_veryfast mova m0, [rsp+gprsize*2+16*19] ;in1 pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 pmulhrsw m0, [o(pw_201x8)] ;t16,t17 mova m7, [o(pd_2048)] mova [rsp+gprsize*2+16*19], m0 ;t16 mova [rsp+gprsize*2+16*34], m3 ;t31 ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a mova [rsp+gprsize*2+16*20], m3 ;t17a mova [rsp+gprsize*2+16*33], m0 ;t30a mova m1, [rsp+gprsize*2+16*22] ;in7 pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 mova [rsp+gprsize*2+16*22], m1 ;t19 mova [rsp+gprsize*2+16*31], m2 ;t28 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a mova [rsp+gprsize*2+16*21], m2 ;t18a mova [rsp+gprsize*2+16*32], m1 ;t29a mova m0, [rsp+gprsize*2+16*23] ;in5 pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 pmulhrsw m0, [o(pw_995x8)] ;t20, t21 mova [rsp+gprsize*2+16*23], m0 ;t20 mova [rsp+gprsize*2+16*30], m3 ;t27 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a mova [rsp+gprsize*2+16*24], m3 ;t21a mova [rsp+gprsize*2+16*29], m0 ;t26a mova m2, [rsp+gprsize*2+16*26] ;in3 pxor m0, m0 mova m3, m0 pmulhrsw m1, m2, [o(pw_4052x8)] pmulhrsw m2, [o(pw_m601x8)] jmp .main2 ALIGN function_align cglobal_label .main_fast ;bottom half is zero mova m0, [rsp+gprsize*2+16*19] ;in1 mova m1, [rsp+gprsize*2+16*20] ;in15 pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a pmulhrsw m0, [o(pw_201x8)] ;t16a pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a pmulhrsw m1, [o(pw_m2751x8)] ;t17a mova m7, [o(pd_2048)] psubsw m4, m0, m1 ;t17 paddsw m0, m1 ;t16 psubsw m5, m3, m2 ;t30 paddsw m3, m2 ;t31 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a mova [rsp+gprsize*2+16*19], m0 ;t16 mova [rsp+gprsize*2+16*20], m5 ;t17a mova [rsp+gprsize*2+16*33], m4 ;t30a mova [rsp+gprsize*2+16*34], m3 ;t31 mova m0, [rsp+gprsize*2+16*21] ;in9 mova m1, [rsp+gprsize*2+16*22] ;in7 pmulhrsw m3, m0, [o(pw_3703x8)] pmulhrsw m0, [o(pw_1751x8)] pmulhrsw m2, m1, [o(pw_3857x8)] pmulhrsw m1, [o(pw_m1380x8)] psubsw m4, m1, m0 ;t18 paddsw m0, m1 ;t19 psubsw m5, m2, m3 ;t29 paddsw m3, m2 ;t28 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a mova [rsp+gprsize*2+16*21], m5 ;t18a mova [rsp+gprsize*2+16*22], m0 ;t19 mova [rsp+gprsize*2+16*31], m3 ;t28 mova [rsp+gprsize*2+16*32], m4 ;t29a mova m0, [rsp+gprsize*2+16*23] ;in5 mova m1, [rsp+gprsize*2+16*24] ;in11 pmulhrsw m3, m0, [o(pw_3973x8)] pmulhrsw m0, [o(pw_995x8)] pmulhrsw m2, m1, [o(pw_3513x8)] pmulhrsw m1, [o(pw_m2106x8)] psubsw m4, m0, m1 ;t21 paddsw m0, m1 ;t20 psubsw m5, m3, m2 ;t26 paddsw m3, m2 ;t27 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a mova [rsp+gprsize*2+16*23], m0 ;t20 mova [rsp+gprsize*2+16*24], m5 ;t21a mova [rsp+gprsize*2+16*29], m4 ;t26a mova [rsp+gprsize*2+16*30], m3 ;t27 mova m0, [rsp+gprsize*2+16*25] ;in13 mova m2, [rsp+gprsize*2+16*26] ;in3 pmulhrsw m3, m0, [o(pw_3290x8)] pmulhrsw m0, [o(pw_2440x8)] pmulhrsw m1, m2, [o(pw_4052x8)] pmulhrsw m2, [o(pw_m601x8)] jmp .main2 ALIGN function_align cglobal_label .main mova m7, [o(pd_2048)] mova m0, [rsp+gprsize*2+16*19] ;in1 mova m1, [rsp+gprsize*2+16*20] ;in15 mova m2, [rsp+gprsize*2+16*33] ;in17 mova m3, [rsp+gprsize*2+16*34] ;in31 ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a psubsw m4, m0, m2 ;t17 paddsw m0, m2 ;t16 psubsw m5, m3, m1 ;t30 paddsw m3, m1 ;t31 ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a mova [rsp+gprsize*2+16*19], m0 ;t16 mova [rsp+gprsize*2+16*20], m5 ;t17a mova [rsp+gprsize*2+16*33], m4 ;t30a mova [rsp+gprsize*2+16*34], m3 ;t31 mova m0, [rsp+gprsize*2+16*21] ;in9 mova m1, [rsp+gprsize*2+16*22] ;in7 mova m2, [rsp+gprsize*2+16*31] ;in25 mova m3, [rsp+gprsize*2+16*32] ;in23 ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a psubsw m4, m2, m0 ;t18 paddsw m0, m2 ;t19 psubsw m5, m1, m3 ;t29 paddsw m3, m1 ;t28 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a mova [rsp+gprsize*2+16*21], m5 ;t18a mova [rsp+gprsize*2+16*22], m0 ;t19 mova [rsp+gprsize*2+16*31], m3 ;t28 mova [rsp+gprsize*2+16*32], m4 ;t29a mova m0, [rsp+gprsize*2+16*23] ;in5 mova m1, [rsp+gprsize*2+16*24] ;in11 mova m2, [rsp+gprsize*2+16*29] ;in21 mova m3, [rsp+gprsize*2+16*30] ;in27 ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a psubsw m4, m0, m2 ;t21 paddsw m0, m2 ;t20 psubsw m5, m3, m1 ;t26 paddsw m3, m1 ;t27 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a mova [rsp+gprsize*2+16*23], m0 ;t20 mova [rsp+gprsize*2+16*24], m5 ;t21a mova [rsp+gprsize*2+16*29], m4 ;t26a mova [rsp+gprsize*2+16*30], m3 ;t27 mova m0, [rsp+gprsize*2+16*25] ;in13 mova m1, [rsp+gprsize*2+16*26] ;in3 mova m2, [rsp+gprsize*2+16*27] ;in29 mova m3, [rsp+gprsize*2+16*28] ;in19 ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a .main2: psubsw m4, m2, m0 ;t22 paddsw m0, m2 ;t23 psubsw m5, m1, m3 ;t25 paddsw m3, m1 ;t24 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a mova m2, [rsp+gprsize*2+16*24] ;t21a psubsw m1, m5, m2 ;t21 paddsw m5, m2 ;t22 mova [rsp+gprsize*2+16*25], m5 ;t22 mova m2, [rsp+gprsize*2+16*29] ;t26a psubsw m5, m4, m2 ;t26 paddsw m4, m2 ;t25 mova [rsp+gprsize*2+16*28], m4 ;t25 ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a mova [rsp+gprsize*2+16*24], m5 ;t21a mova [rsp+gprsize*2+16*29], m1 ;t26a mova m1, [rsp+gprsize*2+16*23] ;t20 mova m5, [rsp+gprsize*2+16*30] ;t27 psubsw m2, m0, m1 ;t20a paddsw m0, m1 ;t23a psubsw m6, m3, m5 ;t27a paddsw m3, m5 ;t24a ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 mova [rsp+gprsize*2+16*26], m0 ;t23a mova [rsp+gprsize*2+16*27], m3 ;t24a mova [rsp+gprsize*2+16*30], m2 ;t27 mova m0, [rsp+gprsize*2+16*20] ;t17a mova m1, [rsp+gprsize*2+16*21] ;t18a mova m2, [rsp+gprsize*2+16*32] ;t29a mova m3, [rsp+gprsize*2+16*33] ;t30a psubsw m4, m0, m1 ;t18 paddsw m0, m1 ;t17 psubsw m5, m3, m2 ;t29 paddsw m3, m2 ;t30 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a mova [rsp+gprsize*2+16*20], m0 ;t17 mova [rsp+gprsize*2+16*21], m5 ;t18a mova [rsp+gprsize*2+16*32], m4 ;t29a mova [rsp+gprsize*2+16*33], m3 ;t30 mova m0, [rsp+gprsize*2+16*19] ;t16 mova m1, [rsp+gprsize*2+16*22] ;t19 mova m2, [rsp+gprsize*2+16*31] ;t28 mova m3, [rsp+gprsize*2+16*34] ;t31 psubsw m4, m0, m1 ;t19a paddsw m0, m1 ;t16a psubsw m5, m3, m2 ;t28a paddsw m3, m2 ;t31a ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 mova m2, [rsp+gprsize*2+16*15] ;tmp12 psubsw m1, m5, m6 ;t20a paddsw m5, m6 ;t19a psubsw m6, m2, m5 ;out19 paddsw m2, m5 ;out12 mova m5, [rsp+gprsize*2+16*30] ;t27 mova [rsp+gprsize*2+16*22], m6 ;out19 mova [rsp+gprsize*2+16*15], m2 ;out12 psubsw m6, m4, m5 ;t27a paddsw m4, m5 ;t28a ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 psubsw m5, m2, m4 ;out28 paddsw m2, m4 ;out3 mova m4, [rsp+gprsize*2+16*14] ;tmp11 mova [rsp+gprsize*2+16*31], m5 ;out28 mova [rsp+gprsize*2+16*6 ], m2 ;out3 psubsw m5, m4, m6 ;out20 paddsw m4, m6 ;out11 mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 mova [rsp+gprsize*2+16*23], m5 ;out20 mova [rsp+gprsize*2+16*14], m4 ;out11 psubsw m5, m2, m1 ;out27 paddsw m2, m1 ;out4 mova m1, [rsp+gprsize*2+16*26] ;t23a mova m4, [rsp+gprsize*2+16*27] ;t24a mova [rsp+gprsize*2+16*30], m5 ;out27 mova [rsp+gprsize*2+16*7 ], m2 ;out4 psubsw m5, m0, m1 ;t23 paddsw m0, m1 ;t16 psubsw m2, m3, m4 ;t24 paddsw m3, m4 ;t31 ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a mova m6, [rsp+gprsize*2+16*18] ;tmp15 psubsw m4, m6, m0 ;out16 paddsw m6, m0 ;out15 mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 mova m1, [rsp+gprsize*2+16*11] ;tmp8 mova [rsp+gprsize*2+16*18], m6 ;out15 mova [rsp+gprsize*2+16*19], m4 ;out16 psubsw m6, m0, m3 ;out31 paddsw m0, m3 ;out0 psubsw m4, m1, m2 ;out23 paddsw m1, m2 ;out8 mova m3, [rsp+gprsize*2+16*10] ;tmp7 mova [rsp+gprsize*2+16*34], m6 ;out31 mova [rsp+gprsize*2+16*11], m1 ;out8 mova [rsp+gprsize*2+16*26], m4 ;out23 paddsw m6, m3, m5 ;out7 psubsw m3, m5 ;out24 mova m1, [rsp+gprsize*2+16*20] ;t17 mova m5, [rsp+gprsize*2+16*25] ;t22 mova m2, [rsp+gprsize*2+16*17] ;tmp14 mova [rsp+gprsize*2+16*27], m3 ;out24 psubsw m4, m1, m5 ;t22a paddsw m1, m5 ;t17a psubsw m3, m2, m1 ;out17 paddsw m2, m1 ;out14 mova m5, [rsp+gprsize*2+16*28] ;t25 mova m1, [rsp+gprsize*2+16*33] ;t30 mova [rsp+gprsize*2+16*17], m2 ;out14 mova [rsp+gprsize*2+16*20], m3 ;out17 psubsw m2, m1, m5 ;t25a paddsw m1, m5 ;t30a ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 psubsw m3, m5, m1 ;out30 paddsw m5, m1 ;out1 mova m1, [rsp+gprsize*2+16*12] ;tmp9 mova [rsp+gprsize*2+16*33], m3 ;out30 mova [rsp+gprsize*2+16*4 ], m5 ;out1 psubsw m3, m1, m2 ;out22 paddsw m1, m2 ;out9 mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 mova [rsp+gprsize*2+16*25], m3 ;out22 mova [rsp+gprsize*2+16*12], m1 ;out9 psubsw m3, m5, m4 ;out25 paddsw m5, m4 ;out6 mova m4, [rsp+gprsize*2+16*21] ;t18a mova m1, [rsp+gprsize*2+16*24] ;t21a mova m2, [rsp+gprsize*2+16*16] ;tmp13 mova [rsp+gprsize*2+16*28], m3 ;out25 mova [rsp+gprsize*2+16*9 ], m5 ;out6 paddsw m3, m4, m1 ;t18 psubsw m4, m1 ;t21 psubsw m5, m2, m3 ;out18 paddsw m2, m3 ;out13 mova m1, [rsp+gprsize*2+16*29] ;t26a mova m3, [rsp+gprsize*2+16*32] ;t29a mova [rsp+gprsize*2+16*21], m5 ;out18 mova [rsp+gprsize*2+16*16], m2 ;out13 psubsw m5, m3, m1 ;t26 paddsw m3, m1 ;t29 ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 psubsw m1, m2, m3 ;out29 paddsw m2, m3 ;out2 mova m3, [rsp+gprsize*2+16*13] ;tmp10 mova [rsp+gprsize*2+16*32], m1 ;out29 psubsw m7, m3, m5 ;out21 paddsw m3, m5 ;out10 mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 mova [rsp+gprsize*2+16*24], m7 ;out21 mova [rsp+gprsize*2+16*13], m3 ;out10 psubsw m1, m5, m4 ;out26 paddsw m5, m4 ;out5 mova m7, m6 ;out7 mova m3, [rsp+gprsize*2+16*6 ] ;out3 mova m4, [rsp+gprsize*2+16*7 ] ;out4 mova [rsp+gprsize*2+16*29], m1 ;out26 mova m6, [rsp+gprsize*2+16*9 ] ;out6 mova m1, [rsp+gprsize*2+16*4 ] ;out1 ret cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x8_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 8 lea tx2q, [o(.end)] .body: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 pxor m5, m5 .loop: mova m1, [dstq+16*0] mova m3, [dstq+16*1] punpckhbw m2, m1, m5 punpcklbw m1, m5 punpckhbw m4, m3, m5 punpcklbw m3, m5 paddw m2, m0 paddw m1, m0 paddw m4, m0 paddw m3, m0 packuswb m1, m2 packuswb m3, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m3 add dstq, strideq dec r3d jg .loop jmp tx2q .end: RET cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 64 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*2, 64 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+16*1, 32 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 cmp eobd, 106 jg .full call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: LOAD_8ROWS coeffq+16*17, 32 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main .pass2: mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.end)] jmp m(idct_8x32_internal_8bpc).end1 .end: mova m7, [o(pw_8192)] lea tx2q, [o(.end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end1: lea r3, [dstq+8] lea tx2q, [o(.end2)] jmp m(idct_8x8_internal_8bpc).pass2_main .end2: LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end3: mov dstq, r3 add r3, 8 lea tx2q, [o(.end4)] jmp m(idct_8x8_internal_8bpc).pass2_main .end4: LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end5: mov dstq, r3 add r3, 8 lea tx2q, [o(.end6)] jmp m(idct_8x8_internal_8bpc).pass2_main .end6: LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .end7: mov dstq, r3 lea tx2q, [o(.end8)] jmp m(idct_8x8_internal_8bpc).pass2_main .end8: ret cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 cmovns tx2d, r5d mov r3d, tx2d %if ARCH_X86_32 LEA r5, $$ %endif lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] .loop: LOAD_8ROWS coeffq+16*0, 64 paddsw m6, [o(pw_5)] mova [rsp+16*1], m6 mova m6, [o(pw_5)] REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 call m(idct_8x8_internal_8bpc).pass1_end3 REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 add coeffq, 16 dec r3d jg .loop RET cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r5d, 4 mov tx2d, 2 cmp eobd, 107 cmovns tx2d, r5d mov r3d, tx2d %if ARCH_X86_32 LEA r5, $$ %endif .loop: LOAD_8ROWS coeffq+16*0, 16 pmulhrsw m6, [o(pw_4096)] mova [rsp+16*1], m6 mova m6, [o(pw_4096)] REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] call m(idct_8x8_internal_8bpc).pass1_end3 mov [rsp+16*3], dstq mova [rsp+16*2], m5 mova [rsp+16*1], m6 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] call m(idct_8x8_internal_8bpc).end3 add coeffq, 16*8 mov dstq, [rsp+16*3] lea dstq, [dstq+8] dec r3d jg .loop jnc .loop RET cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_16x32_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r2d, 16 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*5, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: mova [coeffq+16*1 ], m0 ;in8 mova [coeffq+16*5 ], m4 ;in12 mova [rsp+gprsize+16*13], m2 ;in10 mova [rsp+gprsize+16*14], m6 ;in14 mova [rsp+gprsize+16*21], m1 ;in9 mova [rsp+gprsize+16*24], m3 ;in11 mova [rsp+gprsize+16*25], m5 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*0, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: mova [rsp+gprsize+16*11], m2 ;in2 mova [rsp+gprsize+16*12], m6 ;in6 mova [rsp+gprsize+16*19], m1 ;in1 mova [rsp+gprsize+16*26], m3 ;in3 mova [rsp+gprsize+16*23], m5 ;in5 mova [rsp+gprsize+16*22], m7 ;in7 cmp eobd, 150 jg .full mova m1, m4 ;in4 mova m2, [coeffq+16*1 ] ;in8 mova m3, [coeffq+16*5 ] ;in12 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [rsp+gprsize+16*11] ;in2 mova m1, [rsp+gprsize+16*12] ;in6 mova m2, [rsp+gprsize+16*13] ;in10 mova m3, [rsp+gprsize+16*14] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast jmp .pass2 .full: mova [coeffq+16*0 ], m0 ;in0 mova [coeffq+16*4 ], m4 ;in4 LOAD_8ROWS coeffq+16*2, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*6, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: mova [coeffq+16*2 ], m0 ;in16 mova [coeffq+16*6 ], m4 ;in20 mova [rsp+gprsize+16*15], m2 ;in18 mova [rsp+gprsize+16*16], m6 ;in22 mova [rsp+gprsize+16*33], m1 ;in17 mova [rsp+gprsize+16*28], m3 ;in19 mova [rsp+gprsize+16*29], m5 ;in21 mova [rsp+gprsize+16*32], m7 ;in23 LOAD_8ROWS coeffq+16*3, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*7, 128, 1 call m(idct_16x8_internal_8bpc).main lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: mova [rsp+gprsize+16*17], m2 ;in26 mova [rsp+gprsize+16*18], m6 ;in30 mova [rsp+gprsize+16*31], m1 ;in25 mova [rsp+gprsize+16*30], m3 ;in27 mova [rsp+gprsize+16*27], m5 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 mova m6, m0 ;in24 mova m7, m4 ;in28 mova m0, [coeffq+16*0 ] ;in0 mova m1, [coeffq+16*4 ] ;in4 mova m2, [coeffq+16*1 ] ;in8 mova m3, [coeffq+16*5 ] ;in12 mova m4, [coeffq+16*2 ] ;in16 mova m5, [coeffq+16*6 ] ;in20 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3 , 16 LOAD_8ROWS rsp+gprsize+16*11, 16 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main .pass2: mov [rsp+gprsize*1+16*35], eobd lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 lea r3, [o(.end)] jmp m(idct_8x32_internal_8bpc).end .end: mov dstq, [rsp+gprsize*2+16*35] mov eobd, [rsp+gprsize*1+16*35] add coeffq, 16*32 mova m0, [coeffq+16*4 ] ;in1 mova m1, [coeffq+16*12] ;in3 mova m2, [coeffq+16*20] ;in5 mova m3, [coeffq+16*28] ;in7 mova m4, [coeffq+16*5 ] ;in9 mova m5, [coeffq+16*13] ;in11 mova m6, [coeffq+16*21] ;in13 mova m7, [coeffq+16*29] ;in15 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mova m0, [coeffq+16*0 ] ;in0 mova m1, [coeffq+16*16] ;in4 mova m2, [coeffq+16*1 ] ;in8 mova m3, [coeffq+16*17] ;in12 cmp eobd, 150 jg .full1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 mova m1, [coeffq+16*24] ;in6 mova m2, [coeffq+16*9 ] ;in10 mova m3, [coeffq+16*25] ;in14 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast jmp m(idct_8x32_internal_8bpc).pass2 .full1: mova m4, [coeffq+16*2 ] ;in16 mova m5, [coeffq+16*18] ;in20 mova m6, [coeffq+16*3 ] ;in24 mova m7, [coeffq+16*19] ;in26 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] ;in2 mova m1, [coeffq+16*24] ;in6 mova m2, [coeffq+16*9 ] ;in10 mova m3, [coeffq+16*25] ;in14 mova m4, [coeffq+16*10] ;in18 mova m5, [coeffq+16*26] ;in22 mova m6, [coeffq+16*11] ;in26 mova m7, [coeffq+16*27] ;in30 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*6 ] ;in17 mova m1, [coeffq+16*14] ;in19 mova m2, [coeffq+16*22] ;in21 mova m3, [coeffq+16*30] ;in23 mova m4, [coeffq+16*7 ] ;in25 mova m5, [coeffq+16*15] ;in27 mova m6, [coeffq+16*23] ;in29 mova m7, [coeffq+16*31] ;in31 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp m(idct_8x32_internal_8bpc).pass2 cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x16_internal_8bpc) call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*11, 16 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*19, 16 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end call m(idct_8x16_internal_8bpc).pass2 add coeffq, 16*16 lea dstq, [r3+8] LOAD_8ROWS rsp+16*27, 16 mova [rsp+16*0], m7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end call m(idct_8x16_internal_8bpc).pass2 RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 16 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 add coeffq, 16 lea r3, [o(.pass1_end1)] .pass1: LOAD_8ROWS coeffq+16*0, 128, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+16*4, 128, 1 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+16*2, 64, 1 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 LOAD_8ROWS coeffq+16*34, 64, 1 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main .pass1_end: mova [rsp+gprsize+16*0 ], m7 mov tx2q, r3 jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+16*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+16*16, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+16*32, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0 ], m7 lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+16*48, 32 sub coeffq, 16 lea r3, [o(.end)] jmp .pass1 .end: ret cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r4d, eobd cmp eobd, 43 ;if (eob > 43) sbb r3d, r3d ; iteration_count++ cmp r4d, 150 ;if (eob > 150) sbb r3d, 0 ; iteration_count++ cmp r4d, 278 ;if (eob > 278) sbb r3d, -4 ; iteration_count++ %if ARCH_X86_32 LEA r5, $$ %endif lea r4, [dstq+8] mov [rsp+16*3], r4 mov [rsp+gprsize+16*3], r3d mov [rsp+gprsize*2+16*3], coeffq .loop: LOAD_8ROWS coeffq, 64, 1 mova [rsp+16*1], m6 pxor m6, m6 REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*0], m2 mova [rsp+16*1], m3 mova [rsp+16*2], m4 mova m3, [o(pw_1697x16)] mova m4, [o(pw_16384)] REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 mova m2, [o(pw_8192)] REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 mova m2, [rsp+16*0] mova [rsp+16*0], m7 IDTX16 2, 7, 3, 4 mova m7, [rsp+16*2] mova [rsp+16*2], m5 IDTX16 7, 5, 3, 4 mova m5, [rsp+16*1] mova [rsp+16*1], m6 pmulhrsw m3, m5 pmulhrsw m3, m4 psrlw m4, 1 ; pw_8192 paddsw m3, m5 pmulhrsw m2, m4 pmulhrsw m3, m4 pmulhrsw m4, m7 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] add coeffq, 16 dec r3d jg .loop mov coeffq, [rsp+gprsize*2+16*3] add coeffq, 64*8 mov r3d, [rsp+gprsize+16*3] xor dstq, dstq mov [rsp+gprsize+16*3], dstq mov dstq, [rsp+16*3] test r3d, r3d jnz .loop RET cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 mov r4d, 12 ;0100b mov r5d, 136 ;1000 1000b cmp eobd, 44 ;if (eob > 43) cmovns r4d, r5d ; iteration_count+2 cmp eobd, 151 ;if (eob > 150) mov r3d, 34952 ;1000 1000 1000 1000b cmovs r3d, r4d ; iteration_count += 4 %if ARCH_X86_32 LEA r5, $$ %endif lea r4, [dstq+8] mov [rsp+16*3], r4 .loop: LOAD_8ROWS coeffq, 32, 1 REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 mova [rsp+16*1], m6 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end3 mova [rsp+16*1], m5 mova [rsp+16*2], m6 mova m6, [o(pw_1697x16)] REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 pmulhrsw m7, [o(pw_2048)] mova m5, [rsp+16*1] mova [rsp+16*0], m7 IDTX16 5, 7, 6 mova m7, [rsp+16*2] IDTX16 7, 6, 6 mova m6, [o(pw_2048)] REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 mova [rsp+16*2], m5 mova [rsp+16*1], m7 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 .loop_end: add coeffq, 16 shr r3d, 2 jz .ret test r3d, 2 jnz .loop mov r4d, r3d and r4d, 1 lea coeffq, [coeffq+r4*8+32*7] mov dstq, [rsp+16*3] lea r4, [dstq+8] mov [rsp+16*3], r4 jmp .loop .ret: RET cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x32_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 32 lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*35], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*35], coeffq .pass1_loop: LOAD_8ROWS coeffq+64*1, 64*2 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mov tx2d, [rsp+gprsize*1+16*35] test tx2d, tx2d jl .fast .full: LOAD_8ROWS coeffq+64*0, 64*4 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*17, 64*2 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: mova m0, [coeffq+256*0] mova m1, [coeffq+256*1] mova m2, [coeffq+256*2] mova m3, [coeffq+256*3] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+128*1] mova m1, [coeffq+128*3] mova m2, [coeffq+128*5] mova m3, [coeffq+128*7] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 add coeffq, 16 dec r3d jg .pass1_loop .pass2: mov coeffq, [rsp+gprsize*2+16*35] mov r3d, 4 lea tx2q, [o(.pass2_end)] .pass2_loop: mov [rsp+gprsize*3+16*35], r3d lea r3, [dstq+8] mov [rsp+gprsize*2+16*35], r3 mova m0, [coeffq+16*4 ] mova m1, [coeffq+16*12] mova m2, [coeffq+16*20] mova m3, [coeffq+16*28] mova m4, [coeffq+16*5 ] mova m5, [coeffq+16*13] mova m6, [coeffq+16*21] mova m7, [coeffq+16*29] mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mov eobd, [rsp+gprsize*1+16*35] test eobd, eobd jl .fast1 .full1: mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*16] mova m2, [coeffq+16*1 ] mova m3, [coeffq+16*17] mova m4, [coeffq+16*2 ] mova m5, [coeffq+16*18] mova m6, [coeffq+16*3 ] mova m7, [coeffq+16*19] call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] mova m4, [coeffq+16*10] mova m5, [coeffq+16*26] mova m6, [coeffq+16*11] mova m7, [coeffq+16*27] call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*6 ] mova m1, [coeffq+16*14] mova m2, [coeffq+16*22] mova m3, [coeffq+16*30] mova m4, [coeffq+16*7 ] mova m5, [coeffq+16*15] mova m6, [coeffq+16*23] mova m7, [coeffq+16*31] mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp tx2q .fast1: mova m0, [coeffq+16*0 ] mova m1, [coeffq+16*16] mova m2, [coeffq+16*1 ] mova m3, [coeffq+16*17] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast jmp tx2q .pass2_end: lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end .pass2_end1: lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] dec r3d jg .pass2_loop ret cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 mov r4d, 2 cmp eobd, 136 mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif lea r4, [dstq+8] mov [rsp+gprsize*0+16*3], r4 mov [rsp+gprsize*1+16*3], r3d mov [rsp+gprsize*2+16*3], r3d mov [rsp+gprsize*3+16*3], coeffq .loop: LOAD_8ROWS coeffq, 64 mova [rsp+16*1], m6 lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] call m(idct_8x8_internal_8bpc).pass1_end3 pmulhrsw m7, [o(pw_8192)] mova [rsp+16*0], m7 mova m7, [o(pw_8192)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 mova [rsp+16*1], m6 mova [rsp+16*2], m5 call m(idct_8x8_internal_8bpc).end3 lea dstq, [dstq+strideq*2] pxor m7, m7 REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 add coeffq, 16 dec r3d jg .loop mov r4d, [rsp+gprsize*2+16*3] dec r4d jle .ret mov dstq, [rsp+gprsize*0+16*3] mov coeffq, [rsp+gprsize*3+16*3] mov [rsp+gprsize*2+16*3], r4 lea r3, [dstq+8] add coeffq, 64*8 mov [rsp+gprsize*0+16*3], r3 mov r3d, [rsp+gprsize*1+16*3] mov [rsp+gprsize*3+16*3], coeffq jmp .loop .ret: RET cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_16x64_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r2d, 32 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 151 mov [rsp+gprsize*1+16*67], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*67], coeffq .pass1_loop: LOAD_8ROWS coeffq+64*0, 64*2 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 call m(idct_16x8_internal_8bpc).main mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 add coeffq, 16 dec r3d jg .pass1_loop mov coeffq, [rsp+gprsize*2+16*67] mov r3d, 2 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.end1)] .pass2_loop: mov [rsp+gprsize*3+16*67], r3d mov eobd, [rsp+gprsize*1+16*67] mova m0, [coeffq+16*4 ] ;in1 mova m1, [coeffq+16*12] ;in3 mova m2, [coeffq+16*20] ;in5 mova m3, [coeffq+16*28] ;in7 mova m4, [coeffq+16*5 ] ;in9 mova m5, [coeffq+16*13] ;in11 mova m6, [coeffq+16*21] ;in13 mova m7, [coeffq+16*29] ;in15 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 pxor m4, m4 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] test eobd, eobd jl .fast .full: mova m2, [coeffq+16*2] mova m3, [coeffq+16*3] REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 mova m0, [coeffq+16*16] mova m1, [coeffq+16*17] mova m2, [coeffq+16*18] mova m3, [coeffq+16*19] REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] mova m4, [coeffq+16*10] mova m5, [coeffq+16*26] mova m6, [coeffq+16*11] mova m7, [coeffq+16*27] mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 mova m0, [coeffq+16*6 ] ;in17 mova m1, [coeffq+16*14] ;in19 mova m2, [coeffq+16*22] ;in21 mova m3, [coeffq+16*30] ;in23 mova m4, [coeffq+16*7 ] ;in25 mova m5, [coeffq+16*15] ;in27 mova m6, [coeffq+16*23] ;in29 mova m7, [coeffq+16*31] ;in31 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call .main jmp .end .fast: REPX {mova x, m4}, m2, m3, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 mova m0, [coeffq+16*16] mova m1, [coeffq+16*17] REPX {mova x, m4}, m2, m3, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 mova m0, [coeffq+16*8 ] mova m1, [coeffq+16*24] mova m2, [coeffq+16*9 ] mova m3, [coeffq+16*25] mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 call m(idct_8x32_internal_8bpc).main_veryfast SAVE_8ROWS rsp+gprsize+16*3, 16 call .main_fast .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mov r3, r4 jmp m(idct_8x32_internal_8bpc).end2 .end1: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] lea r3, [rsp+16*32+gprsize] call .write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.end1)] dec r3d jg .pass2_loop ret .write: mova [r3+16*0], m7 mov r4, -16*32 pxor m7, m7 sub coeffq, r4 .zero_loop: mova [coeffq+r4+16*0], m7 mova [coeffq+r4+16*1], m7 add r4, 16*2 jl .zero_loop call .write_main2 LOAD_8ROWS r3+16*11, 16 call .write_main LOAD_8ROWS r3+16*19, 16 call .write_main LOAD_8ROWS r3+16*27, 16 .write_main: mova [r3+16*0], m7 .write_main2: mova m7, [o(pw_2048)] REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 pmulhrsw m7, [r3+16*0] mova [r3+16*2], m5 mova [r3+16*1], m6 mova [r3+16*0], m7 WRITE_8X4 0, 1, 2, 3, 5, 6, 7 lea dstq, [dstq+strideq*2] WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 lea dstq, [dstq+strideq*2] ret ALIGN function_align cglobal_label .main_fast mova m0, [rsp+gprsize*2+16*35] ;in1 pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 pmulhrsw m0, [o(pw_101x8)] ;t32,t33 mova m7, [o(pd_2048)] mova [rsp+gprsize*2+16*35], m0 ;t32 mova [rsp+gprsize*2+16*66], m3 ;t63 ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a mova [rsp+gprsize*2+16*36], m3 ;t33a mova [rsp+gprsize*2+16*65], m0 ;t62a mova m1, [rsp+gprsize*2+16*37] ;in15 pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 mova [rsp+gprsize*2+16*38], m1 ;t35 mova [rsp+gprsize*2+16*63], m2 ;t60 ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a mova [rsp+gprsize*2+16*37], m2 ;t34a mova [rsp+gprsize*2+16*64], m1 ;t61a mova m0, [rsp+gprsize*2+16*39] ;in9 pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 pmulhrsw m0, [o(pw_897x8)] ;t36,t37 mova [rsp+gprsize*2+16*39], m0 ;t36 mova [rsp+gprsize*2+16*62], m3 ;t59 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a mova [rsp+gprsize*2+16*40], m3 ;t37a mova [rsp+gprsize*2+16*61], m0 ;t58a mova m1, [rsp+gprsize*2+16*41] ;in7 pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 mova [rsp+gprsize*2+16*42], m1 ;t39 mova [rsp+gprsize*2+16*59], m2 ;t56 ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a mova [rsp+gprsize*2+16*41], m2 ;t38a mova [rsp+gprsize*2+16*60], m1 ;t57a mova m0, [rsp+gprsize*2+16*43] ;in5 pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 pmulhrsw m0, [o(pw_501x8)] ;t40,t41 mova [rsp+gprsize*2+16*43], m0 ;t40 mova [rsp+gprsize*2+16*58], m3 ;t55 ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a mova [rsp+gprsize*2+16*44], m3 ;t41a mova [rsp+gprsize*2+16*57], m0 ;t54a mova m1, [rsp+gprsize*2+16*45] ;in11 pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 mova [rsp+gprsize*2+16*46], m1 ;t43 mova [rsp+gprsize*2+16*55], m2 ;t52 ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a mova [rsp+gprsize*2+16*45], m2 ;t42a mova [rsp+gprsize*2+16*56], m1 ;t53a mova m0, [rsp+gprsize*2+16*47] ;in13 pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 mova m6, m0 mova [rsp+gprsize*2+16*54], m3 ;t51 ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a mova [rsp+gprsize*2+16*48], m3 ;t45a mova [rsp+gprsize*2+16*53], m0 ;t50a mova m0, [rsp+gprsize*2+16*49] ;in3 pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 mova m4, m3 mova m5, m0 jmp .main2 ALIGN function_align cglobal_label .main mova m0, [rsp+gprsize*2+16*35] ;in1 mova m1, [rsp+gprsize*2+16*65] ;in31 pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a pmulhrsw m0, [o(pw_101x8)] ;t32a pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a pmulhrsw m1, [o(pw_m2824x8)] ;t33a mova m7, [o(pd_2048)] psubsw m4, m0, m1 ;t33 paddsw m0, m1 ;t32 psubsw m5, m3, m2 ;t62 paddsw m3, m2 ;t63 ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a mova [rsp+gprsize*2+16*35], m0 ;t32 mova [rsp+gprsize*2+16*36], m5 ;t33a mova [rsp+gprsize*2+16*65], m4 ;t62a mova [rsp+gprsize*2+16*66], m3 ;t63 mova m0, [rsp+gprsize*2+16*63] ;in17 mova m1, [rsp+gprsize*2+16*37] ;in15 pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a pmulhrsw m0, [o(pw_1660x8)] ;t34a pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a pmulhrsw m1, [o(pw_m1474x8)] ;t35a psubsw m4, m1, m0 ;t34 paddsw m0, m1 ;t35 psubsw m5, m2, m3 ;t61 paddsw m3, m2 ;t60 ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a mova [rsp+gprsize*2+16*37], m5 ;t34a mova [rsp+gprsize*2+16*38], m0 ;t35 mova [rsp+gprsize*2+16*63], m3 ;t60 mova [rsp+gprsize*2+16*64], m4 ;t61a mova m0, [rsp+gprsize*2+16*39] ;in9 mova m1, [rsp+gprsize*2+16*61] ;in23 pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a pmulhrsw m0, [o(pw_897x8)] ;t36a pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a pmulhrsw m1, [o(pw_m2191x8)] ;t37a psubsw m4, m0, m1 ;t37 paddsw m0, m1 ;t36 psubsw m5, m3, m2 ;t58 paddsw m3, m2 ;t59 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a mova [rsp+gprsize*2+16*39], m0 ;t36 mova [rsp+gprsize*2+16*40], m5 ;t37a mova [rsp+gprsize*2+16*61], m4 ;t58a mova [rsp+gprsize*2+16*62], m3 ;t59 mova m0, [rsp+gprsize*2+16*59] ;in25 mova m1, [rsp+gprsize*2+16*41] ;in7 pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a pmulhrsw m0, [o(pw_2359x8)] ;t38a pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a pmulhrsw m1, [o(pw_m700x8)] ;t39a psubsw m4, m1, m0 ;t38 paddsw m0, m1 ;t39 psubsw m5, m2, m3 ;t57 paddsw m3, m2 ;t56 ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a mova [rsp+gprsize*2+16*41], m5 ;t38a mova [rsp+gprsize*2+16*42], m0 ;t39 mova [rsp+gprsize*2+16*59], m3 ;t56 mova [rsp+gprsize*2+16*60], m4 ;t57a mova m0, [rsp+gprsize*2+16*43] ;in5 mova m1, [rsp+gprsize*2+16*57] ;in27 pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a pmulhrsw m0, [o(pw_501x8)] ;t40a pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a pmulhrsw m1, [o(pw_m2520x8)] ;t41a psubsw m4, m0, m1 ;t41 paddsw m0, m1 ;t40 psubsw m5, m3, m2 ;t54 paddsw m3, m2 ;t55 ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a mova [rsp+gprsize*2+16*43], m0 ;t40 mova [rsp+gprsize*2+16*44], m5 ;t41a mova [rsp+gprsize*2+16*57], m4 ;t54a mova [rsp+gprsize*2+16*58], m3 ;t55 mova m0, [rsp+gprsize*2+16*55] ;in21 mova m1, [rsp+gprsize*2+16*45] ;in11 pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a pmulhrsw m0, [o(pw_2019x8)] ;t42a pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a pmulhrsw m1, [o(pw_m1092x8)] ;t43a psubsw m4, m1, m0 ;t42 paddsw m0, m1 ;t43 psubsw m5, m2, m3 ;t53 paddsw m3, m2 ;t52 ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a mova [rsp+gprsize*2+16*45], m5 ;t42a mova [rsp+gprsize*2+16*46], m0 ;t43 mova [rsp+gprsize*2+16*55], m3 ;t52 mova [rsp+gprsize*2+16*56], m4 ;t53a mova m0, [rsp+gprsize*2+16*47] ;in13 mova m1, [rsp+gprsize*2+16*53] ;in19 pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a pmulhrsw m0, [o(pw_1285x8)] ;t44a pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a pmulhrsw m1, [o(pw_m1842x8)] ;t45a psubsw m4, m0, m1 ;t45 paddsw m0, m1 ;t44 psubsw m5, m3, m2 ;t50 paddsw m3, m2 ;t51 ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a mova m6, m0 mova [rsp+gprsize*2+16*48], m5 ;t45a mova [rsp+gprsize*2+16*53], m4 ;t50a mova [rsp+gprsize*2+16*54], m3 ;t51 mova m0, [rsp+gprsize*2+16*51] ;in29 mova m1, [rsp+gprsize*2+16*49] ;in3 pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a pmulhrsw m0, [o(pw_2675x8)] ;t46a pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a pmulhrsw m1, [o(pw_m301x8)] ;t47a psubsw m5, m1, m0 ;t46 paddsw m0, m1 ;t47 psubsw m4, m2, m3 ;t49 paddsw m3, m2 ;t48 ALIGN function_align .main2: ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a mova m1, [rsp+gprsize*2+16*54] ;t51 psubsw m2, m0, m6 ;t44a paddsw m0, m6 ;t47a psubsw m6, m3, m1 ;t51a paddsw m3, m1 ;t48a mova [rsp+gprsize*2+16*50], m0 ;t47a mova [rsp+gprsize*2+16*51], m3 ;t48a ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 mova [rsp+gprsize*2+16*47], m6 ;t44 mova [rsp+gprsize*2+16*54], m2 ;t51 mova m0, [rsp+gprsize*2+16*48] ;t45a mova m3, [rsp+gprsize*2+16*53] ;t50a psubsw m2, m4, m0 ;t45 paddsw m4, m0 ;t46 psubsw m6, m5, m3 ;t50 paddsw m5, m3 ;t49 ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a mova [rsp+gprsize*2+16*48], m6 ;t45a mova [rsp+gprsize*2+16*49], m4 ;t46 mova [rsp+gprsize*2+16*52], m5 ;t49 mova [rsp+gprsize*2+16*53], m2 ;t50a mova m0, [rsp+gprsize*2+16*43] ;t40 mova m2, [rsp+gprsize*2+16*46] ;t43 mova m3, [rsp+gprsize*2+16*55] ;t52 mova m1, [rsp+gprsize*2+16*58] ;t55 psubsw m4, m0, m2 ;t43a paddsw m0, m2 ;t40a psubsw m5, m1, m3 ;t52a paddsw m1, m3 ;t55a ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 mova [rsp+gprsize*2+16*43], m0 ;t40a mova [rsp+gprsize*2+16*46], m5 ;t43 mova [rsp+gprsize*2+16*55], m4 ;t52 mova [rsp+gprsize*2+16*58], m1 ;t55a mova m0, [rsp+gprsize*2+16*44] ;t41a mova m2, [rsp+gprsize*2+16*45] ;t42a mova m3, [rsp+gprsize*2+16*56] ;t53a mova m1, [rsp+gprsize*2+16*57] ;t54a psubsw m4, m0, m2 ;t42 paddsw m0, m2 ;t41 psubsw m5, m1, m3 ;t53 paddsw m1, m3 ;t54 ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a mova [rsp+gprsize*2+16*44], m0 ;t41 mova [rsp+gprsize*2+16*45], m5 ;t42a mova [rsp+gprsize*2+16*56], m4 ;t53a mova [rsp+gprsize*2+16*57], m1 ;t54 mova m0, [rsp+gprsize*2+16*41] ;t38a mova m2, [rsp+gprsize*2+16*40] ;t37a mova m3, [rsp+gprsize*2+16*61] ;t58a mova m1, [rsp+gprsize*2+16*60] ;t57a psubsw m4, m0, m2 ;t37 paddsw m0, m2 ;t38 psubsw m5, m1, m3 ;t58 paddsw m1, m3 ;t57 ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a mova [rsp+gprsize*2+16*41], m0 ;t38 mova [rsp+gprsize*2+16*40], m5 ;t37a mova [rsp+gprsize*2+16*61], m4 ;t58a mova [rsp+gprsize*2+16*60], m1 ;t57 mova m0, [rsp+gprsize*2+16*42] ;t39 mova m2, [rsp+gprsize*2+16*39] ;t36 mova m3, [rsp+gprsize*2+16*62] ;t59 mova m1, [rsp+gprsize*2+16*59] ;t56 psubsw m4, m0, m2 ;t36a paddsw m0, m2 ;t39a psubsw m5, m1, m3 ;t59a paddsw m1, m3 ;t56a ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 mova [rsp+gprsize*2+16*42], m0 ;t39a mova [rsp+gprsize*2+16*39], m5 ;t36 mova [rsp+gprsize*2+16*62], m4 ;t59 mova [rsp+gprsize*2+16*59], m1 ;t56a mova m0, [rsp+gprsize*2+16*35] ;t32 mova m2, [rsp+gprsize*2+16*38] ;t35 mova m3, [rsp+gprsize*2+16*63] ;t60 mova m1, [rsp+gprsize*2+16*66] ;t63 psubsw m4, m0, m2 ;t35a paddsw m0, m2 ;t32a psubsw m5, m1, m3 ;t60a paddsw m1, m3 ;t63a ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 mova [rsp+gprsize*2+16*35], m0 ;t32a mova [rsp+gprsize*2+16*38], m5 ;t35 mova [rsp+gprsize*2+16*63], m4 ;t60 mova [rsp+gprsize*2+16*66], m1 ;t63a mova m0, [rsp+gprsize*2+16*36] ;t33a mova m2, [rsp+gprsize*2+16*37] ;t34a mova m3, [rsp+gprsize*2+16*64] ;t61a mova m1, [rsp+gprsize*2+16*65] ;t62a psubsw m4, m0, m2 ;t34 paddsw m0, m2 ;t33 psubsw m5, m1, m3 ;t61 paddsw m1, m3 ;t62 ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a mova m2, [rsp+gprsize*2+16*41] ;t38 mova m3, [rsp+gprsize*2+16*60] ;t57 psubsw m6, m0, m2 ;t38a paddsw m0, m2 ;t33a psubsw m2, m1, m3 ;t57a paddsw m1, m3 ;t62a mova [rsp+gprsize*2+16*36], m0 ;t33a mova [rsp+gprsize*2+16*65], m1 ;t62a ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 mova [rsp+gprsize*2+16*41], m2 ;t38 mova [rsp+gprsize*2+16*60], m6 ;t57 mova m2, [rsp+gprsize*2+16*40] ;t37 mova m3, [rsp+gprsize*2+16*61] ;t58 psubsw m0, m5, m2 ;t37 paddsw m5, m2 ;t34 psubsw m1, m4, m3 ;t58 paddsw m4, m3 ;t61 ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a mova [rsp+gprsize*2+16*37], m5 ;t34 mova [rsp+gprsize*2+16*64], m4 ;t61 mova [rsp+gprsize*2+16*40], m1 ;t37a mova [rsp+gprsize*2+16*61], m0 ;t58a mova m0, [rsp+gprsize*2+16*38] ;t35 mova m2, [rsp+gprsize*2+16*39] ;t36 mova m3, [rsp+gprsize*2+16*62] ;t59 mova m1, [rsp+gprsize*2+16*63] ;t60 psubsw m4, m0, m2 ;t36a paddsw m0, m2 ;t35a psubsw m5, m1, m3 ;t59a paddsw m1, m3 ;t60a ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 mova [rsp+gprsize*2+16*38], m0 ;t35a mova [rsp+gprsize*2+16*39], m5 ;t36 mova [rsp+gprsize*2+16*62], m4 ;t59 mova [rsp+gprsize*2+16*63], m1 ;t60a mova m0, [rsp+gprsize*2+16*35] ;t32a mova m2, [rsp+gprsize*2+16*42] ;t39a mova m3, [rsp+gprsize*2+16*59] ;t56a mova m1, [rsp+gprsize*2+16*66] ;t63a psubsw m4, m0, m2 ;t39 paddsw m0, m2 ;t32 psubsw m5, m1, m3 ;t56 paddsw m1, m3 ;t63 ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a mova [rsp+gprsize*2+16*35], m0 ;t32 mova [rsp+gprsize*2+16*42], m5 ;t39a mova [rsp+gprsize*2+16*59], m4 ;t56a mova [rsp+gprsize*2+16*66], m1 ;t63 mova m0, [rsp+gprsize*2+16*50] ;t47a mova m2, [rsp+gprsize*2+16*43] ;t40a mova m3, [rsp+gprsize*2+16*58] ;t55a mova m1, [rsp+gprsize*2+16*51] ;t48a psubsw m4, m0, m2 ;t40 paddsw m0, m2 ;t47 psubsw m5, m1, m3 ;t55 paddsw m1, m3 ;t48 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a mova [rsp+gprsize*2+16*50], m0 ;t47 mova [rsp+gprsize*2+16*43], m5 ;t40a mova [rsp+gprsize*2+16*58], m4 ;t55a mova [rsp+gprsize*2+16*51], m1 ;t48 mova m0, [rsp+gprsize*2+16*49] ;t46 mova m2, [rsp+gprsize*2+16*44] ;t41 mova m3, [rsp+gprsize*2+16*57] ;t54 mova m1, [rsp+gprsize*2+16*52] ;t49 psubsw m4, m0, m2 ;t41a paddsw m0, m2 ;t46a psubsw m5, m1, m3 ;t54a paddsw m1, m3 ;t49a ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 mova [rsp+gprsize*2+16*49], m0 ;t46a mova [rsp+gprsize*2+16*44], m5 ;t41 mova [rsp+gprsize*2+16*57], m4 ;t54 mova [rsp+gprsize*2+16*52], m1 ;t49a mova m0, [rsp+gprsize*2+16*48] ;t45a mova m2, [rsp+gprsize*2+16*45] ;t42a mova m3, [rsp+gprsize*2+16*56] ;t53a mova m1, [rsp+gprsize*2+16*53] ;t50a psubsw m4, m0, m2 ;t42 paddsw m0, m2 ;t45 psubsw m5, m1, m3 ;t53 paddsw m1, m3 ;t50 ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a mova [rsp+gprsize*2+16*48], m0 ;t45 mova [rsp+gprsize*2+16*45], m5 ;t42a mova [rsp+gprsize*2+16*56], m4 ;t53a mova [rsp+gprsize*2+16*53], m1 ;t50 mova m0, [rsp+gprsize*2+16*47] ;t44 mova m2, [rsp+gprsize*2+16*46] ;t43 mova m3, [rsp+gprsize*2+16*55] ;t52 mova m1, [rsp+gprsize*2+16*54] ;t51 psubsw m4, m0, m2 ;t43a paddsw m0, m2 ;t44a psubsw m5, m1, m3 ;t52a paddsw m1, m3 ;t51a ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 mova m2, [rsp+gprsize*2+16*38] ;t35a mova m3, [rsp+gprsize*2+16*31] ;tmp[28] psubsw m6, m2, m0 ;t44 paddsw m2, m0 ;t35 psubsw m0, m3, m2 ;out35 paddsw m2, m3 ;out28 mova m3, [rsp+gprsize*2+16*63] ;t60a mova [rsp+gprsize*2+16*38], m0 ;out35 mova [rsp+gprsize*2+16*31], m2 ;out28 psubsw m0, m3, m1 ;t51 paddsw m3, m1 ;t60 ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] psubsw m1, m2, m3 ;out60 paddsw m2, m3 ;out3 mova m3, [rsp+gprsize*2+16*22] ;tmp[19] mova [rsp+gprsize*2+16*63], m1 ;out60 mova [rsp+gprsize*2+16*6 ], m2 ;out3 psubsw m1, m3, m0 ;out44 paddsw m3, m0 ;out19 mova m2, [rsp+gprsize*2+16*15] ;tmp[12] mova m0, [rsp+gprsize*2+16*39] ;t36 mova [rsp+gprsize*2+16*47], m1 ;out44 mova [rsp+gprsize*2+16*22], m3 ;out19 mova m1, [rsp+gprsize*2+16*62] ;t59 psubsw m3, m2, m6 ;out51 paddsw m2, m6 ;out12 mova [rsp+gprsize*2+16*54], m3 ;out51 mova [rsp+gprsize*2+16*15], m2 ;out12 psubsw m2, m0, m5 ;t43a paddsw m0, m5 ;t36a mova m5, [rsp+gprsize*2+16*30] ;tmp[27] psubsw m3, m1, m4 ;t52a paddsw m1, m4 ;t59a ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] psubsw m6, m5, m0 ;out36 paddsw m5, m0 ;out27 psubsw m0, m4, m1 ;out59 paddsw m4, m1 ;out4 mova [rsp+gprsize*2+16*39], m6 ;out36 mova [rsp+gprsize*2+16*30], m5 ;out27 mova [rsp+gprsize*2+16*62], m0 ;out59 mova [rsp+gprsize*2+16*7 ], m4 ;out4 mova m0, [rsp+gprsize*2+16*23] ;tmp[20] mova m5, [rsp+gprsize*2+16*14] ;tmp[11] psubsw m4, m0, m3 ;out43 paddsw m0, m3 ;out20 psubsw m6, m5, m2 ;out52 paddsw m5, m2 ;out11 mova [rsp+gprsize*2+16*46], m4 ;out43 mova [rsp+gprsize*2+16*23], m0 ;out20 mova [rsp+gprsize*2+16*55], m6 ;out52 mova [rsp+gprsize*2+16*14], m5 ;out11 mova m0, [rsp+gprsize*2+16*40] ;t37a mova m5, [rsp+gprsize*2+16*45] ;t42a mova m3, [rsp+gprsize*2+16*56] ;t53a mova m1, [rsp+gprsize*2+16*61] ;t58a mova m2, [rsp+gprsize*2+16*29] ;tmp[26] psubsw m4, m0, m5 ;t42 paddsw m0, m5 ;t37 psubsw m5, m1, m3 ;t53 paddsw m1, m3 ;t58 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] psubsw m6, m2, m0 ;out37 paddsw m2, m0 ;out26 psubsw m0, m3, m1 ;out58 paddsw m3, m1 ;out5 mova [rsp+gprsize*2+16*40], m6 ;out37 mova [rsp+gprsize*2+16*29], m2 ;out26 mova [rsp+gprsize*2+16*61], m0 ;out58 mova [rsp+gprsize*2+16*8 ], m3 ;out5 mova m0, [rsp+gprsize*2+16*24] ;tmp[21] mova m1, [rsp+gprsize*2+16*13] ;tmp[10] psubsw m2, m0, m5 ;out42 paddsw m0, m5 ;out21 psubsw m3, m1, m4 ;out53 paddsw m1, m4 ;out10 mova [rsp+gprsize*2+16*45], m2 ;out42 mova [rsp+gprsize*2+16*24], m0 ;out21 mova [rsp+gprsize*2+16*56], m3 ;out53 mova [rsp+gprsize*2+16*13], m1 ;out10 mova m0, [rsp+gprsize*2+16*41] ;t38 mova m5, [rsp+gprsize*2+16*44] ;t41 mova m3, [rsp+gprsize*2+16*57] ;t54 mova m1, [rsp+gprsize*2+16*60] ;t57 mova m2, [rsp+gprsize*2+16*28] ;tmp[25] psubsw m4, m0, m5 ;t41a paddsw m0, m5 ;t38a psubsw m5, m1, m3 ;t54a paddsw m1, m3 ;t57a ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] psubsw m6, m2, m0 ;out38 paddsw m2, m0 ;out25 psubsw m0, m3, m1 ;out57 paddsw m3, m1 ;out6 mova [rsp+gprsize*2+16*41], m6 ;out38 mova [rsp+gprsize*2+16*28], m2 ;out25 mova [rsp+gprsize*2+16*60], m0 ;out57 mova [rsp+gprsize*2+16*9 ], m3 ;out6 mova m0, [rsp+gprsize*2+16*25] ;tmp[22] mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] psubsw m2, m0, m5 ;out41 paddsw m0, m5 ;out22 psubsw m3, m1, m4 ;out54 paddsw m1, m4 ;out9 mova [rsp+gprsize*2+16*44], m2 ;out41 mova [rsp+gprsize*2+16*25], m0 ;out22 mova [rsp+gprsize*2+16*57], m3 ;out54 mova [rsp+gprsize*2+16*12], m1 ;out9 mova m0, [rsp+gprsize*2+16*42] ;t39a mova m5, [rsp+gprsize*2+16*43] ;t40a mova m3, [rsp+gprsize*2+16*58] ;t55a mova m1, [rsp+gprsize*2+16*59] ;t56a mova m2, [rsp+gprsize*2+16*27] ;tmp[24] psubsw m4, m0, m5 ;t40 paddsw m0, m5 ;t39 psubsw m5, m1, m3 ;t55 paddsw m1, m3 ;t56 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] psubsw m6, m2, m0 ;out39 paddsw m2, m0 ;out24 psubsw m0, m3, m1 ;out56 paddsw m3, m1 ;out7 mova [rsp+gprsize*2+16*42], m6 ;out39 mova [rsp+gprsize*2+16*27], m2 ;out24 mova [rsp+gprsize*2+16*59], m0 ;out56 mova [rsp+gprsize*2+16*10], m3 ;out7 mova m0, [rsp+gprsize*2+16*26] ;tmp[23] mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] psubsw m2, m0, m5 ;out40 paddsw m0, m5 ;out23 psubsw m3, m1, m4 ;out55 paddsw m1, m4 ;out8 mova [rsp+gprsize*2+16*43], m2 ;out40 mova [rsp+gprsize*2+16*26], m0 ;out23 mova [rsp+gprsize*2+16*58], m3 ;out55 mova [rsp+gprsize*2+16*11], m1 ;out8 mova m0, [rsp+gprsize*2+16*37] ;t34 mova m5, [rsp+gprsize*2+16*48] ;t45 mova m3, [rsp+gprsize*2+16*53] ;t50 mova m1, [rsp+gprsize*2+16*64] ;t61 mova m2, [rsp+gprsize*2+16*32] ;tmp[29] psubsw m4, m0, m5 ;t45a paddsw m0, m5 ;t34a psubsw m5, m1, m3 ;t50a paddsw m1, m3 ;t61a ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] psubsw m6, m2, m0 ;out34 paddsw m2, m0 ;out29 psubsw m0, m3, m1 ;out61 paddsw m3, m1 ;out2 mova [rsp+gprsize*2+16*37], m6 ;out34 mova [rsp+gprsize*2+16*32], m2 ;out29 mova [rsp+gprsize*2+16*64], m0 ;out61 mova [rsp+gprsize*2+16*5 ], m3 ;out2 mova m0, [rsp+gprsize*2+16*21] ;tmp[18] mova m1, [rsp+gprsize*2+16*16] ;tmp[13] psubsw m2, m0, m5 ;out45 paddsw m0, m5 ;out18 psubsw m3, m1, m4 ;out50 paddsw m1, m4 ;out13 mova [rsp+gprsize*2+16*48], m2 ;out45 mova [rsp+gprsize*2+16*21], m0 ;out18 mova [rsp+gprsize*2+16*53], m3 ;out50 mova [rsp+gprsize*2+16*16], m1 ;out13 mova m0, [rsp+gprsize*2+16*36] ;t33a mova m5, [rsp+gprsize*2+16*49] ;t46a mova m3, [rsp+gprsize*2+16*52] ;t49a mova m1, [rsp+gprsize*2+16*65] ;t62a mova m2, [rsp+gprsize*2+16*33] ;tmp[30] psubsw m4, m0, m5 ;t46 paddsw m0, m5 ;t33 psubsw m5, m1, m3 ;t49 paddsw m1, m3 ;t62 ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] psubsw m6, m2, m0 ;out33 paddsw m2, m0 ;out30 psubsw m0, m3, m1 ;out62 paddsw m3, m1 ;out1 mova [rsp+gprsize*2+16*36], m6 ;out33 mova [rsp+gprsize*2+16*33], m2 ;out30 mova [rsp+gprsize*2+16*65], m0 ;out62 mova [rsp+gprsize*2+16*4 ], m3 ;out1 mova m0, [rsp+gprsize*2+16*20] ;tmp[17] mova m1, [rsp+gprsize*2+16*17] ;tmp[14] psubsw m2, m0, m5 ;out46 paddsw m0, m5 ;out17 psubsw m3, m1, m4 ;out49 paddsw m1, m4 ;out14 mova [rsp+gprsize*2+16*49], m2 ;out46 mova [rsp+gprsize*2+16*20], m0 ;out17 mova [rsp+gprsize*2+16*52], m3 ;out49 mova [rsp+gprsize*2+16*17], m1 ;out14 mova m0, [rsp+gprsize*2+16*35] ;t32 mova m5, [rsp+gprsize*2+16*50] ;t47 mova m3, [rsp+gprsize*2+16*51] ;t48 mova m1, [rsp+gprsize*2+16*66] ;t63 mova m2, [rsp+gprsize*2+16*34] ;tmp[31] psubsw m4, m0, m5 ;t47a paddsw m0, m5 ;t32a psubsw m5, m1, m3 ;t48a paddsw m1, m3 ;t63a ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] psubsw m6, m2, m0 ;out32 paddsw m2, m0 ;out31 psubsw m0, m3, m1 ;out63 paddsw m3, m1 ;out0 mova [rsp+gprsize*2+16*35], m6 ;out32 mova [rsp+gprsize*2+16*34], m2 ;out31 mova [rsp+gprsize*2+16*66], m0 ;out63 mova [rsp+gprsize*2+16*3 ], m3 ;out0 mova m0, [rsp+gprsize*2+16*19] ;tmp[16] mova m1, [rsp+gprsize*2+16*18] ;tmp[15] psubsw m2, m0, m5 ;out47 paddsw m0, m5 ;out16 psubsw m3, m1, m4 ;out48 paddsw m1, m4 ;out15 mova [rsp+gprsize*2+16*50], m2 ;out47 mova [rsp+gprsize*2+16*19], m0 ;out16 mova [rsp+gprsize*2+16*51], m3 ;out48 mova [rsp+gprsize*2+16*18], m1 ;out15 ret cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_64x16_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 16 lea tx2q, [o(.end)] .body: pmulhrsw m0, m2 movd m2, [o(pw_2048)] ;intentionally rip-relative pmulhrsw m0, m1 pmulhrsw m0, m2 pshuflw m0, m0, q0000 punpcklwd m0, m0 pxor m7, m7 .loop: mova m1, [dstq+16*0] mova m3, [dstq+16*1] mova m5, [dstq+16*2] mova m6, [dstq+16*3] punpckhbw m2, m1, m7 punpcklbw m1, m7 punpckhbw m4, m3, m7 punpcklbw m3, m7 paddw m2, m0 paddw m1, m0 paddw m4, m0 paddw m3, m0 packuswb m1, m2 packuswb m3, m4 punpckhbw m2, m5, m7 punpcklbw m5, m7 punpckhbw m4, m6, m7 punpcklbw m6, m7 paddw m2, m0 paddw m5, m0 paddw m4, m0 paddw m6, m0 packuswb m5, m2 packuswb m6, m4 mova [dstq+16*0], m1 mova [dstq+16*1], m3 mova [dstq+16*2], m5 mova [dstq+16*3], m6 add dstq, strideq dec r3d jg .loop jmp tx2q .end: RET %macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 %if %3 mova m3, [o(pw_2896x8)] pmulhrsw m0, m3, [%1+%2*0] pmulhrsw m1, m3, [%1+%2*1] pmulhrsw m2, m3, [%1+%2*2] pmulhrsw m3, [%1+%2*3] %else mova m0, [%1+%2*0] mova m1, [%1+%2*1] mova m2, [%1+%2*2] mova m3, [%1+%2*3] %endif %endmacro %macro LOAD_4ROWS_H 2 ;src, stride mova m4, [%1+%2*0] mova m5, [%1+%2*1] mova m6, [%1+%2*2] mova m7, [%1+%2*3] %endmacro cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r3d, 2 mov [rsp+gprsize*2+16*67], dstq lea dstq, [rsp+gprsize+16*68] .pass1_loop: LOAD_4ROWS coeffq+32*0, 32*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+32*4, 32*8 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+32*2, 32*4 mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+32*1, 32*2 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 LOAD_8ROWS coeffq+32*17, 32*2 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+32*24, 32 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+32*0, 32 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+32*8, 32 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+32*16, 32 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+32*24, 32 add coeffq, 16 add dstq, 16 dec r3d jg .pass1_loop .pass2: mov dstq, [rsp+gprsize*2+16*67] sub coeffq, 32 mov r3d, 4 .pass2_loop: mov [rsp+gprsize*1+16*67], r3d LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 call m(idct_16x8_internal_8bpc).main mov r3, dstq lea tx2q, [o(.end)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end1)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end .end1: pxor m7, m7 REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 add coeffq, 16*16 mov r3d, [rsp+gprsize*1+16*67] mov dstq, [rsp+gprsize*2+16*67] add dstq, 8 mov [rsp+gprsize*2+16*67], dstq dec r3d jg .pass2_loop mov r3d, 4 lea coeffq, [rsp+gprsize+16*68] .pass2_loop2: mov [rsp+gprsize*1+16*67], r3d LOAD_4ROWS coeffq+16*0, 32*2 LOAD_4ROWS_H coeffq+16*1, 32*2 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+16*2, 32*2 LOAD_4ROWS_H coeffq+16*3, 32*2 call m(idct_16x8_internal_8bpc).main mov r3, dstq lea tx2q, [o(.end2)] lea dstq, [dstq+strideq*8] jmp m(idct_8x8_internal_8bpc).end .end2: LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.end3)] mov dstq, r3 jmp m(idct_8x8_internal_8bpc).end .end3: add coeffq, 16*16 mov r3d, [rsp+gprsize*1+16*67] mov dstq, [rsp+gprsize*2+16*67] add dstq, 8 mov [rsp+gprsize*2+16*67], dstq dec r3d jg .pass2_loop2 ret cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_32x64_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] mov [coeffq], eobd pmulhrsw m0, m1 mov r3d, 64 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*67], coeffq .pass1_loop: LOAD_8ROWS coeffq+64*1, 64*2, 1 mova [rsp+gprsize+16*19], m0 ;in1 mova [rsp+gprsize+16*26], m1 ;in3 mova [rsp+gprsize+16*23], m2 ;in5 mova [rsp+gprsize+16*22], m3 ;in7 mova [rsp+gprsize+16*21], m4 ;in9 mova [rsp+gprsize+16*24], m5 ;in11 mova [rsp+gprsize+16*25], m6 ;in13 mova [rsp+gprsize+16*20], m7 ;in15 mov tx2d, [rsp+gprsize*1+16*67] test tx2d, tx2d jl .fast .full: LOAD_8ROWS coeffq+64*0, 64*4, 1 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*2, 64*4, 1 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*17, 64*2, 1 mova [rsp+gprsize+16*33], m0 ;in17 mova [rsp+gprsize+16*28], m1 ;in19 mova [rsp+gprsize+16*29], m2 ;in21 mova [rsp+gprsize+16*32], m3 ;in23 mova [rsp+gprsize+16*31], m4 ;in25 mova [rsp+gprsize+16*30], m5 ;in27 mova [rsp+gprsize+16*27], m6 ;in29 mova [rsp+gprsize+16*34], m7 ;in31 call m(idct_8x32_internal_8bpc).main jmp .pass1_end .fast: LOAD_4ROWS coeffq, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 LOAD_4ROWS coeffq+128*1, 256, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 call m(idct_8x32_internal_8bpc).main_fast .pass1_end: mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS coeffq+64*24, 64 add coeffq, 16 dec r3d jg .pass1_loop .pass2: mov coeffq, [rsp+gprsize*2+16*67] mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] jmp m(idct_16x64_internal_8bpc).pass2_loop cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_64x32_internal_8bpc) .end: RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_16384)] pmulhrsw m0, m1 mov [coeffq], eobd mov r3d, 32 lea tx2q, [o(.end)] jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r4d, 2 sub eobd, 136 mov [rsp+gprsize*1+16*67], eobd mov r3d, 4 cmovs r3d, r4d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*2+16*67], coeffq mov [rsp+gprsize*3+16*67], dstq lea dstq, [rsp+gprsize+16*69] mov [rsp+gprsize*4+16*67], dstq .pass1_loop: LOAD_4ROWS coeffq+64*0, 64*8, 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8, 1 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*2, 64*4, 1 mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2, 1 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 LOAD_8ROWS coeffq+64*17, 64*2, 1 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end .pass1_end7: SAVE_8ROWS dstq+64*24, 64 add coeffq, 16 add dstq, 16 dec r3d jg .pass1_loop .pass2: mov coeffq, [rsp+gprsize*4+16*67] mov dstq, [rsp+gprsize*3+16*67] mov eobd, [rsp+gprsize*1+16*67] lea dstq, [dstq+32] mov [rsp+gprsize*1+16*35], eobd lea tx2q, [o(.pass2_end)] mov r3d, 4 jmp m(idct_32x32_internal_8bpc).pass2_loop .pass2_end: mova [rsp+gprsize+16*0], m7 lea r3, [o(.pass2_end1)] jmp m(idct_8x32_internal_8bpc).end2 .pass2_end1: lea tx2q, [o(.pass2_end)] add coeffq, 16*32 mov dstq, [rsp+gprsize*2+16*35] mov r3d, [rsp+gprsize*3+16*35] dec r3d jg m(idct_32x32_internal_8bpc).pass2_loop .pass2_end2: mov dstq, [rsp+gprsize*3+16*67] mov coeffq, [rsp+gprsize*2+16*67] lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] mov r3d, 4 jmp m(idct_32x32_internal_8bpc).pass2_loop cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 %if ARCH_X86_32 LEA r5, $$ %endif test eobd, eobd jz .dconly call m(idct_64x64_internal_8bpc) RET .dconly: movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] movd m2, [o(pw_8192)] mov [coeffq], eobd mov r3d, 64 lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 mov r5d, 4 mov r4d, 2 sub eobd, 136 cmovns r4d, r5d %if ARCH_X86_32 LEA r5, $$ %endif mov [rsp+gprsize*1+16*67], eobd mov r3d, r4d mov [rsp+gprsize*4+16*67], coeffq mov [rsp+gprsize*3+16*67], dstq lea dstq, [rsp+gprsize+16*69] mov [rsp+gprsize*2+16*67], dstq .pass1_loop: LOAD_4ROWS coeffq+64*0, 64*8 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 call m(idct_8x8_internal_8bpc).main SAVE_7ROWS rsp+gprsize+16*3, 16 pxor m4, m4 LOAD_4ROWS coeffq+64*4, 64*8 REPX {mova x, m4}, m5, m6, m7 call m(idct_16x8_internal_8bpc).main mova m7, [rsp+gprsize+16*0] SAVE_8ROWS rsp+gprsize+16*11, 16 LOAD_8ROWS coeffq+64*2, 64*4 mova [rsp+gprsize+16*19], m0 mova [rsp+gprsize+16*26], m1 mova [rsp+gprsize+16*23], m2 mova [rsp+gprsize+16*22], m3 mova [rsp+gprsize+16*21], m4 mova [rsp+gprsize+16*24], m5 mova [rsp+gprsize+16*25], m6 mova [rsp+gprsize+16*20], m7 call m(idct_8x32_internal_8bpc).main_fast SAVE_8ROWS rsp+gprsize+16*3, 16 LOAD_8ROWS coeffq+64*1, 64*2 mova [rsp+gprsize+16*35], m0 ;in1 mova [rsp+gprsize+16*49], m1 ;in3 mova [rsp+gprsize+16*43], m2 ;in5 mova [rsp+gprsize+16*41], m3 ;in7 mova [rsp+gprsize+16*39], m4 ;in9 mova [rsp+gprsize+16*45], m5 ;in11 mova [rsp+gprsize+16*47], m6 ;in13 mova [rsp+gprsize+16*37], m7 ;in15 LOAD_8ROWS coeffq+64*17, 64*2 mova [rsp+gprsize+16*63], m0 ;in17 mova [rsp+gprsize+16*53], m1 ;in19 mova [rsp+gprsize+16*55], m2 ;in21 mova [rsp+gprsize+16*61], m3 ;in23 mova [rsp+gprsize+16*59], m4 ;in25 mova [rsp+gprsize+16*57], m5 ;in27 mova [rsp+gprsize+16*51], m6 ;in29 mova [rsp+gprsize+16*65], m7 ;in31 call m(idct_16x64_internal_8bpc).main LOAD_8ROWS rsp+gprsize+16*3, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end: SAVE_8ROWS coeffq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*11, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end1)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end1: SAVE_8ROWS coeffq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*19, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end2)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end2: SAVE_8ROWS coeffq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*27, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end3)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end3: SAVE_8ROWS coeffq+64*24, 64 LOAD_8ROWS rsp+gprsize+16*35, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end4)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end4: SAVE_8ROWS dstq+64*0, 64 LOAD_8ROWS rsp+gprsize+16*43, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end5)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end5: SAVE_8ROWS dstq+64*8, 64 LOAD_8ROWS rsp+gprsize+16*51, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end6)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end6: SAVE_8ROWS dstq+64*16, 64 LOAD_8ROWS rsp+gprsize+16*59, 16 mova [rsp+gprsize+16*0], m7 mova m7, [o(pw_8192)] lea tx2q, [o(.pass1_end7)] jmp m(idct_8x8_internal_8bpc).pass1_end1 .pass1_end7: SAVE_8ROWS dstq+64*24, 64 add coeffq, 16 add dstq, 16 dec r3d jg .pass1_loop .pass2: mov dstq, [rsp+gprsize*3+16*67] mov coeffq, [rsp+gprsize*2+16*67] lea dstq, [dstq+32] mov r3d, 4 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.pass2_end)] jmp m(idct_16x64_internal_8bpc).pass2_loop .pass2_end: LOAD_8ROWS rsp+gprsize+16*35, 16 lea dstq, [dstq+strideq*2] lea r3, [rsp+16*32+gprsize] mova [rsp+gprsize+16*0], m7 call m(idct_16x64_internal_8bpc).write mov dstq, [rsp+gprsize*2+16*67] mov r3d, [rsp+gprsize*3+16*67] lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(.pass2_end)] dec r3d jg m(idct_16x64_internal_8bpc).pass2_loop .pass2_end2: mov coeffq, [rsp+gprsize*4+16*67] mov dstq, [rsp+gprsize*2+16*67] mov r3d, 4 sub dstq, 72 lea r4, [dstq+8] mov [rsp+gprsize*2+16*67], r4 lea r4, [o(m(idct_16x64_internal_8bpc).end1)] jmp m(idct_16x64_internal_8bpc).pass2_loop rav1e-0.7.1/src/x86/loopfilter16_avx2.asm000064400000000000000000001036561046102023000160660ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 times 4 db 0, 1 times 4 db 8, 9 pw_1: times 16 dw 1 pw_2: times 16 dw 2 pw_3: times 16 dw 3 pw_4096: times 2 dw 4096 ; 10bpc/12bpc: pw_4: times 2 dw 4 times 2 dw 16 clip_max: times 2 dw 511 times 2 dw 2047 clip_min: times 2 dw -512 times 2 dw -2048 SECTION .text ; in: out: ; mm%1 a b c d a e i m ; mm%2 e f g h b f j n ; mm%3 i j k l -> c g k o ; mm%4 m n o p d h l p %macro TRANSPOSE4X4W 5 punpcklwd m%5, m%1, m%2 punpckhwd m%1, m%2 punpcklwd m%2, m%3, m%4 punpckhwd m%3, m%4 punpckldq m%4, m%5, m%2 punpckhdq m%5, m%2 punpckldq m%2, m%1, m%3 punpckhdq m%1, m%3 SWAP %1, %4 SWAP %2, %5, %3 %endmacro ; in: out: ; xmm%1 a b c d e f g h a i q y 6 E M U ; xmm%2 i j k l m n o p b j r z 7 F N V ; xmm%3 q r s t u v w x c k s 0 8 G O W ; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X ; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y ; xmm%6 E F G H I J K L f n v 3 B J R Z ; xmm%7 M N O P Q R S T g o w 4 C K S + ; xmm%8 U V W X Y Z + = h p x 5 D L T = %macro TRANSPOSE8X8W 9 ; xmm%1 a b c d e f g h a i q y b j r z ; xmm%2 i j k l m n o p c k s 0 d l t 1 ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 TRANSPOSE4X4W %1, %2, %3, %4, %9 ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V ; xmm%6 E F G H I J K L 8 G O W 9 H P X ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z ; xmm%8 U V W X Y Z + = C K S + D L T = TRANSPOSE4X4W %5, %6, %7, %8, %9 ; xmm%1 a i q y b j r z a i q y 6 E M U ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z ; xmm%7 A I Q Y B J R Z g o w 4 C K S + ; xmm%8 C K S + D L T = h p x 5 D L T = punpckhqdq m%9, m%1, m%5 punpcklqdq m%1, m%5 punpckhqdq m%5, m%2, m%6 punpcklqdq m%2, m%6 punpckhqdq m%6, m%3, m%7 punpcklqdq m%3, m%7 punpckhqdq m%7, m%4, m%8 punpcklqdq m%4, m%8 SWAP %8, %7, %4, %5, %3, %2, %9 %endmacro ; transpose and write m3-6, everything else is scratch %macro TRANSPOSE_8x4_AND_WRITE_4x16 0 ; transpose 8x4 punpcklwd m0, m3, m4 punpckhwd m3, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpckldq m6, m0, m4 punpckhdq m0, m4 punpckldq m4, m3, m5 punpckhdq m3, m5 ; write out movq [dstq+strideq*0-4], xm6 movhps [dstq+strideq*1-4], xm6 movq [dstq+strideq*2-4], xm0 movhps [dstq+stride3q -4], xm0 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm4 movhps [dstq+strideq*1-4], xm4 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] vextracti128 xm6, m6, 1 vextracti128 xm0, m0, 1 vextracti128 xm4, m4, 1 vextracti128 xm3, m3, 1 movq [dstq+strideq*0-4], xm6 movhps [dstq+strideq*1-4], xm6 movq [dstq+strideq*2-4], xm0 movhps [dstq+stride3q -4], xm0 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm4 movhps [dstq+strideq*1-4], xm4 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] ; load data %ifidn %2, v %if %1 == 4 lea tmpq, [dstq+mstrideq*2] mova m3, [tmpq+strideq*0] ; p1 mova m4, [tmpq+strideq*1] ; p0 mova m5, [tmpq+strideq*2] ; q0 mova m6, [tmpq+stride3q] ; q1 %else ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] ; we load p3 later mova m13, [tmpq+strideq*1] mova m3, [tmpq+strideq*2] mova m4, [tmpq+stride3q] mova m5, [dstq+strideq*0] mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 mova m15, [dstq+stride3q] %endif %endif %else ; load lines %if %1 == 4 movq xm3, [dstq+strideq*0-4] movq xm4, [dstq+strideq*1-4] movq xm5, [dstq+strideq*2-4] movq xm6, [dstq+stride3q -4] lea tmpq, [dstq+strideq*4] movq xm11, [tmpq+strideq*0-4] movq xm13, [tmpq+strideq*1-4] movq xm14, [tmpq+strideq*2-4] movq xm15, [tmpq+stride3q -4] lea tmpq, [tmpq+strideq*4] ; this overreads by 8 bytes but the buffers are padded ; so that should be ok vinserti128 m3, [tmpq+strideq*0-4], 1 vinserti128 m4, [tmpq+strideq*1-4], 1 vinserti128 m5, [tmpq+strideq*2-4], 1 vinserti128 m6, [tmpq+stride3q -4], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m11, [tmpq+strideq*0-4], 1 vinserti128 m13, [tmpq+strideq*1-4], 1 vinserti128 m14, [tmpq+strideq*2-4], 1 vinserti128 m15, [tmpq+stride3q -4], 1 ; transpose 4x8 ; xm3: A-D0,A-D4 ; xm4: A-D1,A-D5 ; xm5: A-D2,A-D6 ; xm6: A-D3,A-D7 punpcklwd m7, m3, m4 punpcklwd m3, m11, m13 punpcklwd m4, m5, m6 punpcklwd m5, m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1 ; xm3: A4-5,B4-5,C4-5,D4-5 ; xm4: A2-3,B2-3,C2-3,D2-3 ; xm5: A6-7,B6-7,C6-7,D6-7 punpckldq m6, m7, m4 punpckhdq m7, m4 punpckldq m8, m3, m5 punpckhdq m5, m3, m5 ; xm6: A0-3,B0-3 ; xm7: C0-3,D0-3 ; xm8: A4-7,B4-7 ; xm5: C4-7,D4-7 punpcklqdq m3, m6, m8 punpckhqdq m4, m6, m8 punpckhqdq m6, m7, m5 punpcklqdq m5, m7, m5 ; xm3: A0-7 ; xm4: B0-7 ; xm5: C0-7 ; xm6: D0-7 %elif %1 == 6 || %1 == 8 movu xm3, [dstq+strideq*0-8] movu xm4, [dstq+strideq*1-8] movu xm5, [dstq+strideq*2-8] movu xm6, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] movu xm11, [tmpq+strideq*0-8] movu xm13, [tmpq+strideq*1-8] movu xm14, [tmpq+strideq*2-8] movu xm15, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] vinserti128 m3, [tmpq+strideq*0-8], 1 vinserti128 m4, [tmpq+strideq*1-8], 1 vinserti128 m5, [tmpq+strideq*2-8], 1 vinserti128 m6, [tmpq+stride3q -8], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m11, [tmpq+strideq*0-8], 1 vinserti128 m13, [tmpq+strideq*1-8], 1 vinserti128 m14, [tmpq+strideq*2-8], 1 vinserti128 m15, [tmpq+stride3q -8], 1 ; transpose 8x16 ; xm3: A-H0,A-H8 ; xm4: A-H1,A-H9 ; xm5: A-H2,A-H10 ; xm6: A-H3,A-H11 ; xm11: A-H4,A-H12 ; xm13: A-H5,A-H13 ; xm14: A-H6,A-H14 ; xm15: A-H7,A-H15 punpcklwd m7, m3, m4 punpckhwd m3, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpcklwd m6, m11, m13 punpckhwd m11, m13 punpcklwd m13, m14, m15 punpckhwd m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1 ; xm3: E0-1,F0-1,G0-1,H0-1 ; xm4: A2-3,B2-3,C2-3,D2-3 ; xm5: E2-3,F2-3,G2-3,H2-3 ; xm6: A4-5,B4-5,C4-5,D4-5 ; xm11: E4-5,F4-5,G4-5,H4-5 ; xm13: A6-7,B6-7,C6-7,D6-7 ; xm14: E6-7,F6-7,G6-7,H6-7 punpckldq m15, m7, m4 punpckhdq m7, m4 punpckldq m9, m3, m5 punpckhdq m8, m3, m5 punpckldq m3, m6, m13 punpckhdq m6, m13 punpckldq m10, m11, m14 punpckhdq m11, m14 ; xm15: A0-3,B0-3 ; xm7: C0-3,D0-3 ; xm9: E0-3,F0-3 ; xm8: G0-3,H0-3 ; xm3: A4-7,B4-7 ; xm6: C4-7,D4-7 ; xm10: E4-7,F4-7 ; xm11: G4-7,H4-7 %if %1 != 6 punpcklqdq m0, m15, m3 %endif punpckhqdq m13, m15, m3 punpcklqdq m3, m7, m6 punpckhqdq m4, m7, m6 punpcklqdq m5, m9, m10 punpckhqdq m6, m9, m10 punpcklqdq m14, m8, m11 %if %1 != 6 punpckhqdq m15, m8, m11 mova [rsp+5*32], m0 %endif %else ; We only use 14 pixels but we'll need the remainder at the end for ; the second transpose mova xm0, [dstq+strideq*0-16] mova xm1, [dstq+strideq*1-16] mova xm2, [dstq+strideq*2-16] mova xm3, [dstq+stride3q -16] lea tmpq, [dstq+strideq*4] mova xm4, [tmpq+strideq*0-16] mova xm5, [tmpq+strideq*1-16] mova xm6, [tmpq+strideq*2-16] mova xm7, [tmpq+stride3q -16] lea tmpq, [tmpq+strideq*4] vinserti128 m0, m0, [tmpq+strideq*0-16], 1 vinserti128 m1, m1, [tmpq+strideq*1-16], 1 vinserti128 m2, m2, [tmpq+strideq*2-16], 1 vinserti128 m3, m3, [tmpq+stride3q -16], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m4, m4, [tmpq+strideq*0-16], 1 vinserti128 m5, m5, [tmpq+strideq*1-16], 1 vinserti128 m6, m6, [tmpq+strideq*2-16], 1 vinserti128 m7, m7, [tmpq+stride3q -16], 1 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 mova [rsp+6*32], m0 mova [rsp+7*32], m1 mova [rsp+8*32], m2 mova [rsp+9*32], m3 mova [rsp+5*32], m4 mova xm0, [dstq+strideq*0] mova xm1, [dstq+strideq*1] mova xm2, [dstq+strideq*2] mova xm3, [dstq+stride3q ] lea tmpq, [dstq+strideq*4] mova xm8, [tmpq+strideq*0] mova xm9, [tmpq+strideq*1] mova xm10, [tmpq+strideq*2] mova xm11, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*4] vinserti128 m0, m0, [tmpq+strideq*0], 1 vinserti128 m1, m1, [tmpq+strideq*1], 1 vinserti128 m2, m2, [tmpq+strideq*2], 1 vinserti128 m3, m3, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m8, m8, [tmpq+strideq*0], 1 vinserti128 m9, m9, [tmpq+strideq*1], 1 vinserti128 m10, m10, [tmpq+strideq*2], 1 vinserti128 m11, m11, [tmpq+stride3q ], 1 TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 mova [rsp+10*32], m8 mova [rsp+11*32], m9 mova [rsp+12*32], m10 mova [rsp+13*32], m11 ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 SWAP 13, 5, 0 SWAP 3, 6, 1, 15 SWAP 4, 7 SWAP 2, 14 %endif %endif ; load L/E/I/H %ifidn %2, v pmovzxbw m1, [lq] pmovzxbw m0, [lq+l_strideq] pxor m2, m2 %else vpbroadcastq m0, [lq] ; l0, l1 vpbroadcastq m1, [lq+l_strideq] ; l2, l3 vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5 vpbroadcastq m10, [lq+l_stride3q] ; l6, l7 punpckldq m0, m1 ; l0, l2, l1, l3 [2x] punpckldq m2, m10 ; l4, l6, l5, l7 [2x] vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7 pxor m2, m2 punpcklbw m1, m0, m2 ; l0, l2, l4, l6 punpckhbw m0, m2 ; l1, l3, l5, l7 %endif pcmpeqw m10, m2, m0 pand m1, m10 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] pcmpeqw m10, m2, m0 ; !L psrlw m10, 1 psrlw m2, m0, [lutq+128] vpbroadcastw m1, [lutq+136] pminuw m2, m1 pmaxuw m2, [pw_1] ; I psrlw m1, m0, 4 ; H paddw m0, [pw_2] vpbroadcastd m8, [r11] paddw m0, m0 paddw m0, m2 ; E REPX {pmullw x, m8}, m0, m1, m2 psubw m8, m3, m4 ; p1-p0 psubw m9, m5, m6 ; q1-q0 REPX {pabsw x, x}, m8, m9 pmaxuw m8, m10 pmaxuw m8, m9 pcmpgtw m7, m8, m1 ; hev %if %1 != 4 psubw m9, m13, m4 ; p2-p0 pabsw m9, m9 pmaxuw m9, m8 %if %1 != 6 %ifidn %2, v mova m11, [tmpq+strideq*0] ; p3 %else mova m11, [rsp+5*32] ; p3 %endif psubw m10, m11, m4 ; p3-p0 pabsw m10, m10 pmaxuw m9, m10 %endif psubw m10, m5, m14 ; q2-q0 pabsw m10, m10 pmaxuw m9, m10 %if %1 != 6 psubw m10, m5, m15 ; q3-q0 pabsw m10, m10 pmaxuw m9, m10 %endif vpbroadcastd m10, [r11] pcmpgtw m9, m10 ; !flat8in psubw m10, m13, m3 ; p2-p1 pabsw m10, m10 %if %1 != 6 psubw m11, m13 ; p3-p2 pabsw m11, m11 pmaxuw m10, m11 psubw m11, m14, m15 ; q3-q2 pabsw m11, m11 pmaxuw m10, m11 %endif psubw m11, m14, m6 ; q2-q1 pabsw m11, m11 pmaxuw m10, m11 %if %1 == 16 vpbroadcastd m11, [maskq+8] vpbroadcastd m1, [maskq+4] por m11, m1 pand m11, m12 pcmpeqd m11, m12 pand m10, m11 %else vpbroadcastd m11, [maskq+4] pand m11, m12 pcmpeqd m11, m12 pand m10, m11 ; only apply fm-wide to wd>4 blocks %endif pmaxuw m8, m10 %endif pcmpgtw m8, m2 psubw m10, m3, m6 ; p1-q1 psubw m11, m4, m5 ; p0-q0 REPX {pabsw x, x}, m10, m11 paddw m11, m11 psrlw m10, 1 paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E por m8, m10 %if %1 == 16 %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] mova m1, [tmpq+strideq*2] mova m2, [tmpq+stride3q] %else mova m0, [rsp+7*32] mova m1, [rsp+8*32] mova m2, [rsp+9*32] %endif REPX {psubw x, m4}, m0, m1, m2 REPX {pabsw x, x}, m0, m1, m2 pmaxuw m1, m0 pmaxuw m1, m2 %ifidn %2, v lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] mova m2, [tmpq+strideq*1] mova m10, [tmpq+strideq*2] %else mova m0, [rsp+10*32] mova m2, [rsp+11*32] mova m10, [rsp+12*32] %endif REPX {psubw x, m5}, m0, m2, m10 REPX {pabsw x, x}, m0, m2, m10 pmaxuw m0, m2 pmaxuw m1, m10 pmaxuw m1, m0 vpbroadcastd m0, [r11] pcmpgtw m1, m0 ; !flat8out por m1, m9 ; !flat8in | !flat8out vpbroadcastd m2, [maskq+8] pand m10, m2, m12 pcmpeqd m10, m12 pandn m1, m10 ; flat16 pandn m1, m8, m1 ; flat16 & fm vpbroadcastd m10, [maskq+4] por m10, m2 pand m2, m10, m12 pcmpeqd m2, m12 pandn m9, m2 ; flat8in pandn m9, m8, m9 vpbroadcastd m2, [maskq+0] por m2, m10 pand m2, m12 pcmpeqd m2, m12 pandn m8, m2 pandn m8, m9, m8 ; fm & !flat8 & !flat16 pandn m9, m1, m9 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] pand m2, m0, m12 pcmpeqd m2, m12 pandn m9, m2 pandn m9, m8, m9 ; flat8 & fm vpbroadcastd m2, [maskq+0] por m0, m2 pand m0, m12 pcmpeqd m0, m12 pandn m8, m0 pandn m8, m9, m8 ; fm & !flat8 %else vpbroadcastd m0, [maskq+0] pand m0, m12 pcmpeqd m0, m12 pandn m8, m0 ; fm %endif ; short filter vpbroadcastd m0, [r11+8*1] ; 511 or 2047 vpbroadcastd m2, [r11+8*2] ; -512 or -2048 psubw m10, m5, m4 paddw m11, m10, m10 paddw m11, m10 psubw m10, m3, m6 ; iclip_diff(p1-q1) pminsw m10, m0 pmaxsw m10, m2 pand m10, m7 ; f=iclip_diff(p1-q1)&hev paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) pminsw m10, m0 pmaxsw m10, m2 pand m8, m10 ; f&=fm vpbroadcastd m10, [pw_4] paddw m10, m8 paddw m8, [pw_3] REPX {pminsw x, m0}, m10, m8 psraw m10, 3 ; f2 psraw m8, 3 ; f1 psubw m5, m10 paddw m4, m8 paddw m10, [pw_1] psraw m10, 1 ; f=(f1+1)>>1 pandn m8, m7, m10 ; f&=!hev paddw m3, m8 psubw m6, m8 pxor m8, m8 psubw m0, m2 ; 1023 or 4095 REPX {pminsw x, m0}, m3, m4, m5, m6 REPX {pmaxsw x, m8}, m3, m4, m5, m6 %if %1 == 16 ; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 ; m12=filter bits mask ; m13-15=p2/q2/q3 ; m0,2,7-8,10-11 = free ; flat16 filter %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 mova m11, [tmpq+strideq*4] ; p3 %else mova m0, [rsp+7*32] mova m2, [rsp+8*32] mova m7, [rsp+9*32] mova m11, [rsp+5*32] %endif mova [rsp+ 0*32], m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 paddw m8, m0, [pw_1] psllw m8, 3 ; p6*8+8 paddw m10, m2, m7 ; p5+p4 psubw m8, m0 paddw m10, m10 ; (p5+p4)*2 paddw m8, m11 ; p6*7+p3 paddw m10, m13 ; (p5+p4)*2+p2 paddw m8, m3 ; p6*7+p3+p1 paddw m10, m4 ; (p5+p4)*2+p2+p0 paddw m8, m5 ; p6*7+p3+p1+q0 paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psrlw m10, m8, 4 vpblendvb m10, m2, m10, m1 %ifidn %2, v mova [tmpq+strideq*2], m10 ; p5 %else mova [rsp+8*32], m10 %endif ; sub p6*2, add p3/q1 paddw m8, m11 paddw m10, m0, m0 paddw m8, m6 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m7, m10, m1 %ifidn %2, v mova [tmpq+stride3q], m10 ; p4 %else mova [rsp+9*32], m10 %endif ; sub p6/p5, add p2/q2 psubw m8, m0 paddw m10, m13, m14 psubw m8, m2 paddw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m11, m10, m1 %ifidn %2, v mova [tmpq+strideq*4], m10 ; p3 lea tmpq, [dstq+strideq*4] %else mova [rsp+5*32], m10 %endif ; sub p6/p4, add p1/q3 paddw m8, m3 paddw m10, m0, m7 paddw m8, m15 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m13, m10, m1 mova [rsp+1*32], m10 ; don't clobber p2/m13 ; sub p6/p3, add p0/q4 paddw m8, m4 paddw m10, m0, m11 %ifidn %2, v paddw m8, [tmpq+strideq*0] %else paddw m8, [rsp+10*32] %endif psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m3, m10, m1 mova [rsp+2*32], m10 ; don't clobber p1/m3 ; sub p6/p2, add q0/q5 paddw m8, m5 paddw m10, m0, m13 %ifidn %2, v paddw m8, [tmpq+strideq*1] %else paddw m8, [rsp+11*32] %endif psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m4, m10, m1 mova [rsp+3*32], m10 ; don't clobber p0/m4 ; sub p6/p1, add q1/q6 paddw m8, m6 paddw m10, m0, m3 %ifidn %2, v mova m0, [tmpq+strideq*2] ; q6 %else mova m0, [rsp+12*32] ; q6 %endif paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m5, m10, m1 mova [rsp+4*32], m10 ; don't clobber q0/m5 ; sub p5/p0, add q2/q6 paddw m8, m14 paddw m10, m2, m4 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6 ; sub p4/q0, add q3/q6 paddw m8, m15 paddw m10, m7, m5 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14 ; sub p3/q1, add q4/q6 %ifidn %2, v paddw m8, [tmpq+strideq*0] %else paddw m8, [rsp+10*32] %endif paddw m10, m11, m6 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 vpblendvb m10, m15, m10, m1 %ifidn %2, v mova [tmpq+mstrideq], m10 ; q3 %else mova [rsp+14*32], m10 %endif ; sub p2/q2, add q5/q6 %ifidn %2, v paddw m8, [tmpq+strideq*1] %else paddw m8, [rsp+11*32] %endif paddw m10, m13, m14 paddw m8, m0 psubw m8, m10 psrlw m10, m8, 4 %ifidn %2, v mova m9, [tmpq+strideq*0] %else mova m9, [rsp+10*32] %endif vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*0], m10 ; q4 %else mova [rsp+10*32], m10 %endif ; sub p1/q3, add q6*2 psubw m8, m3 paddw m0, m0 psubw m8, m15 paddw m8, m0 psrlw m10, m8, 4 %ifidn %2, v mova m9, [tmpq+strideq*1] %else mova m9, [rsp+11*32] %endif vpblendvb m10, m9, m10, m1 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else mova [rsp+11*32], m10 %endif mova m9, [rsp+0*32] mova m13, [rsp+1*32] mova m3, [rsp+2*32] mova m4, [rsp+3*32] mova m5, [rsp+4*32] SWAP 2, 6 SWAP 7, 14 %ifidn %2, v lea tmpq, [dstq+mstrideq*4] %else mova m15, [rsp+14*32] %endif %endif %if %1 >= 8 ; flat8 filter vpbroadcastd m7, [pw_4096] %ifidn %2, v mova m0, [tmpq+strideq*0] ; p3 %else mova m0, [rsp+5*32] ; p3 %endif paddw m1, m0, m13 ; p3+p2 paddw m2, m3, m4 ; p1+p0 paddw m8, m1, m1 ; 2*(p3+p2) paddw m2, m0 ; p1+p0+p3 paddw m8, m5 ; 2*(p3+p2)+q0 paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 pmulhrsw m10, m2, m7 paddw m8, m3, m6 psubw m2, m1 paddw m2, m8 pmulhrsw m8, m2, m7 paddw m11, m0, m3 paddw m1, m4, m14 psubw m2, m11 paddw m2, m1 pmulhrsw m1, m2, m7 paddw m11, m0, m4 pblendvb m4, m1, m9 paddw m1, m5, m15 psubw m2, m11 paddw m2, m1 pmulhrsw m11, m2, m7 paddw m2, m6 paddw m2, m15 paddw m1, m13, m5 pblendvb m5, m11, m9 pblendvb m13, m10, m9 psubw m2, m1 pmulhrsw m1, m2, m7 psubw m2, m3 pblendvb m3, m8, m9 psubw m2, m6 pblendvb m6, m1, m9 paddw m1, m15, m14 paddw m2, m1 pmulhrsw m2, m7 pblendvb m14, m2, m9 %ifidn %2, v mova [tmpq+strideq*1], m13 ; p2 mova [tmpq+strideq*2], m3 ; p1 mova [tmpq+stride3q ], m4 ; p0 mova [dstq+strideq*0], m5 ; q0 mova [dstq+strideq*1], m6 ; q1 mova [dstq+strideq*2], m14 ; q2 %elif %1 == 8 TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 ; write 8x16 movu [dstq+strideq*0-8], xm0 movu [dstq+strideq*1-8], xm13 movu [dstq+strideq*2-8], xm3 movu [dstq+stride3q -8], xm4 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm5 movu [dstq+strideq*1-8], xm6 movu [dstq+strideq*2-8], xm14 movu [dstq+stride3q -8], xm15 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m0, 1 vextracti128 [dstq+strideq*1-8], m13, 1 vextracti128 [dstq+strideq*2-8], m3, 1 vextracti128 [dstq+stride3q -8], m4, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m5, 1 vextracti128 [dstq+strideq*1-8], m6, 1 vextracti128 [dstq+strideq*2-8], m14, 1 vextracti128 [dstq+stride3q -8], m15, 1 lea dstq, [dstq+strideq*4] %else mova m8, [rsp+6*32] mova m1, [rsp+7*32] mova m2, [rsp+8*32] mova m7, [rsp+9*32] TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9 mova [dstq+strideq*0-16], xm8 mova [dstq+strideq*1-16], xm1 mova [dstq+strideq*2-16], xm2 mova [dstq+stride3q -16], xm7 lea tmpq, [dstq+strideq*4] mova [tmpq+strideq*0-16], xm0 mova [tmpq+strideq*1-16], xm13 mova [tmpq+strideq*2-16], xm3 mova [tmpq+stride3q -16], xm4 lea tmpq, [tmpq+strideq*4] vextracti128 [tmpq+strideq*0-16], m8, 1 vextracti128 [tmpq+strideq*1-16], m1, 1 vextracti128 [tmpq+strideq*2-16], m2, 1 vextracti128 [tmpq+stride3q -16], m7, 1 lea tmpq, [tmpq+strideq*4] vextracti128 [tmpq+strideq*0-16], m0, 1 vextracti128 [tmpq+strideq*1-16], m13, 1 vextracti128 [tmpq+strideq*2-16], m3, 1 vextracti128 [tmpq+stride3q -16], m4, 1 mova m0, [rsp+10*32] mova m1, [rsp+11*32] mova m2, [rsp+12*32] mova m3, [rsp+13*32] TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 mova [dstq+strideq*0], xm5 mova [dstq+strideq*1], xm6 mova [dstq+strideq*2], xm14 mova [dstq+stride3q ], xm15 lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm0 mova [dstq+strideq*1], xm1 mova [dstq+strideq*2], xm2 mova [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m5, 1 vextracti128 [dstq+strideq*1], m6, 1 vextracti128 [dstq+strideq*2], m14, 1 vextracti128 [dstq+stride3q ], m15, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0], m0, 1 vextracti128 [dstq+strideq*1], m1, 1 vextracti128 [dstq+strideq*2], m2, 1 vextracti128 [dstq+stride3q ], m3, 1 lea dstq, [dstq+strideq*4] %endif %elif %1 == 6 ; flat6 filter vpbroadcastd m7, [pw_4096] paddw m8, m3, m4 paddw m8, m13 ; p2+p1+p0 paddw m11, m13, m5 paddw m8, m8 paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 pmulhrsw m2, m8, m7 paddw m8, m5 paddw m11, m13, m13 paddw m8, m6 psubw m8, m11 pmulhrsw m10, m8, m7 paddw m8, m6 paddw m11, m13, m3 paddw m8, m14 psubw m8, m11 pmulhrsw m11, m8, m7 psubw m8, m3 paddw m14, m14 psubw m8, m4 paddw m8, m14 pmulhrsw m8, m7 pblendvb m3, m2, m9 pblendvb m4, m10, m9 pblendvb m5, m11, m9 pblendvb m6, m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m3 ; p1 mova [tmpq+stride3q ], m4 ; p0 mova [dstq+strideq*0], m5 ; q0 mova [dstq+strideq*1], m6 ; q1 %else TRANSPOSE_8x4_AND_WRITE_4x16 %endif %else %ifidn %2, v mova [tmpq+strideq*0], m3 ; p1 mova [tmpq+strideq*1], m4 ; p0 mova [tmpq+strideq*2], m5 ; q0 mova [tmpq+stride3q ], m6 ; q1 %else TRANSPOSE_8x4_AND_WRITE_4x16 %endif %endif %endmacro INIT_YMM avx2 cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, v jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call .v4 .end: pslld m12, 4 add lq, 16 add dstq, 32 shl mask_bitsd, 4 sub wd, 4 jg .loop RET ALIGN function_align .v4: FILTER 4, v ret INIT_YMM avx2 cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, h jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter call .h4 jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] .end: pslld m12, 4 lea lq, [lq+l_strideq*4] shl mask_bitsd, 4 sub hd, 4 jg .loop RET ALIGN function_align .h4: FILTER 4, h ret INIT_YMM avx2 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov wd, wm shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4 .end: pslld m12, 4 add lq, 16 add dstq, 32 shl mask_bitsd, 4 sub wd, 4 jg .loop RET INIT_YMM avx2 cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits mov r6d, r7m lea r11, [pw_4] shr r6d, 11 ; is_12bpc lea r11, [r11+r6*4] mov hd, hm shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] mov mask_bitsd, 0xf mova m12, [pb_mask] .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4 jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] .end: pslld m12, 4 lea lq, [lq+l_strideq*4] shl mask_bitsd, 4 sub hd, 4 jg .loop RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/loopfilter16_avx512.asm000064400000000000000000001026201046102023000162220ustar 00000000000000; Copyright © 2022, VideoLAN and dav1d authors ; Copyright © 2022, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 l_shuf_v: times 2 db 0, 32 pw_1: times 2 dw 1 times 2 db 4, 36 pw_3: times 2 dw 3 times 2 db 8, 40 pw_4: times 2 dw 4 times 2 db 12, 44 pw_16: times 2 dw 16 times 2 db 16, 48 pw_4096: times 2 dw 4096 times 2 db 20, 52 pw_16384: times 2 dw 16384 times 2 db 24, 56 pw_32767: times 2 dw 32767 times 2 db 28, 60 times 2 dw 0 filter_mask: dd 1, 2, 4, 8, 16, 32, 64,128 stride_mul: dd 0, 1, 8, 9, 16, 17, 24, 25 l_shuf_h: db 4, -1, 4, -1, 4, -1, 4, -1, 12, -1, 12, -1, 12, -1, 12, -1 clip_max: dw 511, 511, 2047, 2047 clip_min: dw -512, -512, -2048, -2048 SECTION .text %macro TRANSPOSE8X8W 9 ; src/dst[1-8], tmp punpckhwd m%9, m%5, m%6 punpcklwd m%5, m%6 punpckhwd m%6, m%1, m%2 punpcklwd m%1, m%2 punpckhwd m%2, m%7, m%8 punpcklwd m%7, m%8 punpckhwd m%8, m%3, m%4 punpcklwd m%3, m%4 punpckhdq m%4, m%1, m%3 punpckldq m%1, m%3 punpckldq m%3, m%5, m%7 punpckhdq m%5, m%7 punpckhdq m%7, m%6, m%8 punpckldq m%6, m%8 punpckldq m%8, m%9, m%2 punpckhdq m%9, m%2 punpckhqdq m%2, m%1, m%3 punpcklqdq m%1, m%3 punpcklqdq m%3, m%4, m%5 punpckhqdq m%4, m%5 punpcklqdq m%5, m%6, m%8 punpckhqdq m%6, m%8 punpckhqdq m%8, m%7, m%9 punpcklqdq m%7, m%9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] %ifidn %2, v %if %1 == 16 lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1 ] mova m1, [tmpq+strideq*2 ] ; p5 mova m2, [tmpq+stride3q ] ; p4 mova m3, [tmpq+strideq*4 ] ; p3 mova m4, [tmpq+stride5q ] ; p2 %elif %1 == 6 || %1 == 8 lea tmpq, [dstq+mstrideq*4] %if %1 == 8 mova m3, [tmpq+strideq*0 ] %endif mova m4, [tmpq+strideq*1 ] %endif mova m5, [dstq+mstrideq*2] ; p1 mova m6, [dstq+mstrideq*1] ; p0 mova m7, [dstq+strideq*0 ] ; q0 mova m8, [dstq+strideq*1 ] ; q1 %if %1 != 4 mova m9, [dstq+strideq*2 ] ; q2 %endif %if %1 == 8 || %1 == 16 mova m10, [dstq+stride3q ] ; q3 %endif %if %1 == 16 mova m11, [dstq+strideq*4 ] ; q4 mova m22, [dstq+stride5q ] ; q5 mova m23, [dstq+stride3q*2] %endif %else ; h %if %1 == 16 movu ym16, [dstq+strideq*0 -16] movu ym17, [dstq+strideq*1 -16] movu ym18, [dstq+strideq*2 -16] movu ym19, [dstq+stride3q -16] movu ym20, [dstq+strideq*4 -16] movu ym22, [dstq+stride5q -16] movu ym23, [dstq+stride3q*2-16] movu ym28, [dstq+stride7q -16] lea tmpq, [dstq+strideq*8 -16] vinserti32x8 m7, m16, [tmpq+strideq*0 ], 1 vinserti32x8 m8, m17, [tmpq+strideq*1 ], 1 vinserti32x8 m9, m18, [tmpq+strideq*2 ], 1 vinserti32x8 m10, m19, [tmpq+stride3q ], 1 vinserti32x8 m11, m20, [tmpq+strideq*4 ], 1 vinserti32x8 m22, m22, [tmpq+stride5q ], 1 vinserti32x8 m23, m23, [tmpq+stride3q*2], 1 vinserti32x8 m28, m28, [tmpq+stride7q ], 1 lea tmpq, [tmpq+strideq*8] TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 27 movu ym16, [tmpq+strideq*0 ] movu ym17, [tmpq+strideq*1 ] movu ym18, [tmpq+strideq*2 ] movu ym19, [tmpq+stride3q ] movu ym24, [tmpq+strideq*4 ] movu ym25, [tmpq+stride5q ] movu ym26, [tmpq+stride3q*2] movu ym20, [tmpq+stride7q ] lea tmpq, [tmpq+strideq*8] vinserti32x8 m0, m16, [tmpq+strideq*0 ], 1 vinserti32x8 m1, m17, [tmpq+strideq*1 ], 1 vinserti32x8 m2, m18, [tmpq+strideq*2 ], 1 vinserti32x8 m3, m19, [tmpq+stride3q ], 1 vinserti32x8 m4, m24, [tmpq+strideq*4 ], 1 vinserti32x8 m5, m25, [tmpq+stride5q ], 1 vinserti32x8 m6, m26, [tmpq+stride3q*2], 1 vinserti32x8 m20, m20, [tmpq+stride7q ], 1 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 20, 27 vshufi32x4 m27, m7, m0, q2020 vshufi32x4 m7, m0, q3131 vshufi32x4 m0, m8, m1, q2020 vshufi32x4 m8, m1, q3131 vshufi32x4 m1, m9, m2, q2020 vshufi32x4 m9, m2, q3131 vshufi32x4 m2, m10, m3, q2020 vshufi32x4 m10, m3, q3131 vshufi32x4 m3, m11, m4, q2020 vshufi32x4 m11, m4, q3131 vshufi32x4 m4, m22, m5, q2020 vshufi32x4 m22, m5, q3131 vshufi32x4 m5, m23, m6, q2020 vshufi32x4 m23, m6, q3131 vshufi32x4 m6, m28, m20, q2020 vshufi32x4 m28, m20, q3131 %elif %1 == 6 || %1 == 8 %if %1 == 8 sub dstq, 8 movu xm16, [dstq+strideq*0 ] movu xm17, [dstq+strideq*1 ] movu xm18, [dstq+strideq*2 ] movu xm19, [dstq+stride3q ] movu xm24, [dstq+strideq*4 ] movu xm25, [dstq+stride5q ] movu xm26, [dstq+stride3q*2] movu xm27, [dstq+stride7q ] lea tmpq, [dstq+strideq*8 ] vinserti128 ym16, [tmpq+strideq*0 ], 1 vinserti128 ym17, [tmpq+strideq*1 ], 1 vinserti128 ym18, [tmpq+strideq*2 ], 1 vinserti128 ym19, [tmpq+stride3q ], 1 vinserti128 ym24, [tmpq+strideq*4 ], 1 vinserti128 ym25, [tmpq+stride5q ], 1 vinserti128 ym26, [tmpq+stride3q*2], 1 vinserti128 ym27, [tmpq+stride7q ], 1 lea tmpq, [tmpq+strideq*8 ] vinserti32x4 m10, m16, [tmpq+strideq*0 ], 2 vinserti32x4 m8, m17, [tmpq+strideq*1 ], 2 vinserti32x4 m5, m18, [tmpq+strideq*2 ], 2 vinserti32x4 m7, m19, [tmpq+stride3q ], 2 vinserti32x4 m2, m24, [tmpq+strideq*4 ], 2 vinserti32x4 m9, m25, [tmpq+stride5q ], 2 vinserti32x4 m3, m26, [tmpq+stride3q*2], 2 vinserti32x4 m4, m27, [tmpq+stride7q ], 2 lea tmpq, [tmpq+strideq*8 ] vinserti32x4 m10, [tmpq+strideq*0 ], 3 vinserti32x4 m8, [tmpq+strideq*1 ], 3 vinserti32x4 m5, [tmpq+strideq*2 ], 3 vinserti32x4 m7, [tmpq+stride3q ], 3 vinserti32x4 m2, [tmpq+strideq*4 ], 3 vinserti32x4 m9, [tmpq+stride5q ], 3 vinserti32x4 m3, [tmpq+stride3q*2], 3 vinserti32x4 m4, [tmpq+stride7q ], 3 %else ; %1 == 6 movu xm16, [dstq+strideq*0-8] movu xm17, [dstq+strideq*1-8] movu xm18, [dstq+strideq*2-8] movu xm19, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4-8] movu xm2, [tmpq+strideq*0] movu xm9, [tmpq+strideq*1] movu xm3, [tmpq+strideq*2] movu xm4, [tmpq+stride3q ] lea tmpq, [tmpq+strideq*4] vinserti128 ym16, [tmpq+strideq*0], 1 vinserti128 ym17, [tmpq+strideq*1], 1 vinserti128 ym18, [tmpq+strideq*2], 1 vinserti128 ym19, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti128 ym2, [tmpq+strideq*0], 1 vinserti128 ym9, [tmpq+strideq*1], 1 vinserti128 ym3, [tmpq+strideq*2], 1 vinserti128 ym4, [tmpq+stride3q ], 1 lea tmpq, [tmpq+strideq*4] vinserti32x4 m10, m16, [tmpq+strideq*0], 2 vinserti32x4 m8, m17, [tmpq+strideq*1], 2 vinserti32x4 m5, m18, [tmpq+strideq*2], 2 vinserti32x4 m7, m19, [tmpq+stride3q ], 2 lea tmpq, [tmpq+strideq*4] vinserti32x4 m2, [tmpq+strideq*0], 2 vinserti32x4 m9, [tmpq+strideq*1], 2 vinserti32x4 m3, [tmpq+strideq*2], 2 vinserti32x4 m4, [tmpq+stride3q ], 2 lea tmpq, [tmpq+strideq*4] vinserti32x4 m10, [tmpq+strideq*0], 3 vinserti32x4 m8, [tmpq+strideq*1], 3 vinserti32x4 m5, [tmpq+strideq*2], 3 vinserti32x4 m7, [tmpq+stride3q ], 3 lea tmpq, [tmpq+strideq*4] vinserti32x4 m2, [tmpq+strideq*0], 3 vinserti32x4 m9, [tmpq+strideq*1], 3 vinserti32x4 m3, [tmpq+strideq*2], 3 vinserti32x4 m4, [tmpq+stride3q ], 3 %endif punpcklwd m6, m10, m8 punpckhwd m10, m8 punpcklwd m8, m5, m7 punpckhwd m5, m7 punpcklwd m7, m2, m9 punpckhwd m2, m9 punpcklwd m9, m3, m4 punpckhwd m3, m4 punpckldq m4, m6, m8 punpckhdq m6, m8 punpckldq m8, m10, m5 punpckhdq m10, m5 punpckldq m5, m7, m9 punpckhdq m7, m9 punpckldq m9, m2, m3 punpckhdq m2, m3 %if %1 == 8 punpcklqdq m3, m4, m5 %endif punpckhqdq m4, m5 punpcklqdq m5, m6, m7 punpckhqdq m6, m7 punpcklqdq m7, m8, m9 punpckhqdq m8, m9 punpcklqdq m9, m10, m2 %if %1 == 8 punpckhqdq m10, m2 %endif %else ; %1 == 4 kxnorb k1, k1, k1 kmovb k2, k1 vpgatherdq m7{k1}, [dstq+ym12-4] lea tmpq, [dstq+strideq*2-4] kmovb k1, k2 vpgatherdq m4{k2}, [tmpq+ym12] lea tmpq, [tmpq+strideq*2] kmovb k2, k1 vpgatherdq m5{k1}, [tmpq+ym12] lea tmpq, [tmpq+strideq*2] vpgatherdq m6{k2}, [tmpq+ym12] punpcklwd m8, m7, m4 punpckhwd m7, m4 punpcklwd m4, m5, m6 punpckhwd m5, m6 punpcklwd m6, m8, m7 punpckhwd m8, m7 punpcklwd m7, m4, m5 punpckhwd m4, m5 punpcklqdq m5, m6, m7 punpckhqdq m6, m7 punpcklqdq m7, m8, m4 punpckhqdq m8, m4 %endif %endif ; load L/E/I/H %ifidn %2, v movu ym16, [lq+l_strideq*1] movsldup m17, [l_shuf_v] vptestnmb k1, ym16, ym16 vmovdqu8 ym16{k1}, [lq+l_strideq*0] ; l[x][] ? l[x][] : l[x-stride][] vpermb m16, m17, m16 ; l[x][1] %else movq xm16, [lq+l_strideq*0] movq xm17, [lq+l_strideq*1] vinserti128 ym16, [lq+l_strideq*2], 1 vinserti128 ym17, [lq+l_stride3q ], 1 lea tmpq, [lq+l_strideq*4] vinserti32x4 m16, [tmpq+l_strideq*0], 2 vinserti32x4 m17, [tmpq+l_strideq*1], 2 vinserti32x4 m16, [tmpq+l_strideq*2], 3 vinserti32x4 m17, [tmpq+l_stride3q ], 3 punpcklqdq m16, m17 vbroadcasti32x4 m17, [l_shuf_h] vptestnmb k1, m16, m16 vpalignr m16{k1}, m16, 12 pshufb m16, m17 ; l[x][1] %endif vpbroadcastd m20, [pw_32767] psubw m17, m5, m6 ; p1-p0 psubw m18, m7, m8 ; q1-q0 vptestmw k1, m16, m16 ; L pabsw m17, m17 pabsw m18, m18 vpmaxuw m20{k1}, m17, m18 vpbroadcastw m17, [lutq+136] psrlw m18, m16, [lutq+128] vpbroadcastd m19, [pw_1] pminuw m18, m17 psrlw m17, m16, 4 ; H paddw m16, m16 pmaxuw m18, m19 ; I vpaddd m16, [pw_4] {1to16} paddw m16, m18 ; E REPX {pmullw x, m13}, m17, m18, m16 vpcmpw k4, m20, m17, 6 ; hev %if %1 != 4 psubw m19, m4, m5 ; p2-p1 pabsw m19, m19 %if %1 == 8 || %1 == 16 psubw m17, m3, m4 ; p3-p2 pabsw m17, m17 pmaxuw m19, m17 psubw m17, m9, m10 ; q3-q2 pabsw m17, m17 pmaxuw m19, m17 %endif psubw m17, m9, m8 ; q2-q1 pabsw m17, m17 pmaxuw m19, m17 %if %1 == 16 vpbroadcastd ym17, [maskq+4] vpord ym17, [maskq+8] {1to8} vptestmd k1, ym17, ym21 %else vptestmd k1, ym21, [maskq+4] {1to8} %endif pmaxuw m19, m20 psubw m17, m4, m6 ; p2-p0 pabsw m17, m17 pmaxuw m17, m20 vmovdqa64 m20{k1}, m19 ; only apply fm-wide to wd>4 blocks %if %1 == 8 || %1 == 16 psubw m19, m3, m6 ; p3-p0 pabsw m19, m19 pmaxuw m17, m19 psubw m19, m7, m10 ; q3-q0 pabsw m19, m19 pmaxuw m17, m19 %endif psubw m19, m7, m9 ; q2-q0 pabsw m19, m19 pmaxuw m17, m19 %endif vpcmpw k1, m20, m18, 2 psubw m18, m5, m8 ; p1-q1 psubw m19, m6, m7 ; p0-q0 pabsw m18, m18 pabsw m19, m19 psrlw m18, 1 paddw m19, m19 paddw m18, m19 ; abs(p0-q0)*2+(abs(p1-q1)>>1) vpcmpw k1{k1}, m18, m16, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E %if %1 != 4 vpcmpw k2{k1}, m17, m13, 2 ; flat8in %endif %if %1 == 16 psubw m20, m0, m6 psubw m16, m1, m6 pabsw m20, m20 psubw m17, m2, m6 pabsw m16, m16 psubw m18, m11, m7 pabsw m17, m17 psubw m19, m22, m7 pabsw m18, m18 pmaxuw m20, m16 psubw m16, m23, m7 pabsw m19, m19 pmaxuw m17, m18 pabsw m16, m16 vpandd ym18, ym21, [maskq+8] {1to8} pmaxuw m20, m17 pmaxuw m19, m16 pcmpeqd ym16, ym21, ym18 vpternlogd ym18, ym21, [maskq+4] {1to8}, 0xc8 pmaxuw m20, m19 pcmpeqd ym17, ym21, ym18 vpternlogd ym18, ym21, [maskq+0] {1to8}, 0xc8 vpcmpw k3{k2}, m20, m13, 2 ; flat8in & flat8out pcmpeqd ym18, ym21 vptestmb k3{k3}, ym16, ym16 ; flat8 & fm vptestmb k2{k2}, ym17, ym17 ; flat8in vptestmb k1{k1}, ym18, ym18 kandnd k1, k2, k1 ; fm & !flat8 & !flat16 kandnd k2, k3, k2 ; flat8 & !flat16 %elif %1 == 6 || %1 == 8 vpandd ym17, ym21, [maskq+4] {1to8} pcmpeqd ym16, ym21, ym17 vpternlogd ym17, ym21, [maskq+0] {1to8}, 0xc8 pcmpeqd ym17, ym21 vptestmb k2{k2}, ym16, ym16 ; flat8 & fm vptestmb k1{k1}, ym17, ym17 kandnd k1, k2, k1 ; fm & !flat8 %else ; %1 == 4 vpandd ym16, ym21, [maskq+0] {1to8} pcmpeqd ym16, ym21 vptestmb k1{k1}, ym16, ym16 %endif ; short filter psubw m16, m7, m6 vpbroadcastd m17, [pw_3] paddw m18, m16, m16 paddw m18, m16 psubw m16, m5, m8 ; iclip_diff(p1-q1) pminsw m16, m14 vpmaxsw m16{k4}{z}, m15 ; f=iclip_diff(p1-q1)&hev knotd k4, k4 ; !hev paddw m16, m18 ; f=iclip_diff(3*(q0-p0)+f) vpbroadcastd m18, [pw_4] pminsw m16, m14 vpmaxsw m16{k1}{z}, m15 ; f&=fm paddw m17, m16 paddw m16, m18 vpbroadcastd m18, [pw_16384] pminsw m17, m14 pminsw m16, m14 psraw m17, 3 ; f2 psraw m16, 3 ; f1 paddw m6, m17 psubw m7, m16 vpmulhrsw m16{k4}{z}, m18 ; (f=(f1+1)>>1) & !hev psubw m17, m14, m15 ; 1023 or 4095 pxor m18, m18 paddw m5, m16 psubw m8, m16 REPX {pminsw x, m17}, m6, m7, m5, m8 REPX {pmaxsw x, m18}, m6, m7, m5, m8 %if %1 == 16 ; flat16 filter vpaddd m19, m0, [pw_1] {1to16} paddw m16, m1, m2 ; p5+p4 paddw m26, m1, m6 ; p5+p0 paddw m24, m2, m7 ; p4+q0 paddw m16, m4 ; p5+p4+p3 paddw m17, m3, m5 ; p2+p1 psllw m19, 3 paddw m16, m26 ; p5*2+p4+p3+p0 paddw m17, m24 ; p4+p2+p1+q0 psubw m19, m0 ; p6*7+8 paddw m16, m17 ; p5*2+p4*2+p3+p2+p1+q0 paddw m18, m3, m8 paddw m19, m16 ; p6*7+p5+p4*2+p3+p2+p1+p0+q0 paddw m25, m1, m0 paddw m16, m0, m0 psrlw m1{k3}, m19, 4 paddw m19, m18 psubw m19, m16 ; +p3+q1-p6*2 paddw m16, m2, m0 psrlw m2{k3}, m19, 4 psubw m19, m25 paddw m25, m4, m9 paddw m20, m10, m5 paddw m19, m25 ; +p2+q2-p6-p5 paddw m17, m0, m3 psubw m16, m20, m16 psrlw m3{k3}, m19, 4 paddw m19, m16 ; +p1+q3-p6-p4 paddw m16, m11, m6 psubw m16, m17 paddw m17, m0, m4 psrlw m4{k3}, m19, 4 paddw m19, m16 ; +p0+q4-p6-p3 paddw m16, m22, m7 psubw m16, m17 paddw m17, m0, m5 psrlw m5{k3}, m19, 4 paddw m19, m16 ; +q0+q5-p6-p2 paddw m16, m23, m8 psrlw m6{k3}, m19, 4 psubw m16, m17 paddw m19, m16 ; +q1+q6-p6-p1 paddw m16, m23, m9 psrlw m7{k3}, m19, 4 psubw m16, m26 paddw m19, m16 ; +q2+q6-p5-p0 paddw m16, m23, m10 psrlw m8{k3}, m19, 4 psubw m16, m24 paddw m19, m16 ; +q3+q6-p4-p0 paddw m16, m23, m11 psrlw m9{k3}, m19, 4 psubw m16, m18 paddw m19, m16 ; +q4+q6-p3-q1 paddw m16, m23, m22 psrlw m10{k3}, m19, 4 psubw m16, m25 paddw m19, m16 ; +q5+q6-p2-q2 paddw m16, m23, m23 psrlw m11{k3}, m19, 4 psubw m16, m20 paddw m19, m16 ; +q6*2-p1-q3 psrlw m22{k3}, m19, 4 %endif %if %1 == 8 || %1 == 16 ; flat8 filter vpbroadcastd m20, [pw_4096] paddw m16, m3, m4 ; p3+p2 paddw m19, m5, m6 ; p1+p0 paddw m17, m16, m16 ; 2*(p3+p2) paddw m19, m3 ; p1+p0+p3 paddw m17, m7 ; 2*(p3+p2)+q0 paddw m19, m17 ; 3*p3+2*p2+p1+p0+q0 paddw m18, m4, m7 pmulhrsw m4{k2}, m19, m20 psubw m19, m16 paddw m17, m5, m8 paddw m16, m3, m5 paddw m19, m17 pmulhrsw m5{k2}, m19, m20 psubw m19, m16 paddw m16, m6, m9 paddw m19, m16 paddw m16, m3, m6 pmulhrsw m6{k2}, m19, m20 paddw m19, m10 psubw m16, m7, m16 paddw m19, m16 psubw m16, m10, m18 pmulhrsw m7{k2}, m19, m20 paddw m16, m8 paddw m19, m16 psubw m16, m10, m17 pmulhrsw m8{k2}, m19, m20 paddw m16, m9 paddw m19, m16 pmulhrsw m9{k2}, m19, m20 %elif %1 == 6 ; flat6 filter vpbroadcastd m10, [pw_4096] paddw m2, m5, m6 paddw m0, m4, m7 paddw m1, m2, m4 ; p2+p1+p0 paddw m3, m4, m4 paddw m1, m1 paddw m4, m5 paddw m1, m0 ; p2+2*(p2+p1+p0)+q0 psubw m3, m7, m3 pmulhrsw m5{k2}, m1, m10 paddw m3, m8 psubw m4, m8, m4 paddw m1, m3 pmulhrsw m6{k2}, m1, m10 paddw m4, m9 paddw m9, m9 paddw m1, m4 pmulhrsw m7{k2}, m1, m10 psubw m9, m2 paddw m1, m9 pmulhrsw m8{k2}, m1, m10 %endif %ifidn %2, v %if %1 == 16 mova [tmpq+strideq*2 ], m1 ; p5 mova [tmpq+stride3q ], m2 ; p4 mova [tmpq+strideq*4 ], m3 ; p3 mova [tmpq+stride5q ], m4 ; p2 %elif %1 == 8 mova [tmpq+strideq*1 ], m4 ; p2 %endif mova [dstq+mstrideq*2], m5 ; p1 mova [dstq+mstrideq ], m6 ; p0 mova [dstq+strideq*0 ], m7 ; q0 mova [dstq+strideq*1 ], m8 ; q1 %if %1 == 8 || %1 == 16 mova [dstq+strideq*2 ], m9 ; q2 %endif %if %1 == 16 mova [dstq+stride3q ], m10 ; q3 mova [dstq+strideq*4 ], m11 ; q4 mova [dstq+stride5q ], m22 ; q5 %endif %else %if %1 == 16 TRANSPOSE8X8W 27, 0, 1, 2, 3, 4, 5, 6, 20 TRANSPOSE8X8W 7, 8, 9, 10, 11, 22, 23, 28, 20 mova [dstq+strideq*0 -16], xm27 mova [dstq+strideq*0 ], xm7 mova [dstq+strideq*1 -16], xm0 mova [dstq+strideq*1 ], xm8 mova [dstq+strideq*2 -16], xm1 mova [dstq+strideq*2 ], xm9 mova [dstq+stride3q -16], xm2 mova [dstq+stride3q ], xm10 mova [dstq+strideq*4 -16], xm3 mova [dstq+strideq*4 ], xm11 mova [dstq+stride5q -16], xm4 mova [dstq+stride5q ], xm22 mova [dstq+stride3q*2-16], xm5 mova [dstq+stride3q*2 ], xm23 mova [dstq+stride7q -16], xm6 mova [dstq+stride7q ], xm28 lea dstq, [dstq+strideq*8] vextracti128 [dstq+strideq*0 -16], ym27, 1 vextracti128 [dstq+strideq*0 ], ym7, 1 vextracti128 [dstq+strideq*1 -16], ym0, 1 vextracti128 [dstq+strideq*1 ], ym8, 1 vextracti128 [dstq+strideq*2 -16], ym1, 1 vextracti128 [dstq+strideq*2 ], ym9, 1 vextracti128 [dstq+stride3q -16], ym2, 1 vextracti128 [dstq+stride3q ], ym10, 1 vextracti128 [dstq+strideq*4 -16], ym3, 1 vextracti128 [dstq+strideq*4 ], ym11, 1 vextracti128 [dstq+stride5q -16], ym4, 1 vextracti128 [dstq+stride5q ], ym22, 1 vextracti128 [dstq+stride3q*2-16], ym5, 1 vextracti128 [dstq+stride3q*2 ], ym23, 1 vextracti128 [dstq+stride7q -16], ym6, 1 vextracti128 [dstq+stride7q ], ym28, 1 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 -16], m27, 2 vextracti32x4 [dstq+strideq*0 ], m7, 2 vextracti32x4 [dstq+strideq*1 -16], m0, 2 vextracti32x4 [dstq+strideq*1 ], m8, 2 vextracti32x4 [dstq+strideq*2 -16], m1, 2 vextracti32x4 [dstq+strideq*2 ], m9, 2 vextracti32x4 [dstq+stride3q -16], m2, 2 vextracti32x4 [dstq+stride3q ], m10, 2 vextracti32x4 [dstq+strideq*4 -16], m3, 2 vextracti32x4 [dstq+strideq*4 ], m11, 2 vextracti32x4 [dstq+stride5q -16], m4, 2 vextracti32x4 [dstq+stride5q ], m22, 2 vextracti32x4 [dstq+stride3q*2-16], m5, 2 vextracti32x4 [dstq+stride3q*2 ], m23, 2 vextracti32x4 [dstq+stride7q -16], m6, 2 vextracti32x4 [dstq+stride7q ], m28, 2 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 -16], m27, 3 vextracti32x4 [dstq+strideq*0 ], m7, 3 vextracti32x4 [dstq+strideq*1 -16], m0, 3 vextracti32x4 [dstq+strideq*1 ], m8, 3 vextracti32x4 [dstq+strideq*2 -16], m1, 3 vextracti32x4 [dstq+strideq*2 ], m9, 3 vextracti32x4 [dstq+stride3q -16], m2, 3 vextracti32x4 [dstq+stride3q ], m10, 3 vextracti32x4 [dstq+strideq*4 -16], m3, 3 vextracti32x4 [dstq+strideq*4 ], m11, 3 vextracti32x4 [dstq+stride5q -16], m4, 3 vextracti32x4 [dstq+stride5q ], m22, 3 vextracti32x4 [dstq+stride3q*2-16], m5, 3 vextracti32x4 [dstq+stride3q*2 ], m23, 3 vextracti32x4 [dstq+stride7q -16], m6, 3 vextracti32x4 [dstq+stride7q ], m28, 3 %elif %1 == 8 TRANSPOSE8X8W 3, 4, 5, 6, 7, 8, 9, 10, 2 movu [dstq+strideq*0 ], xm3 movu [dstq+strideq*1 ], xm4 movu [dstq+strideq*2 ], xm5 movu [dstq+stride3q ], xm6 movu [dstq+strideq*4 ], xm7 movu [dstq+stride5q ], xm8 movu [dstq+stride3q*2], xm9 movu [dstq+stride7q ], xm10 lea dstq, [dstq+strideq*8] vextracti128 [dstq+strideq*0 ], ym3, 1 vextracti128 [dstq+strideq*1 ], ym4, 1 vextracti128 [dstq+strideq*2 ], ym5, 1 vextracti128 [dstq+stride3q ], ym6, 1 vextracti128 [dstq+strideq*4 ], ym7, 1 vextracti128 [dstq+stride5q ], ym8, 1 vextracti128 [dstq+stride3q*2], ym9, 1 vextracti128 [dstq+stride7q ], ym10, 1 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 ], m3, 2 vextracti32x4 [dstq+strideq*1 ], m4, 2 vextracti32x4 [dstq+strideq*2 ], m5, 2 vextracti32x4 [dstq+stride3q ], m6, 2 vextracti32x4 [dstq+strideq*4 ], m7, 2 vextracti32x4 [dstq+stride5q ], m8, 2 vextracti32x4 [dstq+stride3q*2], m9, 2 vextracti32x4 [dstq+stride7q ], m10, 2 lea dstq, [dstq+strideq*8] vextracti32x4 [dstq+strideq*0 ], m3, 3 vextracti32x4 [dstq+strideq*1 ], m4, 3 vextracti32x4 [dstq+strideq*2 ], m5, 3 vextracti32x4 [dstq+stride3q ], m6, 3 vextracti32x4 [dstq+strideq*4 ], m7, 3 vextracti32x4 [dstq+stride5q ], m8, 3 vextracti32x4 [dstq+stride3q*2], m9, 3 vextracti32x4 [dstq+stride7q ], m10, 3 lea dstq, [dstq+strideq*8+8] %else ; %1 == 4 || %1 == 6 punpcklwd m9, m5, m6 punpckhwd m5, m6 kxnorb k1, k1, k1 punpcklwd m6, m7, m8 punpckhwd m7, m8 kmovb k2, k1 punpckldq m8, m9, m6 vpscatterdq [dstq+ym12-4]{k1}, m8 punpckhdq m9, m6 lea tmpq, [dstq+strideq*2-4] kmovb k1, k2 vpscatterdq [tmpq+ym12]{k2}, m9 punpckldq m6, m5, m7 lea tmpq, [tmpq+strideq*2] kmovb k2, k1 vpscatterdq [tmpq+ym12]{k1}, m6 punpckhdq m5, m7 lea tmpq, [tmpq+strideq*2] vpscatterdq [tmpq+ym12]{k2}, m5 %endif %endif %endmacro INIT_ZMM avx512icl cglobal lpf_v_sb_y_16bpc, 6, 12, 26, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride, tmp, \ mask_bits, stride5 %define base tmpq-filter_mask SWAP 12, 26 ; avoids clobbering xmm10 on WIN64 lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max lea stride3q, [strideq*3] shl l_strideq, 2 lea stride5q, [strideq*5] shr r6d, 11 ; is_12bpc mova ym21, [base+filter_mask] mov mstrideq, strideq vpbroadcastd m13, [base+pw_4+r6*8] mov mask_bitsd, 0xff vpbroadcastd m14, [base+clip_max+r6*4] sub lq, l_strideq vpbroadcastd m15, [base+clip_min+r6*4] neg mstrideq mov wd, wm .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, v jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call .v4 .end: shl mask_bitsd, 8 add dstq, 64 pslld ym21, 8 add lq, 32 sub wd, 8 jg .loop RET ALIGN function_align .v4: ; called by both luma and chroma FILTER 4, v ret cglobal lpf_h_sb_y_16bpc, 6, 13, 29, dst, stride, mask, l, l_stride, \ lut, h, stride3, l_stride3, tmp, \ mask_bits, stride5, stride7 lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max lea stride3q, [strideq*3] vpbroadcastd ym12, strided shl l_strideq, 2 lea stride5q, [strideq*5] shr r6d, 11 ; is_12bpc pmulld ym12, [base+stride_mul] lea stride7q, [strideq+stride3q*2] mova ym21, [base+filter_mask] mov mask_bitsd, 0xff vpbroadcastd m13, [base+pw_4+r6*8] sub lq, 4 vpbroadcastd m14, [base+clip_max+r6*4] lea l_stride3q, [l_strideq*3] vpbroadcastd m15, [base+clip_min+r6*4] mov hd, hm .loop: test [maskq+8], mask_bitsd ; vmask[2] jz .no_flat16 FILTER 16, h jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 8, h jmp .end2 .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .no_filter call .h4 .no_filter: lea dstq, [dstq+stride3q*8] .end: lea dstq, [dstq+strideq*8] .end2: shl mask_bitsd, 8 pslld ym21, 8 lea lq, [lq+l_strideq*8] sub hd, 8 jg .loop RET ALIGN function_align .h4: ; called by both luma and chroma FILTER 4, h ret cglobal lpf_v_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max shl l_strideq, 2 lea stride3q, [strideq*3] shr r6d, 11 ; is_12bpc mova ym21, [base+filter_mask] mov mstrideq, strideq vpbroadcastd m13, [base+pw_4+r6*8] mov mask_bitsd, 0xff vpbroadcastd m14, [base+clip_max+r6*4] sub lq, l_strideq vpbroadcastd m15, [base+clip_min+r6*4] neg mstrideq mov wd, wm .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx512icl).v4 .end: shl mask_bitsd, 8 add dstq, 64 pslld ym21, 8 add lq, 32 sub wd, 8 jg .loop RET cglobal lpf_h_sb_uv_16bpc, 6, 11, 22, dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits lea tmpq, [filter_mask] mov r6d, r7m ; bitdepth_max vpbroadcastd ym12, strided shl l_strideq, 2 shr r6d, 11 ; is_12bpc pmulld ym12, [base+stride_mul] lea stride3q, [strideq*3] mova ym21, [base+filter_mask] mov mask_bitsd, 0xff vpbroadcastd m13, [base+pw_4+r6*8] sub lq, 4 vpbroadcastd m14, [base+clip_max+r6*4] lea l_stride3q, [l_strideq*3] vpbroadcastd m15, [base+clip_min+r6*4] mov hd, hm .loop: test [maskq+4], mask_bitsd ; vmask[1] jz .no_flat FILTER 6, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] jz .end call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx512icl).h4 .end: lea tmpq, [strideq+stride3q] shl mask_bitsd, 8 pslld ym21, 8 lea dstq, [dstq+tmpq*8] lea lq, [lq+l_strideq*8] sub hd, 8 jg .loop RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/loopfilter16_sse.asm000064400000000000000000001370771046102023000160040ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 %if ARCH_X86_64 %define PIC_sym(a) a %else %define PIC_base $$ %define PIC_sym(a) pic_regq+a-PIC_base %endif pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 times 4 db 8, 9 pw_1: times 8 dw 1 pw_2: times 8 dw 2 pw_3: times 8 dw 3 ; 4 and 16 need to be next to each other since they are used as alternates ; depending on whether bitdepth is 10 or 12 pw_4: times 8 dw 4 pw_16: times 8 dw 16 pw_8: times 8 dw 8 pw_4096: times 8 dw 4096 pb_mask: dd 1, 1, 2, 2 SECTION .text %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %define extra_stack 2 %else %define extra_stack 0 %endif %endif %macro RELOC_ARGS 2 ; h/v, off ASSERT ARCH_X86_32 %if STACK_ALIGNMENT < 16 mov r5d, [rstk + stack_offset + 4*4 + 4] %define lstridem [esp+%2+0*gprsize] mov lstridem, r5d mov r5d, [rstk + stack_offset + 4*5 + 4] %define lutm [esp+%2+1*gprsize] mov lutm, r5d mov r5d, [rstk + stack_offset + 4*6 + 4] %ifidn %1, v %define wm [esp+%2+2*gprsize] mov wm, r5d mov r5d, [rstk + stack_offset + 4*3 + 4] %define lm [esp+%2+3*gprsize] mov lm, r5d %else ; %1 == h %define hm [esp+%2+2*gprsize] mov hm, r5d %endif ; %1==v mov r5d, r7m %define bdmulm [esp+%2+4*gprsize] mov bdmulm, r5d %else %define lstridem r4m %define lutm r5m %ifidn %1, v %define wm r6m %define lm r3m %else %define hm r6m %endif %define bdmulm r7m %endif ; STACK_ALIGNMENT %endmacro %macro UNRELOC_ARGS 0 %if ARCH_X86_32 %undef lm %undef lstridem %undef wm %undef hm %undef lutm %endif %endmacro %macro SPLATD 2 movd %1, %2 pshufd %1, %1, q0000 %endmacro %macro SPLATW 2 movd %1, %2 pshuflw %1, %1, q0000 punpcklqdq %1, %1 %endmacro ; in: out: ; mm%1 a b c d a e i m ; mm%2 e f g h b f j n ; mm%3 i j k l -> c g k o ; mm%4 m n o p d h l p %macro TRANSPOSE4X4W 5 punpcklwd m%5, m%1, m%2 punpckhwd m%1, m%2 punpcklwd m%2, m%3, m%4 punpckhwd m%3, m%4 punpckldq m%4, m%5, m%2 punpckhdq m%5, m%2 punpckldq m%2, m%1, m%3 punpckhdq m%1, m%3 SWAP %1, %4 SWAP %2, %5, %3 %endmacro ; in: out: ; m%1 a b c d e f g h a i q y 6 E M U ; m%2 i j k l m n o p b j r z 7 F N V ; m%3 q r s t u v w x c k s 0 8 G O W ; m%4 y z 0 1 2 3 4 5 d l t 1 9 H P X ; m%5 6 7 8 9 A B C D -> e m u 2 A I Q Y ; m%6 E F G H I J K L f n v 3 B J R Z ; m%7 M N O P Q R S T g o w 4 C K S + ; m%8 U V W X Y Z + = h p x 5 D L T = %if ARCH_X86_64 %macro TRANSPOSE8X8W 9 ; m%1 a b c d e f g h a i q y b j r z ; m%2 i j k l m n o p c k s 0 d l t 1 ; m%3 q r s t u v w x -> e m u 2 f n v 3 ; m%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 TRANSPOSE4X4W %1, %2, %3, %4, %9 ; m%5 6 7 8 9 A B C D 6 E M U 7 F N V ; m%6 E F G H I J K L 8 G O W 9 H P X ; m%7 M N O P Q R S T -> A I Q Y B J R Z ; m%8 U V W X Y Z + = C K S + D L T = TRANSPOSE4X4W %5, %6, %7, %8, %9 ; m%1 a i q y b j r z a i q y 6 E M U ; m%2 c k s 0 d l t 1 b j r z 7 F N V ; m%3 e m u 2 f n v 3 c k s 0 8 G O W ; m%4 g o w 4 h p x 5 d l t 1 9 H P X ; m%5 6 E M U 7 F N V -> e m u 2 A I Q Y ; m%6 8 G O W 9 H P X f n v 3 B J R Z ; m%7 A I Q Y B J R Z g o w 4 C K S + ; m%8 C K S + D L T = h p x 5 D L T = punpckhqdq m%9, m%1, m%5 punpcklqdq m%1, m%5 punpckhqdq m%5, m%2, m%6 punpcklqdq m%2, m%6 punpckhqdq m%6, m%3, m%7 punpcklqdq m%3, m%7 punpckhqdq m%7, m%4, m%8 punpcklqdq m%4, m%8 SWAP %8, %7, %4, %5, %3, %2, %9 %endmacro %else ; x86-32 ; input: 1-7 in registers, 8 in first memory [read-only] ; second memory is scratch, and may overlap with first or third memory ; output: 1-5,7-8 in registers, 6 in third memory [write-only] %macro TRANSPOSE8X8W 13 ; regs [8x], mem [3x], a/u [in/out alignment [2x] TRANSPOSE4X4W %1, %2, %3, %4, %8 %ifnidn %9, "" mov%12 m%8, %9 %else mova m%8, %10 %endif mova %10, m%4 TRANSPOSE4X4W %5, %6, %7, %8, %4 punpckhqdq m%4, m%1, m%5 punpcklqdq m%1, m%5 punpckhqdq m%5, m%2, m%6 punpcklqdq m%2, m%6 punpckhqdq m%6, m%3, m%7 punpcklqdq m%3, m%7 mova m%7, %10 %ifnidn %11, "" mov%13 %11, m%6 %else mova %10, m%6 %endif punpckhqdq m%6, m%7, m%8 punpcklqdq m%7, m%8 ; 1,4,2,5,3,8,7,6 -> 1,2,3,4,5,6,7,8 SWAP %2, %4, %5, %3 SWAP %6, %8 %endmacro %endif ; x86-32/64 ; transpose and write m8-11, everything else is scratch %macro TRANSPOSE_8x4_AND_WRITE_4x8 5 ; p1, p0, q0, q1, tmp ; transpose 8x4 punpcklwd %5, %1, %2 punpckhwd %1, %2 punpcklwd %2, %3, %4 punpckhwd %3, %4 punpckldq %4, %5, %2 punpckhdq %5, %2 punpckldq %2, %1, %3 punpckhdq %1, %3 ; write out movq [dstq+strideq*0-4], %4 movhps [dstq+strideq*1-4], %4 movq [dstq+strideq*2-4], %5 movhps [dstq+stride3q -4], %5 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], %2 movhps [dstq+strideq*1-4], %2 movq [dstq+strideq*2-4], %1 movhps [dstq+stride3q -4], %1 lea dstq, [dstq+strideq*4] %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] ; load data %ifidn %2, v %if %1 == 4 %if ARCH_X86_64 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 mova P1, [dstq+mstrideq*2] ; p1 mova P0, [dstq+mstrideq*1] ; p0 mova Q0, [dstq+strideq*0] ; q0 mova Q1, [dstq+strideq*1] ; q1 %else ; x86-32 %define P1 [dstq+mstrideq*2] %define P0 [dstq+mstrideq*1] %define Q0 [dstq+strideq*0] %define Q1 [dstq+strideq*1] %endif ; x86-32/64 %else ; %1 != 4 ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] %if ARCH_X86_64 ; we load p3 later %define P2 m13 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 %define Q2 m14 mova P2, [tmpq+strideq*1] mova P1, [tmpq+strideq*2] mova P0, [tmpq+stride3q] mova Q0, [dstq+strideq*0] mova Q1, [dstq+strideq*1] mova Q2, [dstq+strideq*2] %if %1 != 6 %define P3 [tmpq+strideq*0] %define Q3 m15 mova Q3, [dstq+stride3q] %endif ; %1 != 6 %else ; x86-32 %define P2 [tmpq+strideq*1] %define P1 [dstq+mstrideq*2] %define P0 [dstq+mstrideq*1] %define Q0 [dstq+strideq*0] %define Q1 [dstq+strideq*1] %define Q2 [dstq+strideq*2] %if %1 != 6 %define P3 [dstq+mstrideq*4] %define Q3 [dstq+stride3q] %endif ; %1 != 6 %endif ; x86-32/64 %endif ; %1 ==/!= 4 %else ; %2 != v ; load lines %if %1 == 4 movq m0, [dstq+strideq*0-4] movq m2, [dstq+strideq*1-4] movq m4, [dstq+strideq*2-4] movq m5, [dstq+stride3q -4] lea tmpq, [dstq+strideq*4] movq m3, [tmpq+strideq*0-4] movq m6, [tmpq+strideq*1-4] movq m1, [tmpq+strideq*2-4] movq m7, [tmpq+stride3q -4] ; transpose 4x8 ; m0: A-D0 ; m2: A-D1 ; m4: A-D2 ; m5: A-D3 ; m3: A-D4 ; m6: A-D5 ; m1: A-D6 ; m7: A-D7 punpcklwd m0, m2 punpcklwd m4, m5 punpcklwd m3, m6 punpcklwd m1, m7 ; m0: A0-1,B0-1,C0-1,D0-1 ; m4: A2-3,B2-3,C2-3,D2-3 ; m3: A4-5,B4-5,C4-5,D4-5 ; m1: A6-7,B6-7,C6-7,D6-7 punpckhdq m2, m0, m4 punpckldq m0, m4 punpckhdq m4, m3, m1 punpckldq m3, m1 ; m0: A0-3,B0-3 ; m2: C0-3,D0-3 ; m3: A4-7,B4-7 ; m4: C4-7,D4-7 punpckhqdq m1, m0, m3 punpcklqdq m0, m3 punpckhqdq m3, m2, m4 punpcklqdq m2, m4 ; m0: A0-7 ; m1: B0-7 ; m2: C0-7 ; m3: D0-7 %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 2, 10 SWAP 3, 11 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 %else %define P1 [esp+3*mmsize] %define P0 [esp+4*mmsize] %define Q0 [esp+5*mmsize] %define Q1 [esp+6*mmsize] mova P1, m0 mova P0, m1 mova Q0, m2 mova Q1, m3 %endif %elif %1 == 6 || %1 == 8 movu m0, [dstq+strideq*0-8] movu m1, [dstq+strideq*1-8] movu m2, [dstq+strideq*2-8] movu m3, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] movu m4, [tmpq+strideq*0-8] movu m5, [tmpq+strideq*1-8] movu m6, [tmpq+strideq*2-8] %if ARCH_X86_64 movu m7, [tmpq+stride3q -8] %endif ; transpose 8x16 ; m0: A-H0,A-H8 ; m1: A-H1,A-H9 ; m2: A-H2,A-H10 ; m3: A-H3,A-H11 ; m4: A-H4,A-H12 ; m5: A-H5,A-H13 ; m6: A-H6,A-H14 ; m7: A-H7,A-H15 %if ARCH_X86_64 punpcklwd m8, m0, m1 %else punpcklwd m7, m0, m1 %endif punpckhwd m0, m1 punpcklwd m1, m2, m3 punpckhwd m2, m3 punpcklwd m3, m4, m5 punpckhwd m4, m5 %if ARCH_X86_64 punpcklwd m5, m6, m7 punpckhwd m6, m7 %else mova [rsp+3*16], m4 movu m4, [tmpq+stride3q -8] punpcklwd m5, m6, m4 punpckhwd m6, m4 %endif ; m8: A0-1,B0-1,C0-1,D0-1 [m7 on x86-32] ; m0: E0-1,F0-1,G0-1,H0-1 ; m1: A2-3,B2-3,C2-3,D2-3 ; m2: E2-3,F2-3,G2-3,H2-3 ; m3: A4-5,B4-5,C4-5,D4-5 ; m4: E4-5,F4-5,G4-5,H4-5 [r3 on x86-32] ; m5: A6-7,B6-7,C6-7,D6-7 ; m6: E6-7,F6-7,G6-7,H6-7 %if ARCH_X86_64 punpckldq m7, m8, m1 punpckhdq m8, m1 %else punpckldq m4, m7, m1 punpckhdq m7, m1 %endif punpckldq m1, m0, m2 punpckhdq m0, m2 punpckldq m2, m3, m5 punpckhdq m3, m5 %if ARCH_X86_64 punpckldq m5, m4, m6 punpckhdq m4, m6 %else mova [rsp+4*16], m3 mova m3, [rsp+3*16] punpckldq m5, m3, m6 punpckhdq m3, m6 %endif ; m7: A0-3,B0-3 [m4 on x86-32] ; m8: C0-3,D0-3 [m7 on x86-32] ; m1: E0-3,F0-3 ; m0: G0-3,H0-3 ; m2: A4-7,B4-7 ; m3: C4-7,D4-7 [r4 on x86-32] ; m5: E4-7,F4-7 ; m4: G4-7,H4-7 [m3 on x86-32] %if ARCH_X86_64 %if %1 != 6 punpcklqdq m6, m7, m2 %endif punpckhqdq m7, m2 punpcklqdq m2, m8, m3 punpckhqdq m8, m3 punpcklqdq m3, m1, m5 punpckhqdq m1, m5 %if %1 != 6 punpckhqdq m5, m0, m4 %endif punpcklqdq m0, m4 %if %1 == 8 mova [rsp+1*16], m6 %define P3 [rsp+1*16] %endif ; 7,2,8,3,1,0,5 -> 13,8,9,10,11,14,15 SWAP 7, 13 SWAP 8, 2, 9 SWAP 3, 10 SWAP 1, 11 SWAP 0, 14 SWAP 5, 15 %define P2 m13 %define P1 m8 %define P0 m9 %define Q0 m10 %define Q1 m11 %define Q2 m14 %if %1 == 8 %define Q3 m15 %endif %else ; x86-32 %if %1 == 8 %define P3 [rsp+ 6*16] punpcklqdq m6, m4, m2 mova P3, m6 %endif mova m6, [rsp+4*16] punpckhqdq m4, m2 punpcklqdq m2, m7, m6 punpckhqdq m7, m6 punpcklqdq m6, m1, m5 punpckhqdq m1, m5 %if %1 == 8 %define Q3 [rsp+24*16] punpckhqdq m5, m0, m3 mova Q3, m5 %endif punpcklqdq m0, m3 %if %1 == 8 %define P2 [rsp+18*16] %define P1 [rsp+19*16] %define P0 [rsp+20*16] %define Q0 [rsp+21*16] %define Q1 [rsp+22*16] %define Q2 [rsp+23*16] %else %define P2 [rsp+3*16] %define P1 [rsp+4*16] %define P0 [rsp+5*16] %define Q0 [rsp+6*16] %define Q1 [rsp+7*16] %define Q2 [rsp+8*16] %endif mova P2, m4 mova P1, m2 mova P0, m7 mova Q0, m6 mova Q1, m1 mova Q2, m0 %endif ; x86-32/64 %else ; %1 == 16 ; We only use 14 pixels but we'll need the remainder at the end for ; the second transpose mova m0, [dstq+strideq*0-16] mova m1, [dstq+strideq*1-16] mova m2, [dstq+strideq*2-16] mova m3, [dstq+stride3q -16] lea tmpq, [dstq+strideq*4] mova m4, [tmpq+strideq*0-16] mova m5, [tmpq+strideq*1-16] mova m6, [tmpq+strideq*2-16] %if ARCH_X86_64 mova m7, [tmpq+stride3q -16] TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 SWAP 5, 13 SWAP 6, 8 SWAP 7, 9 %define P2 m13 %define P1 m8 %define P0 m9 %else ; x86-32 %define P2 [esp+18*16] %define P1 [esp+19*16] %define P0 [esp+20*16] TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ [tmpq+stride3q -16], P2, "", a, a mova P1, m6 mova P0, m7 %endif ; x86-32/64 mova [rsp+ 7*16], m0 mova [rsp+ 8*16], m1 mova [rsp+ 9*16], m2 mova [rsp+10*16], m3 %define P3 [rsp+6*16] mova P3, m4 mova m0, [dstq+strideq*0] mova m1, [dstq+strideq*1] mova m2, [dstq+strideq*2] mova m3, [dstq+stride3q ] lea tmpq, [dstq+strideq*4] mova m4, [tmpq+strideq*0] mova m5, [tmpq+strideq*1] mova m6, [tmpq+strideq*2] %if ARCH_X86_64 mova m7, [tmpq+stride3q ] TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 10 SWAP 0, 10 SWAP 1, 11 SWAP 2, 14 SWAP 3, 15 %define Q0 m10 %define Q1 m11 %define Q2 m14 %define Q3 m15 %else ; x86-32 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, \ [tmpq+stride3q ], [rsp+12*16], "", a, a %define Q0 [esp+21*16] %define Q1 [esp+22*16] %define Q2 [esp+23*16] %define Q3 [esp+24*16] mova Q0, m0 mova Q1, m1 mova Q2, m2 mova Q3, m3 %endif ; x86-32/64 mova [rsp+11*16], m4 %if ARCH_X86_64 mova [rsp+12*16], m5 %endif mova [rsp+13*16], m6 mova [rsp+14*16], m7 %endif ; %1 == 4/6/8/16 %endif ; %2 ==/!= v ; load L/E/I/H %if ARCH_X86_32 %define l_strideq r5 mov l_strideq, dword lstridem %ifidn %2, v %define lq r3 mov lq, dword lm %endif %endif %ifidn %2, v %if cpuflag(sse4) pmovzxbw m1, [lq] pmovzxbw m0, [lq+l_strideq] pxor m2, m2 %else ; ssse3 movq m1, [lq] movq m0, [lq+l_strideq] pxor m2, m2 REPX {punpcklbw x, m2}, m1, m0 %endif ; ssse3/sse4 %else ; %2 != v movq m0, [lq] ; l0, l1 movq m1, [lq+l_strideq] ; l2, l3 punpckldq m0, m1 ; l0, l2, l1, l3 pxor m2, m2 punpcklbw m1, m0, m2 ; l0, l2 punpckhbw m0, m2 ; l1, l3 %endif ; %2==/!=v %if ARCH_X86_32 %ifidn %2, v %undef lq mov mstrideq, mstridem %endif %endif pcmpeqw m5, m2, m0 pand m1, m5 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, [PIC_sym(pb_4x1_4x5_4x9_4x13)] ; l[x][1] pcmpeqw m5, m2, m0 ; !L psrlw m5, 1 %if ARCH_X86_64 psrlw m2, m0, [lutq+128] SPLATW m1, [lutq+136] %else ; x86-32 mov r5, lutm psrlw m2, m0, [r5+128] SPLATW m1, [r5+136] %endif ; x86-32/64 pminsw m2, m1 pmaxsw m2, [PIC_sym(pw_1)] ; I psrlw m1, m0, 4 ; H paddw m0, [PIC_sym(pw_2)] paddw m0, m0 paddw m0, m2 ; E REPX {pmullw x, [bdmulq]}, m0, m1, m2 %if ARCH_X86_32 %undef l_strideq lea stride3q, [strideq*3] %endif psubw m3, P1, P0 ; p1-p0 psubw m4, Q0, Q1 ; q0-q1 REPX {pabsw x, x}, m3, m4 pmaxsw m3, m5 pmaxsw m3, m4 pcmpgtw m7, m3, m1 ; hev %if %1 != 4 psubw m4, P2, P0 ; p2-p0 pabsw m4, m4 pmaxsw m4, m3 %if %1 != 6 mova m6, P3 ; p3 psubw m5, m6, P0 ; p3-p0 pabsw m5, m5 pmaxsw m4, m5 %endif ; %1 != 6 psubw m5, Q0, Q2 ; q0-q2 pabsw m5, m5 pmaxsw m4, m5 %if %1 != 6 psubw m5, Q0, Q3 ; q0-q3 pabsw m5, m5 pmaxsw m4, m5 %endif ; %1 != 6 pcmpgtw m4, [bdmulq] ; !flat8in psubw m5, P2, P1 ; p2-p1 pabsw m5, m5 %if %1 != 6 psubw m6, P2 ; p3-p2 pabsw m6, m6 pmaxsw m5, m6 psubw m6, Q2, Q3 ; q2-q3 pabsw m6, m6 pmaxsw m5, m6 %endif ; %1 != 6 psubw m6, Q2, Q1 ; q2-q1 pabsw m6, m6 pmaxsw m5, m6 %if %1 == 16 SPLATD m6, [maskq+8] SPLATD m1, [maskq+4] por m6, m1 pand m6, m12 pcmpeqd m6, m12 pand m5, m6 %else ; %1 != 16 SPLATD m6, [maskq+4] pand m6, m12 pcmpeqd m6, m12 pand m5, m6 ; only apply fm-wide to wd>4 blocks %endif ; %1==/!=16 pmaxsw m3, m5 %endif ; %1 != 4 pcmpgtw m3, m2 psubw m5, P1, Q1 ; p1-q1 psubw m6, P0, Q0 ; p0-q0 REPX {pabsw x, x}, m5, m6 paddw m6, m6 psrlw m5, 1 paddw m5, m6 ; abs(p0-q0)*2+(abs(p1-q1)>>1) pcmpgtw m5, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E por m3, m5 %if %1 == 16 %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] mova m1, [tmpq+strideq*2] mova m2, [tmpq+stride3q] %else ; %2 != v mova m0, [rsp+ 8*16] mova m1, [rsp+ 9*16] mova m2, [rsp+10*16] %endif ; %2==/!=v REPX {psubw x, P0}, m0, m1, m2 REPX {pabsw x, x}, m0, m1, m2 pmaxsw m1, m0 pmaxsw m1, m2 %ifidn %2, v lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] mova m2, [tmpq+strideq*1] mova m5, [tmpq+strideq*2] %else ; %2 != v mova m0, [rsp+11*16] mova m2, [rsp+12*16] mova m5, [rsp+13*16] %endif ; %2==/!=v REPX {psubw x, Q0}, m0, m2, m5 REPX {pabsw x, x}, m0, m2, m5 pmaxsw m0, m2 pmaxsw m1, m5 pmaxsw m1, m0 pcmpgtw m1, [bdmulq] ; !flat8out por m1, m4 ; !flat8in | !flat8out SPLATD m2, [maskq+8] pand m5, m2, m12 pcmpeqd m5, m12 pandn m1, m5 ; flat16 pandn m5, m3, m1 ; flat16 & fm SWAP 1, 5 SPLATD m5, [maskq+4] por m5, m2 pand m2, m5, m12 pcmpeqd m2, m12 pandn m4, m2 ; flat8in pandn m2, m3, m4 SWAP 2, 4 SPLATD m2, [maskq+0] por m2, m5 pand m2, m12 pcmpeqd m2, m12 pandn m3, m2 pandn m0, m4, m3 ; fm & !flat8 & !flat16 SWAP 0, 3 pandn m0, m1, m4 ; flat8 & !flat16 SWAP 0, 4 %elif %1 != 4 SPLATD m0, [maskq+4] pand m2, m0, m12 pcmpeqd m2, m12 pandn m4, m2 pandn m2, m3, m4 ; flat8 & fm SWAP 2, 4 SPLATD m2, [maskq+0] por m0, m2 pand m0, m12 pcmpeqd m0, m12 pandn m3, m0 pandn m0, m4, m3 ; fm & !flat8 SWAP 0, 3 %else ; %1 == 4 SPLATD m0, [maskq+0] pand m0, m12 pcmpeqd m0, m12 pandn m3, m0 ; fm %endif ; %1==/!=4 ; short filter %if ARCH_X86_64 SPLATW m0, r7m %else SPLATW m0, bdmulm %endif pcmpeqw m2, m2 psrlw m0, 1 ; 511 or 2047 pxor m2, m0 ; -512 or -2048 psubw m5, Q0, P0 ; q0-p0 paddw m6, m5, m5 paddw m6, m5 ; 3*(q0-p0) psubw m5, P1, Q1 ; iclip_diff(p1-q1) pminsw m5, m0 pmaxsw m5, m2 pand m5, m7 ; f=iclip_diff(p1-q1)&hev paddw m5, m6 ; f=iclip_diff(3*(q0-p0)+f) pminsw m5, m0 pmaxsw m5, m2 pand m3, m5 ; f&=fm paddw m5, m3, [PIC_sym(pw_3)] paddw m3, [PIC_sym(pw_4)] REPX {pminsw x, m0}, m5, m3 psraw m5, 3 ; f2 psraw m3, 3 ; f1 psubw m0, m2 ; 1023 or 4095 pxor m2, m2 %if ARCH_X86_64 paddw P0, m5 psubw Q0, m3 %else paddw m5, P0 psubw m6, Q0, m3 REPX {pminsw x, m0}, m5, m6 REPX {pmaxsw x, m2}, m5, m6 %endif paddw m3, [PIC_sym(pw_1)] psraw m3, 1 ; f=(f1+1)>>1 pandn m7, m3 ; f&=!hev SWAP 7, 3 %if ARCH_X86_64 paddw P1, m3 psubw Q1, m3 REPX {pminsw x, m0}, P1, P0, Q0, Q1 REPX {pmaxsw x, m2}, P1, P0, Q0, Q1 %else psubw m7, Q1, m3 paddw m3, P1 REPX {pminsw x, m0}, m7, m3 REPX {pmaxsw x, m2}, m7, m3 %if %1 > 4 mova P1, m3 mova P0, m5 mova Q0, m6 mova Q1, m7 %endif %endif %if %1 == 16 ; m8-11 = p1/p0/q0/q1, m4=flat8, m1=flat16 ; m12=filter bits mask ; m13-15=p2/q2/q3 ; m0,2-3,5-7 = free ; flat16 filter %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 mova m6, [tmpq+strideq*4] ; p3 lea tmpq, [dstq+mstrideq*4] %else ; %2 != v mova m0, [rsp+ 8*16] mova m2, [rsp+ 9*16] mova m7, [rsp+10*16] mova m6, [rsp+ 6*16] %endif ; %2==/!=v mova [rsp+ 0*16], m4 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psllw m3, m0, 3 ; p6*8 paddw m3, [PIC_sym(pw_8)] paddw m5, m2, m7 ; p5+p4 psubw m3, m0 paddw m5, m5 ; (p5+p4)*2 paddw m3, m6 ; p6*7+p3 paddw m5, P2 ; (p5+p4)*2+p2 paddw m3, P1 ; p6*7+p3+p1 paddw m5, P0 ; (p5+p4)*2+p2+p0 paddw m3, Q0 ; p6*7+p3+p1+q0 paddw m3, m5 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, m2 por m5, m4 %ifidn %2, v mova [tmpq+mstrideq*2], m5 ; p5 %else ; %2 != v mova [rsp+9*16], m5 %endif ; %2==/!=v ; sub p6*2, add p3/q1 paddw m3, m6 paddw m5, m0, m0 paddw m3, Q1 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, m7 por m5, m4 %ifidn %2, v mova [tmpq+mstrideq*1], m5 ; p4 %else ; %2 != v mova [rsp+10*16], m5 %endif ; %2==/!=v ; sub p6/p5, add p2/q2 psubw m3, m0 paddw m5, P2, Q2 psubw m3, m2 paddw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, m6 por m5, m4 %ifidn %2, v mova [tmpq+strideq*0], m5 ; p3 %else ; %2 != v mova [rsp+6*16], m5 %endif ; %2==/!=v %define WRITE_IN_PLACE 0 %ifidn %2, v %if ARCH_X86_64 %define WRITE_IN_PLACE 1 %endif %endif ; sub p6/p4, add p1/q3 paddw m3, P1 paddw m5, m0, m7 paddw m3, Q3 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, P2 por m5, m4 %if WRITE_IN_PLACE mova [tmpq+strideq*1], m5 %else mova [rsp+1*16], m5 ; don't clobber p2/m13 %endif ; sub p6/p3, add p0/q4 paddw m3, P0 paddw m5, m0, m6 %ifidn %2, v paddw m3, [dstq+strideq*4] %else ; %2 != v paddw m3, [rsp+11*16] %endif ; %2==/!=v psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, P1 por m5, m4 %if WRITE_IN_PLACE mova [dstq+mstrideq*2], m5 %else mova [rsp+2*16], m5 ; don't clobber p1/m3 %endif ; sub p6/p2, add q0/q5 paddw m3, Q0 paddw m5, m0, P2 %ifidn %2, v %if ARCH_X86_32 lea r4, P2 %endif lea tmpq, [dstq+strideq*4] paddw m3, [tmpq+strideq*1] %else ; %2 != v paddw m3, [rsp+12*16] %endif ; %2==/!=v psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, P0 por m5, m4 %if WRITE_IN_PLACE mova [dstq+mstrideq*1], m5 %else mova [rsp+3*16], m5 ; don't clobber p0/m4 %endif ; sub p6/p1, add q1/q6 paddw m3, Q1 paddw m5, m0, P1 %ifidn %2, v mova m0, [tmpq+strideq*2] ; q6 %else ; %2 != v mova m0, [rsp+13*16] ; q6 %endif ; %2==/!=v paddw m3, m0 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, Q0 por m5, m4 %if WRITE_IN_PLACE mova [dstq], m5 %else mova [rsp+4*16], m5 ; don't clobber q0/m5 %endif ; sub p5/p0, add q2/q6 paddw m3, Q2 paddw m5, m2, P0 paddw m3, m0 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 pandn m4, m1, Q1 por m2, m5, m4 ; don't clobber q1/m6 ; sub p4/q0, add q3/q6 paddw m3, Q3 paddw m7, Q0 paddw m3, m0 psubw m3, m7 psrlw m7, m3, 4 pand m7, m1 pandn m4, m1, Q2 por m7, m4 ; don't clobber q2/m14 ; sub p3/q1, add q4/q6 %ifidn %2, v paddw m3, [tmpq+strideq*0] %else ; %2 != v paddw m3, [rsp+11*16] %endif ; %2==/!=v paddw m6, Q1 paddw m3, m0 psubw m3, m6 psrlw m6, m3, 4 pand m6, m1 pandn m4, m1, Q3 por m6, m4 %if WRITE_IN_PLACE mova [tmpq+mstrideq], m6 ; q3 %else ; %2 != v mova [rsp+5*16], m6 %endif ; %2==/!=v ; sub p2/q2, add q5/q6 %ifidn %2, v paddw m3, [tmpq+strideq*1] %if ARCH_X86_64 paddw m5, P2, Q2 %else ; because tmpq is clobbered, so we use a backup pointer for P2 instead paddw m5, [r4], Q2 mov pic_regq, pic_regm %endif %else ; %2 != v paddw m3, [rsp+12*16] paddw m5, P2, Q2 %endif ; %2==/!=v paddw m3, m0 psubw m3, m5 psrlw m5, m3, 4 pand m5, m1 %ifidn %2, v pandn m4, m1, [tmpq+strideq*0] %else ; %2 != v pandn m4, m1, [rsp+11*16] %endif ; %2==/!=v por m5, m4 %ifidn %2, v mova [tmpq+strideq*0], m5 ; q4 %else ; %2 != v mova [rsp+11*16], m5 %endif ; %2==/!=v ; sub p1/q3, add q6*2 psubw m3, P1 paddw m0, m0 psubw m3, Q3 paddw m3, m0 psrlw m5, m3, 4 pand m5, m1 %ifidn %2, v pandn m4, m1, [tmpq+strideq*1] %else ; %2 != v pandn m4, m1, [rsp+12*16] %endif ; %2==/!=v por m5, m4 %ifidn %2, v mova [tmpq+strideq*1], m5 ; q5 %else ; %2 != v mova [rsp+12*16], m5 %endif ; %2==/!=v mova m4, [rsp+0*16] %ifidn %2, v lea tmpq, [dstq+mstrideq*4] %endif %if ARCH_X86_64 SWAP 2, 11 SWAP 7, 14 SWAP 6, 15 %else ; x86-32 mova Q1, m2 mova Q2, m7 %endif ; x86-32/64 %if WRITE_IN_PLACE mova P2, [tmpq+strideq*1] mova P1, [tmpq+strideq*2] mova P0, [tmpq+stride3q] mova Q0, [dstq] %elif ARCH_X86_64 mova P2, [rsp+1*16] mova P1, [rsp+2*16] mova P0, [rsp+3*16] mova Q0, [rsp+4*16] %else ; !WRITE_IN_PLACE & x86-32 mova m0, [rsp+1*16] mova m1, [rsp+2*16] mova m2, [rsp+3*16] mova m3, [rsp+4*16] mova m7, [rsp+5*16] mova P2, m0 mova P1, m1 mova P0, m2 mova Q0, m3 mova Q3, m7 %endif ; WRITE_IN_PLACE / x86-32/64 %undef WRITE_IN_PLACE %endif ; %1 == 16 %if %1 >= 8 ; flat8 filter mova m0, P3 ; p3 paddw m1, m0, P2 ; p3+p2 paddw m2, P1, P0 ; p1+p0 paddw m3, m1, m1 ; 2*(p3+p2) paddw m2, m0 ; p1+p0+p3 paddw m3, Q0 ; 2*(p3+p2)+q0 paddw m2, m3 ; 3*p3+2*p2+p1+p0+q0 pmulhrsw m7, m2, [PIC_sym(pw_4096)] psubw m7, P2 pand m7, m4 paddw m3, P1, Q1 ; p1+q1 psubw m2, m1 ; 2*p3+p2+p1+p0+q0 paddw m2, m3 ; 2*p3+p2+2*p1+p0+q0+q1 pmulhrsw m3, m2, [PIC_sym(pw_4096)] psubw m3, P1 pand m3, m4 paddw m5, m0, P1 ; p3+p1 paddw m6, P0, Q2 ; p0+q2 psubw m2, m5 ; p3+p2+p1+p0+q0+q1 paddw m2, m6 ; p3+p2+p1+2*p0+q0+q1+q2 pmulhrsw m5, m2, [PIC_sym(pw_4096)] psubw m5, P0 pand m5, m4 paddw m6, m0, P0 ; p3+p0 paddw m1, Q0, Q3 ; q0+q3 psubw m2, m6 ; p2+p1+p0+q0+q1+q2 paddw m2, m1 ; p2+p1+p0+2*q0+q1+q2+q3 pmulhrsw m6, m2, [PIC_sym(pw_4096)] psubw m6, Q0 pand m6, m4 paddw m2, Q1 ; p2+p1+p0+2*q0+2*q1+q2+q3 paddw m2, Q3 ; p2+p1+p0+2*q0+2*q1+q2+2*q3 paddw m1, P2, Q0 ; p2+q0 psubw m2, m1 ; p1+p0+q0+2*q1+q2+2*q3 pmulhrsw m1, m2, [PIC_sym(pw_4096)] psubw m1, Q1 pand m1, m4 psubw m2, P1 ; p0+q0+2*q1+q2+2*q3 psubw m2, Q1 ; p0+q0+q1+q2+2*q3 paddw m0, Q3, Q2 ; q3+q2 paddw m2, m0 ; p0+q0+q1+2*q2+3*q3 pmulhrsw m2, [PIC_sym(pw_4096)] psubw m2, Q2 pand m2, m4 paddw m7, P2 paddw m3, P1 paddw m5, P0 paddw m6, Q0 paddw m1, Q1 paddw m2, Q2 %ifidn %2, v mova [tmpq+strideq*1], m7 ; p2 mova [tmpq+strideq*2], m3 ; p1 mova [tmpq+stride3q ], m5 ; p0 mova [dstq+strideq*0], m6 ; q0 mova [dstq+strideq*1], m1 ; q1 mova [dstq+strideq*2], m2 ; q2 %else ; %2 != v mova m0, P3 %if %1 == 8 lea tmpq, [dstq+strideq*4] %if ARCH_X86_64 SWAP 4, 15 TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, 8 %else TRANSPOSE8X8W 0, 7, 3, 5, 6, 1, 2, 4, "", \ Q3, [tmpq+strideq*1-8], a, u %endif ; write 8x8 movu [dstq+strideq*0-8], m0 movu [dstq+strideq*1-8], m7 movu [dstq+strideq*2-8], m3 movu [dstq+stride3q -8], m5 movu [tmpq+strideq*0-8], m6 %if ARCH_X86_64 movu [tmpq+strideq*1-8], m1 %endif movu [tmpq+strideq*2-8], m2 movu [tmpq+stride3q -8], m4 lea dstq, [dstq+strideq*8] %else ; %1 != 8 %if ARCH_X86_64 SWAP 6, 8 SWAP 1, 9 SWAP 2, 10 %else mova [rsp+1*16], m6 mova [rsp+2*16], m1 mova [rsp+3*16], m2 %endif mova m1, [rsp+ 7*16] mova m2, [rsp+ 8*16] mova m4, [rsp+ 9*16] mova m6, [rsp+10*16] lea tmpq, [dstq+strideq*4] %if ARCH_X86_64 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, 11 %else mova [rsp+7*16], m5 TRANSPOSE8X8W 1, 2, 4, 6, 0, 7, 3, 5, "", \ [rsp+7*16], [tmpq+strideq*1-16], a, a %endif mova [dstq+strideq*0-16], m1 mova [dstq+strideq*1-16], m2 mova [dstq+strideq*2-16], m4 mova [dstq+stride3q -16], m6 mova [tmpq+strideq*0-16], m0 %if ARCH_X86_64 mova [tmpq+strideq*1-16], m7 %endif mova [tmpq+strideq*2-16], m3 mova [tmpq+stride3q -16], m5 %if ARCH_X86_64 SWAP 6, 8 SWAP 1, 9 SWAP 2, 10 SWAP 4, 15 %else mova m6, [rsp+1*16] mova m1, [rsp+2*16] mova m2, [rsp+3*16] mova m4, Q3 %endif mova m0, [rsp+11*16] mova m3, [rsp+12*16] mova m5, [rsp+13*16] %if ARCH_X86_64 mova m7, [rsp+14*16] TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, 8 %else TRANSPOSE8X8W 6, 1, 2, 4, 0, 3, 5, 7, "", \ [rsp+14*16], [tmpq+strideq*1], a, a %endif mova [dstq+strideq*0], m6 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m4 mova [tmpq+strideq*0], m0 %if ARCH_X86_64 mova [tmpq+strideq*1], m3 %endif mova [tmpq+strideq*2], m5 mova [tmpq+stride3q ], m7 lea dstq, [dstq+strideq*8] %endif ; %1==/!=8 %endif ; %2==/!=v %elif %1 == 6 ; flat6 filter paddw m3, P1, P0 ; p1+p0 paddw m3, P2 ; p2+p1+p0 paddw m6, P2, Q0 ; p2+q0 paddw m3, m3 ; 2*(p2+p1+p0) paddw m3, m6 ; p2+2*(p2+p1+p0)+q0 pmulhrsw m2, m3, [PIC_sym(pw_4096)] psubw m2, P1 pand m2, m4 paddw m3, Q0 ; p2+2*(p2+p1+p0+q0) paddw m6, P2, P2 ; 2*p2 paddw m3, Q1 ; p2+2*(p2+p1+p0+q0)+q1 psubw m3, m6 ; p2+2*(p1+p0+q0)+q1 pmulhrsw m5, m3, [PIC_sym(pw_4096)] psubw m5, P0 pand m5, m4 paddw m3, Q1 ; p2+2*(p1+p0+q0+q1) paddw m6, P2, P1 ; p2+p1 paddw m3, Q2 ; p2+2*(p1+p0+q0+q1)+q2 psubw m3, m6 ; p1+2*(p0+q0+q1)+q2 pmulhrsw m6, m3, [PIC_sym(pw_4096)] psubw m6, Q0 pand m6, m4 psubw m3, P1 ; 2*(p0+q0+q1)+q2 %if ARCH_X86_64 paddw Q2, Q2 ; q2*2 %else mova m0, Q2 paddw m0, m0 %endif psubw m3, P0 ; p0+2*(q0+q1)+q2 %if ARCH_X86_64 paddw m3, Q2 ; p0+q*(q0+q1+q2)+q2 %else paddw m3, m0 %endif pmulhrsw m3, [PIC_sym(pw_4096)] psubw m3, Q1 pand m3, m4 paddw m2, P1 paddw m5, P0 paddw m6, Q0 paddw m3, Q1 %ifidn %2, v mova [dstq+mstrideq*2], m2 ; p1 mova [dstq+mstrideq*1], m5 ; p0 mova [dstq+strideq*0], m6 ; q0 mova [dstq+strideq*1], m3 ; q1 %else ; %2 != v TRANSPOSE_8x4_AND_WRITE_4x8 m2, m5, m6, m3, m0 %endif ; %2==/!=v %else ; %1 == 4 %if ARCH_X86_64 %ifidn %2, v mova [dstq+mstrideq*2], P1 ; p1 mova [dstq+mstrideq*1], P0 ; p0 mova [dstq+strideq*0], Q0 ; q0 mova [dstq+strideq*1], Q1 ; q1 %else ; %2 != v TRANSPOSE_8x4_AND_WRITE_4x8 P1, P0, Q0, Q1, m0 %endif ; %2==/!=v %else ; x86-32 %ifidn %2, v mova [dstq+mstrideq*2], m3 mova [dstq+mstrideq*1], m5 mova [dstq+strideq*0], m6 mova [dstq+strideq*1], m7 %else ; %2 != v TRANSPOSE_8x4_AND_WRITE_4x8 m3, m5, m6, m7, m0 %endif ; %2==/!=v %endif ; x86-32/64 %endif ; %1 %undef P3 %undef P2 %undef P1 %undef P0 %undef Q0 %undef Q1 %undef Q2 %undef Q3 %endmacro INIT_XMM ssse3 ; stack layout: ; r0 - flat8 backup inside flat16 code %if ARCH_X86_64 cglobal lpf_v_sb_y_16bpc, 6, 12, 16, -16 * 1, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov wd, wm shl l_strideq, 2 sub lq, l_strideq %else ; stack layout [32bit only]: ; r1-4 - p2-q0 post-filter16 ; r5 - p3 ; r6 - q3 post-filter16 ; r7 - GPRs [mask_bitsm, mstridem] ; r8 - m12/pb_mask ; r9 - bdmulq cglobal lpf_v_sb_y_16bpc, 4, 7, 8, -16 * (10 + extra_stack), \ dst, stride, mask, mstride, pic_reg, stride3, tmp RELOC_ARGS v, 10*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base %define pic_regm dword [esp+7*16+2*gprsize] mov pic_regm, pic_regq mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+9*16 mova [bdmulq], m0 shl dword lstridem, 2 sub r3, dword lstridem mov dword lm, r3 %endif mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mstridem dword [esp+7*16+1*gprsize] mov mstridem, mstrideq %define mask_bitsm dword [esp+7*16+0*gprsize] mov mask_bitsm, 0x3 mova m0, [PIC_sym(pb_mask)] %define m12 [esp+8*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+8], mask_bitsd ; vmask[2] %else mov r6d, mask_bitsm test [maskq+8], r6d %endif jz .no_flat16 FILTER 16, v jmp .end .no_flat16: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else test [maskq+4], r6d %endif jz .no_flat FILTER 8, v jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .end FILTER 4, v .end: %if ARCH_X86_64 pslld m12, 2 add lq, 8 %else mova m0, m12 pslld m0, 2 mova m12, m0 add dword lm, 8 %endif add dstq, 16 %if ARCH_X86_64 shl mask_bitsd, 2 sub wd, 2 %else shl mask_bitsm, 2 sub dword wm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET INIT_XMM ssse3 ; stack layout: ; r0 - flat8 backup inside flat16 ; r1-4 - p2-q0 post-filter16 backup ; r5 - q3 post-filter16 backup ; r6 - p3 ; r7-10 - p7-4 ; r11-14 - q4-7 %if ARCH_X86_64 cglobal lpf_h_sb_y_16bpc, 6, 11, 16, -16 * 15, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov hd, hm shl l_strideq, 2 %else ; stack layout [32bit only]: ; r15 - GPRs [mask_bitsm] ; r16 - m12/pb_mask ; r17 - bdmulq ; r18-24 - p2-q3 cglobal lpf_h_sb_y_16bpc, 4, 7, 8, -16 * (25 + extra_stack), \ dst, stride, mask, l, pic_reg, stride3, tmp RELOC_ARGS h, 25*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+17*16 mova [bdmulq], m0 shl dword lstridem, 2 %endif sub lq, 4 lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mask_bitsm dword [esp+15*16+0*gprsize] mov mask_bitsm, 0x3 mova m0, [PIC_sym(pb_mask)] %define m12 [esp+16*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+8], mask_bitsd ; vmask[2] %else mov r6d, mask_bitsm test [maskq+8], r6d %endif jz .no_flat16 FILTER 16, h jmp .end .no_flat16: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else test [maskq+4], r6d %endif jz .no_flat FILTER 8, h jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .no_filter FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] .end: %if ARCH_X86_64 pslld m12, 2 lea lq, [lq+l_strideq*2] shl mask_bitsd, 2 sub hd, 2 %else mova m0, m12 pslld m0, 2 mova m12, m0 add lq, dword lstridem add lq, dword lstridem shl mask_bitsm, 2 sub dword hm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov wd, wm shl l_strideq, 2 sub lq, l_strideq %else ; stack layout [32bit only]: ; r0 - GPRs [mask_bitsm, mstridem] ; r1 - m12/pb_mask ; r2 - bdmulq cglobal lpf_v_sb_uv_16bpc, 4, 7, 8, -16 * (3 + extra_stack), \ dst, stride, mask, mstride, pic_reg, stride3, tmp RELOC_ARGS v, 3*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+2*16 mova [bdmulq], m0 shl dword lstridem, 2 sub r3, dword lstridem mov dword lm, r3 %endif mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mask_bitsm dword [esp+0*gprsize] %define mstridem dword [esp+1*gprsize] mov mask_bitsm, 0x3 mov mstridem, mstrideq mova m0, [PIC_sym(pb_mask)] %define m12 [esp+1*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else mov r6d, mask_bitsm test [maskq+4], r6d %endif jz .no_flat FILTER 6, v jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .end FILTER 4, v .end: %if ARCH_X86_64 pslld m12, 2 add lq, 8 %else mova m0, m12 pslld m0, 2 mova m12, m0 add dword lm, 8 %endif add dstq, 16 %if ARCH_X86_64 shl mask_bitsd, 2 sub wd, 2 %else shl mask_bitsm, 2 sub dword wm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_h_sb_uv_16bpc, 6, 11, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, tmp, mask_bits, bdmul mov r6d, r7m sar r6d, 7 and r6d, 16 ; 0 for 10bpc, 16 for 12bpc lea bdmulq, [pw_4] add bdmulq, r6 mov hd, hm shl l_strideq, 2 %else ; stack layout [32bit only]: ; r0 - GPRs [mask_bitsm] ; r1 - m12/pb_mask ; r2 - bdmulq ; r3-8 - p2-q2 cglobal lpf_h_sb_uv_16bpc, 4, 7, 8, -16 * (9 + extra_stack), \ dst, stride, mask, l, pic_reg, stride3, tmp RELOC_ARGS h, 9*16 %if STACK_ALIGNMENT >= 16 mov r5d, r7m %endif sar r5d, 7 and r5d, 16 ; 0 for 10bpc, 16 for 12bpc LEA pic_regq, PIC_base mova m0, [PIC_sym(pw_4)+r5] %define bdmulq esp+2*16 mova [bdmulq], m0 shl dword lstridem, 2 %endif sub lq, 4 lea stride3q, [strideq*3] %if ARCH_X86_64 mov mask_bitsd, 0x3 mova m12, [pb_mask] %else %define mask_bitsm dword [esp+0*gprsize] mov mask_bitsm, 0x3 mova m0, [PIC_sym(pb_mask)] %define m12 [esp+1*16] mova m12, m0 %endif .loop: %if ARCH_X86_64 test [maskq+4], mask_bitsd ; vmask[1] %else mov r6d, mask_bitsm test [maskq+4], r6d %endif jz .no_flat FILTER 6, h jmp .end .no_flat: %if ARCH_X86_64 test [maskq+0], mask_bitsd ; vmask[0] %else test [maskq+0], r6d %endif jz .no_filter FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] .end: %if ARCH_X86_64 pslld m12, 2 lea lq, [lq+l_strideq*2] shl mask_bitsd, 2 sub hd, 2 %else mova m0, m12 pslld m0, 2 mova m12, m0 add lq, dword lstridem add lq, dword lstridem shl mask_bitsm, 2 sub dword hm, 2 %endif jg .loop %undef mask_bitsm %undef bdmulq UNRELOC_ARGS RET rav1e-0.7.1/src/x86/loopfilter_avx2.asm000064400000000000000000001370061046102023000157130ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 pb_7_1: times 16 db 7, 1 pb_3_1: times 16 db 3, 1 pb_2_1: times 16 db 2, 1 pb_m1_0: times 16 db -1, 0 pb_m1_1: times 16 db -1, 1 pb_m1_2: times 16 db -1, 2 pb_1: times 32 db 1 pb_2: times 32 db 2 pb_3: times 32 db 3 pb_4: times 32 db 4 pb_16: times 32 db 16 pb_63: times 32 db 63 pb_64: times 32 db 64 pb_128: times 32 db 0x80 pb_129: times 32 db 0x81 pb_240: times 32 db 0xf0 pb_248: times 32 db 0xf8 pb_254: times 32 db 0xfe pw_2048: times 16 dw 2048 pw_4096: times 16 dw 4096 pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 SECTION .text %macro ABSSUB 4 ; dst, a, b, tmp psubusb %1, %2, %3 psubusb %4, %3, %2 por %1, %4 %endmacro %macro TRANSPOSE_16x4_AND_WRITE_4x32 5 ; transpose 16x4 punpcklbw m%5, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%2, m%3, m%4 punpckhbw m%3, m%4 punpcklwd m%4, m%5, m%2 punpckhwd m%5, m%2 punpcklwd m%2, m%1, m%3 punpckhwd m%1, m%3 ; write out movd [dstq+strideq*0-2], xm%4 pextrd [dstq+strideq*1-2], xm%4, 1 pextrd [dstq+strideq*2-2], xm%4, 2 pextrd [dstq+stride3q-2], xm%4, 3 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0-2], xm%5 pextrd [dstq+strideq*1-2], xm%5, 1 pextrd [dstq+strideq*2-2], xm%5, 2 pextrd [dstq+stride3q-2], xm%5, 3 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0-2], xm%2 pextrd [dstq+strideq*1-2], xm%2, 1 pextrd [dstq+strideq*2-2], xm%2, 2 pextrd [dstq+stride3q-2], xm%2, 3 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0-2], xm%1 pextrd [dstq+strideq*1-2], xm%1, 1 pextrd [dstq+strideq*2-2], xm%1, 2 pextrd [dstq+stride3q-2], xm%1, 3 lea dstq, [dstq+strideq*4] vextracti128 xm%4, m%4, 1 vextracti128 xm%5, m%5, 1 vextracti128 xm%2, m%2, 1 vextracti128 xm%1, m%1, 1 movd [dstq+strideq*0-2], xm%4 pextrd [dstq+strideq*1-2], xm%4, 1 pextrd [dstq+strideq*2-2], xm%4, 2 pextrd [dstq+stride3q-2], xm%4, 3 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0-2], xm%5 pextrd [dstq+strideq*1-2], xm%5, 1 pextrd [dstq+strideq*2-2], xm%5, 2 pextrd [dstq+stride3q-2], xm%5, 3 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0-2], xm%2 pextrd [dstq+strideq*1-2], xm%2, 1 pextrd [dstq+strideq*2-2], xm%2, 2 pextrd [dstq+stride3q-2], xm%2, 3 lea dstq, [dstq+strideq*4] movd [dstq+strideq*0-2], xm%1 pextrd [dstq+strideq*1-2], xm%1, 1 pextrd [dstq+strideq*2-2], xm%1, 2 pextrd [dstq+stride3q-2], xm%1, 3 lea dstq, [dstq+strideq*4] %endmacro %macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem %if %1 == 0 mova %3, m15 %endif ; input in m0-15 punpcklbw m15, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklbw m7, m8, m9 punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 punpcklbw m11, m12, m13 punpckhbw m12, m13 mova m13, %3 mova %3, m12 punpcklbw m12, m14, m13 punpckhbw m13, m14, m13 ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 punpcklwd m14, m15, m1 punpckhwd m15, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpcklwd m6, m7, m9 punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 punpcklwd m10, m11, m12 punpckhwd m11, m12 mova m12, %3 mova %3, m11 punpcklwd m11, m12, m13 punpckhwd m12, m13 ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 punpckldq m13, m14, m2 punpckhdq m14, m2 punpckldq m2, m15, m3 punpckhdq m15, m3 punpckldq m3, m1, m5 punpckhdq m1, m5 punpckldq m5, m0, m4 punpckhdq m0, m4 punpckldq m4, m6, m10 punpckhdq m6, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 punpckldq m11, m8, m12 punpckhdq m8, m12 mova m12, %3 mova %3, m8 punpckldq m8, m7, m12 punpckhdq m7, m12 ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 punpcklqdq m12, m13, m4 punpckhqdq m13, m4 punpcklqdq m4, m14, m6 punpckhqdq m14, m6 punpcklqdq m6, m2, m8 punpckhqdq m2, m8 punpcklqdq m8, m15, m7 punpckhqdq m15, m7 punpcklqdq m7, m3, m10 punpckhqdq m3, m10 punpcklqdq m10, m1, m9 punpckhqdq m1, m9 punpcklqdq m9, m5, m11 punpckhqdq m5, m11 mova m11, %3 mova %3, m12 punpcklqdq m12, m0, m11 punpckhqdq m0, m11 %if %2 == 0 mova m11, %3 %endif ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 SWAP 3, 14, 12, 9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] ; load data %ifidn %2, v %if %1 == 4 lea tmpq, [dstq+mstrideq*2] mova m3, [tmpq+strideq*0] ; p1 mova m4, [tmpq+strideq*1] ; p0 mova m5, [tmpq+strideq*2] ; q0 mova m6, [tmpq+stride3q] ; q1 %else ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] %if %1 != 6 mova m12, [tmpq+strideq*0] %endif mova m13, [tmpq+strideq*1] mova m3, [tmpq+strideq*2] mova m4, [tmpq+stride3q] mova m5, [dstq+strideq*0] mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 mova m15, [dstq+stride3q] %endif %endif %else ; load lines %if %1 == 4 movd xm3, [dstq+strideq*0-2] movd xm4, [dstq+strideq*1-2] movd xm5, [dstq+strideq*2-2] movd xm6, [dstq+stride3q -2] lea tmpq, [dstq+strideq*4] pinsrd xm3, [tmpq+strideq*0-2], 2 pinsrd xm4, [tmpq+strideq*1-2], 2 pinsrd xm5, [tmpq+strideq*2-2], 2 pinsrd xm6, [tmpq+stride3q -2], 2 lea tmpq, [tmpq+strideq*4] pinsrd xm3, [tmpq+strideq*0-2], 1 pinsrd xm4, [tmpq+strideq*1-2], 1 pinsrd xm5, [tmpq+strideq*2-2], 1 pinsrd xm6, [tmpq+stride3q -2], 1 lea tmpq, [tmpq+strideq*4] pinsrd xm3, [tmpq+strideq*0-2], 3 pinsrd xm4, [tmpq+strideq*1-2], 3 pinsrd xm5, [tmpq+strideq*2-2], 3 pinsrd xm6, [tmpq+stride3q -2], 3 lea tmpq, [tmpq+strideq*4] movd xm12, [tmpq+strideq*0-2] movd xm13, [tmpq+strideq*1-2] movd xm14, [tmpq+strideq*2-2] movd xm15, [tmpq+stride3q -2] lea tmpq, [tmpq+strideq*4] pinsrd xm12, [tmpq+strideq*0-2], 2 pinsrd xm13, [tmpq+strideq*1-2], 2 pinsrd xm14, [tmpq+strideq*2-2], 2 pinsrd xm15, [tmpq+stride3q -2], 2 lea tmpq, [tmpq+strideq*4] pinsrd xm12, [tmpq+strideq*0-2], 1 pinsrd xm13, [tmpq+strideq*1-2], 1 pinsrd xm14, [tmpq+strideq*2-2], 1 pinsrd xm15, [tmpq+stride3q -2], 1 lea tmpq, [tmpq+strideq*4] pinsrd xm12, [tmpq+strideq*0-2], 3 pinsrd xm13, [tmpq+strideq*1-2], 3 pinsrd xm14, [tmpq+strideq*2-2], 3 pinsrd xm15, [tmpq+stride3q -2], 3 vinserti128 m3, xm12, 1 vinserti128 m4, xm13, 1 vinserti128 m5, xm14, 1 vinserti128 m6, xm15, 1 ; transpose 4x16 ; xm3: A-D0,A-D8,A-D4,A-D12 ; xm4: A-D1,A-D9,A-D5,A-D13 ; xm5: A-D2,A-D10,A-D6,A-D14 ; xm6: A-D3,A-D11,A-D7,A-D15 punpcklbw m7, m3, m4 punpckhbw m3, m4 punpcklbw m4, m5, m6 punpckhbw m5, m6 ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 punpcklwd m6, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 ; xm6: A0-3,B0-3,C0-3,D0-3 ; xm7: A8-11,B8-11,C8-11,D8-11 ; xm4: A4-7,B4-7,C4-7,D4-7 ; xm3: A12-15,B12-15,C12-15,D12-15 punpckldq m5, m6, m4 punpckhdq m6, m4 punpckldq m4, m7, m3 punpckhdq m7, m3 ; xm5: A0-7,B0-7 ; xm6: C0-7,D0-7 ; xm4: A8-15,B8-15 ; xm7: C8-15,D8-15 punpcklqdq m3, m5, m4 punpckhqdq m4, m5, m4 punpcklqdq m5, m6, m7 punpckhqdq m6, m7 ; xm3: A0-15 ; xm5: B0-15 ; xm4: C0-15 ; xm6: D0-15 %elif %1 == 6 || %1 == 8 movq xm3, [dstq+strideq*0-%1/2] movq xm4, [dstq+strideq*1-%1/2] movq xm5, [dstq+strideq*2-%1/2] movq xm6, [dstq+stride3q -%1/2] lea tmpq, [dstq+strideq*8] movhps xm3, [tmpq+strideq*0-%1/2] movhps xm4, [tmpq+strideq*1-%1/2] movhps xm5, [tmpq+strideq*2-%1/2] movhps xm6, [tmpq+stride3q -%1/2] lea tmpq, [tmpq+strideq*8] movq xm7, [tmpq+strideq*0-%1/2] movq xm8, [tmpq+strideq*1-%1/2] movq xm9, [tmpq+strideq*2-%1/2] movq xm11, [tmpq+stride3q -%1/2] lea tmpq, [tmpq+strideq*8] movhps xm7, [tmpq+strideq*0-%1/2] movhps xm8, [tmpq+strideq*1-%1/2] movhps xm9, [tmpq+strideq*2-%1/2] movhps xm11, [tmpq+stride3q -%1/2] vinserti128 m3, xm7, 1 vinserti128 m4, xm8, 1 vinserti128 m5, xm9, 1 vinserti128 m6, xm11, 1 lea tmpq, [dstq+strideq*4] movq xm12, [tmpq+strideq*0-%1/2] movq xm13, [tmpq+strideq*1-%1/2] movq xm14, [tmpq+strideq*2-%1/2] movq xm15, [tmpq+stride3q -%1/2] lea tmpq, [tmpq+strideq*8] movhps xm12, [tmpq+strideq*0-%1/2] movhps xm13, [tmpq+strideq*1-%1/2] movhps xm14, [tmpq+strideq*2-%1/2] movhps xm15, [tmpq+stride3q -%1/2] lea tmpq, [tmpq+strideq*8] movq xm7, [tmpq+strideq*0-%1/2] movq xm8, [tmpq+strideq*1-%1/2] movq xm9, [tmpq+strideq*2-%1/2] movq xm11, [tmpq+stride3q -%1/2] lea tmpq, [tmpq+strideq*8] movhps xm7, [tmpq+strideq*0-%1/2] movhps xm8, [tmpq+strideq*1-%1/2] movhps xm9, [tmpq+strideq*2-%1/2] movhps xm11, [tmpq+stride3q -%1/2] vinserti128 m12, xm7, 1 vinserti128 m13, xm8, 1 vinserti128 m14, xm9, 1 vinserti128 m15, xm11, 1 ; transpose 8x16 ; xm3: A-H0,A-H8 ; xm4: A-H1,A-H9 ; xm5: A-H2,A-H10 ; xm6: A-H3,A-H11 ; xm12: A-H4,A-H12 ; xm13: A-H5,A-H13 ; xm14: A-H6,A-H14 ; xm15: A-H7,A-H15 punpcklbw m7, m3, m4 punpckhbw m3, m4 punpcklbw m4, m5, m6 punpckhbw m5, m6 punpcklbw m6, m12, m13 punpckhbw m12, m13 punpcklbw m13, m14, m15 punpckhbw m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 punpcklwd m15, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 punpcklwd m5, m6, m13 punpckhwd m6, m13 punpcklwd m13, m12, m14 punpckhwd m12, m14 ; xm15: A0-3,B0-3,C0-3,D0-3 ; xm7: E0-3,F0-3,G0-3,H0-3 ; xm4: A8-11,B8-11,C8-11,D8-11 ; xm3: E8-11,F8-11,G8-11,H8-11 ; xm5: A4-7,B4-7,C4-7,D4-7 ; xm6: E4-7,F4-7,G4-7,H4-7 ; xm13: A12-15,B12-15,C12-15,D12-15 ; xm12: E12-15,F12-15,G12-15,H12-15 punpckldq m14, m15, m5 punpckhdq m15, m5 punpckldq m5, m7, m6 %if %1 != 6 punpckhdq m7, m6 %endif punpckldq m6, m4, m13 punpckhdq m4, m13 punpckldq m13, m3, m12 %if %1 != 6 punpckhdq m12, m3, m12 %endif ; xm14: A0-7,B0-7 ; xm15: C0-7,D0-7 ; xm5: E0-7,F0-7 ; xm7: G0-7,H0-7 ; xm6: A8-15,B8-15 ; xm4: C8-15,D8-15 ; xm13: E8-15,F8-15 ; xm12: G8-15,H8-15 punpcklqdq m3, m14, m6 punpckhqdq m14, m6 punpckhqdq m6, m15, m4 punpcklqdq m15, m4 punpcklqdq m4, m5, m13 punpckhqdq m13, m5, m13 %if %1 == 8 punpcklqdq m5, m7, m12 punpckhqdq m12, m7, m12 ; xm3: A0-15 ; xm14: B0-15 ; xm15: C0-15 ; xm6: D0-15 ; xm4: E0-15 ; xm13: F0-15 ; xm5: G0-15 ; xm12: H0-15 SWAP 12, 3, 15 SWAP 13, 14, 5, 4, 6 ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 %else SWAP 13, 3, 14 SWAP 6, 4, 15, 5 ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 %endif %else ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose movu xm0, [dstq+strideq*0-8] movu xm1, [dstq+strideq*1-8] movu xm2, [dstq+strideq*2-8] movu xm3, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] movu xm4, [tmpq+strideq*0-8] movu xm5, [tmpq+strideq*1-8] movu xm6, [tmpq+strideq*2-8] movu xm7, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] movu xm8, [tmpq+strideq*0-8] movu xm9, [tmpq+strideq*1-8] movu xm10, [tmpq+strideq*2-8] movu xm11, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] movu xm12, [tmpq+strideq*0-8] movu xm13, [tmpq+strideq*1-8] movu xm14, [tmpq+strideq*2-8] movu xm15, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] vinserti128 m0, [tmpq+strideq*0-8], 1 vinserti128 m1, [tmpq+strideq*1-8], 1 vinserti128 m2, [tmpq+strideq*2-8], 1 vinserti128 m3, [tmpq+stride3q -8], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m4, [tmpq+strideq*0-8], 1 vinserti128 m5, [tmpq+strideq*1-8], 1 vinserti128 m6, [tmpq+strideq*2-8], 1 vinserti128 m7, [tmpq+stride3q -8], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m8, [tmpq+strideq*0-8], 1 vinserti128 m9, [tmpq+strideq*1-8], 1 vinserti128 m10, [tmpq+strideq*2-8], 1 vinserti128 m11, [tmpq+stride3q -8], 1 lea tmpq, [tmpq+strideq*4] vinserti128 m12, [tmpq+strideq*0-8], 1 vinserti128 m13, [tmpq+strideq*1-8], 1 vinserti128 m14, [tmpq+strideq*2-8], 1 vinserti128 m15, [tmpq+stride3q -8], 1 TRANSPOSE_16X16B 0, 1, [rsp+11*32] mova [rsp+12*32], m1 mova [rsp+13*32], m2 mova [rsp+14*32], m3 mova [rsp+15*32], m12 mova [rsp+16*32], m13 mova [rsp+17*32], m14 mova [rsp+18*32], m15 ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 SWAP 12, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 SWAP 11, 15 %endif %endif ; load L/E/I/H %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else movq xm1, [lq] movq xm2, [lq+l_strideq*2] movhps xm1, [lq+l_strideq] movhps xm2, [lq+l_stride3q] lea lq, [lq+l_strideq*4] movq xm10, [lq] movq xm0, [lq+l_strideq*2] movhps xm10, [lq+l_strideq] movhps xm0, [lq+l_stride3q] lea lq, [lq+l_strideq*4] vinserti128 m1, xm10, 1 vinserti128 m2, xm0, 1 shufps m0, m1, m2, q3131 shufps m1, m2, q2020 %endif pxor m2, m2 pcmpeqb m10, m2, m0 pand m1, m10 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] pcmpeqb m10, m2, m0 ; !L psrlq m2, m0, [lutq+128] pand m2, [pb_63] vpbroadcastb m1, [lutq+136] pminub m2, m1 pmaxub m2, [pb_1] ; I pand m1, m0, [pb_240] psrlq m1, 4 ; H paddb m0, [pb_2] paddb m0, m0 paddb m0, m2 ; E pxor m1, [pb_128] pxor m2, [pb_128] pxor m0, [pb_128] ABSSUB m8, m3, m4, m9 ; abs(p1-p0) pmaxub m8, m10 ABSSUB m9, m5, m6, m10 ; abs(q1-q0) pmaxub m8, m9 %if %1 == 4 pxor m8, [pb_128] pcmpgtb m7, m8, m1 ; hev %else pxor m7, m8, [pb_128] pcmpgtb m7, m1 ; hev %if %1 == 6 ABSSUB m9, m13, m4, m10 ; abs(p2-p0) pmaxub m9, m8 %else ABSSUB m9, m12, m4, m10 ; abs(p3-p0) pmaxub m9, m8 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) pmaxub m9, m10 %endif ABSSUB m10, m5, m14, m11 ; abs(q2-q0) pmaxub m9, m10 %if %1 != 6 ABSSUB m10, m5, m15, m11 ; abs(q3-q0) pmaxub m9, m10 %endif pxor m9, [pb_128] pcmpgtb m9, [pb_129] ; !flat8in %if %1 == 6 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) %else ABSSUB m10, m12, m13, m11 ; abs(p3-p2) ABSSUB m11, m13, m3, m1 ; abs(p2-p1) pmaxub m10, m11 ABSSUB m11, m14, m15, m1 ; abs(q3-q2) pmaxub m10, m11 %endif ABSSUB m11, m14, m6, m1 ; abs(q2-q1) pmaxub m10, m11 %if %1 == 16 vpbroadcastd m11, [maskq+8] vpbroadcastd m1, [maskq+4] por m11, m1 pand m11, [pb_mask] pcmpeqd m11, [pb_mask] pand m10, m11 %else vpbroadcastd m11, [maskq+4] pand m11, [pb_mask] pcmpeqd m11, [pb_mask] pand m10, m11 ; only apply fm-wide to wd>4 blocks %endif pmaxub m8, m10 pxor m8, [pb_128] %endif pcmpgtb m8, m2 ABSSUB m10, m3, m6, m11 ; abs(p1-q1) ABSSUB m11, m4, m5, m2 ; abs(p0-q0) paddusb m11, m11 pand m10, [pb_254] psrlq m10, 1 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) pxor m10, [pb_128] pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E por m8, m10 %if %1 == 16 %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] %else mova m0, [rsp+12*32] %endif ABSSUB m1, m0, m4, m2 %ifidn %2, v mova m0, [tmpq+strideq*2] %else mova m0, [rsp+13*32] %endif ABSSUB m2, m0, m4, m10 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+stride3q] %else mova m0, [rsp+14*32] %endif ABSSUB m2, m0, m4, m10 pmaxub m1, m2 %ifidn %2, v lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] %else mova m0, [rsp+15*32] %endif ABSSUB m2, m0, m5, m10 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+strideq*1] %else mova m0, [rsp+16*32] %endif ABSSUB m2, m0, m5, m10 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+strideq*2] %else mova m0, [rsp+17*32] %endif ABSSUB m2, m0, m5, m10 pmaxub m1, m2 pxor m1, [pb_128] pcmpgtb m1, [pb_129] ; !flat8out por m1, m9 ; !flat8in | !flat8out vpbroadcastd m2, [maskq+8] pand m10, m2, [pb_mask] pcmpeqd m10, [pb_mask] pandn m1, m10 ; flat16 pandn m1, m8, m1 ; flat16 & fm vpbroadcastd m10, [maskq+4] por m10, m2 pand m2, m10, [pb_mask] pcmpeqd m2, [pb_mask] pandn m9, m2 ; flat8in pandn m9, m8, m9 vpbroadcastd m2, [maskq+0] por m2, m10 pand m2, [pb_mask] pcmpeqd m2, [pb_mask] pandn m8, m2 pandn m8, m9, m8 ; fm & !flat8 & !flat16 pandn m9, m1, m9 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] pand m2, m0, [pb_mask] pcmpeqd m2, [pb_mask] pandn m9, m2 pandn m9, m8, m9 ; flat8 & fm vpbroadcastd m2, [maskq+0] por m0, m2 pand m0, [pb_mask] pcmpeqd m0, [pb_mask] pandn m8, m0 pandn m8, m9, m8 ; fm & !flat8 %else vpbroadcastd m0, [maskq+0] pand m0, [pb_mask] pcmpeqd m0, [pb_mask] pandn m8, m0 ; fm %endif ; short filter pxor m3, [pb_128] pxor m6, [pb_128] psubsb m10, m3, m6 ; iclip_diff(p1-q1) pand m10, m7 ; f=iclip_diff(p1-q1)&hev pxor m4, [pb_128] pxor m5, [pb_128] psubsb m11, m5, m4 paddsb m10, m11 paddsb m10, m11 paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f) pand m8, m10 ; f&=fm paddsb m10, m8, [pb_3] paddsb m8, [pb_4] pand m10, [pb_248] pand m8, [pb_248] psrlq m10, 3 psrlq m8, 3 pxor m10, [pb_16] pxor m8, [pb_16] psubb m10, [pb_16] ; f2 psubb m8, [pb_16] ; f1 paddsb m4, m10 psubsb m5, m8 pxor m4, [pb_128] pxor m5, [pb_128] pxor m8, [pb_128] pxor m10, m10 pavgb m8, m10 ; f=(f1+1)>>1 psubb m8, [pb_64] pandn m8, m7, m8 ; f&=!hev paddsb m3, m8 psubsb m6, m8 pxor m3, [pb_128] pxor m6, [pb_128] %if %1 == 16 ; flat16 filter %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 %else mova m0, [rsp+12*32] mova m2, [rsp+13*32] mova m7, [rsp+14*32] %endif mova [rsp+0*32], m9 mova [rsp+1*32], m14 mova [rsp+2*32], m15 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 punpcklbw m14, m0, m12 punpckhbw m15, m0, m12 pmaddubsw m10, m14, [pb_7_1] pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3 punpcklbw m8, m2, m7 punpckhbw m9, m2, m7 pmaddubsw m8, [pb_2] pmaddubsw m9, [pb_2] paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 punpcklbw m8, m13, m3 punpckhbw m9, m13, m3 pmaddubsw m8, [pb_1] pmaddubsw m9, [pb_1] paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m8, m4, m5 punpckhbw m9, m4, m5 pmaddubsw m8, [pb_1] pmaddubsw m9, [pb_1] paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 pand m8, m1 pandn m9, m1, m2 por m8, m9 %ifidn %2, v mova [tmpq+strideq*2], m8 ; p5 %else mova [rsp+13*32], m8 %endif ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, [pb_m1_1] pmaddubsw m15, [pb_m1_1] paddw m10, m14 paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 punpcklbw m8, m0, m6 punpckhbw m9, m0, m6 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] mova [rsp+3*32], m8 mova [rsp+4*32], m9 paddw m10, m8 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 vpblendvb m8, m7, m8, m1 %ifidn %2, v mova [tmpq+stride3q], m8 ; p4 %else mova [rsp+14*32], m8 %endif ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 mova m14, [rsp+1*32] punpcklbw m8, m0, m13 punpckhbw m9, m0, m13 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] paddw m10, m8 paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 punpcklbw m8, m2, m14 punpckhbw m2, m14 pmaddubsw m8, [pb_m1_1] pmaddubsw m2, [pb_m1_1] mova [rsp+1*32], m8 paddw m10, m8 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 vpblendvb m8, m12, m8, m1 %ifidn %2, v mova [tmpq+strideq*4], m8 ; p3 %else mova [rsp+19*32], m8 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 mova m15, [rsp+2*32] punpcklbw m8, m0, m3 punpckhbw m9, m0, m3 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] paddw m10, m8 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 punpcklbw m8, m7, m15 punpckhbw m7, m15 pmaddubsw m8, [pb_m1_1] pmaddubsw m7, [pb_m1_1] mova [rsp+2*32], m8 paddw m10, m8 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 vpblendvb m8, m13, m8, m1 mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 %ifidn %2, v lea tmpq, [dstq+strideq*4] %endif punpcklbw m8, m0, m4 punpckhbw m9, m0, m4 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 %ifidn %2, v mova m9, [tmpq+strideq*0] ; q4 %else mova m9, [rsp+15*32] %endif punpcklbw m8, m12, m9 punpckhbw m9, m12, m9 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] mova [rsp+7*32], m8 mova [rsp+5*32], m9 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 vpblendvb m8, m3, m8, m1 mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, v mova m9, [tmpq+strideq*1] ; q5 %else mova m9, [rsp+16*32] %endif punpcklbw m8, m0, m5 punpckhbw m0, m5 pmaddubsw m8, [pb_m1_1] pmaddubsw m0, [pb_m1_1] paddw m10, m8 paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 punpcklbw m0, m13, m9 punpckhbw m9, m13, m9 mova m13, [rsp+6*32] pmaddubsw m0, [pb_m1_1] pmaddubsw m9, [pb_m1_1] mova [rsp+ 9*32], m0 mova [rsp+10*32], m9 paddw m10, m0 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 pmulhrsw m0, m10, [pw_2048] pmulhrsw m8, m11, [pw_2048] packuswb m0, m8 vpblendvb m0, m4, m0, m1 mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 %ifidn %2, v mova m0, [tmpq+strideq*2] ; q6 %else mova m0, [rsp+17*32] %endif paddw m10, [rsp+3*32] paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 punpcklbw m8, m3, m0 punpckhbw m9, m3, m0 mova m3, [rsp+8*32] pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] mova [rsp+3*32], m8 mova [rsp+4*32], m9 paddw m10, m8 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 vpblendvb m8, m5, m8, m1 mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 paddw m10, [rsp+1*32] paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 punpcklbw m8, m4, m0 punpckhbw m2, m4, m0 mova m4, [rsp+6*32] pmaddubsw m8, [pb_m1_1] pmaddubsw m2, [pb_m1_1] paddw m10, m8 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 pmulhrsw m2, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m2, m9 vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I ; write +2 paddw m10, [rsp+2*32] paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 punpcklbw m8, m5, m0 punpckhbw m9, m5, m0 mova m5, [rsp+8*32] pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] paddw m10, m8 paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 pmulhrsw m7, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m7, m9 vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 paddw m10, [rsp+7*32] paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 punpcklbw m8, m6, m0 punpckhbw m9, m6, m0 SWAP 2, 6 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] paddw m10, m8 paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 vpblendvb m8, m15, m8, m1 %ifidn %2, v mova [tmpq+mstrideq], m8 ; q3 %else mova [rsp+20*32], m8 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 paddw m10, [rsp+ 9*32] paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 punpcklbw m8, m14, m0 punpckhbw m9, m14, m0 SWAP 14, 7 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] paddw m10, m8 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m8, m10, [pw_2048] pmulhrsw m9, m11, [pw_2048] packuswb m8, m9 %ifidn %2, v mova m9, [tmpq+strideq*0] %else mova m9, [rsp+15*32] %endif vpblendvb m8, m9, m8, m1 %ifidn %2, v mova [tmpq+strideq*0], m8 ; q4 %else mova [rsp+15*32], m8 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, [rsp+3*32] paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 punpcklbw m8, m15, m0 punpckhbw m9, m15, m0 pmaddubsw m8, [pb_m1_1] pmaddubsw m9, [pb_m1_1] paddw m10, m8 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m10, [pw_2048] pmulhrsw m11, [pw_2048] packuswb m10, m11 %ifidn %2, v mova m11, [tmpq+strideq*1] %else mova m11, [rsp+16*32] %endif vpblendvb m10, m11, m10, m1 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else mova [rsp+16*32], m10 %endif mova m9, [rsp+0*32] %ifidn %2, v lea tmpq, [dstq+mstrideq*4] %endif %endif %if %1 >= 8 ; flat8 filter punpcklbw m0, m12, m3 punpckhbw m1, m12, m3 pmaddubsw m2, m0, [pb_3_1] pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1 punpcklbw m8, m13, m4 punpckhbw m11, m13, m4 pmaddubsw m8, [pb_2_1] pmaddubsw m11, [pb_2_1] paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 punpcklbw m8, m5, [pb_4] punpckhbw m11, m5, [pb_4] pmaddubsw m8, [pb_1] pmaddubsw m11, [pb_1] paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendvb m10, m13, m8, m9 ; p2 %ifidn %2, v mova [tmpq+strideq*1], m10 ; p2 %endif pmaddubsw m8, m0, [pb_m1_1] pmaddubsw m11, m1, [pb_m1_1] paddw m2, m8 paddw m7, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 pmaddubsw m8, [pb_m1_1] pmaddubsw m11, [pb_m1_1] paddw m2, m8 paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendvb m8, m3, m8, m9 ; p1 %ifidn %2, v mova [tmpq+strideq*2], m8 ; p1 %else mova [rsp+0*32], m8 %endif pmaddubsw m0, [pb_1] pmaddubsw m1, [pb_1] psubw m2, m0 psubw m7, m1 punpcklbw m8, m4, m14 punpckhbw m11, m4, m14 pmaddubsw m8, [pb_1] pmaddubsw m11, [pb_1] paddw m2, m8 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendvb m8, m4, m8, m9 ; p0 %ifidn %2, v mova [tmpq+stride3q ], m8 ; p0 %else mova [rsp+1*32], m8 %endif punpcklbw m0, m5, m15 punpckhbw m1, m5, m15 pmaddubsw m8, m0, [pb_1] pmaddubsw m11, m1, [pb_1] paddw m2, m8 paddw m7, m11 punpcklbw m8, m4, m12 punpckhbw m11, m4, m12 pmaddubsw m8, [pb_1] pmaddubsw m11, [pb_1] psubw m2, m8 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendvb m11, m5, m8, m9 ; q0 %ifidn %2, v mova [dstq+strideq*0], m11 ; q0 %endif pmaddubsw m0, [pb_m1_1] pmaddubsw m1, [pb_m1_1] paddw m2, m0 paddw m7, m1 punpcklbw m8, m13, m6 punpckhbw m13, m6 pmaddubsw m8, [pb_m1_1] pmaddubsw m13, [pb_m1_1] paddw m2, m8 paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 psrlw m8, m2, 3 psrlw m13, m7, 3 packuswb m8, m13 vpblendvb m13, m6, m8, m9 ; q1 %ifidn %2, v mova [dstq+strideq*1], m13 ; q1 %endif punpcklbw m0, m3, m6 punpckhbw m1, m3, m6 pmaddubsw m0, [pb_1] pmaddubsw m1, [pb_1] psubw m2, m0 psubw m7, m1 punpcklbw m0, m14, m15 punpckhbw m1, m14, m15 pmaddubsw m0, [pb_1] pmaddubsw m1, [pb_1] paddw m2, m0 paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 vpblendvb m2, m14, m2, m9 ; q2 %ifidn %2, v mova [dstq+strideq*2], m2 ; q2 %else mova m0, [rsp+0*32] mova m1, [rsp+1*32] %if %1 == 8 ; 16x8 transpose punpcklbw m3, m12, m10 punpckhbw m12, m10 punpcklbw m10, m0, m1 punpckhbw m0, m1 punpcklbw m1, m11, m13 punpckhbw m11, m13 punpcklbw m13, m2, m15 punpckhbw m2, m15 punpcklwd m15, m3, m10 punpckhwd m3, m10 punpcklwd m10, m12, m0 punpckhwd m12, m0 punpcklwd m0, m1, m13 punpckhwd m1, m13 punpcklwd m13, m11, m2 punpckhwd m11, m2 punpckldq m2, m15, m0 punpckhdq m15, m0 punpckldq m0, m3, m1 punpckhdq m3, m1 punpckldq m1, m10, m13 punpckhdq m10, m13 punpckldq m13, m12, m11 punpckhdq m12, m11 ; write 8x32 movq [dstq+strideq*0-4], xm2 movhps [dstq+strideq*1-4], xm2 movq [dstq+strideq*2-4], xm15 movhps [dstq+stride3q -4], xm15 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm0 movhps [dstq+strideq*1-4], xm0 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm1 movhps [dstq+strideq*1-4], xm1 movq [dstq+strideq*2-4], xm10 movhps [dstq+stride3q -4], xm10 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm13 movhps [dstq+strideq*1-4], xm13 movq [dstq+strideq*2-4], xm12 movhps [dstq+stride3q -4], xm12 lea dstq, [dstq+strideq*4] vextracti128 xm2, m2, 1 vextracti128 xm15, m15, 1 vextracti128 xm0, m0, 1 vextracti128 xm3, m3, 1 vextracti128 xm1, m1, 1 vextracti128 xm10, m10, 1 vextracti128 xm13, m13, 1 vextracti128 xm12, m12, 1 movq [dstq+strideq*0-4], xm2 movhps [dstq+strideq*1-4], xm2 movq [dstq+strideq*2-4], xm15 movhps [dstq+stride3q -4], xm15 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm0 movhps [dstq+strideq*1-4], xm0 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm1 movhps [dstq+strideq*1-4], xm1 movq [dstq+strideq*2-4], xm10 movhps [dstq+stride3q -4], xm10 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm13 movhps [dstq+strideq*1-4], xm13 movq [dstq+strideq*2-4], xm12 movhps [dstq+stride3q -4], xm12 lea dstq, [dstq+strideq*4] %else ; 16x16 transpose and store SWAP 5, 10, 2 SWAP 6, 0 SWAP 7, 1 SWAP 8, 11 SWAP 9, 13 mova m0, [rsp+11*32] mova m1, [rsp+12*32] mova m2, [rsp+13*32] mova m3, [rsp+14*32] mova m4, [rsp+19*32] mova m11, [rsp+20*32] mova m12, [rsp+15*32] mova m13, [rsp+16*32] mova m14, [rsp+17*32] TRANSPOSE_16X16B 1, 0, [rsp+18*32] movu [dstq+strideq*0-8], xm0 movu [dstq+strideq*1-8], xm1 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm4 movu [dstq+strideq*1-8], xm5 movu [dstq+strideq*2-8], xm6 movu [dstq+stride3q -8], xm7 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm8 movu [dstq+strideq*1-8], xm9 movu [dstq+strideq*2-8], xm10 movu [dstq+stride3q -8], xm11 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm12 movu [dstq+strideq*1-8], xm13 movu [dstq+strideq*2-8], xm14 movu [dstq+stride3q -8], xm15 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m0, 1 vextracti128 [dstq+strideq*1-8], m1, 1 vextracti128 [dstq+strideq*2-8], m2, 1 vextracti128 [dstq+stride3q -8], m3, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m4, 1 vextracti128 [dstq+strideq*1-8], m5, 1 vextracti128 [dstq+strideq*2-8], m6, 1 vextracti128 [dstq+stride3q -8], m7, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m8, 1 vextracti128 [dstq+strideq*1-8], m9, 1 vextracti128 [dstq+strideq*2-8], m10, 1 vextracti128 [dstq+stride3q -8], m11, 1 lea dstq, [dstq+strideq*4] vextracti128 [dstq+strideq*0-8], m12, 1 vextracti128 [dstq+strideq*1-8], m13, 1 vextracti128 [dstq+strideq*2-8], m14, 1 vextracti128 [dstq+stride3q -8], m15, 1 lea dstq, [dstq+strideq*4] %endif %endif %elif %1 == 6 ; flat6 filter punpcklbw m8, m13, m5 punpckhbw m11, m13, m5 pmaddubsw m0, m8, [pb_3_1] pmaddubsw m1, m11, [pb_3_1] punpcklbw m7, m4, m3 punpckhbw m10, m4, m3 pmaddubsw m2, m7, [pb_2] pmaddubsw m12, m10, [pb_2] paddw m0, m2 paddw m1, m12 pmulhrsw m2, m0, [pw_4096] pmulhrsw m12, m1, [pw_4096] packuswb m2, m12 vpblendvb m2, m3, m2, m9 %ifidn %2, v mova [tmpq+strideq*2], m2 ; p1 %endif pmaddubsw m8, [pb_m1_1] pmaddubsw m11, [pb_m1_1] paddw m0, m8 paddw m1, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 pmaddubsw m8, [pb_m1_1] pmaddubsw m11, [pb_m1_1] paddw m0, m8 paddw m1, m11 pmulhrsw m12, m0, [pw_4096] pmulhrsw m13, m1, [pw_4096] packuswb m12, m13 vpblendvb m12, m4, m12, m9 %ifidn %2, v mova [tmpq+stride3q], m12 ; p0 %endif paddw m0, m8 paddw m1, m11 punpcklbw m8, m3, m14 punpckhbw m11, m3, m14 pmaddubsw m14, m8, [pb_m1_1] pmaddubsw m13, m11, [pb_m1_1] paddw m0, m14 paddw m1, m13 pmulhrsw m14, m0, [pw_4096] pmulhrsw m13, m1, [pw_4096] packuswb m14, m13 vpblendvb m14, m5, m14, m9 %ifidn %2, v mova [dstq+strideq*0], m14 ; q0 %endif pmaddubsw m8, [pb_m1_2] pmaddubsw m11, [pb_m1_2] paddw m0, m8 paddw m1, m11 pmaddubsw m7, [pb_m1_0] pmaddubsw m10, [pb_m1_0] paddw m0, m7 paddw m1, m10 pmulhrsw m0, [pw_4096] pmulhrsw m1, [pw_4096] packuswb m0, m1 vpblendvb m0, m6, m0, m9 %ifidn %2, v mova [dstq+strideq*1], m0 ; q1 %else TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 %endif %else %ifidn %2, v mova [tmpq+strideq*0], m3 ; p1 mova [tmpq+strideq*1], m4 ; p0 mova [tmpq+strideq*2], m5 ; q0 mova [tmpq+stride3q ], m6 ; q1 %else TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 %endif %endif %endmacro INIT_YMM avx2 cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] .loop: cmp byte [maskq+8], 0 ; vmask[2] je .no_flat16 FILTER 16, v jmp .end .no_flat16: cmp byte [maskq+4], 0 ; vmask[1] je .no_flat FILTER 8, v jmp .end .no_flat: cmp byte [maskq+0], 0 ; vmask[0] je .end call .v4 .end: add lq, 32 add dstq, 32 add maskq, 1 sub wd, 8 jg .loop RET ALIGN function_align .v4: FILTER 4, v ret INIT_YMM avx2 cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] .loop: cmp byte [maskq+8], 0 ; vmask[2] je .no_flat16 FILTER 16, h jmp .end .no_flat16: cmp byte [maskq+4], 0 ; vmask[1] je .no_flat FILTER 8, h jmp .end .no_flat: cmp byte [maskq+0], 0 ; vmask[0] je .no_filter call .h4 jmp .end .no_filter: lea dstq, [dstq+stride3q*8] lea lq, [lq+l_strideq*8] lea dstq, [dstq+strideq*8] .end: add maskq, 1 sub hd, 8 jg .loop RET ALIGN function_align .h4: FILTER 4, h ret INIT_YMM avx2 cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] .loop: cmp byte [maskq+4], 0 ; vmask[1] je .no_flat FILTER 6, v jmp .end .no_flat: cmp byte [maskq+0], 0 ; vmask[0] je .end call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4 .end: add lq, 32 add dstq, 32 add maskq, 1 sub wd, 8 jg .loop RET INIT_YMM avx2 cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] .loop: cmp byte [maskq+4], 0 ; vmask[1] je .no_flat FILTER 6, h jmp .end .no_flat: cmp byte [maskq+0], 0 ; vmask[0] je .no_filter call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4 jmp .end .no_filter: lea dstq, [dstq+stride3q*8] lea lq, [lq+l_strideq*8] lea dstq, [dstq+strideq*8] .end: add maskq, 1 sub hd, 8 jg .loop RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/loopfilter_avx512.asm000064400000000000000000001412071046102023000160570ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080 dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60 hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 pb_1: times 4 db 1 pb_2: times 4 db 2 pb_3: times 4 db 3 pb_4: times 4 db 4 pb_16: times 4 db 16 pb_63: times 4 db 63 pb_64: times 4 db 64 pb_128: times 4 db 0x80 pb_240: times 4 db 0xf0 pb_248: times 4 db 0xf8 pb_254: times 4 db 0xfe pb_2_1: times 2 db 2, 1 pb_3_1: times 2 db 3, 1 pb_7_1: times 2 db 7, 1 pb_m1_0: times 2 db -1, 0 pb_m1_1: times 2 db -1, 1 pb_m1_2: times 2 db -1, 2 pw_2048: times 2 dw 2048 pw_4096: times 2 dw 4096 SECTION .text %macro ABSSUB 4 ; dst, a, b, tmp psubusb %1, %2, %3 psubusb %4, %3, %2 por %1, %4 %endmacro %macro TRANSPOSE_16x4_AND_WRITE_4x32 5 punpcklbw m%5, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%2, m%3, m%4 punpckhbw m%3, m%4 punpcklwd m%4, m%5, m%2 punpckhwd m%5, m%2 punpcklwd m%2, m%1, m%3 punpckhwd m%1, m%3 kmovw k1, k6 lea t0, [dstq+strideq*4] vpscatterdd [dstq+m19-2]{k1}, m%4 kmovw k1, k6 lea t1, [dstq+strideq*8] vpscatterdd [t0 +m19-2]{k1}, m%5 kmovw k1, k6 lea t2, [t0 +strideq*8] vpscatterdd [t1 +m19-2]{k1}, m%2 kmovw k1, k6 vpscatterdd [t2 +m19-2]{k1}, m%1 %endmacro %macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem %if %1 == 0 SWAP m16, m22 %endif punpcklbw m22, m24, m26 punpckhbw m24, m26 punpcklbw m26, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 punpcklbw m5, m6, m7 punpckhbw m6, m7 punpcklbw m7, m8, m9 punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 punpcklbw m11, m25, m13 punpckhbw m25, m13 %if %1 == 0 SWAP m13, m16 %else mova m13, %3 %endif SWAP m16, m25 punpcklbw m25, m14, m13 punpckhbw m13, m14, m13 ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 punpcklwd m14, m22, m26 punpckhwd m22, m26 punpcklwd m26, m24, m2 punpckhwd m24, m2 punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m6 punpckhwd m4, m6 punpcklwd m6, m7, m9 punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 punpcklwd m10, m11, m25 punpckhwd m11, m25 SWAP m25, m16, m11 punpcklwd m11, m25, m13 punpckhwd m25, m13 ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 punpckldq m13, m14, m2 punpckhdq m14, m2 punpckldq m2, m22, m3 punpckhdq m22, m3 punpckldq m3, m26, m5 punpckhdq m26, m5 punpckldq m5, m24, m4 punpckhdq m24, m4 punpckldq m4, m6, m10 punpckhdq m6, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 punpckldq m11, m8, m25 punpckhdq m8, m25 SWAP m25, m16, m8 punpckldq m8, m7, m25 punpckhdq m7, m25 ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 punpcklqdq m25, m13, m4 punpckhqdq m13, m4 punpcklqdq m4, m14, m6 punpckhqdq m14, m6 punpcklqdq m6, m2, m8 punpckhqdq m2, m8 punpcklqdq m8, m22, m7 punpckhqdq m22, m7 punpcklqdq m7, m3, m10 punpckhqdq m3, m10 punpcklqdq m10, m26, m9 punpckhqdq m26, m9 punpcklqdq m9, m5, m11 punpckhqdq m5, m11 SWAP m11, m16 %if %2 == 0 SWAP m16, m25 %else mova %3, m25 %endif punpcklqdq m25, m24, m11 punpckhqdq m24, m11 %if %2 == 0 SWAP m11, m16 %endif ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 SWAP 3, 14, 25, 9 %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] ; load data %ifidn %2, v %define is_h 0 %if %1 == 4 lea t0, [dstq+mstrideq*2] mova m3, [t0 +strideq*0] ; p1 mova m4, [t0 +strideq*1] ; p0 mova m5, [t0 +strideq*2] ; q0 mova m6, [t0 +stride3q ] ; q1 %else ; load 6-8 pixels, remainder (for wd=16) will be read inline %if %1 == 16 lea t0, [dstq+mstrideq*8] mova m16, [t0 +strideq*1] mova m17, [t0 +strideq*2] mova m18, [t0 +stride3q ] %endif lea t0, [dstq+mstrideq*4] %if %1 != 6 mova m25, [t0 +strideq*0] %endif mova m13, [t0 +strideq*1] mova m3, [t0 +strideq*2] mova m4, [t0 +stride3q ] mova m5, [dstq+strideq*0] mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 mova m22, [dstq+stride3q ] %endif %if %1 == 16 lea t0, [dstq+strideq*4] mova m29, [t0 +strideq*0] mova m30, [t0 +strideq*1] mova m31, [t0 +strideq*2] %endif %endif %else ; h %define is_h 1 ; load lines %if %1 == 4 vbroadcasti32x4 m0, [hshuf4] kmovw k1, k6 lea t0, [dstq+strideq*4] vpgatherdd m3{k1}, [dstq+m19-2] kmovw k1, k6 lea t1, [dstq+strideq*8] vpgatherdd m4{k1}, [t0 +m19-2] kmovw k1, k6 lea t2, [t0 +strideq*8] vpgatherdd m5{k1}, [t1 +m19-2] kmovw k1, k6 vpgatherdd m6{k1}, [t2 +m19-2] pshufb m3, m0 pshufb m4, m0 pshufb m5, m0 pshufb m6, m0 punpckldq m7, m3, m4 punpckhdq m3, m4 punpckldq m4, m5, m6 punpckhdq m5, m6 punpcklqdq m6, m7, m4 punpckhqdq m7, m4 punpcklqdq m4, m3, m5 punpckhqdq m3, m5 SWAP 3, 6 SWAP 5, 4, 7 ; 6,7,4,3 -> 3,4,5,6 %elif %1 == 6 || %1 == 8 kmovb k1, k7 lea t0, [dstq+strideq*1] vpgatherdq m3{k1}, [dstq+ym21-%1/2] kmovb k1, k7 lea t1, [dstq+strideq*2] vpgatherdq m4{k1}, [t0 +ym21-%1/2] kmovb k1, k7 lea t2, [dstq+stride3q ] vpgatherdq m5{k1}, [t1 +ym21-%1/2] kmovb k1, k7 vextracti32x8 ym0, m21, 1 vpgatherdq m6{k1}, [t2 +ym21-%1/2] kmovb k1, k7 vpgatherdq m12{k1}, [dstq+ym0 -%1/2] kmovb k1, k7 vpgatherdq m13{k1}, [t0 +ym0 -%1/2] kmovb k1, k7 vpgatherdq m14{k1}, [t1 +ym0 -%1/2] kmovb k1, k7 vpgatherdq m15{k1}, [t2 +ym0 -%1/2] ; transpose 8x16 ; xm3: A-H0,A-H8 ; xm4: A-H1,A-H9 ; xm5: A-H2,A-H10 ; xm6: A-H3,A-H11 ; xm12: A-H4,A-H12 ; xm13: A-H5,A-H13 ; xm14: A-H6,A-H14 ; xm15: A-H7,A-H15 punpcklbw m7, m3, m4 punpckhbw m3, m4 punpcklbw m4, m5, m6 punpckhbw m5, m6 punpcklbw m6, m12, m13 punpckhbw m12, m13 punpcklbw m13, m14, m15 punpckhbw m14, m15 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 punpcklwd m15, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 punpcklwd m5, m6, m13 punpckhwd m6, m13 punpcklwd m13, m12, m14 punpckhwd m12, m14 ; xm15: A0-3,B0-3,C0-3,D0-3 ; xm7: E0-3,F0-3,G0-3,H0-3 ; xm4: A8-11,B8-11,C8-11,D8-11 ; xm3: E8-11,F8-11,G8-11,H8-11 ; xm5: A4-7,B4-7,C4-7,D4-7 ; xm6: E4-7,F4-7,G4-7,H4-7 ; xm13: A12-15,B12-15,C12-15,D12-15 ; xm12: E12-15,F12-15,G12-15,H12-15 punpckldq m14, m15, m5 punpckhdq m15, m5 punpckldq m5, m7, m6 %if %1 != 6 punpckhdq m7, m6 %endif punpckldq m6, m4, m13 punpckhdq m4, m13 punpckldq m13, m3, m12 %if %1 != 6 punpckhdq m12, m3, m12 %endif ; xm14: A0-7,B0-7 ; xm15: C0-7,D0-7 ; xm5: E0-7,F0-7 ; xm7: G0-7,H0-7 ; xm6: A8-15,B8-15 ; xm4: C8-15,D8-15 ; xm13: E8-15,F8-15 ; xm12: G8-15,H8-15 punpcklqdq m3, m14, m6 punpckhqdq m14, m6 punpckhqdq m6, m15, m4 punpcklqdq m15, m4 punpcklqdq m4, m5, m13 punpckhqdq m13, m5, m13 %if %1 == 8 punpcklqdq m5, m7, m12 punpckhqdq m25, m7, m12 ; xm3: A0-15 ; xm14: B0-15 ; xm15: C0-15 ; xm6: D0-15 ; xm4: E0-15 ; xm13: F0-15 ; xm5: G0-15 ; xm25: H0-15 SWAP 25, 3, 15 SWAP 13, 14, 5, 4, 6 SWAP 15, 22 ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 %else SWAP 13, 3, 14 SWAP 6, 4, 15, 5 ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 %endif %else ; 16, h ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose movu xm24, [dstq+strideq*0-8] movu xm26, [dstq+strideq*1-8] movu xm2, [dstq+strideq*2-8] movu xm3, [dstq+stride3q -8] lea t0, [dstq+strideq*4] movu xm4, [t0 +strideq*0-8] movu xm5, [t0 +strideq*1-8] movu xm6, [t0 +strideq*2-8] movu xm7, [t0 +stride3q -8] lea t0, [t0 +strideq*4] movu xm8, [t0 +strideq*0-8] movu xm9, [t0 +strideq*1-8] movu xm10, [t0 +strideq*2-8] movu xm11, [t0 +stride3q -8] lea t0, [t0 +strideq*4] movu xm25, [t0 +strideq*0-8] movu xm13, [t0 +strideq*1-8] movu xm14, [t0 +strideq*2-8] movu xm22, [t0 +stride3q -8] lea t0, [t0 +strideq*4] vinserti32x4 ym24, [t0 +strideq*0-8], 1 vinserti32x4 ym26, [t0 +strideq*1-8], 1 vinserti32x4 ym2, [t0 +strideq*2-8], 1 vinserti32x4 ym3, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 ym4, [t0 +strideq*0-8], 1 vinserti32x4 ym5, [t0 +strideq*1-8], 1 vinserti32x4 ym6, [t0 +strideq*2-8], 1 vinserti32x4 ym7, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 ym8, [t0 +strideq*0-8], 1 vinserti32x4 ym9, [t0 +strideq*1-8], 1 vinserti32x4 ym10, [t0 +strideq*2-8], 1 vinserti32x4 ym11, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 ym25, [t0 +strideq*0-8], 1 vinserti32x4 ym13, [t0 +strideq*1-8], 1 vinserti32x4 ym14, [t0 +strideq*2-8], 1 vinserti32x4 ym22, [t0 +stride3q -8], 1 lea t0, [t0 +strideq*4] vinserti32x4 m24, [t0 +strideq*0-8], 2 vinserti32x4 m26, [t0 +strideq*1-8], 2 vinserti32x4 m2, [t0 +strideq*2-8], 2 vinserti32x4 m3, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m4, [t0 +strideq*0-8], 2 vinserti32x4 m5, [t0 +strideq*1-8], 2 vinserti32x4 m6, [t0 +strideq*2-8], 2 vinserti32x4 m7, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m8, [t0 +strideq*0-8], 2 vinserti32x4 m9, [t0 +strideq*1-8], 2 vinserti32x4 m10, [t0 +strideq*2-8], 2 vinserti32x4 m11, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m25, [t0 +strideq*0-8], 2 vinserti32x4 m13, [t0 +strideq*1-8], 2 vinserti32x4 m14, [t0 +strideq*2-8], 2 vinserti32x4 m22, [t0 +stride3q -8], 2 lea t0, [t0 +strideq*4] vinserti32x4 m24, [t0 +strideq*0-8], 3 vinserti32x4 m26, [t0 +strideq*1-8], 3 vinserti32x4 m2, [t0 +strideq*2-8], 3 vinserti32x4 m3, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] vinserti32x4 m4, [t0 +strideq*0-8], 3 vinserti32x4 m5, [t0 +strideq*1-8], 3 vinserti32x4 m6, [t0 +strideq*2-8], 3 vinserti32x4 m7, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] vinserti32x4 m8, [t0 +strideq*0-8], 3 vinserti32x4 m9, [t0 +strideq*1-8], 3 vinserti32x4 m10, [t0 +strideq*2-8], 3 vinserti32x4 m11, [t0 +stride3q -8], 3 lea t0, [t0 +strideq*4] vinserti32x4 m25, [t0 +strideq*0-8], 3 vinserti32x4 m13, [t0 +strideq*1-8], 3 vinserti32x4 m14, [t0 +strideq*2-8], 3 vinserti32x4 m22, [t0 +stride3q -8], 3 ; TRANSPOSE_16X16B 0, 1, [rsp+0*64] SWAP m16, m26 SWAP m17, m2 SWAP m18, m3 SWAP m29, m25 SWAP m30, m13 SWAP m31, m14 mova [rsp+4*64], m22 ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 SWAP 25, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 SWAP 11, 22 %endif %endif ; load L/E/I/H vpbroadcastd m15, [pb_1] %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else kmovw k1, k6 vpgatherdd m0{k1}, [lq+m20+4] kmovw k1, k6 vpgatherdd m1{k1}, [lq+m20+0] %endif pxor m2, m2 pcmpeqb k1, m0, m2 vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, pbshuf ; l[x][0] vpcmpub k3, m0, m2, 4 ; neq ; L psrlq m2, m0, [lutq+128] pand m2, [pb_63]{bcstd} vpbroadcastb m1, [lutq+136] pminub m2, m1 pmaxub m2, m15 ; I pand m1, m0, [pb_240]{bcstd} psrlq m1, 4 ; H paddd m0, [pb_2]{bcstd} paddb m0, m0 paddb m0, m2 ; E ABSSUB m8, m3, m4, m9 ; abs(p1-p0) ABSSUB m9, m5, m6, m10 ; abs(q1-q0) pmaxub m8, m9 vpcmpub k1, m8, m1, 6 ; gt ; hev %if %1 != 4 %if %1 == 6 ABSSUB m9, m13, m4, m10 ; abs(p2-p0) pmaxub m9, m8 %else ABSSUB m9, m25, m4, m10 ; abs(p3-p0) pmaxub m9, m8 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) pmaxub m9, m10 %endif ABSSUB m10, m5, m14, m11 ; abs(q2-q0) pmaxub m9, m10 %if %1 != 6 ABSSUB m10, m5, m22, m11 ; abs(q3-q0) pmaxub m9, m10 %endif vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in %if %1 == 6 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) %else ABSSUB m10, m25, m13, m11 ; abs(p3-p2) ABSSUB m11, m13, m3, m1 ; abs(p2-p1) pmaxub m10, m11 ABSSUB m11, m14, m22, m1 ; abs(q3-q2) pmaxub m10, m11 %endif ABSSUB m11, m14, m6, m1 ; abs(q2-q1) pmaxub m10, m11 %if %1 == 16 vpbroadcastd m11, [maskq+8] por m11, [maskq+4]{bcstd} %else vpbroadcastd m11, [maskq+4] %endif vptestmd k4, m11, pbmask vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks pmaxub m8, m10 %endif vpcmpub k3{k3}, m8, m2, 2 ; le ABSSUB m10, m3, m6, m11 ; abs(p1-q1) ABSSUB m11, m4, m5, m2 ; abs(p0-q0) paddusb m11, m11 pand m10, [pb_254]{bcstd} psrlq m10, 1 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E %if %1 == 16 ABSSUB m1, m16, m4, m2 ABSSUB m2, m17, m4, m10 pmaxub m1, m2 ABSSUB m2, m18, m4, m10 pmaxub m1, m2 ABSSUB m2, m29, m5, m10 pmaxub m1, m2 ABSSUB m2, m30, m5, m10 pmaxub m1, m2 ABSSUB m2, m31, m5, m10 pmaxub m1, m2 kandq k2, k2, k3 vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out vpbroadcastd m2, [maskq+8] vptestmd k5, m2, pbmask vpmovm2d m7, k5 vptestmb k4{k4}, m7, m7 ; flat16 & fm por m10, m2, [maskq+4]{bcstd} vptestmd k5, m10, pbmask vpmovm2d m7, k5 vptestmb k2{k2}, m7, m7 ; flat8in por m2, m10, [maskq+0]{bcstd} vptestmd k5, m2, pbmask vpmovm2d m7, k5 vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 & !flat16 kandnq k2, k4, k2 ; flat8 & !flat16 %elif %1 != 4 vpbroadcastd m0, [maskq+4] vptestmd k4, m0, pbmask vpmovm2d m7, k4 vptestmb k2{k2}, m7, m7 kandq k2, k2, k3 ; flat8 & fm por m0, [maskq+0]{bcstd} vptestmd k4, m0, pbmask vpmovm2d m7, k4 vptestmb k3{k3}, m7, m7 kandnq k3, k2, k3 ; fm & !flat8 %else %ifidn %2, v vptestmd k4, pbmask, [maskq+0]{bcstd} %else vpbroadcastd m0, [maskq+0] vptestmd k4, m0, pbmask %endif vpmovm2d m7, k4 vptestmb k3{k3}, m7, m7 ; fm %endif ; short filter %if %1 >= 8 SWAP m23, m15 %endif vpbroadcastd m15, [pb_3] vpbroadcastd m0, [pb_4] vpbroadcastd m12, [pb_16] vpbroadcastd m1, [pb_64] pxor m3, pb128 pxor m6, pb128 psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev pxor m4, pb128 pxor m5, pb128 psubsb m11, m5, m4 paddsb m10, m11 paddsb m10, m11 paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm paddsb m8, m10, m15 paddsb m10, m0 pand m8, [pb_248]{bcstd} pand m10, [pb_248]{bcstd} psrlq m8, 3 psrlq m10, 3 pxor m8, m12 pxor m10, m12 psubb m8, m12 ; f2 psubb m10, m12 ; f1 paddsb m4, m8 psubsb m5, m10 pxor m4, pb128 pxor m5, pb128 ; pxor m10, pb128 pxor m8, m8 pavgb m8, m10 ; f=(f1+1)>>1 psubb m8, m1 knotq k1, k1 paddsb m3{k1}, m3, m8 psubsb m6{k1}, m6, m8 pxor m3, pb128 pxor m6, pb128 %if %1 == 16 ; flat16 filter %ifidn %2, v lea t0, [dstq+mstrideq*8] %endif SWAP m24, m16, m14 SWAP m2, m17, m22 SWAP m7, m18 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 vpbroadcastd m1, [pb_7_1] vpbroadcastd m12, [pb_2] punpcklbw m14, m24, m25 punpckhbw m22, m24, m25 pmaddubsw m10, m14, m1 pmaddubsw m11, m22, m1 ; p6*7+p3 punpcklbw m8, m2, m7 punpckhbw m9, m2, m7 pmaddubsw m8, m12 pmaddubsw m9, m12 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 %ifidn %2, h vpbroadcastd m27, [pw_2048] vpbroadcastd m1, [pb_m1_1] %define pw2048 m27 %define pbm1_1 m1 %endif punpcklbw m8, m13, m3 punpckhbw m9, m13, m3 pmaddubsw m8, m23 pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m8, m4, m5 punpckhbw m9, m4, m5 pmaddubsw m8, m23 pmaddubsw m9, m23 paddw m10, m8 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5 %else vpblendmb m8{k4}, m2, m8 mova [rsp+1*64], m8 %endif ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, pbm1_1 pmaddubsw m22, pbm1_1 paddw m10, m14 paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 punpcklbw m8, m24, m6 punpckhbw m9, m24, m6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 SWAP m18, m8 SWAP m23, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+stride3q]{k4}, m8 ; p4 %else vpblendmb m8{k4}, m7, m8 mova [rsp+2*64], m8 %endif ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 SWAP m14, m16 punpcklbw m8, m24, m13 punpckhbw m9, m24, m13 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 punpcklbw m8, m2, m14 punpckhbw m2, m14 pmaddubsw m8, pbm1_1 pmaddubsw m2, pbm1_1 paddw m10, m8 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 SWAP m16, m8 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 %else vpblendmb m8{k4}, m25, m8 mova [rsp+3*64], m8 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 SWAP m22, m17 punpcklbw m8, m24, m3 punpckhbw m9, m24, m3 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 punpcklbw m8, m7, m22 punpckhbw m7, m22 pmaddubsw m8, pbm1_1 pmaddubsw m7, pbm1_1 paddw m10, m8 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 SWAP m17, m8 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 %ifidn %2, v lea t0, [dstq+strideq*4] %endif punpcklbw m8, m24, m4 punpckhbw m9, m24, m4 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 punpcklbw m8, m25, m29 punpckhbw m9, m25, m29 SWAP m26, m29 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 SWAP m29, m8 SWAP m0, m9 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, h SWAP m28, m24 punpcklbw m8, m28, m5 punpckhbw m24, m28, m5 %else punpcklbw m8, m24, m5 punpckhbw m24, m5 %endif pmaddubsw m8, pbm1_1 pmaddubsw m24, pbm1_1 paddw m10, m8 paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 punpcklbw m24, m13, m30 punpckhbw m9, m13, m30 %ifidn %2, h SWAP m27, m30 %endif SWAP m13, m15 pmaddubsw m24, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m24 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 SWAP m30, m24 SWAP m15, m9 %ifidn %2, h SWAP m9, m24 %define pw2048 m9 %endif pmulhrsw m24, m10, pw2048 pmulhrsw m8, m11, pw2048 paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 paddw m11, m23 packuswb m24, m8 punpcklbw m8, m3, m31 pmaddubsw m8, pbm1_1 paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 SWAP m18, m8 pmulhrsw m8, m10, pw2048 paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 %ifidn %2, h SWAP m16, m9 %define pw2048 m16 %endif punpckhbw m9, m3, m31 SWAP m3, m12 pmaddubsw m9, pbm1_1 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 SWAP m23, m9 pmulhrsw m9, m11, pw2048 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 %ifidn %2, h SWAP m2, m1 %define pbm1_1 m2 %endif vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 SWAP m24, m31 ; q6 packuswb m8, m9 %ifidn %2, h SWAP m31, m2 %define pbm1_1 m31 %endif vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 punpcklbw m8, m4, m24 punpckhbw m2, m4, m24 SWAP m4, m1 pmaddubsw m8, pbm1_1 pmaddubsw m2, pbm1_1 paddw m10, m8 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 pmulhrsw m2, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m2, m9 vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I ; write +2 paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 paddw m11, m7 punpcklbw m8, m5, m24 punpckhbw m9, m5, m24 SWAP m5, m12 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 pmulhrsw m7, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m7, m9 vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 paddw m11, m0 punpcklbw m8, m6, m24 punpckhbw m9, m6, m24 SWAP 2, 6 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+mstrideq]{k4}, m8 %else SWAP m29, m16 %define pw2048 m29 vpblendmb m16{k4}, m22, m8 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 paddw m11, m15 %ifidn %2, h SWAP m15, m8 %endif punpcklbw m8, m14, m24 punpckhbw m9, m14, m24 SWAP 14, 7 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m8, m10, pw2048 pmulhrsw m9, m11, pw2048 packuswb m8, m9 %ifidn %2, v vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 %else vpblendmb m17{k4}, m26, m8 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 paddw m11, m23 punpcklbw m8, m22, m24 punpckhbw m9, m22, m24 SWAP m30, m24 pmaddubsw m8, pbm1_1 pmaddubsw m9, pbm1_1 paddw m10, m8 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m10, pw2048 pmulhrsw m11, pw2048 packuswb m10, m11 %ifidn %2, v vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5 %else vmovdqu8 m27{k4}, m10 %endif %ifidn %2, v lea t0, [dstq+mstrideq*4] %endif %endif %if %1 >= 8 ; flat8 filter vpbroadcastd m9, [pb_3_1] vpbroadcastd m10, [pb_2_1] %if %1 == 16 vpbroadcastd m23, [pb_1] vpbroadcastd m0, [pb_4] %elifidn %2, h vpbroadcastd m31, [pb_m1_1] %define pbm1_1 m31 %endif punpcklbw m24, m25, m3 punpckhbw m26, m25, m3 pmaddubsw m2, m24, m9 pmaddubsw m7, m26, m9 ; 3 * p3 + p1 punpcklbw m8, m13, m4 punpckhbw m11, m13, m4 pmaddubsw m8, m10 pmaddubsw m11, m10 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 punpcklbw m8, m5, m0 punpckhbw m11, m5, m0 pmaddubsw m8, m23 pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 %if is_h || %1 == 16 vpblendmb m10{k2}, m13, m8 ; p2 %endif %ifidn %2, v %if %1 == 8 vmovdqu8 [t0+strideq*1]{k2}, m8 %else mova [t0+strideq*1], m10 %endif %endif pmaddubsw m8, m24, pbm1_1 pmaddubsw m11, m26, pbm1_1 paddw m2, m8 paddw m7, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 pmaddubsw m8, pbm1_1 pmaddubsw m11, pbm1_1 paddw m2, m8 paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendmb m8{k2}, m3, m8 ; p1 %ifidn %2, v mova [t0+strideq*2], m8 %else SWAP m18, m8 %endif pmaddubsw m24, m23 pmaddubsw m26, m23 psubw m2, m24 psubw m7, m26 punpcklbw m8, m4, m14 punpckhbw m11, m4, m14 pmaddubsw m8, m23 pmaddubsw m11, m23 paddw m2, m8 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendmb m8{k2}, m4, m8 ; p0 %ifidn %2, v mova [t0+stride3q], m8 %else SWAP m29, m8 %endif punpcklbw m24, m5, m22 punpckhbw m26, m5, m22 pmaddubsw m8, m24, m23 pmaddubsw m11, m26, m23 paddw m2, m8 paddw m7, m11 punpcklbw m8, m4, m25 punpckhbw m11, m4, m25 pmaddubsw m8, m23 pmaddubsw m11, m23 psubw m2, m8 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m8, m2, 3 psrlw m11, m7, 3 packuswb m8, m11 vpblendmb m11{k2}, m5, m8 ; q0 %ifidn %2, v mova [dstq+strideq*0], m11 %endif pmaddubsw m24, pbm1_1 pmaddubsw m26, pbm1_1 paddw m2, m24 paddw m7, m26 punpcklbw m8, m13, m6 punpckhbw m13, m6 pmaddubsw m8, pbm1_1 pmaddubsw m13, pbm1_1 paddw m2, m8 paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 psrlw m8, m2, 3 psrlw m13, m7, 3 packuswb m8, m13 vpblendmb m13{k2}, m6, m8 ; q1 %ifidn %2, v mova [dstq+strideq*1], m13 %endif punpcklbw m24, m3, m6 punpckhbw m26, m3, m6 pmaddubsw m24, m23 pmaddubsw m26, m23 psubw m2, m24 psubw m7, m26 punpcklbw m24, m14, m22 punpckhbw m26, m14, m22 pmaddubsw m24, m23 pmaddubsw m26, m23 paddw m2, m24 paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 %if is_h || %1 == 16 vpblendmb m2{k2}, m14, m2 ; q2 %endif %ifidn %2, v %if %1 == 8 vmovdqu8 [dstq+strideq*2]{k2}, m2 %else mova [dstq+strideq*2], m2 %endif %endif %ifidn %2, h SWAP m24, m18 SWAP m26, m29 %if %1 == 8 ; 16x8 transpose punpcklbw m3, m25, m10 punpckhbw m25, m10 punpcklbw m10, m24, m26 punpckhbw m24, m26 punpcklbw m26, m11, m13 punpckhbw m11, m13 punpcklbw m13, m2, m22 punpckhbw m2, m22 ; punpcklwd m22, m3, m10 punpckhwd m3, m10 punpcklwd m10, m25, m24 punpckhwd m25, m24 punpcklwd m24, m26, m13 punpckhwd m26, m13 punpcklwd m13, m11, m2 punpckhwd m11, m2 ; punpckldq m2, m22, m24 punpckhdq m22, m24 punpckldq m24, m3, m26 punpckhdq m3, m26 punpckldq m26, m10, m13 punpckhdq m10, m13 punpckldq m13, m25, m11 punpckhdq m25, m11 ; write 8x32 vpbroadcastd ym16, strided pmulld ym16, [hmulD] lea t1, [dstq+strideq*2] lea t2, [dstq+strideq*4] lea t3, [t1 +strideq*4] lea t0, [dstq+strideq*8] kmovb k1, k6 kmovb k2, k6 kmovb k3, k6 kmovb k4, k6 vpscatterdq [dstq+ym16-4]{k1}, m2 vpscatterdq [t1 +ym16-4]{k2}, m22 vpscatterdq [t2 +ym16-4]{k3}, m24 vpscatterdq [t3 +ym16-4]{k4}, m3 lea t1, [t0+strideq*2] lea t2, [t0+strideq*4] lea t3, [t1+strideq*4] kmovb k1, k6 kmovb k2, k6 kmovb k3, k6 kmovb k4, k6 vpscatterdq [t0+ym16-4]{k1}, m26 vpscatterdq [t1+ym16-4]{k2}, m10 vpscatterdq [t2+ym16-4]{k3}, m13 vpscatterdq [t3+ym16-4]{k4}, m25 %else ; 16x16 transpose and store SWAP 5, 10, 2 SWAP 6, 24 SWAP 7, 26 SWAP 8, 11 SWAP 9, 13 mova m24, [rsp+0*64] SWAP m26, m28 mova m2, [rsp+1*64] mova m3, [rsp+2*64] mova m4, [rsp+3*64] SWAP m11, m16 SWAP m25, m17 SWAP m13, m27 SWAP m14, m30 TRANSPOSE_16X16B 1, 0, [rsp+4*64] movu [dstq+strideq*0-8], xm24 movu [dstq+strideq*1-8], xm26 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea t0, [dstq+strideq*4] movu [t0+strideq*0-8], xm4 movu [t0+strideq*1-8], xm5 movu [t0+strideq*2-8], xm6 movu [t0+stride3q -8], xm7 lea t0, [t0+strideq*4] movu [t0+strideq*0-8], xm8 movu [t0+strideq*1-8], xm9 movu [t0+strideq*2-8], xm10 movu [t0+stride3q -8], xm11 lea t0, [t0+strideq*4] movu [t0+strideq*0-8], xm25 movu [t0+strideq*1-8], xm13 movu [t0+strideq*2-8], xm14 movu [t0+stride3q -8], xm22 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym24, 1 vextracti128 [t0+strideq*1-8], ym26, 1 vextracti128 [t0+strideq*2-8], ym2, 1 vextracti128 [t0+stride3q -8], ym3, 1 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym4, 1 vextracti128 [t0+strideq*1-8], ym5, 1 vextracti128 [t0+strideq*2-8], ym6, 1 vextracti128 [t0+stride3q -8], ym7, 1 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym8, 1 vextracti128 [t0+strideq*1-8], ym9, 1 vextracti128 [t0+strideq*2-8], ym10, 1 vextracti128 [t0+stride3q -8], ym11, 1 lea t0, [t0+strideq*4] vextracti128 [t0+strideq*0-8], ym25, 1 vextracti128 [t0+strideq*1-8], ym13, 1 vextracti128 [t0+strideq*2-8], ym14, 1 vextracti128 [t0+stride3q -8], ym22, 1 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m24, 2 vextracti32x4 [t0+strideq*1-8], m26, 2 vextracti32x4 [t0+strideq*2-8], m2, 2 vextracti32x4 [t0+stride3q -8], m3, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m4, 2 vextracti32x4 [t0+strideq*1-8], m5, 2 vextracti32x4 [t0+strideq*2-8], m6, 2 vextracti32x4 [t0+stride3q -8], m7, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m8, 2 vextracti32x4 [t0+strideq*1-8], m9, 2 vextracti32x4 [t0+strideq*2-8], m10, 2 vextracti32x4 [t0+stride3q -8], m11, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m25, 2 vextracti32x4 [t0+strideq*1-8], m13, 2 vextracti32x4 [t0+strideq*2-8], m14, 2 vextracti32x4 [t0+stride3q -8], m22, 2 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m24, 3 vextracti32x4 [t0+strideq*1-8], m26, 3 vextracti32x4 [t0+strideq*2-8], m2, 3 vextracti32x4 [t0+stride3q -8], m3, 3 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m4, 3 vextracti32x4 [t0+strideq*1-8], m5, 3 vextracti32x4 [t0+strideq*2-8], m6, 3 vextracti32x4 [t0+stride3q -8], m7, 3 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m8, 3 vextracti32x4 [t0+strideq*1-8], m9, 3 vextracti32x4 [t0+strideq*2-8], m10, 3 vextracti32x4 [t0+stride3q -8], m11, 3 lea t0, [t0+strideq*4] vextracti32x4 [t0+strideq*0-8], m25, 3 vextracti32x4 [t0+strideq*1-8], m13, 3 vextracti32x4 [t0+strideq*2-8], m14, 3 vextracti32x4 [t0+stride3q -8], m22, 3 %endif %endif %elif %1 == 6 ; flat6 filter vpbroadcastd m15, [pb_3_1] vpbroadcastd m12, [pb_2] punpcklbw m8, m13, m5 punpckhbw m11, m13, m5 pmaddubsw m0, m8, m15 pmaddubsw m1, m11, m15 punpcklbw m7, m4, m3 punpckhbw m10, m4, m3 pmaddubsw m2, m7, m12 pmaddubsw m12, m10, m12 %ifidn %2, h vpbroadcastd m15, [pb_m1_1] %define pbm1_1 m15 %endif paddw m0, m2 paddw m1, m12 pmulhrsw m2, m0, m16 pmulhrsw m12, m1, m16 packuswb m2, m12 vpblendmb m2{k2}, m3, m2 ; p1 %ifidn %2, v mova [t0+strideq*2], m2 %endif pmaddubsw m8, pbm1_1 pmaddubsw m11, pbm1_1 paddw m0, m8 paddw m1, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 pmaddubsw m8, pbm1_1 pmaddubsw m11, pbm1_1 paddw m0, m8 paddw m1, m11 pmulhrsw m12, m0, m16 pmulhrsw m13, m1, m16 packuswb m12, m13 vpblendmb m12{k2}, m4, m12 ; p0 %ifidn %2, v mova [t0+stride3q], m12 %endif vpbroadcastd m9, [pb_m1_2] vpbroadcastd m4, [pb_m1_0] paddw m0, m8 paddw m1, m11 punpcklbw m8, m3, m14 punpckhbw m11, m3, m14 pmaddubsw m14, m8, pbm1_1 pmaddubsw m13, m11, pbm1_1 paddw m0, m14 paddw m1, m13 pmulhrsw m14, m0, m16 pmulhrsw m13, m1, m16 packuswb m14, m13 vpblendmb m14{k2}, m5, m14 ; q0 %ifidn %2, v mova [dstq+strideq*0], m14 %endif pmaddubsw m8, m9 pmaddubsw m11, m9 paddw m0, m8 paddw m1, m11 pmaddubsw m7, m4 pmaddubsw m10, m4 paddw m0, m7 paddw m1, m10 pmulhrsw m0, m16 pmulhrsw m1, m16 packuswb m0, m1 vpblendmb m0{k2}, m6, m0 ; q1 %ifidn %2, v mova [dstq+strideq*1], m0 %else TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 %endif %else ; %1 == 4 %ifidn %2, v mova [t0+strideq*0], m3 ; p1 mova [t0+strideq*1], m4 ; p0 mova [t0+strideq*2], m5 ; q0 mova [t0+stride3q ], m6 ; q1 %else TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 %endif %endif %endmacro %define k7 k6 INIT_ZMM avx512icl cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride DECLARE_REG_TMP 9 shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mova m21, [pb_4x0_4x4_4x8_4x12] mova m20, [pb_mask] vpbroadcastd m19, [pb_128] vpbroadcastd m28, [pb_m1_1] vpbroadcastd m27, [pw_2048] %define pbshuf m21 %define pbmask m20 %define pb128 m19 %define pbm1_1 m28 %define pw2048 m27 .loop: cmp word [maskq+8], 0 ; vmask[2] je .no_flat16 FILTER 16, v jmp .end .no_flat16: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 8, v jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call .v4 .end: add lq, 64 add dstq, 64 add maskq, 2 sub wd, 16 jg .loop RET ALIGN function_align RESET_MM_PERMUTATION .v4: FILTER 4, v ret cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ lut, h, stride3, stride8 DECLARE_REG_TMP 9, 10, 11, 12 shl l_strideq, 2 sub lq, 4 lea stride3q, [strideq*3] lea stride8q, [strideq*8] kxnorw k6, k6, k6 vpbroadcastd m19, strided vpbroadcastd m20, l_strided pmulld m21, m19, [hmulA] pmulld m20, [hmulB] pmulld m19, [hmulC] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask [pb_mask] %define pb128 [pb_128]{bcstd} shl l_strideq, 1 .loop: cmp word [maskq+8], 0 ; vmask[2] je .no_flat16 FILTER 16, h jmp .end .no_flat16: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 8, h jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call .h4 .end: lea lq, [lq+l_strideq*8] lea dstq, [dstq+stride8q*8] add maskq, 2 sub hd, 16 jg .loop RET ALIGN function_align RESET_MM_PERMUTATION .h4: FILTER 4, h ret cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ lut, w, stride3, mstride DECLARE_REG_TMP 9 shl l_strideq, 2 sub lq, l_strideq mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] mova m21, [pb_4x0_4x4_4x8_4x12] mova m20, [pb_mask] vpbroadcastd m19, [pb_128] vpbroadcastd m17, [pb_m1_1] vpbroadcastd m16, [pw_4096] %define pbshuf m21 %define pbmask m20 %define pb128 m19 %define pbm1_1 m17 .loop: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 6, v jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 .end: add lq, 64 add dstq, 64 add maskq, 2 sub wd, 16 jg .loop RET %undef k7 cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ lut, h, stride3, stride8 DECLARE_REG_TMP 9, 10, 11 mov r7d, 0xffff movzx r8d, r7b cmp hd, 9 cmovb r7d, r8d kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff shl l_strideq, 2 sub lq, 4 kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0 lea stride3q, [strideq*3] lea stride8q, [strideq*8] vpbroadcastd m19, strided vpbroadcastd m20, l_strided pmulld m21, m19, [hmulA] pmulld m20, [hmulB] pmulld m19, [hmulC] mova m18, [pb_mask] vpbroadcastd m17, [pb_128] vpbroadcastd m16, [pw_4096] %define pbshuf [pb_4x0_4x4_4x8_4x12] %define pbmask m18 %define pb128 m17 add l_strideq, l_strideq .loop: cmp word [maskq+4], 0 ; vmask[1] je .no_flat FILTER 6, h jmp .end .no_flat: cmp word [maskq+0], 0 ; vmask[0] je .end call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 .end: lea lq, [lq+l_strideq*8] lea dstq, [dstq+stride8q*8] add maskq, 2 sub hd, 16 jg .loop RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/loopfilter_sse.asm000064400000000000000000001732111046102023000156230ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 pb_4x0_4x4_4x8_4x12: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 pb_7_1: times 8 db 7, 1 pb_3_1: times 8 db 3, 1 pb_2_1: times 8 db 2, 1 pb_m1_0: times 8 db -1, 0 pb_m1_1: times 8 db -1, 1 pb_m1_2: times 8 db -1, 2 pb_1: times 16 db 1 pb_2: times 16 db 2 pb_3: times 16 db 3 pb_4: times 16 db 4 pb_16: times 16 db 16 pb_63: times 16 db 63 pb_64: times 16 db 64 pb_128: times 16 db 0x80 pb_129: times 16 db 0x81 pb_240: times 16 db 0xf0 pb_248: times 16 db 0xf8 pb_254: times 16 db 0xfe pw_2048: times 8 dw 2048 pw_4096: times 8 dw 4096 pd_mask: dd 1, 2, 4, 8 SECTION .text %macro ABSSUB 4 ; dst, a, b, tmp psubusb %1, %2, %3 psubusb %4, %3, %2 por %1, %4 %endmacro %macro TRANSPOSE_16x4_AND_WRITE_4x16 5 ; transpose 16x4 punpcklbw m%5, m%1, m%2 punpckhbw m%1, m%2 punpcklbw m%2, m%3, m%4 punpckhbw m%3, m%4 punpcklwd m%4, m%5, m%2 punpckhwd m%5, m%2 punpcklwd m%2, m%1, m%3 punpckhwd m%1, m%3 ; write out %assign %%n 0 %rep 4 movd [dstq+strideq *0-2], xm%4 movd [dstq+strideq *4-2], xm%5 movd [dstq+strideq *8-2], xm%2 movd [dstq+stride3q*4-2], xm%1 add dstq, strideq %if %%n < 3 psrldq xm%4, 4 psrldq xm%5, 4 psrldq xm%2, 4 psrldq xm%1, 4 %endif %assign %%n (%%n+1) %endrep lea dstq, [dstq+stride3q*4] %endmacro %macro TRANSPOSE_16X16B 2 ; output_transpose, mem %if %1 == 0 mova %2, m15 ; m7 in 32-bit %endif ; input in m0-7 punpcklbw m15, m0, m1 punpckhbw m0, m1 punpcklbw m1, m2, m3 punpckhbw m2, m3 punpcklbw m3, m4, m5 punpckhbw m4, m5 %if ARCH_X86_64 SWAP 4, 5, 7 %else %if %1 == 0 mova m5, %2 %else mova m5, [esp+1*16] %endif mova %2, m4 %endif punpcklbw m4, m6, m5 punpckhbw m6, m5 ; interleaved in m15,0,1,2,3,7,4,6 punpcklwd m5, m15, m1 punpckhwd m15, m1 punpcklwd m1, m0, m2 punpckhwd m0, m2 punpcklwd m2, m3, m4 punpckhwd m3, m4 %if ARCH_X86_64 SWAP 3, 4, 7 %else mova m4, %2 mova %2, m3 %endif punpcklwd m3, m4, m6 punpckhwd m4, m6 ; interleaved in m5,15,1,0,2,7,3,4 punpckldq m6, m5, m2 punpckhdq m5, m2 %if ARCH_X86_64 SWAP 2, 7, 5 %else mova m2, %2 mova [esp+1*16], m5 %endif punpckldq m5, m15, m2 punpckhdq m15, m2 punpckldq m2, m1, m3 punpckhdq m1, m3 punpckldq m3, m0, m4 punpckhdq m0, m4 %if ARCH_X86_32 mova [esp+0*16], m6 mova [esp+2*16], m5 mova [esp+3*16], m15 mova [esp+4*16], m2 mova [esp+5*16], m1 mova [esp+6*16], m3 mova [esp+7*16], m0 mova m8, [esp+ 8*16] mova m9, [esp+ 9*16] mova m10, [esp+10*16] %if %1 == 0 mova m11, [esp+11*16] mova m12, [esp+12*16] mova m13, [esp+13*16] mova m14, [esp+14*16] %else mova m11, [esp+20*16] mova m12, [esp+15*16] mova m13, [esp+16*16] mova m14, [esp+17*16] %endif %endif ; input in m8-m15 %if ARCH_X86_64 SWAP 7, 4 %endif punpcklbw m7, m8, m9 punpckhbw m8, m9 punpcklbw m9, m10, m11 punpckhbw m10, m11 punpcklbw m11, m12, m13 punpckhbw m12, m13 %if ARCH_X86_64 mova m13, %2 %else %if %1 == 0 mova m13, [esp+15*16] %else mova m13, [esp+18*16] %endif %endif mova %2, m12 punpcklbw m12, m14, m13 punpckhbw m14, m14, m13 ; interleaved in m7,8,9,10,11,rsp%2,12,14 punpcklwd m13, m7, m9 punpckhwd m7, m9 punpcklwd m9, m8, m10 punpckhwd m8, m10 punpcklwd m10, m11, m12 punpckhwd m11, m12 mova m12, %2 mova %2, m11 punpcklwd m11, m12, m14 punpckhwd m12, m14 ; interleaved in m13,7,9,8,10,rsp%2,11,12 punpckldq m14, m13, m10 punpckhdq m13, m10 punpckldq m10, m9, m11 punpckhdq m9, m11 punpckldq m11, m8, m12 punpckhdq m8, m12 mova m12, %2 mova %2, m8 punpckldq m8, m7, m12 punpckhdq m7, m12 %if ARCH_X86_32 mova [esp+ 8*16], m10 mova [esp+ 9*16], m9 mova [esp+10*16], m11 SWAP 6, 1 SWAP 4, 2 SWAP 5, 3 mova m6, [esp+0*16] mova m4, [esp+1*16] mova m5, [esp+2*16] %endif ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7 punpcklqdq m12, m6, m14 punpckhqdq m6, m14 punpcklqdq m14, m4, m13 punpckhqdq m4, m13 punpcklqdq m13, m5, m8 punpckhqdq m5, m8 %if ARCH_X86_64 SWAP 8, 5 %else mova m8, [esp+3*16] mova [esp+27*16], m5 %define m15 m8 %endif punpcklqdq m5, m15, m7 punpckhqdq m15, m7 %if ARCH_X86_32 mova [esp+11*16], m12 mova [esp+12*16], m6 mova [esp+13*16], m14 mova [esp+14*16], m4 mova [esp+26*16], m13 mova [esp+ 0*16], m5 mova [esp+ 1*16], m15 mova m2, [esp+ 4*16] mova m10, [esp+ 8*16] mova m1, [esp+ 5*16] mova m9, [esp+ 9*16] mova m3, [esp+ 6*16] mova m11, [esp+10*16] mova m0, [esp+ 7*16] %endif punpcklqdq m7, m2, m10 punpckhqdq m2, m10 punpcklqdq m10, m1, m9 punpckhqdq m1, m9 punpcklqdq m9, m3, m11 punpckhqdq m3, m11 mova m11, %2 %if ARCH_X86_32 %define m12 m3 %endif mova %2, m12 punpcklqdq m12, m0, m11 punpckhqdq m0, m11 %if %1 == 1 mova m11, %2 %endif %if ARCH_X86_64 ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0 SWAP 0, 11, 1, 6, 5, 8, 7, 15 SWAP 2, 14, 12, 9 SWAP 3, 4, 13 %else %if %1 == 0 mova [esp+15*16], m9 mova [esp+17*16], m12 mova [esp+18*16], m0 mova [esp+28*16], m10 mova [esp+29*16], m1 mova m3, [esp+0*16] mova m4, [esp+1*16] SWAP m5, m7 SWAP m6, m2 %else SWAP 0, 7 SWAP 3, 1, 2, 4, 6 %endif %endif %endmacro %macro FILTER 2 ; width [4/6/8/16], dir [h/v] %if ARCH_X86_64 %define %%flat8mem [rsp+0*16] %define %%q2mem [rsp+1*16] %define %%q3mem [rsp+2*16] %else %if %1 == 4 || %1 == 6 %define %%p2mem [esp+ 8*16] %define %%q2mem [esp+ 9*16] %define %%flat8mem [esp+10*16] %else %ifidn %2, v %define %%p2mem [esp+16*16] %define %%q2mem [esp+ 1*16] %define %%q3mem [esp+18*16] %define %%flat8mem [esp+ 0*16] %define %%flat16mem [esp+20*16] %else %define %%p2mem [esp+27*16] %define %%q2mem [esp+28*16] %define %%q3mem [esp+29*16] %define %%flat8mem [esp+21*16] %define %%flat16mem [esp+30*16] %endif %endif %xdefine m12reg m12 %endif %if ARCH_X86_32 lea stride3q, [strideq*3] %endif ; load data %ifidn %2, v %if ARCH_X86_32 mov mstrideq, strideq neg mstrideq %endif %if %1 == 4 lea tmpq, [dstq+mstrideq*2] mova m3, [tmpq+strideq*0] ; p1 mova m4, [tmpq+strideq*1] ; p0 mova m5, [tmpq+strideq*2] ; q0 mova m6, [tmpq+stride3q] ; q1 %else ; load 6-8 pixels, remainder (for wd=16) will be read inline lea tmpq, [dstq+mstrideq*4] ; we load p3 later %define %%p3mem [dstq+mstrideq*4] %if ARCH_X86_32 %define m13 m0 %define m14 m1 %define m15 m2 %endif mova m13, [tmpq+strideq*1] mova m3, [tmpq+strideq*2] mova m4, [tmpq+stride3q] mova m5, [dstq+strideq*0] mova m6, [dstq+strideq*1] mova m14, [dstq+strideq*2] %if %1 != 6 mova m15, [dstq+stride3q] %endif %if ARCH_X86_32 mova %%p2mem, m13 mova %%q2mem, m14 %define m13 %%p2mem %define m14 %%q2mem %if %1 != 6 mova %%q3mem, m15 %define m15 %%q3mem %endif %endif %endif %else ; %2 == h ; load lines %if %1 == 4 ; transpose 4x16 movd m7, [dstq+strideq*0-2] movd m3, [dstq+strideq*1-2] movd m4, [dstq+strideq*2-2] movd m5, [dstq+stride3q -2] lea tmpq, [dstq+strideq*4] punpcklbw m7, m3 punpcklbw m4, m5 movd m3, [tmpq+strideq*0-2] movd m1, [tmpq+strideq*1-2] movd m5, [tmpq+strideq*2-2] movd m6, [tmpq+stride3q -2] lea tmpq, [tmpq+strideq*4] punpcklbw m3, m1 punpcklbw m5, m6 movd m0, [tmpq+strideq*0-2] movd m1, [tmpq+strideq*1-2] punpcklbw m0, m1 movd m1, [tmpq+strideq*2-2] movd m2, [tmpq+stride3q -2] punpcklbw m1, m2 punpcklqdq m7, m0 punpcklqdq m4, m1 lea tmpq, [tmpq+strideq*4] movd m0, [tmpq+strideq*0-2] movd m1, [tmpq+strideq*1-2] punpcklbw m0, m1 movd m1, [tmpq+strideq*2-2] movd m2, [tmpq+stride3q -2] punpcklbw m1, m2 punpcklqdq m3, m0 punpcklqdq m5, m1 ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 punpcklwd m6, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 ; xm6: A0-3,B0-3,C0-3,D0-3 ; xm7: A8-11,B8-11,C8-11,D8-11 ; xm4: A4-7,B4-7,C4-7,D4-7 ; xm3: A12-15,B12-15,C12-15,D12-15 punpckldq m5, m6, m4 punpckhdq m6, m4 punpckldq m4, m7, m3 punpckhdq m7, m3 ; xm5: A0-7,B0-7 ; xm6: C0-7,D0-7 ; xm4: A8-15,B8-15 ; xm7: C8-15,D8-15 punpcklqdq m3, m5, m4 punpckhqdq m5, m5, m4 punpcklqdq m4, m6, m7 punpckhqdq m6, m7 ; xm3: A0-15 ; xm5: B0-15 ; xm4: C0-15 ; xm6: D0-15 SWAP 4, 5 %elif %1 == 6 || %1 == 8 ; transpose 8x16 movq m7, [dstq+strideq*0-%1/2] movq m3, [dstq+strideq*1-%1/2] movq m4, [dstq+strideq*2-%1/2] movq m5, [dstq+stride3q -%1/2] lea tmpq, [dstq+strideq*8] punpcklbw m7, m3 punpcklbw m4, m5 movq m3, [tmpq+strideq*0-%1/2] movq m1, [tmpq+strideq*1-%1/2] movq m5, [tmpq+strideq*2-%1/2] movq m6, [tmpq+stride3q -%1/2] lea tmpq, [dstq+strideq*4] punpcklbw m3, m1 punpcklbw m5, m6 movq m6, [tmpq+strideq*0-%1/2] movq m0, [tmpq+strideq*1-%1/2] movq m1, [tmpq+strideq*2-%1/2] movq m2, [tmpq+stride3q -%1/2] lea tmpq, [tmpq+strideq*8] punpcklbw m6, m0 punpcklbw m1, m2 movq m2, [tmpq+strideq*2-%1/2] movq m0, [tmpq+stride3q -%1/2] punpcklbw m2, m0 %if ARCH_X86_64 SWAP m15, m2 %else %define m15 [esp+3*16] mova m15, m2 %endif movq m0, [tmpq+strideq*0-%1/2] movq m2, [tmpq+strideq*1-%1/2] punpcklbw m0, m2 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 punpcklwd m2, m7, m4 punpckhwd m7, m4 punpcklwd m4, m3, m5 punpckhwd m3, m5 punpcklwd m5, m6, m1 punpckhwd m6, m1 punpcklwd m1, m0, m15 punpckhwd m0, m15 %if ARCH_X86_64 SWAP m15, m0 %else mova m15, m0 %endif ; xm2: A0-3,B0-3,C0-3,D0-3 ; xm7: E0-3,F0-3,G0-3,H0-3 ; xm4: A8-11,B8-11,C8-11,D8-11 ; xm3: E8-11,F8-11,G8-11,H8-11 ; xm5: A4-7,B4-7,C4-7,D4-7 ; xm6: E4-7,F4-7,G4-7,H4-7 ; xm1: A12-15,B12-15,C12-15,D12-15 ; xm0: E12-15,F12-15,G12-15,H12-15 punpckldq m0, m2, m5 punpckhdq m2, m5 punpckldq m5, m7, m6 %if %1 != 6 punpckhdq m7, m6 %endif punpckldq m6, m4, m1 punpckhdq m4, m1 punpckldq m1, m3, m15 %if %1 != 6 punpckhdq m3, m15 %if ARCH_X86_64 SWAP m15, m3 %else mova m15, m3 %endif %endif ; xm0: A0-7,B0-7 ; xm2: C0-7,D0-7 ; xm5: E0-7,F0-7 ; xm7: G0-7,H0-7 ; xm6: A8-15,B8-15 ; xm4: C8-15,D8-15 ; xm1: E8-15,F8-15 ; xm3: G8-15,H8-15 punpcklqdq m3, m0, m6 punpckhqdq m0, m6 punpckhqdq m6, m2, m4 punpcklqdq m2, m4 punpcklqdq m4, m5, m1 punpckhqdq m5, m1 %if %1 == 8 punpcklqdq m1, m7, m15 punpckhqdq m7, m15 ; xm3: A0-15 ; xm0: B0-15 ; xm2: C0-15 ; xm6: D0-15 ; xm4: E0-15 ; xm5: F0-15 ; xm1: G0-15 ; xm7: H0-15 %if ARCH_X86_64 SWAP 11, 3, 2 SWAP 13, 0 SWAP 6, 5, 4 SWAP 14, 1 SWAP 15, 7 ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15 mova [rsp+21*16], m11 %define %%p3mem [rsp+21*16] %else %define m11 [esp+26*16] %define m13 [esp+27*16] %define m14 [esp+28*16] %define m15 [esp+29*16] mova m11, m3 mova m13, m0 SWAP 3, 2 SWAP 6, 5, 4 mova m14, m1 mova m15, m7 %define %%p3mem [esp+26*16] %endif %else %if ARCH_X86_64 SWAP 13, 3, 0 SWAP 14, 5, 6, 4, 2 ; 3,0,2,6,4,5 -> 13,3,4,5,6,14 %else %define m13 %%p2mem %define m14 %%q2mem mova m13, m3 mova m14, m5 SWAP 3, 0 SWAP 5, 6, 4, 2 ; 0,2,6,4 -> 3,4,5,6 %endif %endif %else %if ARCH_X86_64 mova [rsp+20*16], m12 %endif ; load and 16x16 transpose. We only use 14 pixels but we'll need the ; remainder at the end for the second transpose %if ARCH_X86_32 %xdefine m8 m0 %xdefine m9 m1 %xdefine m10 m2 %xdefine m11 m3 %xdefine m12 m4 %xdefine m13 m5 %xdefine m14 m6 %xdefine m15 m7 lea tmpq, [dstq+strideq*8] movu m8, [tmpq+strideq*0-8] movu m9, [tmpq+strideq*1-8] movu m10, [tmpq+strideq*2-8] movu m11, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] movu m12, [tmpq+strideq*0-8] movu m13, [tmpq+strideq*1-8] movu m14, [tmpq+strideq*2-8] movu m15, [tmpq+stride3q -8] mova [esp+ 8*16], m8 mova [esp+ 9*16], m9 mova [esp+10*16], m10 mova [esp+11*16], m11 mova [esp+12*16], m12 mova [esp+13*16], m13 mova [esp+14*16], m14 mova [esp+15*16], m15 %endif movu m0, [dstq+strideq*0-8] movu m1, [dstq+strideq*1-8] movu m2, [dstq+strideq*2-8] movu m3, [dstq+stride3q -8] lea tmpq, [dstq+strideq*4] movu m4, [tmpq+strideq*0-8] movu m5, [tmpq+strideq*1-8] movu m6, [tmpq+strideq*2-8] movu m7, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] %if ARCH_X86_64 movu m8, [tmpq+strideq*0-8] movu m9, [tmpq+strideq*1-8] movu m10, [tmpq+strideq*2-8] movu m11, [tmpq+stride3q -8] lea tmpq, [tmpq+strideq*4] movu m12, [tmpq+strideq*0-8] movu m13, [tmpq+strideq*1-8] movu m14, [tmpq+strideq*2-8] movu m15, [tmpq+stride3q -8] %endif %if ARCH_X86_64 TRANSPOSE_16X16B 0, [rsp+11*16] mova [rsp+12*16], m1 mova [rsp+13*16], m2 mova [rsp+14*16], m3 mova [rsp+15*16], m12 mova [rsp+16*16], m13 mova [rsp+17*16], m14 mova [rsp+18*16], m15 ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 SWAP 12, 4, 7 SWAP 13, 5, 8 SWAP 3, 6, 9 SWAP 10, 14 SWAP 11, 15 mova [rsp+21*16], m12 %define %%p3mem [rsp+21*16] mova m12, [rsp+20*16] %else TRANSPOSE_16X16B 0, [esp+16*16] %define %%p3mem [esp+26*16] %define m11 %%p3mem %define m13 %%p2mem %define m14 %%q2mem %define m15 %%q3mem %endif %endif ; if 4 elif 6 or 8 else 16 %endif ; if v else h ; load L/E/I/H %if ARCH_X86_32 mov l_strideq, l_stridem %endif %ifidn %2, v movu m1, [lq] movu m0, [lq+l_strideq] %else %if ARCH_X86_32 lea l_stride3q, [l_strideq*3] %endif movq xm1, [lq] movq xm2, [lq+l_strideq*2] movhps xm1, [lq+l_strideq] movhps xm2, [lq+l_stride3q] shufps m0, m1, m2, q3131 shufps m1, m2, q2020 %if ARCH_X86_32 lea stride3q, [strideq*3] %endif %endif %if ARCH_X86_32 %ifidn %2, v mov lutd, lutm %endif %endif pxor m2, m2 pcmpeqb m7, m2, m0 pand m1, m7 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] pshufb m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1] pcmpeqb m2, m0 ; !L psrlq m7, m0, [lutq+128] pand m7, [PIC_sym(pb_63)] pminub m7, minlvl pmaxub m7, [PIC_sym(pb_1)] ; I pand m1, m0, [PIC_sym(pb_240)] psrlq m1, 4 ; H paddb m0, [PIC_sym(pb_2)] paddb m0, m0 paddb m0, m7 ; E pxor m1, [PIC_sym(pb_128)] pxor m7, [PIC_sym(pb_128)] pxor m0, [PIC_sym(pb_128)] SWAP 2, 7 %if ARCH_X86_64 SWAP 0, 8 SWAP 2, 10 %else %ifidn %2, v mov mstrideq, strideq neg mstrideq %if %1 == 4 lea tmpq, [dstq+mstrideq*2] %elif %1 == 6 || %1 == 8 lea tmpq, [dstq+mstrideq*4] %endif %endif mova [esp+3*16], m0 mova [esp+4*16], m2 %endif ABSSUB m0, m3, m4, m2 ; abs(p1-p0) pmaxub m0, m7 ABSSUB m2, m5, m6, m7 ; abs(q1-q0) pmaxub m0, m2 %if %1 == 4 pxor m0, [PIC_sym(pb_128)] pcmpgtb m7, m0, m1 ; hev %if ARCH_X86_64 SWAP 7, 11 %else mova [esp+5*16], m7 %endif %else pxor m7, m0, [PIC_sym(pb_128)] pcmpgtb m7, m1 ; hev %if ARCH_X86_64 SWAP 7, 11 %else mova [esp+5*16], m7 %endif %if %1 == 6 ABSSUB m1, m13, m4, m7 ; abs(p2-p0) pmaxub m1, m0 %else mova m2, %%p3mem ABSSUB m1, m2, m4, m7 ; abs(p3-p0) pmaxub m1, m0 ABSSUB m7, m13, m4, m2 ; abs(p2-p0) pmaxub m1, m7 %endif ABSSUB m7, m5, m14, m2 ; abs(p2-p0) pmaxub m1, m7 %if %1 != 6 ABSSUB m7, m5, m15, m2 ; abs(q3-q0) pmaxub m1, m7 %endif pxor m1, [PIC_sym(pb_128)] pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8in %if ARCH_X86_64 SWAP 1, 9 %else mova [esp+6*16], m1 %endif %if %1 == 6 ABSSUB m7, m13, m3, m1 ; abs(p2-p1) %else mova m2, %%p3mem ABSSUB m7, m2, m13, m1 ; abs(p3-p2) ABSSUB m2, m13, m3, m1 ; abs(p2-p1) pmaxub m7, m2 ABSSUB m2, m14, m15, m1 ; abs(q3-q2) pmaxub m7, m2 %endif ABSSUB m2, m14, m6, m1 ; abs(q2-q1) pmaxub m7, m2 %if ARCH_X86_32 %define m12 m1 mova m12, maskmem %endif pand m2, m12, mask1 pcmpeqd m2, m12 pand m7, m2 ; only apply fm-wide to wd>4 blocks pmaxub m0, m7 pxor m0, [PIC_sym(pb_128)] %endif ; %if %1 == 4 else %if ARCH_X86_64 SWAP 2, 10 pcmpgtb m0, m2 %else pcmpgtb m0, [esp+4*16] %endif ABSSUB m1, m3, m6, m7 ; abs(p1-q1) ABSSUB m7, m4, m5, m2 ; abs(p0-q0) paddusb m7, m7 pand m1, [PIC_sym(pb_254)] psrlq m1, 1 paddusb m1, m7 ; abs(p0-q0)*2+(abs(p1-q1)>>1) pxor m1, [PIC_sym(pb_128)] %if ARCH_X86_64 pcmpgtb m1, m8 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E %else pcmpgtb m1, [esp+3*16] %endif por m0, m1 %if %1 == 16 %if ARCH_X86_64 SWAP 0, 8 %else mova [esp+3*16], m0 %endif %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] %else mova m0, [rsp+12*16] %endif ABSSUB m1, m0, m4, m2 %ifidn %2, v mova m0, [tmpq+strideq*2] %else mova m0, [rsp+13*16] %endif ABSSUB m2, m0, m4, m7 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+stride3q] %else mova m0, [rsp+14*16] %endif ABSSUB m2, m0, m4, m7 pmaxub m1, m2 %ifidn %2, v lea tmpq, [dstq+strideq*4] mova m0, [tmpq+strideq*0] %else mova m0, [rsp+15*16] %endif ABSSUB m2, m0, m5, m7 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+strideq*1] %else mova m0, [rsp+16*16] %endif ABSSUB m2, m0, m5, m7 pmaxub m1, m2 %ifidn %2, v mova m0, [tmpq+strideq*2] %else mova m0, [rsp+17*16] %endif ABSSUB m2, m0, m5, m7 pmaxub m1, m2 pxor m1, [PIC_sym(pb_128)] pcmpgtb m1, [PIC_sym(pb_129)] ; !flat8out %if ARCH_X86_64 por m1, m9 ; !flat8in | !flat8out %else por m1, [esp+6*16] %define m12 m7 mova m12, maskmem %endif pand m2, m12, mask2 pcmpeqd m2, m12 pandn m1, m2 ; flat16 %if ARCH_X86_64 pandn m2, m8, m1 ; flat16 & fm %else pandn m2, [esp+3*16], m1 ; flat16 & fm mova %%flat16mem, m2 %endif SWAP 1, 2 pand m2, m12, mask1 pcmpeqd m2, m12 %if ARCH_X86_64 pandn m9, m2 ; flat8in pandn m2, m8, m9 SWAP 2, 9 %else pandn m0, [esp+6*16], m2 pandn m2, [esp+3*16], m0 mova [esp+6*16], m2 %endif pand m2, m12, mask0 pcmpeqd m2, m12 %if ARCH_X86_64 pandn m8, m2 pandn m2, m9, m8 ; fm & !flat8 & !flat16 SWAP 2, 8 pandn m2, m1, m9 ; flat8 & !flat16 SWAP 2, 9 SWAP 0, 8 SWAP 1, 10 %else pandn m0, [esp+3*16], m2 pandn m2, [esp+6*16], m0 SWAP 2, 0 pandn m2, m1, [esp+6*16] mova %%flat8mem, m2 %endif %elif %1 != 4 %if ARCH_X86_64 SWAP 1, 9 %else %define m12 m7 mova m12, maskmem mova m1, [esp+6*16] %endif pand m2, m12, mask1 pcmpeqd m2, m12 pandn m1, m2 pandn m2, m0, m1 ; flat8 & fm pand m1, m12, mask0 pcmpeqd m1, m12 pandn m0, m1 pandn m1, m2, m0 ; fm & !flat8 SWAP 1, 2, 0 %if ARCH_X86_64 SWAP 1, 9 %else mova %%flat8mem, m1 %endif %else %if ARCH_X86_32 %define m12 m1 mova m12, maskmem %endif pand m2, m12, mask0 pcmpeqd m2, m12 pandn m0, m2 ; fm %endif ; short filter mova m1, [PIC_sym(pb_128)] %if ARCH_X86_64 SWAP 7, 11 %else mova m7, [esp+5*16] %endif pxor m3, m1 pxor m6, m1 pxor m4, m1 pxor m5, m1 psubsb m1, m3, m6 ; iclip_diff(p1-q1) pand m1, m7 ; f=iclip_diff(p1-q1)&hev psubsb m2, m5, m4 paddsb m1, m2 paddsb m1, m2 paddsb m1, m2 ; f=iclip_diff(3*(q0-p0)+f) mova m2, [PIC_sym(pb_16)] pand m0, m1 ; f&=fm paddsb m1, m0, [PIC_sym(pb_3)] paddsb m0, [PIC_sym(pb_4)] pand m1, [PIC_sym(pb_248)] pand m0, [PIC_sym(pb_248)] psrlq m1, 3 psrlq m0, 3 pxor m1, m2 pxor m0, m2 psubb m1, m2 ; f2 psubb m0, m2 ; f1 mova m2, [PIC_sym(pb_128)] paddsb m4, m1 psubsb m5, m0 pxor m4, m2 pxor m5, m2 pxor m0, m2 pxor m1, m1 pavgb m0, m1 ; f=(f1+1)>>1 psubb m0, [PIC_sym(pb_64)] pandn m7, m0 ; f&=!hev paddsb m3, m7 psubsb m6, m7 pxor m3, m2 pxor m6, m2 %if %1 == 16 ; flat16 filter %ifidn %2, v lea tmpq, [dstq+mstrideq*8] mova m0, [tmpq+strideq*1] ; p6 mova m2, [tmpq+strideq*2] ; p5 mova m7, [tmpq+stride3q] ; p4 %else mova m0, [rsp+12*16] mova m2, [rsp+13*16] mova m7, [rsp+14*16] %endif %if ARCH_X86_64 SWAP 1, 10 mova %%flat8mem, m9 mova %%q2mem, m14 mova %%q3mem, m15 SWAP 0, 8 SWAP 1, 9 %else %ifidn %2, v mova [esp+17*16], m0 mova [esp+19*16], m3 mova [esp+21*16], m4 mova [esp+22*16], m5 mova [esp+23*16], m6 %xdefine m11 m3 %xdefine m14 m4 %xdefine m15 m5 %xdefine m10 m6 %define m13 %%p2mem %define m8 [esp+17*16] %define m9 %%flat16mem %define m3 [esp+19*16] %define m4 [esp+21*16] %define m5 [esp+22*16] %define m6 [esp+23*16] %else mova [esp+31*16], m0 mova [esp+32*16], m3 mova [esp+33*16], m4 mova [esp+34*16], m5 mova [esp+35*16], m6 %xdefine m11 m3 %xdefine m14 m4 %xdefine m15 m5 %xdefine m10 m6 %define m13 %%p2mem %define m8 [esp+31*16] %define m9 %%flat16mem %define m3 [esp+32*16] %define m4 [esp+33*16] %define m5 [esp+34*16] %define m6 [esp+35*16] %endif %endif ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A ; write -6 mova m11, %%p3mem %if ARCH_X86_64 punpcklbw m14, m8, m11 punpckhbw m15, m8, m11 %else punpcklbw m14, m0, m11 punpckhbw m15, m0, m11 %endif %ifidn %2, v mova [rsp+5*16], m11 %endif pmaddubsw m10, m14, [PIC_sym(pb_7_1)] pmaddubsw m11, m15, [PIC_sym(pb_7_1)] ; p6*7+p3 punpcklbw m0, m2, m7 punpckhbw m1, m2, m7 pmaddubsw m0, [PIC_sym(pb_2)] pmaddubsw m1, [PIC_sym(pb_2)] paddw m10, m0 paddw m11, m1 ; p6*7+p5*2+p4*2+p3 punpcklbw m0, m13, m3 punpckhbw m1, m13, m3 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m10, m0 paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1 punpcklbw m0, m4, m5 punpckhbw m1, m4, m5 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m10, m0 paddw m11, m1 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m2 por m0, m1 %ifidn %2, v mova [tmpq+strideq*2], m0 ; p5 %else mova [rsp+13*16], m0 %endif ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B ; write -5 pmaddubsw m14, [PIC_sym(pb_m1_1)] pmaddubsw m15, [PIC_sym(pb_m1_1)] paddw m10, m14 paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 punpcklbw m0, m8, m6 punpckhbw m1, m8, m6 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+3*16], m0 mova [rsp+4*16], m1 paddw m10, m0 paddw m11, m1 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m7 por m0, m1 %ifidn %2, v mova [tmpq+stride3q], m0 ; p4 %else mova [rsp+14*16], m0 %endif ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C ; write -4 mova m14, %%q2mem punpcklbw m0, m8, m13 punpckhbw m1, m8, m13 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 punpcklbw m0, m2, m14 punpckhbw m2, m14 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m2, [PIC_sym(pb_m1_1)] mova [rsp+1*16], m0 paddw m10, m0 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, %%p3mem por m0, m1 %ifidn %2, v mova [tmpq+strideq*4], m0 ; p3 %else mova [rsp+19*16], m0 %endif ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D ; write -3 mova m15, %%q3mem punpcklbw m0, m8, m3 punpckhbw m1, m8, m3 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 punpcklbw m0, m7, m15 punpckhbw m7, m15 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m7, [PIC_sym(pb_m1_1)] mova [rsp+2*16], m0 %if ARCH_X86_32 %ifidn %2, v mova [esp+24*16], m7 %else mova [esp+36*16], m7 %endif %endif paddw m10, m0 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m13 por m0, m1 mova [rsp+6*16], m0 ; don't clobber p2/m13 since we need it in F ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E ; write -2 punpcklbw m0, m8, m4 punpckhbw m1, m8, m4 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 %if ARCH_X86_64 SWAP 7, 8 %endif %ifidn %2, v mova m1, [dstq+strideq*4] ; q4 mova m7, [rsp+5*16] ; (pre-filter) p3 %else mova m1, [rsp+15*16] mova m7, %%p3mem ; (pre-filter) p3 %endif punpcklbw m0, m1, m7 punpckhbw m1, m1, m7 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+7*16], m0 mova [rsp+5*16], m1 psubw m10, m0 psubw m11, m1 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m3 por m0, m1 mova [rsp+8*16], m0 ; don't clobber p1/m3 since we need it in G ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F ; write -1 %ifidn %2, v mova m7, [tmpq+strideq*1] ; p6 lea tmpq, [dstq+strideq*4] mova m1, [tmpq+strideq*1] ; q5 %else mova m7, [rsp+12*16] ; p6 mova m1, [rsp+16*16] %endif punpcklbw m0, m7, m5 punpckhbw m7, m5 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m7, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m7 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 punpcklbw m7, m13, m1 pmaddubsw m7, [PIC_sym(pb_m1_1)] mova [rsp+9*16], m7 paddw m10, m7 %if ARCH_X86_64 punpckhbw m13, m1 mova m1, [rsp+6*16] SWAP 1, 13 %else punpckhbw m7, m13, m1 mova m1, [esp+6*16] mova m13, m1 SWAP 1, 7 %endif pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+10*16], m1 paddw m11, m1 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 pmulhrsw m7, m10, [PIC_sym(pw_2048)] pmulhrsw m0, m11, [PIC_sym(pw_2048)] packuswb m7, m0 pand m7, m9 pandn m0, m9, m4 por m7, m0 mova [rsp+6*16], m7 ; don't clobber p0/m4 since we need it in H ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G ; write +0 %ifidn %2, v mova m7, [tmpq+strideq*2] ; q6 %else mova m7, [rsp+17*16] %endif paddw m10, [rsp+3*16] paddw m11, [rsp+4*16] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 punpcklbw m0, m3, m7 punpckhbw m1, m3, m7 %if ARCH_X86_64 mova m3, [rsp+8*16] %endif pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] mova [rsp+3*16], m0 mova [rsp+4*16], m1 paddw m10, m0 paddw m11, m1 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m5 por m0, m1 %if ARCH_X86_32 mova m1, [esp+8*16] mova m3, m1 %endif mova [rsp+8*16], m0 ; don't clobber q0/m5 since we need it in I ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H ; write +1 paddw m10, [rsp+1*16] paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 punpcklbw m0, m4, m7 punpckhbw m2, m4, m7 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m2, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 %if ARCH_X86_64 mova m4, [rsp+6*16] %else %define m4 [esp+6*16] %endif pmulhrsw m2, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m2, m1 pand m2, m9 pandn m1, m9, m6 por m2, m1 ; don't clobber q1/m6 since we need it in K ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I ; write +2 paddw m10, [rsp+2*16] %if ARCH_X86_64 SWAP 7, 8 paddw m11, m7 %else mova m8, m7 %ifidn %2, v paddw m11, [esp+24*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 %else paddw m11, [esp+36*16] ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 %endif %endif punpcklbw m0, m5, m8 punpckhbw m1, m5, m8 %if ARCH_X86_64 mova m5, [rsp+8*16] %else %define m5 [esp+8*16] %endif pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 pmulhrsw m7, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m7, m1 pand m7, m9 pandn m1, m9, m14 por m7, m1 ; don't clobber q2/m14 since we need it in K ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J ; write +3 psubw m10, [rsp+7*16] psubw m11, [rsp+5*16] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 punpcklbw m0, m6, m8 punpckhbw m1, m6, m8 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m15 por m0, m1 %ifidn %2, v mova [tmpq+mstrideq], m0 ; q3 %else mova [rsp+20*16], m0 %endif ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K ; write +4 paddw m10, [rsp+ 9*16] paddw m11, [rsp+10*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 punpcklbw m0, m14, m8 punpckhbw m1, m14, m8 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m0, m10, [PIC_sym(pw_2048)] pmulhrsw m1, m11, [PIC_sym(pw_2048)] packuswb m0, m1 pand m0, m9 %ifidn %2, v pandn m1, m9, [tmpq+strideq*0] %else pandn m1, m9, [rsp+15*16] %endif por m0, m1 %ifidn %2, v mova [tmpq+strideq*0], m0 ; q4 %else mova [rsp+15*16], m0 %endif ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L ; write +5 paddw m10, [rsp+3*16] paddw m11, [rsp+4*16] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 punpcklbw m0, m15, m8 punpckhbw m1, m15, m8 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m10, m0 paddw m11, m1 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 pmulhrsw m10, [PIC_sym(pw_2048)] pmulhrsw m11, [PIC_sym(pw_2048)] packuswb m10, m11 pand m10, m9 %ifidn %2, v pandn m11, m9, [tmpq+strideq*1] %else pandn m11, m9, [rsp+16*16] %endif por m10, m11 %ifidn %2, v mova [tmpq+strideq*1], m10 ; q5 %else mova [rsp+16*16], m10 %endif %if ARCH_X86_64 SWAP 0, 8 SWAP 1, 9 SWAP 14, 7 %else %xdefine m3 m11 %xdefine m4 m14 %xdefine m5 m15 %xdefine m6 m10 mova %%q2mem, m7 %ifidn %2, v mova m3, [esp+19*16] %else mova m3, [esp+32*16] %endif mova m4, [esp+ 6*16] mova m5, [esp+ 8*16] %endif SWAP m6, m2 %if ARCH_X86_64 mova m9, %%flat8mem %endif %ifidn %2, v lea tmpq, [dstq+mstrideq*4] %endif %endif ; if %1 == 16 %if %1 >= 8 ; flat8 filter %if ARCH_X86_32 %define m9 %%flat8mem %define m11 m1 %define m13 %%p2mem %define m14 %%q2mem %define m15 %%q3mem %endif mova m11, %%p3mem punpcklbw m0, m11, m3 punpcklbw m7, m13, m4 pmaddubsw m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 pmaddubsw m7, [PIC_sym(pb_2_1)] paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 punpcklbw m7, m5, [PIC_sym(pb_4)] pmaddubsw m7, [PIC_sym(pb_1)] paddw m2, m7 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 punpckhbw m1, m11, m3 pmaddubsw m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1 punpckhbw m0, m13, m4 pmaddubsw m0, [PIC_sym(pb_2_1)] paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 punpckhbw m0, m5, [PIC_sym(pb_4)] pmaddubsw m0, [PIC_sym(pb_1)] paddw m7, m0 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m13 por m0, m1 ; p2 %ifidn %2, v mova [tmpq+strideq*1], m0 %else %if ARCH_X86_64 SWAP 0, 10 %else mova [esp+2*16], m0 %endif %endif %if ARCH_X86_32 mova m11, %%p3mem %endif punpcklbw m0, m11, m3 punpckhbw m1, m11, m3 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 punpcklbw m0, m13, m6 punpckhbw m1, m13, m6 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m3 por m0, m1 ; p1 %ifidn %2, v mova [tmpq+strideq*2], m0 %else mova [rsp+0*16], m0 %endif %if ARCH_X86_32 mova m11, %%p3mem %endif punpcklbw m0, m11, m3 punpckhbw m1, m11, m3 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] psubw m2, m0 psubw m7, m1 punpcklbw m0, m4, m14 punpckhbw m1, m4, m14 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m2, m0 paddw m7, m1 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m4 por m0, m1 ; p0 %ifidn %2, v mova [tmpq+stride3q], m0 %else mova [rsp+1*16], m0 %endif punpcklbw m0, m5, m15 punpckhbw m1, m5, m15 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m2, m0 paddw m7, m1 %if ARCH_X86_32 mova m11, %%p3mem %endif punpcklbw m0, m11, m4 punpckhbw m11, m11, m4 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m11, [PIC_sym(pb_1)] psubw m2, m0 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 psrlw m0, m2, 3 psrlw m11, m7, 3 packuswb m0, m11 pand m0, m9 pandn m11, m9, m5 por m11, m0 ; q0 %ifidn %2, v mova [dstq+strideq*0], m11 %elif ARCH_X86_32 mova [esp+8*16], m11 %endif punpcklbw m0, m5, m15 punpckhbw m1, m5, m15 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 punpcklbw m0, m13, m6 punpckhbw m1, m13, m6 pmaddubsw m0, [PIC_sym(pb_m1_1)] pmaddubsw m1, [PIC_sym(pb_m1_1)] paddw m2, m0 paddw m7, m1 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 psrlw m0, m2, 3 psrlw m1, m7, 3 packuswb m0, m1 pand m0, m9 pandn m1, m9, m6 por m0, m1 ; q1 %ifidn %2, v mova [dstq+strideq*1], m0 %else %if ARCH_X86_64 SWAP 0, 13 %else mova [esp+9*16], m0 %endif %endif punpcklbw m0, m3, m6 punpckhbw m1, m3, m6 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] psubw m2, m0 psubw m7, m1 punpcklbw m0, m14, m15 punpckhbw m1, m14, m15 pmaddubsw m0, [PIC_sym(pb_1)] pmaddubsw m1, [PIC_sym(pb_1)] paddw m2, m0 paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 psrlw m2, 3 psrlw m7, 3 packuswb m2, m7 pand m2, m9 pandn m7, m9, m14 por m2, m7 ; q2 %ifidn %2, v mova [dstq+strideq*2], m2 %else mova m0, [rsp+0*16] %if %1 == 8 mova m1, [rsp+1*16] mova m4, %%p3mem %if ARCH_X86_32 %define m10 [esp+2*16] %define m11 [esp+8*16] %define m13 [esp+9*16] %endif ; 16x8 transpose punpcklbw m3, m4, m10 punpckhbw m4, m10 punpcklbw m5, m0, m1 punpckhbw m0, m1 punpcklbw m1, m11, m13 punpckhbw m6, m11, m13 punpcklbw m7, m2, m15 punpckhbw m2, m15 %if ARCH_X86_64 SWAP 2, 15 %else mova m15, m2 %endif punpcklwd m2, m3, m5 punpckhwd m3, m5 punpcklwd m5, m4, m0 punpckhwd m4, m0 punpcklwd m0, m1, m7 punpckhwd m1, m7 punpcklwd m7, m6, m15 punpckhwd m6, m15 %if ARCH_X86_64 SWAP 6, 15 %else mova m15, m6 %endif punpckldq m6, m2, m0 punpckhdq m2, m0 punpckldq m0, m3, m1 punpckhdq m3, m1 punpckldq m1, m5, m7 punpckhdq m5, m7 punpckldq m7, m4, m15 punpckhdq m4, m15 ; write 8x16 movq [dstq+strideq*0-4], xm6 movhps [dstq+strideq*1-4], xm6 movq [dstq+strideq*2-4], xm2 movhps [dstq+stride3q -4], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm0 movhps [dstq+strideq*1-4], xm0 movq [dstq+strideq*2-4], xm3 movhps [dstq+stride3q -4], xm3 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm1 movhps [dstq+strideq*1-4], xm1 movq [dstq+strideq*2-4], xm5 movhps [dstq+stride3q -4], xm5 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0-4], xm7 movhps [dstq+strideq*1-4], xm7 movq [dstq+strideq*2-4], xm4 movhps [dstq+stride3q -4], xm4 lea dstq, [dstq+strideq*4] %else ; 16x16 transpose and store SWAP 6, 0 SWAP 7, 1 %if ARCH_X86_64 SWAP 5, 10, 2 SWAP 8, 11 SWAP 9, 13 mova [rsp+21*16], m12 %else mova [esp+10*16], m2 %xdefine m8 m0 %xdefine m9 m1 %xdefine m10 m2 %xdefine m11 m3 %xdefine m12 m4 %xdefine m13 m5 %xdefine m14 m6 %xdefine m15 m7 %endif mova m0, [rsp+11*16] mova m1, [rsp+12*16] mova m2, [rsp+13*16] mova m3, [rsp+14*16] mova m4, [rsp+19*16] %if ARCH_X86_64 mova m7, [rsp+ 1*16] mova m11, [rsp+20*16] mova m12, [rsp+15*16] mova m13, [rsp+16*16] mova m14, [rsp+17*16] TRANSPOSE_16X16B 1, [rsp+18*16] %else mova m5, [esp+ 2*16] TRANSPOSE_16X16B 1, [esp+32*16] mov tmpq, dstq lea dstq, [dstq+strideq*8] %endif movu [dstq+strideq*0-8], xm0 movu [dstq+strideq*1-8], xm1 movu [dstq+strideq*2-8], xm2 movu [dstq+stride3q -8], xm3 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm4 movu [dstq+strideq*1-8], xm5 movu [dstq+strideq*2-8], xm6 movu [dstq+stride3q -8], xm7 %if ARCH_X86_64 lea dstq, [dstq+strideq*4] %else %xdefine m8 m0 %xdefine m9 m1 %xdefine m10 m2 %xdefine m11 m3 %xdefine m12 m4 %xdefine m13 m5 %xdefine m14 m6 %xdefine m15 m7 mova m8, [esp+11*16] mova m9, [esp+12*16] mova m10, [esp+13*16] mova m11, [esp+14*16] mova m12, [esp+26*16] mova m13, [esp+27*16] mova m14, [esp+ 0*16] mova m15, [esp+ 1*16] mov dstq, tmpq %endif movu [dstq+strideq*0-8], xm8 movu [dstq+strideq*1-8], xm9 movu [dstq+strideq*2-8], xm10 movu [dstq+stride3q -8], xm11 lea dstq, [dstq+strideq*4] movu [dstq+strideq*0-8], xm12 movu [dstq+strideq*1-8], xm13 movu [dstq+strideq*2-8], xm14 movu [dstq+stride3q -8], xm15 lea dstq, [dstq+strideq*4] %if ARCH_X86_32 lea dstq, [dstq+strideq*8] %else mova m12, [rsp+21*16] %endif %endif ; if %1 == 8 %endif ; ifidn %2, v %elif %1 == 6 ; flat6 filter %if ARCH_X86_32 mova [esp+3*16], m3 mova [esp+4*16], m4 mova [esp+5*16], m5 mova [esp+6*16], m6 %xdefine m8 m3 %xdefine m10 m4 %xdefine m11 m5 %xdefine m15 m6 %define m3 [esp+3*16] %define m4 [esp+4*16] %define m5 [esp+5*16] %define m6 [esp+6*16] %define m9 %%flat8mem %define m13 %%p2mem %define m14 %%q2mem %endif punpcklbw m8, m13, m5 punpckhbw m11, m13, m5 pmaddubsw m0, m8, [PIC_sym(pb_3_1)] pmaddubsw m1, m11, [PIC_sym(pb_3_1)] punpcklbw m7, m4, m3 punpckhbw m10, m4, m3 pmaddubsw m2, m7, [PIC_sym(pb_2)] pmaddubsw m15, m10, [PIC_sym(pb_2)] paddw m0, m2 paddw m1, m15 pmulhrsw m2, m0, [PIC_sym(pw_4096)] pmulhrsw m15, m1, [PIC_sym(pw_4096)] packuswb m2, m15 pand m2, m9 pandn m15, m9, m3 por m2, m15 %ifidn %2, v mova [tmpq+strideq*2], m2 ; p1 %elif ARCH_X86_32 mova [esp+11*16], m2 %endif pmaddubsw m8, [PIC_sym(pb_m1_1)] pmaddubsw m11, [PIC_sym(pb_m1_1)] paddw m0, m8 paddw m1, m11 punpcklbw m8, m13, m6 punpckhbw m11, m13, m6 %if ARCH_X86_64 SWAP 2, 13 %endif pmaddubsw m8, [PIC_sym(pb_m1_1)] pmaddubsw m11, [PIC_sym(pb_m1_1)] paddw m0, m8 paddw m1, m11 pmulhrsw m2, m0, [PIC_sym(pw_4096)] pmulhrsw m15, m1, [PIC_sym(pw_4096)] packuswb m2, m15 pand m2, m9 pandn m15, m9, m4 por m2, m15 %ifidn %2, v mova [tmpq+stride3q], m2 ; p0 %elif ARCH_X86_32 mova [esp+8*16], m2 %endif paddw m0, m8 paddw m1, m11 punpcklbw m8, m3, m14 punpckhbw m11, m3, m14 %if ARCH_X86_64 SWAP 2, 14 %endif pmaddubsw m2, m8, [PIC_sym(pb_m1_1)] pmaddubsw m15, m11, [PIC_sym(pb_m1_1)] paddw m0, m2 paddw m1, m15 pmulhrsw m2, m0, [PIC_sym(pw_4096)] pmulhrsw m15, m1, [PIC_sym(pw_4096)] packuswb m2, m15 pand m2, m9 pandn m15, m9, m5 por m2, m15 %ifidn %2, v mova [dstq+strideq*0], m2 ; q0 %endif pmaddubsw m8, [PIC_sym(pb_m1_2)] pmaddubsw m11, [PIC_sym(pb_m1_2)] paddw m0, m8 paddw m1, m11 pmaddubsw m7, [PIC_sym(pb_m1_0)] pmaddubsw m10, [PIC_sym(pb_m1_0)] paddw m0, m7 paddw m1, m10 pmulhrsw m0, [PIC_sym(pw_4096)] pmulhrsw m1, [PIC_sym(pw_4096)] packuswb m0, m1 pand m0, m9 pandn m1, m9, m6 por m0, m1 %if ARCH_X86_32 %xdefine m3 m8 %xdefine m4 m10 %xdefine m5 m11 %xdefine m6 m15 %endif %ifidn %2, v mova [dstq+strideq*1], m0 ; q1 %else %if ARCH_X86_64 SWAP 3, 13 SWAP 4, 14 %else mova m3, [esp+11*16] mova m4, [esp+ 8*16] %endif SWAP 5, 2 SWAP 6, 0 TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 %endif %else ; if %1 == 4 %ifidn %2, v mova [tmpq+strideq*0], m3 ; p1 mova [tmpq+strideq*1], m4 ; p0 mova [tmpq+strideq*2], m5 ; q0 mova [tmpq+stride3q ], m6 ; q1 %else TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7 %endif %endif %if ARCH_X86_32 %define m12 m12reg %endif %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; 32-bit PIC helpers ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %if ARCH_X86_32 %define PIC_base_offset $$ %macro SETUP_PIC 0 ; PIC_reg %define PIC_reg r2 %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4) LEA PIC_reg, $$ %endmacro %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base %if %1 == 0 mov [esp+PIC_reg_stk_offset], PIC_reg mov PIC_reg, maskm %else mov PIC_reg, [esp+PIC_reg_stk_offset] %endif %endmacro %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) %else %macro XCHG_PIC_REG 1 %endmacro %define PIC_sym(sym) (sym) %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %if ARCH_X86_32 %if STACK_ALIGNMENT < required_stack_alignment %assign copy_args 1 %else %assign copy_args 0 %endif %endif %macro RELOC_ARGS 1 %if copy_args %define maskm [esp+stack_size-gprsize*1] %define l_stridem [esp+stack_size-gprsize*2] %define lutm [esp+stack_size-gprsize*3] %define %1m [esp+stack_size-gprsize*4] mov r6d, r6m mov maskm, maskd mov lutm, lutd mov %1m, r6d %else %define %1m r6m %endif %endmacro %if ARCH_X86_32 %define tmpq r4 %define mstrideq r5 %define stride3q r6 %define l_stride3q r6 %endif INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_v_sb_y_8bpc, 7, 11, 16, 16 * 15, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else cglobal lpf_v_sb_y_8bpc, 6, 7, 8, -16 * (26 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC %define m12 m5 %endif shl l_strideq, 2 sub lq, l_strideq %if ARCH_X86_64 mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movu m0, [maskq] pxor m4, m4 movd m3, [lutq+136] pshufb m3, m4 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m1, m2 por m0, m1 mova [rsp+11*16], m0 mova [rsp+12*16], m1 mova [rsp+13*16], m2 mova [rsp+14*16], m3 %define maskmem [esp+15*16] %define mask0 [rsp+11*16] %define mask1 [rsp+12*16] %define mask2 [rsp+13*16] %define minlvl [rsp+14*16] .loop: test [maskq+8], mask_bitsd ; vmask[2] je .no_flat16 %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+25*16], mask_bitsd mova maskmem, m12 %endif FILTER 16, v jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+25*16], mask_bitsd mova maskmem, m12 %endif FILTER 8, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+25*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, v .end: %if ARCH_X86_32 mova m12, maskmem mov mask_bitsd, [esp+25*16] %endif .no_filter: pslld m12, 4 shl mask_bitsd, 4 add lq, 16 add dstq, 16 %if ARCH_X86_64 sub wd, 4 %else sub dword wm, 4 %endif XCHG_PIC_REG 0 jg .loop RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_h_sb_y_8bpc, 7, 11, 16, 16 * 26, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else cglobal lpf_h_sb_y_8bpc, 6, 7, 8, -16 * (39 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC %define m12 m5 %endif sub lq, 4 shl l_strideq, 2 %if ARCH_X86_64 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movu m0, [maskq] pxor m4, m4 movd m3, [lutq+136] pshufb m3, m4 pshufd m2, m0, q2222 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m1, m2 por m0, m1 mova [rsp+22*16], m0 mova [rsp+23*16], m1 mova [rsp+24*16], m2 mova [rsp+25*16], m3 %define maskmem [esp+37*16] %define mask0 [rsp+22*16] %define mask1 [rsp+23*16] %define mask2 [rsp+24*16] %define minlvl [rsp+25*16] .loop: test [maskq+8], mask_bitsd ; vmask[2] je .no_flat16 %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+38*16], mask_bitsd mova maskmem, m12 %endif FILTER 16, h jmp .end .no_flat16: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+38*16], mask_bitsd mova maskmem, m12 %endif FILTER 8, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[0] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+38*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] %if ARCH_X86_32 jmp .end_noload .end: mova m12, maskmem mov l_strideq, l_stridem mov mask_bitsd, [esp+38*16] .end_noload: %else .end: %endif lea lq, [lq+l_strideq*4] pslld m12, 4 shl mask_bitsd, 4 %if ARCH_X86_64 sub hd, 4 %else sub dword hm, 4 %endif XCHG_PIC_REG 0 jg .loop RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_v_sb_uv_8bpc, 7, 11, 16, 3 * 16, \ dst, stride, mask, l, l_stride, lut, \ w, stride3, mstride, tmp, mask_bits %else cglobal lpf_v_sb_uv_8bpc, 6, 7, 8, -16 * (12 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS w SETUP_PIC %define m12 m4 %endif shl l_strideq, 2 sub lq, l_strideq %if ARCH_X86_64 mov mstrideq, strideq neg mstrideq lea stride3q, [strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movq m0, [maskq] pxor m3, m3 movd m2, [lutq+136] pshufb m2, m3 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m0, m1 mova [rsp+0*16], m0 mova [rsp+1*16], m1 mova [rsp+2*16], m2 %define maskmem [esp+7*16] %define mask0 [rsp+0*16] %define mask1 [rsp+1*16] %define minlvl [rsp+2*16] .loop: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+11*16], mask_bitsd mova maskmem, m12 %endif FILTER 6, v jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[1] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+11*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, v .end: %if ARCH_X86_32 mova m12, maskmem mov mask_bitsd, [esp+11*16] %endif .no_filter: pslld m12, 4 shl mask_bitsd, 4 add lq, 16 add dstq, 16 %if ARCH_X86_64 sub wd, 4 %else sub dword wm, 4 %endif XCHG_PIC_REG 0 jg .loop RET INIT_XMM ssse3 %if ARCH_X86_64 cglobal lpf_h_sb_uv_8bpc, 7, 11, 16, 16 * 3, \ dst, stride, mask, l, l_stride, lut, \ h, stride3, l_stride3, tmp, mask_bits %else cglobal lpf_h_sb_uv_8bpc, 6, 7, 8, -16 * (13 + copy_args), \ dst, stride, mask, l, l_stride, lut, mask_bits RELOC_ARGS h SETUP_PIC %define m12 m4 %endif sub lq, 4 shl l_strideq, 2 %if ARCH_X86_64 lea stride3q, [strideq*3] lea l_stride3q, [l_strideq*3] %else mov l_stridem, l_strided %endif mov mask_bitsd, 0xf mova m12, [PIC_sym(pd_mask)] XCHG_PIC_REG 0 movq m0, [maskq] pxor m3, m3 movd m2, [lutq+136] pshufb m2, m3 pshufd m1, m0, q1111 pshufd m0, m0, q0000 por m0, m1 mova [rsp+0*16], m0 mova [rsp+1*16], m1 mova [rsp+2*16], m2 %define maskmem [esp+7*16] %define mask0 [rsp+0*16] %define mask1 [rsp+1*16] %define minlvl [rsp+2*16] .loop: test [maskq+4], mask_bitsd ; vmask[1] je .no_flat %if ARCH_X86_32 XCHG_PIC_REG 1 mov [esp+12*16], mask_bitsd mova maskmem, m12 %endif FILTER 6, h jmp .end .no_flat: test [maskq+0], mask_bitsd ; vmask[1] XCHG_PIC_REG 1 je .no_filter %if ARCH_X86_32 mov [esp+12*16], mask_bitsd mova maskmem, m12 %endif FILTER 4, h jmp .end .no_filter: lea dstq, [dstq+strideq*8] lea dstq, [dstq+strideq*8] %if ARCH_X86_32 jmp .end_noload .end: mova m12, maskmem mov l_strided, l_stridem mov mask_bitsd, [esp+12*16] .end_noload: %else .end: %endif lea lq, [lq+l_strideq*4] pslld m12, 4 shl mask_bitsd, 4 %if ARCH_X86_64 sub hd, 4 %else sub dword hm, 4 %endif XCHG_PIC_REG 0 jg .loop RET rav1e-0.7.1/src/x86/looprestoration16_avx2.asm000064400000000000000000002327301046102023000171460ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 wiener_hshift: dw 4, 4, 1, 1 wiener_vshift: dw 1024, 1024, 4096, 4096 wiener_round: dd 1049600, 1048832 pb_m10_m9: times 2 db -10, -9 pb_m6_m5: times 2 db -6, -5 pb_m2_m1: times 2 db -2, -1 pb_2_3: times 2 db 2, 3 pb_6_7: times 2 db 6, 7 pw_1023: times 2 dw 1023 pd_8: dd 8 pd_25: dd 25 pd_4096: dd 4096 pd_34816: dd 34816 pd_m262128: dd -262128 pd_0xf00800a4: dd 0xf00800a4 pd_0xf00801c7: dd 0xf00801c7 %define pw_256 sgr_lshuf5 cextern sgr_x_by_x_avx2 SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers INIT_YMM avx2 cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt %define base t4-wiener_hshift mov fltq, r6mp movifnidn wd, wm movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max vbroadcasti128 m6, [wiener_shufA] vpbroadcastd m12, [fltq+ 0] ; x0 x1 lea t4, [wiener_hshift] vbroadcasti128 m7, [wiener_shufB] add wd, wd vpbroadcastd m13, [fltq+ 4] ; x2 x3 shr t3d, 11 vpbroadcastd m14, [fltq+16] ; y0 y1 add lpfq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 add dstq, wq vbroadcasti128 m8, [wiener_shufC] lea t1, [rsp+wq+16] vbroadcasti128 m9, [wiener_shufD] neg wq vpbroadcastd m0, [base+wiener_hshift+t3*4] vpbroadcastd m10, [base+wiener_round+t3*4] vpbroadcastd m11, [base+wiener_vshift+t3*4] pmullw m12, m0 ; upshift filter coefs to make the pmullw m13, m0 ; horizontal downshift constant test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v .v2: call .v jmp .v1 .extend_right: movd xm1, r10d vpbroadcastd m0, [pb_6_7] movu m2, [pb_0to31] vpbroadcastb m1, xm1 psubb m0, m1 pminub m0, m2 pshufb m3, m0 vpbroadcastd m0, [pb_m2_m1] psubb m0, m1 pminub m0, m2 pshufb m4, m0 vpbroadcastd m0, [pb_m10_m9] psubb m0, m1 pminub m0, m2 pshufb m5, m0 ret .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movq xm3, [leftq] vpblendd m3, [lpfq+r10-8], 0xfc add leftq, 8 jmp .h_main .h_extend_left: vbroadcasti128 m3, [lpfq+r10] ; avoid accessing memory located mova m4, [lpfq+r10] ; before the start of the buffer shufpd m3, m4, 0x05 pshufb m3, [wiener_lshuf7] jmp .h_main2 .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+r10-8] .h_main: mova m4, [lpfq+r10+0] .h_main2: movu m5, [lpfq+r10+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -36 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m3, m6 pshufb m1, m4, m7 paddw m0, m1 pshufb m3, m8 pmaddwd m0, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 vpbroadcastd m2, [pd_m262128] ; (1 << 4) - (1 << 18) pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 paddw m4, m5 pmaddwd m4, m13 paddd m0, m2 paddd m1, m2 paddd m0, m3 paddd m1, m4 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+r10], m0 add r10, 32 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movq xm3, [leftq] vpblendd m3, [lpfq+r10-8], 0xfc add leftq, 8 jmp .hv_main .hv_extend_left: movu m3, [lpfq+r10-8] pshufb m3, [wiener_lshuf7] jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+r10-8] .hv_main: mova m4, [lpfq+r10+0] movu m5, [lpfq+r10+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -36 jl .hv_have_right call .extend_right .hv_have_right: pshufb m0, m3, m6 pshufb m1, m4, m7 paddw m0, m1 pshufb m3, m8 pmaddwd m0, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 vpbroadcastd m2, [pd_m262128] pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 paddw m4, m5 pmaddwd m4, m13 paddd m0, m2 paddd m1, m2 mova m2, [t4+r10] paddw m2, [t2+r10] mova m5, [t3+r10] paddd m0, m3 paddd m1, m4 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova m4, [t5+r10] paddw m4, [t1+r10] psraw m0, 1 paddw m3, m0, [t6+r10] mova [t0+r10], m0 punpcklwd m0, m2, m5 pmaddwd m0, m15 punpckhwd m2, m5 pmaddwd m2, m15 punpcklwd m1, m3, m4 pmaddwd m1, m14 punpckhwd m3, m4 pmaddwd m3, m14 paddd m0, m10 paddd m2, m10 paddd m0, m1 paddd m2, m3 psrad m0, 5 psrad m2, 5 packusdw m0, m2 pmulhuw m0, m11 mova [dstq+r10], m0 add r10, 32 jl .hv_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m1, [t4+r10] paddw m1, [t2+r10] mova m2, [t3+r10] mova m4, [t1+r10] paddw m3, m4, [t6+r10] paddw m4, [t5+r10] punpcklwd m0, m1, m2 pmaddwd m0, m15 punpckhwd m1, m2 pmaddwd m1, m15 punpcklwd m2, m3, m4 pmaddwd m2, m14 punpckhwd m3, m4 pmaddwd m3, m14 paddd m0, m10 paddd m1, m10 paddd m0, m2 paddd m1, m3 psrad m0, 5 psrad m1, 5 packusdw m0, m1 pmulhuw m0, m11 mova [dstq+r10], m0 add r10, 32 jl .v_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ w, h, edge, flt %define base t4-wiener_hshift mov fltq, r6mp movifnidn wd, wm movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max vbroadcasti128 m5, [wiener_shufE] vpbroadcastw m11, [fltq+ 2] ; x1 vbroadcasti128 m6, [wiener_shufB] lea t4, [wiener_hshift] vbroadcasti128 m7, [wiener_shufD] add wd, wd vpbroadcastd m12, [fltq+ 4] ; x2 x3 shr t3d, 11 vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) add lpfq, wq vpbroadcastw m13, [fltq+18] ; y1 add dstq, wq vpbroadcastd m14, [fltq+20] ; y2 y3 lea t1, [rsp+wq+16] neg wq vpbroadcastd m0, [base+wiener_hshift+t3*4] vpbroadcastd m9, [base+wiener_round+t3*4] vpbroadcastd m10, [base+wiener_vshift+t3*4] movu xm15, [wiener_lshuf5] pmullw m11, m0 vinserti128 m15, [pb_0to31], 1 pmullw m12, m0 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 .main: mov t0, t4 .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .end: RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 lea t0, [t1+384*2] call .hv dec hd jz .v2 add t0, 384*6 call .hv dec hd jnz .main .v2: call .v mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq .v1: call .v jmp .end .extend_right: movd xm2, r10d vpbroadcastd m0, [pb_2_3] vpbroadcastd m1, [pb_m6_m5] vpbroadcastb m2, xm2 psubb m0, m2 psubb m1, m2 movu m2, [pb_0to31] pminub m0, m2 pminub m1, m2 pshufb m3, m0 pshufb m4, m1 ret .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm3, [leftq+4] vpblendd m3, [lpfq+r10-4], 0xfe add leftq, 8 jmp .h_main .h_extend_left: vbroadcasti128 m4, [lpfq+r10] ; avoid accessing memory located mova m3, [lpfq+r10] ; before the start of the buffer palignr m3, m4, 12 pshufb m3, m15 jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+r10-4] .h_main: movu m4, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m3, m5 pmaddwd m0, m11 pshufb m1, m4, m5 pmaddwd m1, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 pmaddwd m2, m12 pshufb m4, m7 paddw m3, m4 pmaddwd m3, m12 paddd m0, m8 paddd m1, m8 paddd m0, m2 paddd m1, m3 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+r10], m0 add r10, 32 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm3, [leftq+4] vpblendd m3, [lpfq+r10-4], 0xfe add leftq, 8 jmp .hv_main .hv_extend_left: movu m3, [lpfq+r10-4] pshufb m3, m15 jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+r10-4] .hv_main: movu m4, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -34 jl .hv_have_right call .extend_right .hv_have_right: pshufb m0, m3, m5 pmaddwd m0, m11 pshufb m1, m4, m5 pmaddwd m1, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 pmaddwd m2, m12 pshufb m4, m7 paddw m3, m4 pmaddwd m3, m12 paddd m0, m8 paddd m1, m8 paddd m0, m2 mova m2, [t3+r10] paddw m2, [t1+r10] paddd m1, m3 mova m4, [t2+r10] punpckhwd m3, m2, m4 pmaddwd m3, m14 punpcklwd m2, m4 mova m4, [t4+r10] psrad m0, 4 psrad m1, 4 packssdw m0, m1 pmaddwd m2, m14 psraw m0, 1 mova [t0+r10], m0 punpckhwd m1, m0, m4 pmaddwd m1, m13 punpcklwd m0, m4 pmaddwd m0, m13 paddd m3, m9 paddd m2, m9 paddd m1, m3 paddd m0, m2 psrad m1, 5 psrad m0, 5 packusdw m0, m1 pmulhuw m0, m10 mova [dstq+r10], m0 add r10, 32 jl .hv_loop mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t4 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m0, [t1+r10] paddw m2, m0, [t3+r10] mova m1, [t2+r10] mova m4, [t4+r10] punpckhwd m3, m2, m1 pmaddwd m3, m14 punpcklwd m2, m1 pmaddwd m2, m14 punpckhwd m1, m0, m4 pmaddwd m1, m13 punpcklwd m0, m4 pmaddwd m0, m13 paddd m3, m9 paddd m2, m9 paddd m1, m3 paddd m0, m2 psrad m1, 5 psrad m0, 5 packusdw m0, m1 pmulhuw m0, m10 mova [dstq+r10], m0 add r10, 32 jl .v_loop ret cglobal sgr_filter_5x5_16bpc, 4, 14, 15, 400*24+16, dst, stride, left, lpf, \ w, h, edge, params movifnidn wd, wm mov paramsq, r6mp lea r13, [sgr_x_by_x_avx2+256*4] movifnidn hd, hm mov edged, r7m add wd, wd vpbroadcastw m7, [paramsq+8] ; w0 add lpfq, wq vpbroadcastd m8, [pd_8] add dstq, wq vpbroadcastd m9, [pd_25] lea t3, [rsp+wq*2+400*12+16] vpbroadcastd m10, [paramsq+0] ; s0 lea t4, [rsp+wq+400*20+16] vpbroadcastd m11, [pd_0xf00800a4] lea t1, [rsp+wq+20] mova xm12, [sgr_lshuf5] neg wq vpbroadcastd m13, [pd_34816] ; (1 << 11) + (1 << 15) pxor m6, m6 vpbroadcastd m14, [pw_1023] psllw m7, 4 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call .top_fixup add t1, 400*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq test hd, hd jz .odd_height call .h add lpfq, strideq call .hv call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .h_top add lpfq, strideq call .hv_bottom .end: call .n0 call .n1 .end2: RET .height1: call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv call .n0 call .n1 .odd_height_end: call .v call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+400*6] call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 jmp .main .no_top_height1: call .v call .prep_n jmp .odd_height_end .extend_right: vpbroadcastw m0, [lpfq-2] movu m1, [r13+r10+ 0] movu m2, [r13+r10+16] vpblendvb m4, m0, m1 vpblendvb m5, m0, m2 ret .h: ; horizontal boxsum lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 10 jmp .h_main .h_extend_left: mova xm4, [lpfq+wq] pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+10], 1 jmp .h_main .h_top: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+r10- 2] .h_main: movu m5, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -36 jl .h_have_right call .extend_right .h_have_right: palignr m2, m5, m4, 2 paddw m0, m4, m2 palignr m3, m5, m4, 6 paddw m0, m3 punpcklwd m1, m2, m3 pmaddwd m1, m1 punpckhwd m2, m3 pmaddwd m2, m2 shufpd m5, m4, m5, 0x05 paddw m0, m5 punpcklwd m3, m4, m5 pmaddwd m3, m3 paddd m1, m3 punpckhwd m3, m4, m5 pmaddwd m3, m3 shufps m4, m5, q2121 paddw m0, m4 ; sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m2, m3 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+r10+400*0] paddd m1, [t1+r10+400*2] paddd m2, [t1+r10+400*4] .h_loop_end: paddd m1, m5 ; sumsq paddd m2, m4 mova [t1+r10+400*0], m0 mova [t1+r10+400*2], m1 mova [t1+r10+400*4], m2 add r10, 32 jl .h_loop ret .top_fixup: lea r10, [wq-4] .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+r10+400*0] mova m1, [t1+r10+400*2] mova m2, [t1+r10+400*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+r10+400*0], m0 mova [t2+r10+400*2], m1 mova [t2+r10+400*4], m2 add r10, 32 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 10 jmp .hv_main .hv_extend_left: mova xm4, [lpfq+wq] pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+10], 1 jmp .hv_main .hv_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m4, [lpfq+r10- 2] .hv_main: movu m5, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -36 jl .hv_have_right call .extend_right .hv_have_right: palignr m3, m5, m4, 2 paddw m0, m4, m3 palignr m1, m5, m4, 6 paddw m0, m1 punpcklwd m2, m3, m1 pmaddwd m2, m2 punpckhwd m3, m1 pmaddwd m3, m3 shufpd m5, m4, m5, 0x05 paddw m0, m5 punpcklwd m1, m4, m5 pmaddwd m1, m1 paddd m2, m1 punpckhwd m1, m4, m5 pmaddwd m1, m1 shufps m4, m5, q2121 paddw m0, m4 ; h sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m3, m1 paddd m2, m5 ; h sumsq paddd m3, m4 paddw m1, m0, [t1+r10+400*0] paddd m4, m2, [t1+r10+400*2] paddd m5, m3, [t1+r10+400*4] test hd, hd jz .hv_last_row .hv_main2: paddw m1, [t2+r10+400*0] ; hv sum paddd m4, [t2+r10+400*2] ; hv sumsq paddd m5, [t2+r10+400*4] mova [t0+r10+400*0], m0 mova [t0+r10+400*2], m2 mova [t0+r10+400*4], m3 psrlw m3, m1, 1 paddd m4, m8 pavgw m3, m6 ; (b + 2) >> 2 paddd m5, m8 psrld m4, 4 ; (a + 8) >> 4 punpcklwd m2, m3, m6 psrld m5, 4 punpckhwd m3, m6 pmulld m4, m9 ; a * 25 pmulld m5, m9 pmaddwd m2, m2 ; b * b pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmaxud m4, m2 pmaxud m5, m3 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m10 ; p * s pmulld m5, m10 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+r10+4], m2 psrld m0, 12 ; b psrld m1, 12 mova [t3+r10*2+ 8], xm0 vextracti128 [t3+r10*2+40], m0, 1 mova [t3+r10*2+24], xm1 vextracti128 [t3+r10*2+56], m1, 1 add r10, 32 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+r10+400*0], m1 paddw m1, m0 mova [t1+r10+400*2], m4 paddd m4, m2 mova [t1+r10+400*4], m5 paddd m5, m3 jmp .hv_main2 .v: ; vertical boxsum + ab lea r10, [wq-4] .v_loop: mova m0, [t1+r10+400*0] mova m2, [t1+r10+400*2] mova m3, [t1+r10+400*4] paddw m1, m0, [t2+r10+400*0] paddd m4, m2, [t2+r10+400*2] paddd m5, m3, [t2+r10+400*4] paddw m0, m0 paddd m2, m2 paddd m3, m3 paddw m1, m0 ; hv sum paddd m4, m2 ; hv sumsq paddd m5, m3 psrlw m3, m1, 1 paddd m4, m8 pavgw m3, m6 ; (b + 2) >> 2 paddd m5, m8 psrld m4, 4 ; (a + 8) >> 4 punpcklwd m2, m3, m6 psrld m5, 4 punpckhwd m3, m6 pmulld m4, m9 ; a * 25 pmulld m5, m9 pmaddwd m2, m2 ; b * b pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmaxud m4, m2 pmaxud m5, m3 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m10 ; p * s pmulld m5, m10 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+r10+4], m2 psrld m0, 12 ; b psrld m1, 12 mova [t3+r10*2+ 8], xm0 vextracti128 [t3+r10*2+40], m0, 1 mova [t3+r10*2+24], xm1 vextracti128 [t3+r10*2+56], m1, 1 add r10, 32 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t4+r10*1+ 2] movu m1, [t3+r10*2+ 4] movu m2, [t3+r10*2+36] paddw m3, m0, [t4+r10*1+ 0] paddd m4, m1, [t3+r10*2+ 0] paddd m5, m2, [t3+r10*2+32] paddw m3, [t4+r10*1+ 4] paddd m4, [t3+r10*2+ 8] paddd m5, [t3+r10*2+40] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 mova [t4+r10*1+400*2+ 0], m0 mova [t3+r10*2+400*4+ 0], m1 mova [t3+r10*2+400*4+32], m2 add r10, 32 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m0, [t4+r10*1+ 2] movu m1, [t3+r10*2+ 4] movu m2, [t3+r10*2+36] paddw m3, m0, [t4+r10*1+ 0] paddd m4, m1, [t3+r10*2+ 0] paddd m5, m2, [t3+r10*2+32] paddw m3, [t4+r10*1+ 4] paddd m4, [t3+r10*2+ 8] paddd m5, [t3+r10*2+40] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 paddw m3, m0, [t4+r10*1+400*2+ 0] paddd m4, m1, [t3+r10*2+400*4+ 0] paddd m5, m2, [t3+r10*2+400*4+32] mova [t4+r10*1+400*2+ 0], m0 mova [t3+r10*2+400*4+ 0], m1 mova [t3+r10*2+400*4+32], m2 mova m0, [dstq+r10] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 psubd m1, m2 ; b - a * src + (1 << 8) psubd m4, m3 psrad m1, 9 psrad m4, 9 packssdw m1, m4 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 32 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m0, [dstq+r10] mova m3, [t4+r10*1+400*2+ 0] mova m4, [t3+r10*2+400*4+ 0] mova m5, [t3+r10*2+400*4+32] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 psubd m1, m2 ; b - a * src + (1 << 7) psubd m4, m3 psrad m1, 8 psrad m4, 8 packssdw m1, m4 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 32 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_3x3_16bpc, 4, 14, 14, 400*42+8, dst, stride, left, lpf, \ w, h, edge, params movifnidn wd, wm mov paramsq, r6mp lea r13, [sgr_x_by_x_avx2+256*4] add wd, wd movifnidn hd, hm mov edged, r7m add lpfq, wq vpbroadcastw m7, [paramsq+10] ; w1 add dstq, wq vpbroadcastd m9, [paramsq+ 4] ; s1 lea t3, [rsp+wq*2+400*12+8] vpbroadcastd m8, [pd_8] lea t4, [rsp+wq+400*32+8] vpbroadcastd m10, [pd_0xf00801c7] lea t1, [rsp+wq+12] vpbroadcastd m11, [pd_34816] neg wq mova xm12, [sgr_lshuf3] pxor m6, m6 vpbroadcastd m13, [pw_1023] psllw m7, 4 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 add t1, 400*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea r10, [wq-4] lea t2, [t1+400*6] .top_fixup_loop: mova m0, [t1+r10+400*0] mova m1, [t1+r10+400*2] mova m2, [t1+r10+400*4] mova [t2+r10+400*0], m0 mova [t2+r10+400*2], m1 mova [t2+r10+400*4], m2 add r10, 32 jl .top_fixup_loop call .v0 jmp .main .extend_right: vpbroadcastw m0, [lpfq-2] movu m1, [r13+r10+ 2] movu m2, [r13+r10+18] vpblendvb m4, m0, m1 vpblendvb m5, m0, m2 ret .h: ; horizontal boxsum lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 12 jmp .h_main .h_extend_left: mova xm4, [lpfq+wq] pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+12], 1 jmp .h_main .h_top: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+r10+ 0] .h_main: movu m5, [lpfq+r10+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right call .extend_right .h_have_right: palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 mova [t1+r10+400*0], m1 mova [t1+r10+400*2], m2 mova [t1+r10+400*4], m3 add r10, 32 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 12 jmp .hv0_main .hv0_extend_left: mova xm4, [lpfq+wq] pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+12], 1 jmp .hv0_main .hv0_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu m4, [lpfq+r10+ 0] .hv0_main: movu m5, [lpfq+r10+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -34 jl .hv0_have_right call .extend_right .hv0_have_right: palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 paddw m0, m1, [t1+r10+400*0] paddd m4, m2, [t1+r10+400*2] paddd m5, m3, [t1+r10+400*4] mova [t1+r10+400*0], m1 mova [t1+r10+400*2], m2 mova [t1+r10+400*4], m3 paddw m1, m0, [t2+r10+400*0] paddd m2, m4, [t2+r10+400*2] paddd m3, m5, [t2+r10+400*4] mova [t2+r10+400*0], m0 mova [t2+r10+400*2], m4 mova [t2+r10+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmaxud m4, m2 psubd m4, m2 ; p pmaxud m5, m3 psubd m5, m3 pmulld m4, m9 ; p * s pmulld m5, m9 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m2 mova [t3+r10*2+400*0+ 8], xm0 vextracti128 [t3+r10*2+400*0+40], m0, 1 mova [t3+r10*2+400*0+24], xm1 vextracti128 [t3+r10*2+400*0+56], m1, 1 add r10, 32 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 12 jmp .hv1_main .hv1_extend_left: mova xm4, [lpfq+wq] pshufb xm4, xm12 vinserti128 m4, [lpfq+wq+12], 1 jmp .hv1_main .hv1_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu m4, [lpfq+r10+ 0] .hv1_main: movu m5, [lpfq+r10+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -34 jl .hv1_have_right call .extend_right .hv1_have_right: palignr m1, m5, m4, 2 paddw m0, m4, m1 punpcklwd m2, m4, m1 pmaddwd m2, m2 punpckhwd m3, m4, m1 pmaddwd m3, m3 palignr m5, m4, 4 paddw m0, m5 ; h sum punpcklwd m1, m5, m6 pmaddwd m1, m1 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m1 ; h sumsq paddd m3, m5 paddw m1, m0, [t2+r10+400*0] paddd m4, m2, [t2+r10+400*2] paddd m5, m3, [t2+r10+400*4] mova [t2+r10+400*0], m0 mova [t2+r10+400*2], m2 mova [t2+r10+400*4], m3 paddd m4, m8 paddd m5, m8 psrld m4, 4 ; (a + 8) >> 4 psrld m5, 4 pslld m2, m4, 3 pslld m3, m5, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmaxud m4, m2 psubd m4, m2 ; p pmaxud m5, m3 psubd m5, m3 pmulld m4, m9 ; p * s pmulld m5, m9 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*2 +4], m2 mova [t3+r10*2+400*4+ 8], xm0 vextracti128 [t3+r10*2+400*4+40], m0, 1 mova [t3+r10*2+400*4+24], xm1 vextracti128 [t3+r10*2+400*4+56], m1, 1 add r10, 32 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab (even rows) lea r10, [wq-4] .v0_loop: mova m0, [t1+r10+400*0] mova m4, [t1+r10+400*2] mova m5, [t1+r10+400*4] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+r10+400*0] paddd m2, m4, [t2+r10+400*2] paddd m3, m5, [t2+r10+400*4] mova [t2+r10+400*0], m0 mova [t2+r10+400*2], m4 mova [t2+r10+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmaxud m4, m2 psubd m4, m2 ; p pmaxud m5, m3 psubd m5, m3 pmulld m4, m9 ; p * s pmulld m5, m9 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m2 mova [t3+r10*2+400*0+ 8], xm0 vextracti128 [t3+r10*2+400*0+40], m0, 1 mova [t3+r10*2+400*0+24], xm1 vextracti128 [t3+r10*2+400*0+56], m1, 1 add r10, 32 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-4] .v1_loop: mova m0, [t1+r10+400*0] mova m4, [t1+r10+400*2] mova m5, [t1+r10+400*4] paddw m1, m0, [t2+r10+400*0] paddd m2, m4, [t2+r10+400*2] paddd m3, m5, [t2+r10+400*4] mova [t2+r10+400*0], m0 mova [t2+r10+400*2], m4 mova [t2+r10+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmaxud m4, m2 psubd m4, m2 ; p pmaxud m5, m3 psubd m5, m3 pmulld m4, m9 ; p * s pmulld m5, m9 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*2+ 4], m2 mova [t3+r10*2+400*4+ 8], xm0 vextracti128 [t3+r10*2+400*4+40], m0, 1 mova [t3+r10*2+400*4+24], xm1 vextracti128 [t3+r10*2+400*4+56], m1, 1 add r10, 32 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: mova xm0, [t4+r10*1+400*0+0] paddw xm0, [t4+r10*1+400*0+4] paddw xm2, xm0, [t4+r10*1+400*0+2] mova m1, [t3+r10*2+400*0+0] paddd m1, [t3+r10*2+400*0+8] paddd m3, m1, [t3+r10*2+400*0+4] psllw xm2, 2 ; a[-1] 444 pslld m3, 2 ; b[-1] 444 psubw xm2, xm0 ; a[-1] 343 psubd m3, m1 ; b[-1] 343 mova [t4+r10*1+400* 4], xm2 mova [t3+r10*2+400* 8], m3 mova xm0, [t4+r10*1+400*2+0] paddw xm0, [t4+r10*1+400*2+4] paddw xm2, xm0, [t4+r10*1+400*2+2] mova m1, [t3+r10*2+400*4+0] paddd m1, [t3+r10*2+400*4+8] paddd m3, m1, [t3+r10*2+400*4+4] psllw xm2, 2 ; a[ 0] 444 pslld m3, 2 ; b[ 0] 444 mova [t4+r10*1+400* 6], xm2 mova [t3+r10*2+400*12], m3 psubw xm2, xm0 ; a[ 0] 343 psubd m3, m1 ; b[ 0] 343 mova [t4+r10*1+400* 8], xm2 mova [t3+r10*2+400*16], m3 add r10, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: mova m3, [t4+r10*1+400*0+0] paddw m3, [t4+r10*1+400*0+4] paddw m1, m3, [t4+r10*1+400*0+2] psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+r10*1+400*4] paddw m3, [t4+r10*1+400*6] mova [t4+r10*1+400*4], m2 mova [t4+r10*1+400*6], m1 mova m4, [t3+r10*2+400*0+0] paddd m4, [t3+r10*2+400*0+8] paddd m1, m4, [t3+r10*2+400*0+4] pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+r10*2+400* 8+ 0] paddd m4, [t3+r10*2+400*12+ 0] mova [t3+r10*2+400* 8+ 0], m2 mova [t3+r10*2+400*12+ 0], m1 mova m5, [t3+r10*2+400*0+32] paddd m5, [t3+r10*2+400*0+40] paddd m1, m5, [t3+r10*2+400*0+36] pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+r10*2+400* 8+32] paddd m5, [t3+r10*2+400*12+32] mova [t3+r10*2+400* 8+32], m2 mova [t3+r10*2+400*12+32], m1 mova m0, [dstq+r10] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 psubd m1, m2 ; b - a * src + (1 << 8) psubd m4, m3 psrad m1, 9 psrad m4, 9 packssdw m1, m4 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+r10], m0 add r10, 32 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m3, [t4+r10*1+400*2+0] paddw m3, [t4+r10*1+400*2+4] paddw m1, m3, [t4+r10*1+400*2+2] psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+r10*1+400*6] paddw m3, [t4+r10*1+400*8] mova [t4+r10*1+400*6], m1 mova [t4+r10*1+400*8], m2 mova m4, [t3+r10*2+400*4+0] paddd m4, [t3+r10*2+400*4+8] paddd m1, m4, [t3+r10*2+400*4+4] pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+r10*2+400*12+ 0] paddd m4, [t3+r10*2+400*16+ 0] mova [t3+r10*2+400*12+ 0], m1 mova [t3+r10*2+400*16+ 0], m2 mova m5, [t3+r10*2+400*4+32] paddd m5, [t3+r10*2+400*4+40] paddd m1, m5, [t3+r10*2+400*4+36] pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+r10*2+400*12+32] paddd m5, [t3+r10*2+400*16+32] mova [t3+r10*2+400*12+32], m1 mova [t3+r10*2+400*16+32], m2 mova m0, [dstq+r10] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vinserti128 m1, m4, xm5, 1 vperm2i128 m4, m5, 0x31 psubd m1, m2 ; b - a * src + (1 << 8) psubd m4, m3 psrad m1, 9 psrad m4, 9 packssdw m1, m4 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+r10], m0 add r10, 32 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_mix_16bpc, 4, 14, 16, 400*66+8, dst, stride, left, lpf, \ w, h, edge, params movifnidn wd, wm mov paramsq, r6mp lea r13, [sgr_x_by_x_avx2+256*4] add wd, wd movifnidn hd, hm mov edged, r7m add lpfq, wq vpbroadcastd m15, [paramsq+8] ; w0 w1 add dstq, wq vpbroadcastd m13, [paramsq+0] ; s0 lea t3, [rsp+wq*2+400*24+8] vpbroadcastd m14, [paramsq+4] ; s1 lea t4, [rsp+wq+400*52+8] vpbroadcastd m9, [pd_8] lea t1, [rsp+wq+12] vpbroadcastd m10, [pd_34816] neg wq vpbroadcastd m11, [pd_4096] pxor m7, m7 vpbroadcastd m12, [pd_0xf00801c7] psllw m15, 2 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).top_fixup add t1, 400*12 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea r10, [wq-4] lea t2, [t1+400*12] .top_fixup_loop: mova m0, [t1+r10+400* 0] mova m1, [t1+r10+400* 2] mova m2, [t1+r10+400* 4] paddw m0, m0 mova m3, [t1+r10+400* 6] paddd m1, m1 mova m4, [t1+r10+400* 8] paddd m2, m2 mova m5, [t1+r10+400*10] mova [t2+r10+400* 0], m0 mova [t2+r10+400* 2], m1 mova [t2+r10+400* 4], m2 mova [t2+r10+400* 6], m3 mova [t2+r10+400* 8], m4 mova [t2+r10+400*10], m5 add r10, 32 jl .top_fixup_loop call .v0 jmp .main .h: ; horizontal boxsum lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 10 jmp .h_main .h_extend_left: mova xm4, [lpfq+wq] pshufb xm4, [sgr_lshuf5] vinserti128 m4, [lpfq+wq+10], 1 jmp .h_main .h_top: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+r10- 2] .h_main: movu m5, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -36 jl .h_have_right call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right .h_have_right: palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; sum3 punpcklwd m6, m0, m7 pmaddwd m6, m6 punpckhwd m0, m7 pmaddwd m0, m0 paddd m2, m6 ; sumsq3 shufpd m6, m4, m5, 0x05 punpcklwd m5, m6, m4 paddw m8, m4, m6 pmaddwd m5, m5 punpckhwd m6, m4 pmaddwd m6, m6 paddd m3, m0 mova [t1+r10+400* 6], m1 mova [t1+r10+400* 8], m2 mova [t1+r10+400*10], m3 paddw m8, m1 ; sum5 paddd m5, m2 ; sumsq5 paddd m6, m3 mova [t1+r10+400* 0], m8 mova [t1+r10+400* 2], m5 mova [t1+r10+400* 4], m6 add r10, 32 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 10 jmp .hv0_main .hv0_extend_left: mova xm4, [lpfq+wq] pshufb xm4, [sgr_lshuf5] vinserti128 m4, [lpfq+wq+10], 1 jmp .hv0_main .hv0_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu m4, [lpfq+r10- 2] .hv0_main: movu m5, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -36 jl .hv0_have_right call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right .hv0_have_right: palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; h sum3 punpcklwd m6, m0, m7 pmaddwd m6, m6 punpckhwd m0, m7 pmaddwd m0, m0 paddd m2, m6 ; h sumsq3 shufpd m6, m4, m5, 0x05 punpcklwd m5, m6, m4 paddw m8, m4, m6 pmaddwd m5, m5 punpckhwd m6, m4 pmaddwd m6, m6 paddd m3, m0 paddw m8, m1 ; h sum5 paddd m5, m2 ; h sumsq5 paddd m6, m3 mova [t3+r10*2+400*8+ 8], m8 ; we need a clean copy of the last row TODO: t4? mova [t3+r10*2+400*0+ 8], m5 ; in case height is odd mova [t3+r10*2+400*0+40], m6 paddw m8, [t1+r10+400* 0] paddd m5, [t1+r10+400* 2] paddd m6, [t1+r10+400* 4] mova [t1+r10+400* 0], m8 mova [t1+r10+400* 2], m5 mova [t1+r10+400* 4], m6 paddw m0, m1, [t1+r10+400* 6] paddd m4, m2, [t1+r10+400* 8] paddd m5, m3, [t1+r10+400*10] mova [t1+r10+400* 6], m1 mova [t1+r10+400* 8], m2 mova [t1+r10+400*10], m3 paddw m1, m0, [t2+r10+400* 6] paddd m2, m4, [t2+r10+400* 8] paddd m3, m5, [t2+r10+400*10] mova [t2+r10+400* 6], m0 mova [t2+r10+400* 8], m4 mova [t2+r10+400*10], m5 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pmaxud m4, m2 psubd m4, m2 ; p3 pmaxud m5, m3 psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m12 ; b3 * 455 pmaddwd m1, m12 paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*2+ 4], m2 mova [t3+r10*2+400*4+ 8], xm0 vextracti128 [t3+r10*2+400*4+40], m0, 1 mova [t3+r10*2+400*4+24], xm1 vextracti128 [t3+r10*2+400*4+56], m1, 1 add r10, 32 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left vpbroadcastq xm5, [leftq] vinserti128 m5, [lpfq+wq], 1 mova m4, [lpfq+wq] add leftq, 8 palignr m4, m5, 10 jmp .hv1_main .hv1_extend_left: mova xm4, [lpfq+wq] pshufb xm4, [sgr_lshuf5] vinserti128 m4, [lpfq+wq+10], 1 jmp .hv1_main .hv1_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu m4, [lpfq+r10- 2] .hv1_main: movu m5, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -36 jl .hv1_have_right call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx2).extend_right .hv1_have_right: palignr m6, m5, m4, 2 palignr m3, m5, m4, 4 paddw m2, m6, m3 punpcklwd m0, m6, m3 pmaddwd m0, m0 punpckhwd m6, m3 pmaddwd m6, m6 palignr m3, m5, m4, 6 paddw m2, m3 ; h sum3 punpcklwd m1, m3, m7 pmaddwd m1, m1 punpckhwd m3, m7 pmaddwd m3, m3 paddd m0, m1 ; h sumsq3 shufpd m1, m4, m5, 0x05 punpckhwd m5, m4, m1 paddw m8, m4, m1 pmaddwd m5, m5 punpcklwd m4, m1 pmaddwd m4, m4 paddd m6, m3 paddw m1, m2, [t2+r10+400* 6] mova [t2+r10+400* 6], m2 paddw m8, m2 ; h sum5 paddd m2, m0, [t2+r10+400* 8] paddd m3, m6, [t2+r10+400*10] mova [t2+r10+400* 8], m0 mova [t2+r10+400*10], m6 paddd m4, m0 ; h sumsq5 paddd m5, m6 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pslld m0, m2, 3 pslld m6, m3, 3 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 paddd m3, m6 psrlw m6, m1, 1 pavgw m6, m7 ; (b3 + 2) >> 2 punpcklwd m0, m6, m7 pmaddwd m0, m0 punpckhwd m6, m7 pmaddwd m6, m6 pmaxud m2, m0 psubd m2, m0 ; p3 pmaxud m3, m6 psubd m3, m6 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pmulld m2, m14 ; p3 * s1 pmulld m3, m14 pmaddwd m0, m12 ; b3 * 455 pmaddwd m1, m12 paddusw m2, m12 paddusw m3, m12 psrad m7, m2, 20 ; min(z3, 255) - 256 vpgatherdd m6, [r13+m7*4], m2 ; x3 psrad m2, m3, 20 vpgatherdd m7, [r13+m2*4], m3 pmulld m0, m6 packssdw m6, m7 pmulld m7, m1 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m7, m10 psrld m0, 12 psrld m7, 12 paddw m1, m8, [t2+r10+400*0] paddd m2, m4, [t2+r10+400*2] paddd m3, m5, [t2+r10+400*4] paddw m1, [t1+r10+400*0] paddd m2, [t1+r10+400*2] paddd m3, [t1+r10+400*4] mova [t2+r10+400*0], m8 mova [t2+r10+400*2], m4 mova [t2+r10+400*4], m5 mova [t4+r10*1+400*4 +4], m6 mova [t3+r10*2+400*8+ 8], xm0 vextracti128 [t3+r10*2+400*8+40], m0, 1 mova [t3+r10*2+400*8+24], xm7 vextracti128 [t3+r10*2+400*8+56], m7, 1 vpbroadcastd m4, [pd_25] pxor m7, m7 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 pmulld m3, m4 psrlw m5, m1, 1 pavgw m5, m7 ; (b5 + 2) >> 2 punpcklwd m4, m5, m7 pmaddwd m4, m4 punpckhwd m5, m7 pmaddwd m5, m5 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 pmaxud m2, m4 psubd m2, m4 ; p5 vpbroadcastd m4, [pd_0xf00800a4] pmaxud m3, m5 psubd m3, m5 pmulld m2, m13 ; p5 * s0 pmulld m3, m13 pmaddwd m0, m4 ; b5 * 164 pmaddwd m1, m4 paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 vpgatherdd m4, [r13+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r13+m2*4], m3 pmulld m0, m4 pmulld m1, m5 packssdw m4, m5 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m4 mova [t3+r10*2+400*0+ 8], xm0 vextracti128 [t3+r10*2+400*0+40], m0, 1 mova [t3+r10*2+400*0+24], xm1 vextracti128 [t3+r10*2+400*0+56], m1, 1 add r10, 32 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) lea r10, [wq-4] .v0_loop: mova m0, [t1+r10+400* 6] mova m4, [t1+r10+400* 8] mova m5, [t1+r10+400*10] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+r10+400* 6] paddd m2, m4, [t2+r10+400* 8] paddd m3, m5, [t2+r10+400*10] mova [t2+r10+400* 6], m0 mova [t2+r10+400* 8], m4 mova [t2+r10+400*10], m5 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pmaxud m4, m2 psubd m4, m2 ; p3 pmaxud m5, m3 psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m12 ; b3 * 455 pmaddwd m1, m12 paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 psrld m0, 12 psrld m1, 12 mova m3, [t1+r10+400*0] mova m4, [t1+r10+400*2] mova m5, [t1+r10+400*4] mova [t3+r10*2+400*8+ 8], m3 mova [t3+r10*2+400*0+ 8], m4 mova [t3+r10*2+400*0+40], m5 paddw m3, m3 ; cc5 paddd m4, m4 paddd m5, m5 mova [t1+r10+400*0], m3 mova [t1+r10+400*2], m4 mova [t1+r10+400*4], m5 mova [t4+r10*1+400*2+ 4], m2 mova [t3+r10*2+400*4+ 8], xm0 vextracti128 [t3+r10*2+400*4+40], m0, 1 mova [t3+r10*2+400*4+24], xm1 vextracti128 [t3+r10*2+400*4+56], m1, 1 add r10, 32 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-4] .v1_loop: mova m4, [t1+r10+400* 6] mova m5, [t1+r10+400* 8] mova m6, [t1+r10+400*10] paddw m1, m4, [t2+r10+400* 6] paddd m2, m5, [t2+r10+400* 8] paddd m3, m6, [t2+r10+400*10] mova [t2+r10+400* 6], m4 mova [t2+r10+400* 8], m5 mova [t2+r10+400*10], m6 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pmaxud m4, m2 psubd m4, m2 ; p3 pmaxud m5, m3 psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m12 ; b3 * 455 pmaddwd m1, m12 paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r13+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r13+m4*4], m5 pmulld m0, m2 pmulld m1, m3 packssdw m2, m3 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 psrld m0, 12 psrld m8, m1, 12 mova [t4+r10*1+400*4+4], m2 mova m4, [t3+r10*2+400*8+ 8] mova m5, [t3+r10*2+400*0+ 8] mova m6, [t3+r10*2+400*0+40] paddw m1, m4, [t2+r10+400*0] paddd m2, m5, [t2+r10+400*2] paddd m3, m6, [t2+r10+400*4] paddw m1, [t1+r10+400*0] paddd m2, [t1+r10+400*2] paddd m3, [t1+r10+400*4] mova [t2+r10+400*0], m4 mova [t2+r10+400*2], m5 mova [t2+r10+400*4], m6 vpbroadcastd m4, [pd_25] mova [t3+r10*2+400*8+ 8], xm0 vextracti128 [t3+r10*2+400*8+40], m0, 1 mova [t3+r10*2+400*8+24], xm8 vextracti128 [t3+r10*2+400*8+56], m8, 1 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 pmulld m2, m4 ; ((a5 + 8) >> 4) * 25 pmulld m3, m4 psrlw m5, m1, 1 pavgw m5, m7 ; (b5 + 2) >> 2 punpcklwd m4, m5, m7 pmaddwd m4, m4 punpckhwd m5, m7 pmaddwd m5, m5 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 pmaxud m2, m4 psubd m2, m4 ; p5 vpbroadcastd m4, [pd_0xf00800a4] pmaxud m3, m5 psubd m3, m5 pmulld m2, m13 ; p5 * s0 pmulld m3, m13 pmaddwd m0, m4 ; b5 * 164 pmaddwd m1, m4 paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 vpgatherdd m4, [r13+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r13+m2*4], m3 pmulld m0, m4 pmulld m1, m5 packssdw m4, m5 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 psrld m0, 12 psrld m1, 12 mova [t4+r10*1+400*0+ 4], m4 mova [t3+r10*2+400*0+ 8], xm0 vextracti128 [t3+r10*2+400*0+40], m0, 1 mova [t3+r10*2+400*0+24], xm1 vextracti128 [t3+r10*2+400*0+56], m1, 1 add r10, 32 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu xm0, [t4+r10*1+400*0+2] paddw xm2, xm0, [t4+r10*1+400*0+0] paddw xm2, [t4+r10*1+400*0+4] movu m1, [t3+r10*2+400*0+4] paddd m3, m1, [t3+r10*2+400*0+0] paddd m3, [t3+r10*2+400*0+8] paddw xm0, xm2 paddd m1, m3 psllw xm2, 2 pslld m3, 2 paddw xm0, xm2 ; a5 565 paddd m1, m3 ; b5 565 mova [t4+r10*1+400* 6], xm0 mova [t3+r10*2+400*12], m1 mova xm0, [t4+r10*1+400*2+0] paddw xm0, [t4+r10*1+400*2+4] paddw xm2, xm0, [t4+r10*1+400*2+2] mova m1, [t3+r10*2+400*4+0] paddd m1, [t3+r10*2+400*4+8] paddd m3, m1, [t3+r10*2+400*4+4] psllw xm2, 2 ; a3[-1] 444 pslld m3, 2 ; b3[-1] 444 psubw xm2, xm0 ; a3[-1] 343 psubd m3, m1 ; b3[-1] 343 mova [t4+r10*1+400* 8], xm2 mova [t3+r10*2+400*16], m3 mova xm0, [t4+r10*1+400*4+0] paddw xm0, [t4+r10*1+400*4+4] paddw xm2, xm0, [t4+r10*1+400*4+2] mova m1, [t3+r10*2+400*8+0] paddd m1, [t3+r10*2+400*8+8] paddd m3, m1, [t3+r10*2+400*8+4] psllw xm2, 2 ; a3[ 0] 444 pslld m3, 2 ; b3[ 0] 444 mova [t4+r10*1+400*10], xm2 mova [t3+r10*2+400*20], m3 psubw xm2, xm0 ; a3[ 0] 343 psubd m3, m1 ; b3[ 0] 343 mova [t4+r10*1+400*12], xm2 mova [t3+r10*2+400*24], m3 add r10, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu xm2, [t4+r10*1+2] paddw xm0, xm2, [t4+r10*1+0] paddw xm0, [t4+r10*1+4] paddw xm2, xm0 psllw xm0, 2 paddw xm0, xm2 ; a5 movu m1, [t3+r10*2+4] paddd m4, m1, [t3+r10*2+0] paddd m4, [t3+r10*2+8] paddd m1, m4 pslld m4, 2 paddd m4, m1 ; b5 paddw xm2, xm0, [t4+r10*1+400* 6] mova [t4+r10*1+400* 6], xm0 paddd m0, m4, [t3+r10*2+400*12] mova [t3+r10*2+400*12], m4 mova xm3, [t4+r10*1+400*2+0] paddw xm3, [t4+r10*1+400*2+4] paddw xm5, xm3, [t4+r10*1+400*2+2] psllw xm5, 2 ; a3[ 1] 444 psubw xm4, xm5, xm3 ; a3[ 1] 343 paddw xm3, xm4, [t4+r10*1+400* 8] paddw xm3, [t4+r10*1+400*10] mova [t4+r10*1+400* 8], xm4 mova [t4+r10*1+400*10], xm5 mova m1, [t3+r10*2+400*4+0] paddd m1, [t3+r10*2+400*4+8] paddd m5, m1, [t3+r10*2+400*4+4] pslld m5, 2 ; b3[ 1] 444 psubd m4, m5, m1 ; b3[ 1] 343 paddd m1, m4, [t3+r10*2+400*16] paddd m1, [t3+r10*2+400*20] mova [t3+r10*2+400*16], m4 mova [t3+r10*2+400*20], m5 pmovzxwd m4, [dstq+r10] pmovzxwd m2, xm2 ; a5 pmovzxwd m3, xm3 ; a3 pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src pslld m4, 13 psubd m0, m2 ; b5 - a5 * src + (1 << 8) psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 9 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 paddd m4, m11 paddd m0, m4 psrad m0, 7 vextracti128 xm1, m0, 1 packusdw xm0, xm1 ; clip psrlw xm0, 6 mova [dstq+r10], xm0 add r10, 16 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova xm3, [t4+r10*1+400*4+0] paddw xm3, [t4+r10*1+400*4+4] paddw xm5, xm3, [t4+r10*1+400*4+2] psllw xm5, 2 ; a3[ 1] 444 psubw xm4, xm5, xm3 ; a3[ 1] 343 paddw xm3, xm4, [t4+r10*1+400*12] paddw xm3, [t4+r10*1+400*10] mova [t4+r10*1+400*10], xm5 mova [t4+r10*1+400*12], xm4 mova m1, [t3+r10*2+400*8+0] paddd m1, [t3+r10*2+400*8+8] paddd m5, m1, [t3+r10*2+400*8+4] pslld m5, 2 ; b3[ 1] 444 psubd m4, m5, m1 ; b3[ 1] 343 paddd m1, m4, [t3+r10*2+400*24] paddd m1, [t3+r10*2+400*20] mova [t3+r10*2+400*20], m5 mova [t3+r10*2+400*24], m4 pmovzxwd m4, [dstq+r10] pmovzxwd m2, [t4+r10*1+400* 6] pmovzxwd m3, xm3 mova m0, [t3+r10*2+400*12] pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src pslld m4, 13 psubd m0, m2 ; b5 - a5 * src + (1 << 8) psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 8 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 paddd m4, m11 paddd m0, m4 psrad m0, 7 vextracti128 xm1, m0, 1 packusdw xm0, xm1 ; clip psrlw xm0, 6 mova [dstq+r10], xm0 add r10, 16 jl .n1_loop add dstq, strideq ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/looprestoration16_avx512.asm000064400000000000000000002367671046102023000173320ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 16 wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 r_ext_mask: times 72 db -1 times 8 db 0 wiener_hshift: dw 4, 4, 1, 1 wiener_vshift: dw 1024, 1024, 4096, 4096 wiener_round: dd 1049600, 1048832 pw_164_455: dw 164, 455 pw_1023: times 2 dw 1023 pw_61448: times 2 dw 61448 pd_m262128: dd -262128 pd_m34816: dd -34816 pd_m25: dd -25 pd_m9: dd -9 pd_8: dd 8 pd_2147483648: dd 2147483648 cextern sgr_x_by_x SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers INIT_ZMM avx512icl cglobal wiener_filter7_16bpc, 4, 15, 17, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt %define base t4-wiener_hshift mov fltq, r6mp movifnidn wd, wm movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max vbroadcasti128 m6, [wiener_shufA] vpbroadcastd m12, [fltq+ 0] ; x0 x1 lea t4, [wiener_hshift] vbroadcasti128 m7, [wiener_shufB] add wd, wd vpbroadcastd m13, [fltq+ 4] ; x2 x3 shr t3d, 11 vpbroadcastd m14, [fltq+16] ; y0 y1 add lpfq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 add dstq, wq vbroadcasti128 m8, [wiener_shufC] lea t1, [rsp+wq+16] vbroadcasti128 m9, [wiener_shufD] neg wq vpbroadcastd m0, [base+wiener_hshift+t3*4] mov r10d, 0xfe vpbroadcastd m10, [base+wiener_round+t3*4] kmovb k1, r10d vpbroadcastd m11, [base+wiener_vshift+t3*4] pmullw m12, m0 ; upshift filter coefs to make the vpbroadcastd m16, [pd_m262128] pmullw m13, m0 ; horizontal downshift constant test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v .v2: call .v jmp .v1 .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movq xm3, [leftq] vmovdqu64 m3{k1}, [lpfq+r10-8] add leftq, 8 jmp .h_main .h_extend_left: mova m4, [lpfq+r10+0] vpbroadcastw xm3, xm4 vmovdqu64 m3{k1}, [lpfq+r10-8] jmp .h_main2 .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+r10-8] .h_main: mova m4, [lpfq+r10+0] .h_main2: movu m5, [lpfq+r10+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -68 jl .h_have_right push r0 lea r0, [r_ext_mask+66] vpbroadcastw m0, [lpfq-2] vpternlogd m3, m0, [r0+r10+ 0], 0xe4 ; c ? a : b vpternlogd m4, m0, [r0+r10+ 8], 0xe4 vpternlogd m5, m0, [r0+r10+16], 0xe4 pop r0 .h_have_right: pshufb m2, m3, m6 pshufb m1, m4, m7 paddw m2, m1 pshufb m3, m8 mova m0, m16 vpdpwssd m0, m2, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 vpdpwssd m0, m3, m13 pshufb m2, m5, m7 paddw m2, m1 mova m1, m16 pshufb m4, m8 vpdpwssd m1, m2, m12 pshufb m5, m9 paddw m4, m5 vpdpwssd m1, m4, m13 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+r10], m0 add r10, 64 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movq xm3, [leftq] vmovdqu64 m3{k1}, [lpfq+r10-8] add leftq, 8 jmp .hv_main .hv_extend_left: mova m4, [lpfq+r10+0] vpbroadcastw xm3, xm4 vmovdqu64 m3{k1}, [lpfq+r10-8] jmp .hv_main2 .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+r10-8] .hv_main: mova m4, [lpfq+r10+0] .hv_main2: movu m5, [lpfq+r10+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -68 jl .hv_have_right push r0 lea r0, [r_ext_mask+66] vpbroadcastw m0, [lpfq-2] vpternlogd m3, m0, [r0+r10+ 0], 0xe4 vpternlogd m4, m0, [r0+r10+ 8], 0xe4 vpternlogd m5, m0, [r0+r10+16], 0xe4 pop r0 .hv_have_right: pshufb m2, m3, m6 pshufb m1, m4, m7 paddw m2, m1 pshufb m3, m8 mova m0, m16 vpdpwssd m0, m2, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 vpdpwssd m0, m3, m13 pshufb m2, m5, m7 paddw m2, m1 pshufb m4, m8 mova m1, m16 vpdpwssd m1, m2, m12 pshufb m5, m9 paddw m4, m5 vpdpwssd m1, m4, m13 mova m2, [t4+r10] paddw m2, [t2+r10] mova m5, [t3+r10] psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova m4, [t5+r10] paddw m4, [t1+r10] psraw m0, 1 paddw m3, m0, [t6+r10] mova [t0+r10], m0 punpcklwd m1, m2, m5 mova m0, m10 vpdpwssd m0, m1, m15 punpckhwd m2, m5 mova m1, m10 vpdpwssd m1, m2, m15 punpcklwd m2, m3, m4 vpdpwssd m0, m2, m14 punpckhwd m3, m4 vpdpwssd m1, m3, m14 psrad m0, 5 psrad m1, 5 packusdw m0, m1 pmulhuw m0, m11 mova [dstq+r10], m0 add r10, 64 jl .hv_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m2, [t4+r10] paddw m2, [t2+r10] mova m3, [t3+r10] punpcklwd m1, m2, m3 mova m0, m10 vpdpwssd m0, m1, m15 punpckhwd m2, m3 mova m1, m10 vpdpwssd m1, m2, m15 mova m4, [t1+r10] paddw m3, m4, [t6+r10] paddw m4, [t5+r10] punpcklwd m2, m3, m4 vpdpwssd m0, m2, m14 punpckhwd m3, m4 vpdpwssd m1, m3, m14 psrad m0, 5 psrad m1, 5 packusdw m0, m1 pmulhuw m0, m11 mova [dstq+r10], m0 add r10, 64 jl .v_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret cglobal wiener_filter5_16bpc, 4, 14, 15, 384*8+16, dst, stride, left, lpf, \ w, h, edge, flt %define base r13-r_ext_mask-70 mov fltq, r6mp movifnidn wd, wm movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max vbroadcasti128 m5, [wiener_shufE] vpbroadcastw m11, [fltq+ 2] ; x1 vbroadcasti128 m6, [wiener_shufB] lea r13, [r_ext_mask+70] vbroadcasti128 m7, [wiener_shufD] add wd, wd vpbroadcastd m12, [fltq+ 4] ; x2 x3 shr t3d, 11 vpbroadcastd m8, [pd_m262128] ; (1 << 4) - (1 << 18) add lpfq, wq vpbroadcastw m13, [fltq+18] ; y1 add dstq, wq vpbroadcastd m14, [fltq+20] ; y2 y3 lea t1, [rsp+wq+16] vpbroadcastd m0, [base+wiener_hshift+t3*4] neg wq vpbroadcastd m9, [base+wiener_round+t3*4] mov r10d, 0xfffe vpbroadcastd m10, [base+wiener_vshift+t3*4] kmovw k1, r10d pmullw m11, m0 pmullw m12, m0 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 .main: mov t0, t4 .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .end: RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 lea t0, [t1+384*2] call .hv dec hd jz .v2 add t0, 384*6 call .hv dec hd jnz .main .v2: call .v mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq .v1: call .v jmp .end .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm3, [leftq+4] vmovdqu32 m3{k1}, [lpfq+r10-4] add leftq, 8 jmp .h_main .h_extend_left: vpbroadcastw xm3, [lpfq+r10] vmovdqu32 m3{k1}, [lpfq+r10-4] jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+r10-4] .h_main: movu m4, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -66 jl .h_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m3, m0, [r13+r10+0], 0xe4 ; c ? a : b vpternlogd m4, m0, [r13+r10+8], 0xe4 .h_have_right: pshufb m1, m3, m5 mova m0, m8 vpdpwssd m0, m1, m11 pshufb m2, m4, m5 mova m1, m8 vpdpwssd m1, m2, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 vpdpwssd m0, m2, m12 pshufb m4, m7 paddw m3, m4 vpdpwssd m1, m3, m12 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+r10], m0 add r10, 64 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm3, [leftq+4] vmovdqu32 m3{k1}, [lpfq+r10-4] add leftq, 8 jmp .hv_main .hv_extend_left: vpbroadcastw xm3, [lpfq+r10] vmovdqu32 m3{k1}, [lpfq+r10-4] jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+r10-4] .hv_main: movu m4, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -66 jl .hv_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m3, m0, [r13+r10+0], 0xe4 vpternlogd m4, m0, [r13+r10+8], 0xe4 .hv_have_right: pshufb m1, m3, m5 mova m0, m8 vpdpwssd m0, m1, m11 pshufb m2, m4, m5 mova m1, m8 vpdpwssd m1, m2, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 vpdpwssd m0, m2, m12 pshufb m4, m7 paddw m4, m3 vpdpwssd m1, m4, m12 mova m2, [t3+r10] paddw m2, [t1+r10] mova m3, [t2+r10] punpcklwd m4, m2, m3 punpckhwd m2, m3 mova m3, m9 vpdpwssd m3, m2, m14 mova m2, m9 vpdpwssd m2, m4, m14 mova m4, [t4+r10] psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t0+r10], m0 punpcklwd m1, m0, m4 vpdpwssd m2, m1, m13 punpckhwd m0, m4 vpdpwssd m3, m0, m13 psrad m2, 5 psrad m3, 5 packusdw m2, m3 pmulhuw m2, m10 mova [dstq+r10], m2 add r10, 64 jl .hv_loop mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t4 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m0, [t1+r10] paddw m2, m0, [t3+r10] mova m1, [t2+r10] mova m4, [t4+r10] punpckhwd m3, m2, m1 pmaddwd m3, m14 punpcklwd m2, m1 pmaddwd m2, m14 punpckhwd m1, m0, m4 pmaddwd m1, m13 punpcklwd m0, m4 pmaddwd m0, m13 paddd m3, m9 paddd m2, m9 paddd m1, m3 paddd m0, m2 psrad m1, 5 psrad m0, 5 packusdw m0, m1 pmulhuw m0, m10 mova [dstq+r10], m0 add r10, 64 jl .v_loop ret cglobal sgr_filter_5x5_16bpc, 4, 14, 22, 416*24+8, dst, stride, left, lpf, \ w, h, edge, params %define base r13-r_ext_mask-72 movifnidn wd, wm mov paramsq, r6mp lea r13, [r_ext_mask+72] mov edged, r7m movifnidn hd, hm pxor m6, m6 vpbroadcastw m7, [paramsq+8] ; w0 add wd, wd vpbroadcastd m8, [base+pd_8] add lpfq, wq vpbroadcastd m9, [base+pd_m25] add dstq, wq vpsubd m10, m6, [paramsq+0] {1to16} ; -s0 lea t3, [rsp+wq*2+416*12+8] vpbroadcastd m11, [base+pw_164_455] lea t4, [rsp+wq+416*20+8] vpbroadcastd m12, [base+pw_61448] ; (15 << 12) + (1 << 3) lea t1, [rsp+wq+12] vpbroadcastd m13, [base+pd_m34816] ; -((1 << 11) + (1 << 15)) neg wq vpbroadcastd m14, [base+pw_1023] psllw m7, 4 mova m18, [sgr_x_by_x+64*0] mov r10d, 0xfffffff8 mova m19, [sgr_x_by_x+64*1] kmovd k1, r10d mova m20, [sgr_x_by_x+64*2] mov r10, 0x3333333333333333 mova m21, [sgr_x_by_x+64*3] kmovq k2, r10 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call .top_fixup add t1, 416*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq test hd, hd jz .odd_height call .h add lpfq, strideq call .hv call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .h_top add lpfq, strideq call .hv_bottom .end: call .n0 call .n1 .end2: RET .height1: call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv call .n0 call .n1 .odd_height_end: call .v call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+416*6] call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 jmp .main .no_top_height1: call .v call .prep_n jmp .odd_height_end .h: ; horizontal boxsum lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movq xm16, [leftq+2] vmovdqu16 m16{k1}, [lpfq+wq-6] add leftq, 8 jmp .h_main .h_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-6] jmp .h_main .h_top: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m16, [lpfq+r10- 2] .h_main: movu m17, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -68 jl .h_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 ; c ? a : b vpternlogd m17, m0, [r13+r10+16], 0xe4 .h_have_right: palignr m2, m17, m16, 2 paddw m0, m16, m2 palignr m3, m17, m16, 6 paddw m0, m3 punpcklwd m1, m2, m3 pmaddwd m1, m1 punpckhwd m2, m3 pmaddwd m2, m2 shufpd m17, m16, m17, 0x55 paddw m0, m17 punpcklwd m3, m16, m17 vpdpwssd m1, m3, m3 punpckhwd m3, m16, m17 vpdpwssd m2, m3, m3 shufps m16, m17, q2121 paddw m0, m16 ; sum test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+r10+416*0] paddd m1, [t1+r10+416*2] paddd m2, [t1+r10+416*4] .h_loop_end: punpcklwd m17, m16, m6 vpdpwssd m1, m17, m17 ; sumsq punpckhwd m16, m6 vpdpwssd m2, m16, m16 mova [t1+r10+416*0], m0 mova [t1+r10+416*2], m1 mova [t1+r10+416*4], m2 add r10, 64 jl .h_loop ret .top_fixup: lea r10, [wq-4] .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+r10+416*0] mova m1, [t1+r10+416*2] mova m2, [t1+r10+416*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+r10+416*0], m0 mova [t2+r10+416*2], m1 mova [t2+r10+416*4], m2 add r10, 64 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movq xm16, [leftq+2] vmovdqu16 m16{k1}, [lpfq+wq-6] add leftq, 8 jmp .hv_main .hv_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-6] jmp .hv_main .hv_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m16, [lpfq+r10- 2] .hv_main: movu m17, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -68 jl .hv_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 vpternlogd m17, m0, [r13+r10+16], 0xe4 .hv_have_right: palignr m3, m17, m16, 2 paddw m0, m16, m3 palignr m1, m17, m16, 6 paddw m0, m1 punpcklwd m2, m3, m1 pmaddwd m2, m2 punpckhwd m3, m1 pmaddwd m3, m3 shufpd m17, m16, m17, 0x55 paddw m0, m17 punpcklwd m1, m16, m17 vpdpwssd m2, m1, m1 punpckhwd m1, m16, m17 vpdpwssd m3, m1, m1 shufps m16, m17, q2121 paddw m0, m16 ; h sum punpcklwd m17, m16, m6 vpdpwssd m2, m17, m17 ; h sumsq punpckhwd m16, m6 vpdpwssd m3, m16, m16 paddw m1, m0, [t1+r10+416*0] paddd m16, m2, [t1+r10+416*2] paddd m17, m3, [t1+r10+416*4] test hd, hd jz .hv_last_row .hv_main2: paddw m1, [t2+r10+416*0] ; hv sum paddd m16, [t2+r10+416*2] ; hv sumsq paddd m17, [t2+r10+416*4] mova [t0+r10+416*0], m0 mova [t0+r10+416*2], m2 mova [t0+r10+416*4], m3 psrlw m3, m1, 1 paddd m16, m8 pavgw m3, m6 ; (b + 2) >> 2 paddd m17, m8 psrld m16, 4 ; (a + 8) >> 4 psrld m17, 4 pmulld m16, m9 ; -a * 25 pmulld m17, m9 punpcklwd m2, m3, m6 vpdpwssd m16, m2, m2 ; -p punpckhwd m3, m6 vpdpwssd m17, m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmulld m16, m10 ; p * s pmulld m17, m10 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 pmaxsw m17, m6 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 packssdw m16, m17 psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) psubd m1, m13 mova [t4+r10+4], m16 psrld m16, m0, 12 ; b psrld m17, m1, 12 mova [t3+r10*2+ 8], xm16 mova [t3+r10*2+ 24], xm17 vextracti128 [t3+r10*2+ 40], ym16, 1 vextracti128 [t3+r10*2+ 56], ym17, 1 vextracti32x4 [t3+r10*2+ 72], m16, 2 vextracti32x4 [t3+r10*2+ 88], m17, 2 vextracti32x4 [t3+r10*2+104], m16, 3 vextracti32x4 [t3+r10*2+120], m17, 3 add r10, 64 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+r10+416*0], m1 paddw m1, m0 mova [t1+r10+416*2], m16 paddd m16, m2 mova [t1+r10+416*4], m17 paddd m17, m3 jmp .hv_main2 .v: ; vertical boxsum + ab lea r10, [wq-4] .v_loop: mova m2, [t1+r10+416*2] mova m3, [t1+r10+416*4] mova m0, [t1+r10+416*0] paddd m16, m2, [t2+r10+416*2] paddd m17, m3, [t2+r10+416*4] paddw m1, m0, [t2+r10+416*0] paddd m2, m2 paddd m3, m3 paddd m16, m2 ; hv sumsq paddd m17, m3 paddd m16, m8 paddd m17, m8 psrld m16, 4 ; (a + 8) >> 4 psrld m17, 4 pmulld m16, m9 ; -a * 25 pmulld m17, m9 paddw m0, m0 paddw m1, m0 ; hv sum psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 vpdpwssd m16, m2, m2 ; -p punpckhwd m3, m6 vpdpwssd m17, m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmulld m16, m10 ; p * s pmulld m17, m10 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 pmaxsw m17, m6 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 packssdw m16, m17 psubd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) psubd m1, m13 mova [t4+r10+4], m16 psrld m16, m0, 12 ; b psrld m17, m1, 12 mova [t3+r10*2+ 8], xm16 mova [t3+r10*2+ 24], xm17 vextracti128 [t3+r10*2+ 40], ym16, 1 vextracti128 [t3+r10*2+ 56], ym17, 1 vextracti32x4 [t3+r10*2+ 72], m16, 2 vextracti32x4 [t3+r10*2+ 88], m17, 2 vextracti32x4 [t3+r10*2+104], m16, 3 vextracti32x4 [t3+r10*2+120], m17, 3 add r10, 64 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t4+r10*1+ 2] movu m1, [t3+r10*2+ 4] movu m2, [t3+r10*2+68] paddw m3, m0, [t4+r10*1+ 0] paddd m16, m1, [t3+r10*2+ 0] paddd m17, m2, [t3+r10*2+64] paddw m3, [t4+r10*1+ 4] paddd m16, [t3+r10*2+ 8] paddd m17, [t3+r10*2+72] paddw m0, m3 psllw m3, 2 paddd m1, m16 pslld m16, 2 paddd m2, m17 pslld m17, 2 paddw m0, m3 ; a 565 paddd m1, m16 ; b 565 paddd m2, m17 mova [t4+r10*1+416*2+ 0], m0 mova [t3+r10*2+416*4+ 0], m1 mova [t3+r10*2+416*4+64], m2 add r10, 64 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m0, [t4+r10*1+ 2] movu m1, [t3+r10*2+ 4] movu m2, [t3+r10*2+68] paddw m3, m0, [t4+r10*1+ 0] paddd m16, m1, [t3+r10*2+ 0] paddd m17, m2, [t3+r10*2+64] paddw m3, [t4+r10*1+ 4] paddd m16, [t3+r10*2+ 8] paddd m17, [t3+r10*2+72] paddw m0, m3 psllw m3, 2 paddd m1, m16 pslld m16, 2 paddd m2, m17 pslld m17, 2 paddw m0, m3 ; a 565 paddd m1, m16 ; b 565 paddd m2, m17 paddw m3, m0, [t4+r10*1+416*2+ 0] paddd m16, m1, [t3+r10*2+416*4+ 0] paddd m17, m2, [t3+r10*2+416*4+64] mova [t4+r10*1+416*2+ 0], m0 mova [t3+r10*2+416*4+ 0], m1 mova [t3+r10*2+416*4+64], m2 mova m0, [dstq+r10] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vshufi32x4 m1, m16, m17, q2020 vshufi32x4 m16, m17, q3131 psubd m1, m2 ; b - a * src + (1 << 8) psubd m16, m3 psrad m1, 9 psrad m16, 9 packssdw m1, m16 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 64 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m0, [dstq+r10] mova m3, [t4+r10*1+416*2+ 0] mova m16, [t3+r10*2+416*4+ 0] mova m17, [t3+r10*2+416*4+64] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vshufi32x4 m1, m16, m17, q2020 vshufi32x4 m16, m17, q3131 psubd m1, m2 ; b - a * src + (1 << 7) psubd m16, m3 psrad m1, 8 psrad m16, 8 packssdw m1, m16 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 64 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_3x3_16bpc, 4, 14, 22, 416*42+8, dst, stride, left, lpf, \ w, h, edge, params movifnidn wd, wm mov paramsq, r6mp lea r13, [r_ext_mask+72] mov edged, r7m movifnidn hd, hm pxor m6, m6 vpbroadcastw m7, [paramsq+10] ; w1 add wd, wd vpbroadcastd m8, [base+pd_8] add lpfq, wq vpbroadcastd m9, [base+pd_m9] add dstq, wq vpsubd m10, m6, [paramsq+4] {1to16} ; -s1 lea t3, [rsp+wq*2+416*12+8] vpbroadcastd m11, [base+pw_164_455] lea t4, [rsp+wq+416*32+8] vpbroadcastd m12, [base+pw_61448] lea t1, [rsp+wq+12] vpbroadcastd m13, [base+pd_m34816] neg wq vpbroadcastd m14, [base+pw_1023] psllw m7, 4 mova m18, [sgr_x_by_x+64*0] mov r10d, 0xfffffffc mova m19, [sgr_x_by_x+64*1] kmovd k1, r10d mova m20, [sgr_x_by_x+64*2] mov r10, 0x3333333333333333 mova m21, [sgr_x_by_x+64*3] kmovq k2, r10 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 add t1, 416*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea r10, [wq-4] lea t2, [t1+416*6] .top_fixup_loop: mova m0, [t1+r10+416*0] mova m1, [t1+r10+416*2] mova m2, [t1+r10+416*4] mova [t2+r10+416*0], m0 mova [t2+r10+416*2], m1 mova [t2+r10+416*4], m2 add r10, 64 jl .top_fixup_loop call .v0 jmp .main .h: ; horizontal boxsum lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm16, [leftq+4] vmovdqu16 m16{k1}, [lpfq+wq-4] add leftq, 8 jmp .h_main .h_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-4] jmp .h_main .h_top: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m16, [lpfq+r10+ 0] .h_main: movu m17, [lpfq+r10+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -66 jl .h_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 vpternlogd m17, m0, [r13+r10+16], 0xe4 .h_have_right: palignr m0, m17, m16, 2 paddw m1, m16, m0 punpcklwd m2, m16, m0 pmaddwd m2, m2 punpckhwd m3, m16, m0 pmaddwd m3, m3 palignr m17, m16, 4 paddw m1, m17 ; sum punpcklwd m16, m17, m6 vpdpwssd m2, m16, m16 ; sumsq punpckhwd m17, m6 vpdpwssd m3, m17, m17 mova [t1+r10+416*0], m1 mova [t1+r10+416*2], m2 mova [t1+r10+416*4], m3 add r10, 64 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movd xm16, [leftq+4] vmovdqu16 m16{k1}, [lpfq+wq-4] add leftq, 8 jmp .hv0_main .hv0_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-4] jmp .hv0_main .hv0_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu m16, [lpfq+r10+ 0] .hv0_main: movu m17, [lpfq+r10+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -66 jl .hv0_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 vpternlogd m17, m0, [r13+r10+16], 0xe4 .hv0_have_right: palignr m0, m17, m16, 2 paddw m1, m16, m0 punpcklwd m2, m16, m0 pmaddwd m2, m2 punpckhwd m3, m16, m0 pmaddwd m3, m3 palignr m17, m16, 4 paddw m1, m17 ; sum punpcklwd m16, m17, m6 vpdpwssd m2, m16, m16 ; sumsq punpckhwd m17, m6 vpdpwssd m3, m17, m17 paddw m0, m1, [t1+r10+416*0] paddd m16, m2, [t1+r10+416*2] paddd m17, m3, [t1+r10+416*4] mova [t1+r10+416*0], m1 mova [t1+r10+416*2], m2 mova [t1+r10+416*4], m3 paddw m1, m0, [t2+r10+416*0] paddd m2, m16, [t2+r10+416*2] paddd m3, m17, [t2+r10+416*4] mova [t2+r10+416*0], m0 mova [t2+r10+416*2], m16 mova [t2+r10+416*4], m17 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pmulld m2, m9 ; -((a + 8) >> 4) * 9 pmulld m3, m9 psrlw m17, m1, 1 pavgw m17, m6 ; (b + 2) >> 2 punpcklwd m16, m17, m6 vpdpwssd m2, m16, m16 ; -p punpckhwd m17, m6 vpdpwssd m3, m17, m17 punpcklwd m16, m6, m1 ; b punpckhwd m17, m6, m1 pminsd m2, m6 pminsd m3, m6 pmulld m2, m10 ; p * s pmulld m3, m10 pmaddwd m16, m11 ; b * 455 pmaddwd m17, m11 vpalignr m3{k2}, m2, m2, 2 mova m2, m20 paddusw m3, m12 psraw m3, 4 ; min(z, 255) - 256 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m3 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m3{k3}, m2 ; x pandn m2, m13, m3 psrld m3, 16 pmulld m16, m2 pmulld m17, m3 packssdw m2, m3 psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) psubd m17, m13 mova [t4+r10*1+416*0+4], m2 psrld m16, 12 psrld m17, 12 mova [t3+r10*2+416*0+ 8], xm16 mova [t3+r10*2+416*0+ 24], xm17 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 add r10, 64 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movd xm16, [leftq+4] vmovdqu16 m16{k1}, [lpfq+wq-4] add leftq, 8 jmp .hv1_main .hv1_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-4] jmp .hv1_main .hv1_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu m16, [lpfq+r10+ 0] .hv1_main: movu m17, [lpfq+r10+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -66 jl .hv1_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 vpternlogd m17, m0, [r13+r10+16], 0xe4 .hv1_have_right: palignr m1, m17, m16, 2 paddw m0, m16, m1 punpcklwd m2, m16, m1 pmaddwd m2, m2 punpckhwd m3, m16, m1 pmaddwd m3, m3 palignr m17, m16, 4 paddw m0, m17 ; h sum punpcklwd m1, m17, m6 vpdpwssd m2, m1, m1 ; h sumsq punpckhwd m17, m6 vpdpwssd m3, m17, m17 paddw m1, m0, [t2+r10+416*0] paddd m16, m2, [t2+r10+416*2] paddd m17, m3, [t2+r10+416*4] mova [t2+r10+416*0], m0 mova [t2+r10+416*2], m2 mova [t2+r10+416*4], m3 paddd m16, m8 paddd m17, m8 psrld m16, 4 ; (a + 8) >> 4 psrld m17, 4 pmulld m16, m9 ; -((a + 8) >> 4) * 9 pmulld m17, m9 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 vpdpwssd m16, m2, m2 ; -p punpckhwd m3, m6 vpdpwssd m17, m3, m3 punpcklwd m0, m6, m1 ; b punpckhwd m1, m6, m1 pminsd m16, m6 pminsd m17, m6 pmulld m16, m10 ; p * s pmulld m17, m10 pmaddwd m0, m11 ; b * 455 pmaddwd m1, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 packssdw m16, m17 psubd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) psubd m1, m13 mova [t4+r10*1+416*2+4], m16 psrld m16, m0, 12 psrld m17, m1, 12 mova [t3+r10*2+416*4+ 8], xm16 mova [t3+r10*2+416*4+ 24], xm17 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 add r10, 64 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab (even rows) lea r10, [wq-4] .v0_loop: mova m0, [t1+r10+416*0] mova m16, [t1+r10+416*2] mova m17, [t1+r10+416*4] paddw m0, m0 paddd m16, m16 paddd m17, m17 paddw m1, m0, [t2+r10+416*0] paddd m2, m16, [t2+r10+416*2] paddd m3, m17, [t2+r10+416*4] mova [t2+r10+416*0], m0 mova [t2+r10+416*2], m16 mova [t2+r10+416*4], m17 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pmulld m2, m9 ; -((a + 8) >> 4) * 9 pmulld m3, m9 psrlw m17, m1, 1 pavgw m17, m6 ; (b + 2) >> 2 punpcklwd m16, m17, m6 vpdpwssd m2, m16, m16 ; -p punpckhwd m17, m6 vpdpwssd m3, m17, m17 punpcklwd m16, m6, m1 ; b punpckhwd m17, m6, m1 pminsd m2, m6 pminsd m3, m6 pmulld m2, m10 ; p * s pmulld m3, m10 pmaddwd m16, m11 ; b * 455 pmaddwd m17, m11 vpalignr m3{k2}, m2, m2, 2 mova m2, m20 paddusw m3, m12 psraw m3, 4 ; min(z, 255) - 256 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m3 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m3{k3}, m2 ; x pandn m2, m13, m3 psrld m3, 16 pmulld m16, m2 pmulld m17, m3 packssdw m2, m3 psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) psubd m17, m13 mova [t4+r10*1+416*0+4], m2 psrld m16, 12 psrld m17, 12 mova [t3+r10*2+416*0+ 8], xm16 mova [t3+r10*2+416*0+ 24], xm17 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 add r10, 64 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-4] .v1_loop: mova m0, [t1+r10+416*0] mova m16, [t1+r10+416*2] mova m17, [t1+r10+416*4] paddw m1, m0, [t2+r10+416*0] paddd m2, m16, [t2+r10+416*2] paddd m3, m17, [t2+r10+416*4] mova [t2+r10+416*0], m0 mova [t2+r10+416*2], m16 mova [t2+r10+416*4], m17 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pmulld m2, m9 ; -((a + 8) >> 4) * 9 pmulld m3, m9 psrlw m17, m1, 1 pavgw m17, m6 ; (b + 2) >> 2 punpcklwd m16, m17, m6 vpdpwssd m2, m16, m16 ; -p punpckhwd m17, m6 vpdpwssd m3, m17, m17 punpcklwd m16, m6, m1 ; b punpckhwd m17, m6, m1 pminsd m2, m6 pminsd m3, m6 pmulld m2, m10 ; p * s pmulld m3, m10 pmaddwd m16, m11 ; b * 455 pmaddwd m17, m11 vpalignr m3{k2}, m2, m2, 2 mova m2, m20 paddusw m3, m12 psraw m3, 4 ; min(z, 255) - 256 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m3 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m3{k3}, m2 ; x pandn m2, m13, m3 psrld m3, 16 pmulld m16, m2 pmulld m17, m3 packssdw m2, m3 psubd m16, m13 ; x * b * 455 + (1 << 11) + (1 << 15) psubd m17, m13 mova [t4+r10*1+416*2+4], m2 psrld m16, 12 psrld m17, 12 mova [t3+r10*2+416*4+ 8], xm16 mova [t3+r10*2+416*4+ 24], xm17 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 add r10, 64 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: mova ym16, [t4+r10*1+416*0+0] paddw ym16, [t4+r10*1+416*0+4] paddw ym17, ym16, [t4+r10*1+416*0+2] mova m0, [t3+r10*2+416*0+0] paddd m0, [t3+r10*2+416*0+8] paddd m1, m0, [t3+r10*2+416*0+4] psllw ym17, 2 ; a[-1] 444 pslld m1, 2 ; b[-1] 444 psubw ym17, ym16 ; a[-1] 343 psubd m1, m0 ; b[-1] 343 vmovdqa32 [t4+r10*1+416* 4], ym17 vmovdqa32 [t3+r10*2+416* 8], m1 mova ym16, [t4+r10*1+416*2+0] paddw ym16, [t4+r10*1+416*2+4] paddw ym17, ym16, [t4+r10*1+416*2+2] mova m0, [t3+r10*2+416*4+0] paddd m0, [t3+r10*2+416*4+8] paddd m1, m0, [t3+r10*2+416*4+4] psllw ym17, 2 ; a[ 0] 444 pslld m1, 2 ; b[ 0] 444 vmovdqa32 [t4+r10*1+416* 6], ym17 vmovdqa32 [t3+r10*2+416*12], m1 psubw ym17, ym16 ; a[ 0] 343 psubd m1, m0 ; b[ 0] 343 vmovdqa32 [t4+r10*1+416* 8], ym17 vmovdqa32 [t3+r10*2+416*16], m1 add r10, 32 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: mova m3, [t4+r10*1+416*0+0] paddw m3, [t4+r10*1+416*0+4] paddw m1, m3, [t4+r10*1+416*0+2] psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+r10*1+416*4] paddw m3, [t4+r10*1+416*6] mova [t4+r10*1+416*4], m2 mova [t4+r10*1+416*6], m1 mova m16, [t3+r10*2+416*0+0] paddd m16, [t3+r10*2+416*0+8] paddd m1, m16, [t3+r10*2+416*0+4] pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m16 ; b[ 1] 343 paddd m16, m2, [t3+r10*2+416* 8+ 0] paddd m16, [t3+r10*2+416*12+ 0] mova [t3+r10*2+416* 8+ 0], m2 mova [t3+r10*2+416*12+ 0], m1 mova m17, [t3+r10*2+416*0+64] paddd m17, [t3+r10*2+416*0+72] paddd m1, m17, [t3+r10*2+416*0+68] pslld m1, 2 psubd m2, m1, m17 paddd m17, m2, [t3+r10*2+416* 8+64] paddd m17, [t3+r10*2+416*12+64] mova [t3+r10*2+416* 8+64], m2 mova [t3+r10*2+416*12+64], m1 mova m0, [dstq+r10] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vshufi32x4 m1, m16, m17, q2020 vshufi32x4 m16, m17, q3131 psubd m1, m2 ; b - a * src + (1 << 8) psubd m16, m3 psrad m1, 9 psrad m16, 9 packssdw m1, m16 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 64 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m3, [t4+r10*1+416*2+0] paddw m3, [t4+r10*1+416*2+4] paddw m1, m3, [t4+r10*1+416*2+2] psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+r10*1+416*6] paddw m3, [t4+r10*1+416*8] mova [t4+r10*1+416*6], m1 mova [t4+r10*1+416*8], m2 mova m16, [t3+r10*2+416*4+0] paddd m16, [t3+r10*2+416*4+8] paddd m1, m16, [t3+r10*2+416*4+4] pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m16 ; b[ 1] 343 paddd m16, m2, [t3+r10*2+416*12+ 0] paddd m16, [t3+r10*2+416*16+ 0] mova [t3+r10*2+416*12+ 0], m1 mova [t3+r10*2+416*16+ 0], m2 mova m17, [t3+r10*2+416*4+64] paddd m17, [t3+r10*2+416*4+72] paddd m1, m17, [t3+r10*2+416*4+68] pslld m1, 2 psubd m2, m1, m17 paddd m17, m2, [t3+r10*2+416*12+64] paddd m17, [t3+r10*2+416*16+64] mova [t3+r10*2+416*12+64], m1 mova [t3+r10*2+416*16+64], m2 mova m0, [dstq+r10] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 vshufi32x4 m1, m16, m17, q2020 vshufi32x4 m16, m17, q3131 psubd m1, m2 ; b - a * src + (1 << 8) psubd m16, m3 psrad m1, 9 psrad m16, 9 packssdw m1, m16 pmulhrsw m1, m7 paddw m0, m1 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+r10], m0 add r10, 64 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_mix_16bpc, 4, 14, 23, 416*66+8, dst, stride, left, lpf, \ w, h, edge, params movifnidn wd, wm mov paramsq, r6mp lea r13, [r_ext_mask+72] mov edged, r7m movifnidn hd, hm vpbroadcastd m7, [paramsq+8] ; w0 w1 pxor m6, m6 vpbroadcastd m8, [base+pd_8] add wd, wd vpbroadcastd m9, [base+pd_m9] add lpfq, wq vpbroadcastd m10, [base+pd_m25] add dstq, wq vpsubd m11, m6, [paramsq+0] {1to16} ; -s0 lea t3, [rsp+wq*2+416*24+8] vpsubd m12, m6, [paramsq+4] {1to16} ; -s1 lea t4, [rsp+wq+416*52+8] vpbroadcastd m13, [base+pw_164_455] lea t1, [rsp+wq+12] vpbroadcastd m14, [base+pw_61448] neg wq vpbroadcastd m15, [base+pd_m34816] psllw m7, 2 vpbroadcastd m22, [base+pd_2147483648] mov r10d, 0xfffffff8 mova m18, [sgr_x_by_x+64*0] kmovd k1, r10d mova m19, [sgr_x_by_x+64*1] mov r10, 0x3333333333333333 mova m20, [sgr_x_by_x+64*2] kmovq k2, r10 mova m21, [sgr_x_by_x+64*3] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_avx512icl).top_fixup add t1, 416*12 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea r10, [wq-4] lea t2, [t1+416*12] .top_fixup_loop: mova m0, [t1+r10+416* 0] mova m1, [t1+r10+416* 2] mova m2, [t1+r10+416* 4] paddw m0, m0 mova m3, [t1+r10+416* 6] paddd m1, m1 mova m4, [t1+r10+416* 8] paddd m2, m2 mova m5, [t1+r10+416*10] mova [t2+r10+416* 0], m0 mova [t2+r10+416* 2], m1 mova [t2+r10+416* 4], m2 mova [t2+r10+416* 6], m3 mova [t2+r10+416* 8], m4 mova [t2+r10+416*10], m5 add r10, 64 jl .top_fixup_loop call .v0 jmp .main .h: ; horizontal boxsum lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movq xm16, [leftq+2] vmovdqu16 m16{k1}, [lpfq+wq-6] add leftq, 8 jmp .h_main .h_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-6] jmp .h_main .h_top: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m16, [lpfq+r10- 2] .h_main: movu m17, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -68 jl .h_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 vpternlogd m17, m0, [r13+r10+16], 0xe4 .h_have_right: palignr m3, m17, m16, 2 palignr m0, m17, m16, 4 paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m17, m16, 6 paddw m1, m0 ; sum3 punpcklwd m4, m0, m6 vpdpwssd m2, m4, m4 ; sumsq3 punpckhwd m0, m6 vpdpwssd m3, m0, m0 shufpd m4, m16, m17, 0x55 punpcklwd m17, m4, m16 paddw m0, m16, m4 punpckhwd m4, m16 mova [t1+r10+416* 6], m1 mova [t1+r10+416* 8], m2 mova [t1+r10+416*10], m3 paddw m1, m0 ; sum5 vpdpwssd m2, m17, m17 ; sumsq5 vpdpwssd m3, m4, m4 mova [t1+r10+416* 0], m1 mova [t1+r10+416* 2], m2 mova [t1+r10+416* 4], m3 add r10, 64 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movq xm16, [leftq+2] vmovdqu16 m16{k1}, [lpfq+wq-6] add leftq, 8 jmp .hv0_main .hv0_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-6] jmp .hv0_main .hv0_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu m16, [lpfq+r10- 2] .hv0_main: movu m17, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -68 jl .hv0_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 vpternlogd m17, m0, [r13+r10+16], 0xe4 .hv0_have_right: palignr m3, m17, m16, 2 palignr m0, m17, m16, 4 paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m17, m16, 6 paddw m1, m0 ; h sum3 punpcklwd m4, m0, m6 vpdpwssd m2, m4, m4 ; h sumsq3 punpckhwd m0, m6 vpdpwssd m3, m0, m0 shufpd m17, m16, m17, 0x55 paddw m4, m1, [t1+r10+416* 6] paddd m5, m2, [t1+r10+416* 8] mova [t1+r10+416* 6], m1 mova [t1+r10+416* 8], m2 paddw m1, m16 paddw m1, m17 ; h sum5 punpcklwd m0, m17, m16 vpdpwssd m2, m0, m0 ; h sumsq5 paddd m0, m3, [t1+r10+416*10] mova [t1+r10+416*10], m3 punpckhwd m17, m16 vpdpwssd m3, m17, m17 mova [t3+r10*2+416*8+ 8], m1 ; we need a clean copy of the last row mova [t3+r10*2+416*0+ 8], m2 ; in case height is odd mova [t3+r10*2+416*0+72], m3 paddw m1, [t1+r10+416* 0] paddd m2, [t1+r10+416* 2] paddd m3, [t1+r10+416* 4] mova [t1+r10+416* 0], m1 mova [t1+r10+416* 2], m2 mova [t1+r10+416* 4], m3 paddw m17, m4, [t2+r10+416* 6] paddd m2, m5, [t2+r10+416* 8] paddd m3, m0, [t2+r10+416*10] mova [t2+r10+416* 6], m4 mova [t2+r10+416* 8], m5 mova [t2+r10+416*10], m0 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 pmulld m3, m9 psrlw m5, m17, 1 pavgw m5, m6 ; (b3 + 2) >> 2 punpcklwd m4, m5, m6 vpdpwssd m2, m4, m4 ; -p3 punpckhwd m5, m6 vpdpwssd m3, m5, m5 punpcklwd m16, m6, m17 ; b3 punpckhwd m17, m6, m17 pminsd m2, m6 pminsd m3, m6 pmulld m2, m12 ; p3 * s1 pmulld m3, m12 pmaddwd m16, m13 ; b3 * 455 pmaddwd m17, m13 vpalignr m3{k2}, m2, m2, 2 mova m2, m20 paddusw m3, m14 psraw m3, 4 ; min(z3, 255) - 256 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m3 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m3{k3}, m2 ; x3 pandn m2, m15, m3 psrld m3, 16 pmulld m16, m2 pmulld m17, m3 packssdw m2, m3 mova [t4+r10*1+416*2+4], m2 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) psubd m17, m15 psrld m16, 12 psrld m17, 12 mova [t3+r10*2+416*4+ 8], xm16 mova [t3+r10*2+416*4+ 24], xm17 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 add r10, 64 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movq xm16, [leftq+2] vmovdqu16 m16{k1}, [lpfq+wq-6] add leftq, 8 jmp .hv1_main .hv1_extend_left: vpbroadcastw xm16, [lpfq+wq] vmovdqu16 m16{k1}, [lpfq+wq-6] jmp .hv1_main .hv1_bottom: lea r10, [wq-4] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu m16, [lpfq+r10- 2] .hv1_main: movu m17, [lpfq+r10+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -68 jl .hv1_have_right vpbroadcastw m0, [lpfq-2] vpternlogd m16, m0, [r13+r10+ 0], 0xe4 vpternlogd m17, m0, [r13+r10+16], 0xe4 .hv1_have_right: palignr m1, m17, m16, 2 palignr m3, m17, m16, 4 paddw m2, m1, m3 punpcklwd m0, m1, m3 pmaddwd m0, m0 punpckhwd m1, m3 pmaddwd m1, m1 palignr m3, m17, m16, 6 paddw m2, m3 ; h sum3 punpcklwd m5, m3, m6 vpdpwssd m0, m5, m5 ; h sumsq3 punpckhwd m3, m6 vpdpwssd m1, m3, m3 shufpd m3, m16, m17, 0x55 punpcklwd m5, m16, m3 paddw m4, m16, m3 punpckhwd m16, m3 paddw m17, m2, [t2+r10+416* 6] mova [t2+r10+416* 6], m2 paddw m4, m2 ; h sum5 paddd m2, m0, [t2+r10+416* 8] paddd m3, m1, [t2+r10+416*10] mova [t2+r10+416* 8], m0 mova [t2+r10+416*10], m1 vpdpwssd m0, m5, m5 ; h sumsq5 vpdpwssd m1, m16, m16 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pmulld m2, m9 ; -((a3 + 8) >> 4) * 9 pmulld m3, m9 psrlw m16, m17, 1 pavgw m16, m6 ; (b3 + 2) >> 2 punpcklwd m5, m16, m6 vpdpwssd m2, m5, m5 ; -p3 punpckhwd m16, m6 vpdpwssd m3, m16, m16 punpcklwd m16, m6, m17 ; b3 punpckhwd m17, m6, m17 pminsd m2, m6 pminsd m3, m6 pmulld m2, m12 ; p3 * s1 pmulld m3, m12 pmaddwd m16, m13 ; b3 * 455 pmaddwd m17, m13 vpalignr m3{k2}, m2, m2, 2 mova m2, m20 paddusw m3, m14 psraw m3, 4 ; min(z3, 255) - 256 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m3 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m3{k3}, m2 ; x3 pandn m2, m15, m3 psrld m3, 16 pmulld m16, m2 pmulld m17, m3 packssdw m2, m3 mova [t4+r10*1+416*4+4], m2 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) psubd m17, m15 psrld m16, 12 psrld m17, 12 paddw m5, m4, [t2+r10+416*0] paddd m2, m0, [t2+r10+416*2] paddd m3, m1, [t2+r10+416*4] paddw m5, [t1+r10+416*0] paddd m2, [t1+r10+416*2] paddd m3, [t1+r10+416*4] mova [t2+r10+416*0], m4 mova [t2+r10+416*2], m0 mova [t2+r10+416*4], m1 mova [t3+r10*2+416*8+ 8], xm16 mova [t3+r10*2+416*8+ 24], xm17 vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*8+104], m16, 3 vextracti32x4 [t3+r10*2+416*8+120], m17, 3 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 pmulld m3, m10 psrlw m17, m5, 1 pavgw m17, m6 ; (b5 + 2) >> 2 punpcklwd m16, m17, m6 vpdpwssd m2, m16, m16 ; -p5 punpckhwd m17, m6 vpdpwssd m3, m17, m17 punpcklwd m16, m5, m6 ; b5 punpckhwd m17, m5, m6 pmulld m2, m11 ; p5 * s0 pmulld m3, m11 pmaddwd m16, m13 ; b5 * 164 pmaddwd m17, m13 vpalignr m3{k2}, m2, m2, 2 mova m2, m20 pmaxsw m3, m6 paddusw m3, m14 psraw m3, 4 ; min(z5, 255) - 256 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m3 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m3{k3}, m2 ; x5 pandn m2, m15, m3 psrld m3, 16 pmulld m16, m2 pmulld m17, m3 packssdw m2, m3 mova [t4+r10*1+416*0+4], m2 psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) psubd m17, m15 psrld m16, 12 psrld m17, 12 mova [t3+r10*2+416*0+ 8], xm16 mova [t3+r10*2+416*0+ 24], xm17 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 add r10, 64 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) lea r10, [wq-4] .v0_loop: mova m16, [t1+r10+416* 6] mova m2, [t1+r10+416* 8] mova m3, [t1+r10+416*10] paddw m16, m16 paddd m2, m2 paddd m3, m3 paddw m17, m16, [t2+r10+416* 6] paddd m4, m2, [t2+r10+416* 8] paddd m5, m3, [t2+r10+416*10] mova [t2+r10+416* 6], m16 mova [t2+r10+416* 8], m2 mova [t2+r10+416*10], m3 paddd m4, m8 paddd m5, m8 psrld m4, 4 ; (a3 + 8) >> 4 psrld m5, 4 pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 pmulld m5, m9 psrlw m3, m17, 1 pavgw m3, m6 ; (b3 + 2) >> 2 punpcklwd m2, m3, m6 vpdpwssd m4, m2, m2 ; -p3 punpckhwd m3, m6 vpdpwssd m5, m3, m3 punpcklwd m16, m6, m17 ; b3 punpckhwd m17, m6, m17 pminsd m4, m6 pminsd m5, m6 pmulld m4, m12 ; p3 * s1 pmulld m5, m12 pmaddwd m16, m13 ; b3 * 455 pmaddwd m17, m13 vpalignr m5{k2}, m4, m4, 2 mova m4, m20 paddusw m5, m14 psraw m5, 4 ; min(z3, 255) - 256 vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m5 vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m5{k3}, m4 ; x3 pandn m4, m15, m5 psrld m5, 16 pmulld m16, m4 pmulld m17, m5 packssdw m4, m5 mova [t4+r10*1+416*2+4], m4 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) psubd m17, m15 psrld m16, 12 psrld m17, 12 mova m3, [t1+r10+416*0] mova m4, [t1+r10+416*2] mova m5, [t1+r10+416*4] mova [t3+r10*2+416*8+ 8], m3 mova [t3+r10*2+416*0+ 8], m4 mova [t3+r10*2+416*0+72], m5 paddw m3, m3 ; cc5 paddd m4, m4 paddd m5, m5 mova [t1+r10+416*0], m3 mova [t1+r10+416*2], m4 mova [t1+r10+416*4], m5 mova [t3+r10*2+416*4+ 8], xm16 mova [t3+r10*2+416*4+ 24], xm17 vextracti128 [t3+r10*2+416*4+ 40], ym16, 1 vextracti128 [t3+r10*2+416*4+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*4+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*4+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*4+104], m16, 3 vextracti32x4 [t3+r10*2+416*4+120], m17, 3 add r10, 64 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-4] .v1_loop: mova m16, [t1+r10+416* 6] mova m2, [t1+r10+416* 8] mova m3, [t1+r10+416*10] paddw m17, m16, [t2+r10+416* 6] paddd m4, m2, [t2+r10+416* 8] paddd m5, m3, [t2+r10+416*10] mova [t2+r10+416* 6], m16 mova [t2+r10+416* 8], m2 mova [t2+r10+416*10], m3 paddd m4, m8 paddd m5, m8 psrld m4, 4 ; (a3 + 8) >> 4 psrld m5, 4 pmulld m4, m9 ; -((a3 + 8) >> 4) * 9 pmulld m5, m9 psrlw m3, m17, 1 pavgw m3, m6 ; (b3 + 2) >> 2 punpcklwd m2, m3, m6 vpdpwssd m4, m2, m2 ; -p3 punpckhwd m3, m6 vpdpwssd m5, m3, m3 punpcklwd m16, m6, m17 ; b3 punpckhwd m17, m6, m17 pminsd m4, m6 pminsd m5, m6 pmulld m4, m12 ; p3 * s1 pmulld m5, m12 pmaddwd m16, m13 ; b3 * 455 pmaddwd m17, m13 vpalignr m5{k2}, m4, m4, 2 mova m4, m20 paddusw m5, m14 psraw m5, 4 ; min(z3, 255) - 256 vpermt2b m4, m5, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m5 vpermi2b m5, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m5{k3}, m4 ; x3 pandn m4, m15, m5 psrld m5, 16 pmulld m16, m4 pmulld m17, m5 packssdw m4, m5 mova [t4+r10*1+416*4+4], m4 psubd m16, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) psubd m17, m15 psrld m16, 12 psrld m17, 12 mova m0, [t3+r10*2+416*8+ 8] mova m4, [t3+r10*2+416*0+ 8] mova m5, [t3+r10*2+416*0+72] paddw m1, m0, [t2+r10+416*0] paddd m2, m4, [t2+r10+416*2] paddd m3, m5, [t2+r10+416*4] paddw m1, [t1+r10+416*0] paddd m2, [t1+r10+416*2] paddd m3, [t1+r10+416*4] mova [t2+r10+416*0], m0 mova [t2+r10+416*2], m4 mova [t2+r10+416*4], m5 mova [t3+r10*2+416*8+ 8], xm16 mova [t3+r10*2+416*8+ 24], xm17 vextracti128 [t3+r10*2+416*8+ 40], ym16, 1 vextracti128 [t3+r10*2+416*8+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*8+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*8+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*8+104], m16, 3 vextracti32x4 [t3+r10*2+416*8+120], m17, 3 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 pmulld m2, m10 ; -((a5 + 8) >> 4) * 25 pmulld m3, m10 psrlw m5, m1, 1 pavgw m5, m6 ; (b5 + 2) >> 2 punpcklwd m4, m5, m6 vpdpwssd m2, m4, m4 ; -p5 punpckhwd m5, m6 vpdpwssd m3, m5, m5 punpcklwd m16, m1, m6 ; b5 punpckhwd m17, m1, m6 pmulld m2, m11 ; p5 * s0 pmulld m3, m11 pmaddwd m16, m13 ; b5 * 164 pmaddwd m17, m13 vpalignr m3{k2}, m2, m2, 2 mova m2, m20 pmaxsw m3, m6 paddusw m3, m14 psraw m3, 4 ; min(z5, 255) - 256 vpermt2b m2, m3, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m3 vpermi2b m3, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m3{k3}, m2 ; x5 pandn m2, m15, m3 psrld m3, 16 pmulld m16, m2 pmulld m17, m3 packssdw m2, m3 mova [t4+r10*1+416*0+4], m2 psubd m16, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) psubd m17, m15 psrld m16, 12 psrld m17, 12 mova [t3+r10*2+416*0+ 8], xm16 mova [t3+r10*2+416*0+ 24], xm17 vextracti128 [t3+r10*2+416*0+ 40], ym16, 1 vextracti128 [t3+r10*2+416*0+ 56], ym17, 1 vextracti32x4 [t3+r10*2+416*0+ 72], m16, 2 vextracti32x4 [t3+r10*2+416*0+ 88], m17, 2 vextracti32x4 [t3+r10*2+416*0+104], m16, 3 vextracti32x4 [t3+r10*2+416*0+120], m17, 3 add r10, 64 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu ym0, [t4+r10*1+416*0+2] paddw ym2, ym0, [t4+r10*1+416*0+0] paddw ym2, [t4+r10*1+416*0+4] movu m1, [t3+r10*2+416*0+4] paddd m3, m1, [t3+r10*2+416*0+0] paddd m3, [t3+r10*2+416*0+8] paddw ym0, ym2 paddd m1, m3 psllw ym2, 2 pslld m3, 2 paddw ym0, ym2 ; a5 565 paddd m1, m3 ; b5 565 mova [t4+r10*1+416* 6], ym0 mova [t3+r10*2+416*12], m1 mova ym0, [t4+r10*1+416*2+0] paddw ym0, [t4+r10*1+416*2+4] paddw ym2, ym0, [t4+r10*1+416*2+2] mova m1, [t3+r10*2+416*4+0] paddd m1, [t3+r10*2+416*4+8] paddd m3, m1, [t3+r10*2+416*4+4] psllw ym2, 2 ; a3[-1] 444 pslld m3, 2 ; b3[-1] 444 psubw ym2, ym0 ; a3[-1] 343 psubd m3, m1 ; b3[-1] 343 mova [t4+r10*1+416* 8], ym2 mova [t3+r10*2+416*16], m3 mova ym0, [t4+r10*1+416*4+0] paddw ym0, [t4+r10*1+416*4+4] paddw ym2, ym0, [t4+r10*1+416*4+2] mova m1, [t3+r10*2+416*8+0] paddd m1, [t3+r10*2+416*8+8] paddd m3, m1, [t3+r10*2+416*8+4] psllw ym2, 2 ; a3[ 0] 444 pslld m3, 2 ; b3[ 0] 444 mova [t4+r10*1+416*10], ym2 mova [t3+r10*2+416*20], m3 psubw ym2, ym0 ; a3[ 0] 343 psubd m3, m1 ; b3[ 0] 343 mova [t4+r10*1+416*12], ym2 mova [t3+r10*2+416*24], m3 add r10, 32 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu ym2, [t4+r10*1+2] paddw ym0, ym2, [t4+r10*1+0] paddw ym0, [t4+r10*1+4] paddw ym2, ym0 psllw ym0, 2 paddw ym0, ym2 ; a5 movu m1, [t3+r10*2+4] paddd m4, m1, [t3+r10*2+0] paddd m4, [t3+r10*2+8] paddd m1, m4 pslld m4, 2 paddd m4, m1 ; b5 paddw ym2, ym0, [t4+r10*1+416* 6] mova [t4+r10*1+416* 6], ym0 paddd m0, m4, [t3+r10*2+416*12] mova [t3+r10*2+416*12], m4 mova ym3, [t4+r10*1+416*2+0] paddw ym3, [t4+r10*1+416*2+4] paddw ym5, ym3, [t4+r10*1+416*2+2] psllw ym5, 2 ; a3[ 1] 444 psubw ym4, ym5, ym3 ; a3[ 1] 343 paddw ym3, ym4, [t4+r10*1+416* 8] paddw ym3, [t4+r10*1+416*10] mova [t4+r10*1+416* 8], ym4 mova [t4+r10*1+416*10], ym5 mova m1, [t3+r10*2+416*4+0] paddd m1, [t3+r10*2+416*4+8] paddd m5, m1, [t3+r10*2+416*4+4] pslld m5, 2 ; b3[ 1] 444 psubd m4, m5, m1 ; b3[ 1] 343 paddd m1, m4, [t3+r10*2+416*16] paddd m1, [t3+r10*2+416*20] mova [t3+r10*2+416*16], m4 mova [t3+r10*2+416*20], m5 pmovzxwd m4, [dstq+r10] pmovzxwd m2, ym2 ; a5 pmovzxwd m3, ym3 ; a3 pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src vpshldd m4, m22, 13 psubd m0, m2 ; b5 - a5 * src + (1 << 8) psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 9 pslld m1, 7 vpblendmb m0{k2}, m1, m0 vpdpwssd m4, m0, m7 psrad m4, 7 pmaxsd m4, m6 vpmovusdw ym16, m4 ; clip psrlw ym16, 6 mova [dstq+r10], ym16 add r10, 32 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova ym3, [t4+r10*1+416*4+0] paddw ym3, [t4+r10*1+416*4+4] paddw ym5, ym3, [t4+r10*1+416*4+2] psllw ym5, 2 ; a3[ 1] 444 psubw ym4, ym5, ym3 ; a3[ 1] 343 paddw ym3, ym4, [t4+r10*1+416*12] paddw ym3, [t4+r10*1+416*10] mova [t4+r10*1+416*10], ym5 mova [t4+r10*1+416*12], ym4 mova m0, [t3+r10*2+416*8+0] paddd m0, [t3+r10*2+416*8+8] paddd m5, m0, [t3+r10*2+416*8+4] pslld m5, 2 ; b3[ 1] 444 psubd m4, m5, m0 ; b3[ 1] 343 paddd m0, m4, [t3+r10*2+416*24] paddd m0, [t3+r10*2+416*20] mova [t3+r10*2+416*20], m5 mova [t3+r10*2+416*24], m4 pmovzxwd m4, [dstq+r10] pmovzxwd m2, [t4+r10*1+416* 6] pmovzxwd m3, ym3 mova m1, [t3+r10*2+416*12] pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src vpshldd m4, m22, 13 psubd m1, m2 ; b5 - a5 * src + (1 << 8) psubd m0, m3 ; b3 - a3 * src + (1 << 8) pslld m0, 7 vpalignr m0{k2}, m1, m1, 1 vpdpwssd m4, m0, m7 psrad m4, 7 pmaxsd m4, m6 vpmovusdw ym16, m4 ; clip psrlw ym16, 6 mova [dstq+r10], ym16 add r10, 32 jl .n1_loop add dstq, strideq ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/looprestoration16_sse.asm000064400000000000000000003262131046102023000170600ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wiener_lshuf5: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 wiener_lshuf7: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_m14_m13: times 8 db -14,-13 pb_m10_m9: times 8 db -10, -9 pb_m6_m5: times 8 db -6, -5 pb_m2_m1: times 8 db -2, -1 pb_2_3: times 8 db 2, 3 pb_6_7: times 8 db 6, 7 pw_256: times 8 dw 256 pw_1023: times 8 dw 1023 pd_8: times 4 dd 8 pd_4096: times 4 dd 4096 pd_34816: times 4 dd 34816 pd_m262128: times 4 dd -262128 pd_0xffff: times 4 dd 0xffff pd_0xf00800a4: times 4 dd 0xf00800a4 pd_0xf00801c7: times 4 dd 0xf00801c7 pd_0xfffffff0: times 4 dd 0xfffffff0 wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192 wiener_round: dd 1049600, 1048832 cextern sgr_x_by_x SECTION .text %macro movif64 2 ; dst, src %if ARCH_X86_64 mov %1, %2 %endif %endmacro %macro movif32 2 ; dst, src %if ARCH_X86_32 mov %1, %2 %endif %endmacro INIT_XMM ssse3 %if ARCH_X86_32 DECLARE_REG_TMP 5, 6 %if STACK_ALIGNMENT < 16 %assign extra_stack 13*16 %else %assign extra_stack 12*16 %endif cglobal wiener_filter7_16bpc, 4, 7, 8, -384*12-16-extra_stack, \ dst, stride, left, lpf, w, flt %if STACK_ALIGNMENT < 16 %define lpfm dword [esp+calloff+16*12+ 0] %define wm dword [esp+calloff+16*12+ 4] %define hd dword [esp+calloff+16*12+ 8] %define edgeb byte [esp+calloff+16*12+12] %define edged dword [esp+calloff+16*12+12] %else %define hd dword r5m %define edgeb byte r7m %endif %define PICmem dword [esp+calloff+4*0] %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers %define t1m dword [esp+calloff+4*2] %define t2m dword [esp+calloff+4*3] %define t3m dword [esp+calloff+4*4] %define t4m dword [esp+calloff+4*5] %define t5m dword [esp+calloff+4*6] %define t6m dword [esp+calloff+4*7] %define t2 t2m %define t3 t3m %define t4 t4m %define t5 t5m %define t6 t6m %define m8 [esp+calloff+16*2] %define m9 [esp+calloff+16*3] %define m10 [esp+calloff+16*4] %define m11 [esp+calloff+16*5] %define m12 [esp+calloff+16*6] %define m13 [esp+calloff+16*7] %define m14 [esp+calloff+16*8] %define m15 [esp+calloff+16*9] %define r10 r4 %define base t0-wiener_shifts %assign calloff 0 %if STACK_ALIGNMENT < 16 mov wd, [rstk+stack_offset+20] mov wm, wd mov r5, [rstk+stack_offset+24] mov hd, r5 mov r5, [rstk+stack_offset+32] mov edged, r5 ; edge %endif %else DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; wiener ring buffer pointers cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt %define base %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max movq m13, [fltq] movq m15, [fltq+16] %else %if STACK_ALIGNMENT < 16 mov t0, [rstk+stack_offset+28] mov t1, [rstk+stack_offset+36] ; pixel_max movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts %else mov fltq, r6m movq m1, [fltq] movq m3, [fltq+16] LEA t0, wiener_shifts mov t1, r8m ; pixel_max %endif mov PICmem, t0 %endif mova m6, [base+wiener_shufA] mova m7, [base+wiener_shufB] %if ARCH_X86_64 lea t4, [wiener_shifts] add wd, wd pshufd m12, m13, q0000 ; x0 x1 pshufd m13, m13, q1111 ; x2 x3 pshufd m14, m15, q0000 ; y0 y1 pshufd m15, m15, q1111 ; y2 y3 mova m8, [wiener_shufC] mova m9, [wiener_shufD] add lpfq, wq lea t1, [rsp+wq+16] add dstq, wq neg wq shr t3d, 11 %define base t4-wiener_shifts movd m10, [base+wiener_round+t3*4] movq m11, [base+wiener_shifts+t3*8] pshufd m10, m10, q0000 pshufd m0, m11, q0000 pshufd m11, m11, q1111 pmullw m12, m0 ; upshift filter coefs to make the pmullw m13, m0 ; horizontal downshift constant DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %define base %define wiener_lshuf7_mem [wiener_lshuf7] %define pd_m262128_mem [pd_m262128] %else add wd, wd mova m4, [base+wiener_shufC] mova m5, [base+wiener_shufD] pshufd m0, m1, q0000 pshufd m1, m1, q1111 pshufd m2, m3, q0000 pshufd m3, m3, q1111 mova m8, m4 mova m9, m5 mova m14, m2 mova m15, m3 shr t1, 11 add lpfq, wq mova m3, [base+pd_m262128] movd m4, [base+wiener_round+t1*4] movq m5, [base+wiener_shifts+t1*8] lea t1, [esp+extra_stack+wq+16] add dstq, wq neg wq pshufd m4, m4, q0000 pshufd m2, m5, q0000 pshufd m5, m5, q1111 mov wm, wq pmullw m0, m2 pmullw m1, m2 mova m2, [base+wiener_lshuf7] %define pd_m262128_mem [esp+calloff+16*10] mova pd_m262128_mem, m3 mova m10, m4 mova m11, m5 mova m12, m0 mova m13, m1 %define wiener_lshuf7_mem [esp+calloff+16*11] mova wiener_lshuf7_mem, m2 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov lpfm, r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, lpfm call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v movif32 wq, wm .v2: call .v movif32 wq, wm jmp .v1 .extend_right: %assign stack_offset stack_offset+8 %assign calloff 8 movif32 t0, PICmem pxor m0, m0 movd m1, wd mova m2, [base+pb_0to15] pshufb m1, m0 mova m0, [base+pb_6_7] psubb m0, m1 pminub m0, m2 pshufb m3, m0 mova m0, [base+pb_m2_m1] psubb m0, m1 pminub m0, m2 pshufb m4, m0 mova m0, [base+pb_m10_m9] psubb m0, m1 pminub m0, m2 pshufb m5, m0 movif32 t0, t0m ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movq m3, [leftq] movhps m3, [lpfq+wq] add leftq, 8 jmp .h_main .h_extend_left: mova m3, [lpfq+wq] ; avoid accessing memory located pshufb m3, wiener_lshuf7_mem ; before the start of the buffer jmp .h_main .h_top: movif64 wq, r4 test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+wq-8] .h_main: mova m4, [lpfq+wq+0] movu m5, [lpfq+wq+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -20 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m3, m6 pshufb m1, m4, m7 paddw m0, m1 pshufb m3, m8 pmaddwd m0, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 mova m2, pd_m262128_mem ; (1 << 4) - (1 << 18) pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 paddw m4, m5 pmaddwd m4, m13 paddd m0, m2 paddd m1, m2 paddd m0, m3 paddd m1, m4 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+wq], m0 add wq, 16 jl .h_loop movif32 wq, wm ret ALIGN function_align .hv: add lpfq, strideq movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movq m3, [leftq] movhps m3, [lpfq+wq] add leftq, 8 jmp .hv_main .hv_extend_left: mova m3, [lpfq+wq] pshufb m3, wiener_lshuf7_mem jmp .hv_main .hv_bottom: movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+wq-8] .hv_main: mova m4, [lpfq+wq+0] movu m5, [lpfq+wq+8] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp wd, -20 jl .hv_have_right call .extend_right .hv_have_right: movif32 t1, t4m movif32 t0, t2m pshufb m0, m3, m6 pshufb m1, m4, m7 paddw m0, m1 pshufb m3, m8 pmaddwd m0, m12 pshufb m1, m4, m9 paddw m3, m1 pshufb m1, m4, m6 pmaddwd m3, m13 pshufb m2, m5, m7 paddw m1, m2 mova m2, pd_m262128_mem pshufb m4, m8 pmaddwd m1, m12 pshufb m5, m9 paddw m4, m5 pmaddwd m4, m13 paddd m0, m2 paddd m1, m2 %if ARCH_X86_64 mova m2, [t4+wq] paddw m2, [t2+wq] mova m5, [t3+wq] %else mova m2, [t1+wq] paddw m2, [t0+wq] mov t1, t3m mov t0, t5m mova m5, [t1+wq] mov t1, t1m %endif paddd m0, m3 paddd m1, m4 psrad m0, 4 psrad m1, 4 packssdw m0, m1 %if ARCH_X86_64 mova m4, [t5+wq] paddw m4, [t1+wq] psraw m0, 1 paddw m3, m0, [t6+wq] %else mova m4, [t0+wq] paddw m4, [t1+wq] mov t0, t0m mov t1, t6m psraw m0, 1 paddw m3, m0, [t1+wq] %endif mova [t0+wq], m0 punpcklwd m0, m2, m5 pmaddwd m0, m15 punpckhwd m2, m5 pmaddwd m2, m15 punpcklwd m1, m3, m4 pmaddwd m1, m14 punpckhwd m3, m4 pmaddwd m3, m14 paddd m0, m10 paddd m2, m10 paddd m0, m1 paddd m2, m3 psrad m0, 6 psrad m2, 6 packssdw m0, m2 pmulhw m0, m11 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 jl .hv_loop %if ARCH_X86_64 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 %else mov r4, t5m mov t1, t4m mov t6m, r4 mov t5m, t1 mov r4, t3m mov t1, t2m mov t4m, r4 mov t3m, t1 mov r4, t1m mov t1, t0 mov t2m, r4 mov t0, t6m mov wq, wm %endif add dstq, strideq ret .v: movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 .v_loop: %if ARCH_X86_64 mova m1, [t4+wq] paddw m1, [t2+wq] mova m2, [t3+wq] mova m4, [t1+wq] paddw m3, m4, [t6+wq] paddw m4, [t5+wq] %else mov t0, t4m mov t1, t2m mova m1, [t0+wq] paddw m1, [t1+wq] mov t0, t3m mov t1, t1m mova m2, [t0+wq] mova m4, [t1+wq] mov t0, t6m mov t1, t5m paddw m3, m4, [t0+wq] paddw m4, [t1+wq] %endif punpcklwd m0, m1, m2 pmaddwd m0, m15 punpckhwd m1, m2 pmaddwd m1, m15 punpcklwd m2, m3, m4 pmaddwd m2, m14 punpckhwd m3, m4 pmaddwd m3, m14 paddd m0, m10 paddd m1, m10 paddd m0, m2 paddd m1, m3 psrad m0, 6 psrad m1, 6 packssdw m0, m1 pmulhw m0, m11 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 jl .v_loop %if ARCH_X86_64 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 %else mov t0, t5m mov t1, t4m mov r4, t3m mov t6m, t0 mov t5m, t1 mov t4m, r4 mov r4, t2m mov t1, t1m mov t0, t0m mov t3m, r4 mov t2m, t1 %endif add dstq, strideq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign stack_size 12*16+384*8 %else %assign stack_size 11*16+384*8 %endif cglobal wiener_filter5_16bpc, 4, 7, 8, -stack_size, dst, stride, left, \ lpf, w, flt %if STACK_ALIGNMENT < 16 %define lpfm dword [esp+calloff+4*6] %define wm dword [esp+calloff+4*7] %define hd dword [esp+calloff+16*10+0] %define edgeb byte [esp+calloff+16*10+4] %define edged dword [esp+calloff+16*10+4] %else %define hd dword r5m %define edgeb byte r7m %endif %define PICmem dword [esp+calloff+4*0] %define t0m dword [esp+calloff+4*1] ; wiener ring buffer pointers %define t1m dword [esp+calloff+4*2] %define t2m dword [esp+calloff+4*3] %define t3m dword [esp+calloff+4*4] %define t4m dword [esp+calloff+4*5] %define t2 t2m %define t3 t3m %define t4 t4m %define m8 [esp+calloff+16*2] %define m9 [esp+calloff+16*3] %define m10 [esp+calloff+16*4] %define m11 [esp+calloff+16*5] %define m12 [esp+calloff+16*6] %define m13 [esp+calloff+16*7] %define m14 [esp+calloff+16*8] %define m15 [esp+calloff+16*9] %define base t0-wiener_shifts %assign calloff 0 %if STACK_ALIGNMENT < 16 mov wd, [rstk+stack_offset+20] mov wm, wd mov r5, [rstk+stack_offset+24] mov hd, r5 mov r5, [rstk+stack_offset+32] mov edged, r5 ; edge %endif %else cglobal wiener_filter5_16bpc, 4, 14, 16, 384*8+16, dst, stride, left, lpf, \ w, h, edge, flt %define base %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov t3d, r8m ; pixel_max movq m12, [fltq] movq m14, [fltq+16] %else %if STACK_ALIGNMENT < 16 mov t0, [rstk+stack_offset+28] mov t1, [rstk+stack_offset+36] ; pixel_max movq m1, [t0] ; fx movq m3, [t0+16] ; fy LEA t0, wiener_shifts %else mov fltq, r6m movq m1, [fltq] movq m3, [fltq+16] LEA t0, wiener_shifts mov t1, r8m ; pixel_max %endif mov PICmem, t0 %endif mova m5, [base+wiener_shufE] mova m6, [base+wiener_shufB] mova m7, [base+wiener_shufD] %if ARCH_X86_64 lea t4, [wiener_shifts] add wd, wd punpcklwd m11, m12, m12 pshufd m11, m11, q1111 ; x1 pshufd m12, m12, q1111 ; x2 x3 punpcklwd m13, m14, m14 pshufd m13, m13, q1111 ; y1 pshufd m14, m14, q1111 ; y2 y3 shr t3d, 11 mova m8, [pd_m262128] ; (1 << 4) - (1 << 18) add lpfq, wq lea t1, [rsp+wq+16] add dstq, wq neg wq %define base t4-wiener_shifts movd m9, [base+wiener_round+t3*4] movq m10, [base+wiener_shifts+t3*8] pshufd m9, m9, q0000 pshufd m0, m10, q0000 pshufd m10, m10, q1111 mova m15, [wiener_lshuf5] pmullw m11, m0 pmullw m12, m0 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %define base %else add wd, wd punpcklwd m0, m1, m1 pshufd m0, m0, q1111 ; x1 pshufd m1, m1, q1111 ; x2 x3 punpcklwd m2, m3, m3 pshufd m2, m2, q1111 ; y1 pshufd m3, m3, q1111 ; y2 y3 mova m4, [base+pd_m262128] ; (1 << 4) - (1 << 18) mova m13, m2 mova m14, m3 mova m8, m4 shr t1, 11 add lpfq, wq movd m2, [base+wiener_round+t1*4] movq m3, [base+wiener_shifts+t1*8] %if STACK_ALIGNMENT < 16 lea t1, [esp+16*11+wq+16] %else lea t1, [esp+16*10+wq+16] %endif add dstq, wq neg wq pshufd m2, m2, q0000 pshufd m4, m3, q0000 pshufd m3, m3, q1111 mov wm, wq pmullw m0, m4 pmullw m1, m4 mova m4, [base+wiener_lshuf5] mova m9, m2 mova m10, m3 mova m11, m0 mova m12, m1 mova m15, m4 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 add r10, strideq mov lpfm, r10 ; below call .h mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 .main: mov t0, t4 .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 mov lpfq, lpfm call .hv_bottom add lpfq, strideq call .hv_bottom .end: RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 lea t0, [t1+384*2] call .hv dec hd jz .v2 add t0, 384*6 call .hv dec hd jnz .main .v2: call .v %if ARCH_X86_64 mov t4, t3 mov t3, t2 mov t2, t1 %else mov t0, t3m mov r4, t2m mov t1, t1m mov t4m, t0 mov t3m, r4 mov t2m, t1 mov wq, wm %endif add dstq, strideq .v1: call .v jmp .end .extend_right: %assign stack_offset stack_offset+8 %assign calloff 8 movif32 t0, PICmem pxor m1, m1 movd m2, wd mova m0, [base+pb_2_3] pshufb m2, m1 mova m1, [base+pb_m6_m5] psubb m0, m2 psubb m1, m2 mova m2, [base+pb_0to15] pminub m0, m2 pminub m1, m2 pshufb m3, m0 pshufb m4, m1 ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left mova m4, [lpfq+wq] movd m3, [leftq+4] pslldq m4, 4 por m3, m4 add leftq, 8 jmp .h_main .h_extend_left: mova m3, [lpfq+wq] ; avoid accessing memory located pshufb m3, m15 ; before the start of the buffer jmp .h_main .h_top: movif64 wq, r4 movif32 wq, wm test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m3, [lpfq+wq-4] .h_main: movu m4, [lpfq+wq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -18 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m3, m5 pmaddwd m0, m11 pshufb m1, m4, m5 pmaddwd m1, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 pmaddwd m2, m12 pshufb m4, m7 paddw m3, m4 pmaddwd m3, m12 paddd m0, m8 paddd m1, m8 paddd m0, m2 paddd m1, m3 psrad m0, 4 psrad m1, 4 packssdw m0, m1 psraw m0, 1 mova [t1+wq], m0 add wq, 16 jl .h_loop movif32 wq, wm ret ALIGN function_align .hv: add lpfq, strideq movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left mova m4, [lpfq+wq] movd m3, [leftq+4] pslldq m4, 4 por m3, m4 add leftq, 8 jmp .hv_main .hv_extend_left: mova m3, [lpfq+wq] pshufb m3, m15 jmp .hv_main .hv_bottom: movif64 wq, r4 movif32 t0m, t0 movif32 t1m, t1 test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m3, [lpfq+wq-4] .hv_main: movu m4, [lpfq+wq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp wd, -18 jl .hv_have_right call .extend_right .hv_have_right: movif32 t1, t1m movif32 t0, t3m pshufb m0, m3, m5 pmaddwd m0, m11 pshufb m1, m4, m5 pmaddwd m1, m11 pshufb m2, m3, m6 pshufb m3, m7 paddw m2, m3 pshufb m3, m4, m6 pmaddwd m2, m12 pshufb m4, m7 paddw m3, m4 pmaddwd m3, m12 paddd m0, m8 paddd m1, m8 paddd m0, m2 %if ARCH_X86_64 mova m2, [t3+wq] paddw m2, [t1+wq] paddd m1, m3 mova m4, [t2+wq] %else mova m2, [t0+wq] mov t0, t2m paddw m2, [t1+wq] mov t1, t4m paddd m1, m3 mova m4, [t0+wq] mov t0, t0m %endif punpckhwd m3, m2, m4 pmaddwd m3, m14 punpcklwd m2, m4 %if ARCH_X86_64 mova m4, [t4+wq] %else mova m4, [t1+wq] %endif psrad m0, 4 psrad m1, 4 packssdw m0, m1 pmaddwd m2, m14 psraw m0, 1 mova [t0+wq], m0 punpckhwd m1, m0, m4 pmaddwd m1, m13 punpcklwd m0, m4 pmaddwd m0, m13 paddd m3, m9 paddd m2, m9 paddd m1, m3 paddd m0, m2 psrad m1, 6 psrad m0, 6 packssdw m0, m1 pmulhw m0, m10 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 jl .hv_loop %if ARCH_X86_64 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t4 %else mov r4, t3m mov t1, t2m mov t4m, r4 mov t3m, t1 mov r4, t1m mov t1, t0 mov t2m, r4 mov t0, t4m mov wq, wm %endif add dstq, strideq ret .v: movif64 wq, r4 movif32 t1m, t1 .v_loop: %if ARCH_X86_64 mova m0, [t1+wq] paddw m2, m0, [t3+wq] mova m1, [t2+wq] mova m4, [t4+wq] %else mov t0, t3m mova m0, [t1+wq] mov t1, t2m paddw m2, m0, [t0+wq] mov t0, t4m mova m1, [t1+wq] mova m4, [t0+wq] %endif punpckhwd m3, m2, m1 pmaddwd m3, m14 punpcklwd m2, m1 pmaddwd m2, m14 punpckhwd m1, m0, m4 pmaddwd m1, m13 punpcklwd m0, m4 pmaddwd m0, m13 paddd m3, m9 paddd m2, m9 paddd m1, m3 paddd m0, m2 psrad m1, 6 psrad m0, 6 packssdw m0, m1 pmulhw m0, m10 pxor m1, m1 pmaxsw m0, m1 mova [dstq+wq], m0 add wq, 16 %if ARCH_X86_64 jl .v_loop %else jge .v_end mov t1, t1m jmp .v_loop .v_end: %endif ret %macro GATHERDD 3 ; dst, src, tmp movd %3d, %2 %if ARCH_X86_64 movd %1, [r13+%3] pextrw %3d, %2, 2 pinsrw %1, [r13+%3+2], 3 pextrw %3d, %2, 4 pinsrw %1, [r13+%3+2], 5 pextrw %3d, %2, 6 pinsrw %1, [r13+%3+2], 7 %else movd %1, [base+sgr_x_by_x-0xf03+%3] pextrw %3, %2, 2 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 pextrw %3, %2, 4 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 pextrw %3, %2, 6 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 %endif %endmacro %macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore %if ARCH_X86_64 %define tmp r14 %else %define tmp %4 %endif GATHERDD %1, %2, tmp GATHERDD %2, %3, tmp movif32 %4, %5 psrld %1, 24 psrld %2, 24 packssdw %1, %2 %endmacro %macro MAXSD 3-4 0 ; dst, src, restore_tmp pcmpgtd %3, %1, %2 pand %1, %3 pandn %3, %2 por %1, %3 %if %4 == 1 pxor %3, %3 %endif %endmacro %macro MULLD 3 ; dst, src, tmp pmulhuw %3, %1, %2 pmullw %1, %2 pslld %3, 16 paddd %1, %3 %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 0, 1, 2, 3, 5 %if STACK_ALIGNMENT < 16 %assign extra_stack 5*16 %else %assign extra_stack 3*16 %endif cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*0+4*6] %define stridemp dword [esp+calloff+16*0+4*7] %define leftm dword [esp+calloff+16*3+4*0] %define lpfm dword [esp+calloff+16*3+4*1] %define w0m dword [esp+calloff+16*3+4*2] %define hd dword [esp+calloff+16*3+4*3] %define edgeb byte [esp+calloff+16*3+4*4] %define edged dword [esp+calloff+16*3+4*4] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t0m dword [esp+calloff+4*2] %define t2m dword [esp+calloff+4*3] %define t3m dword [esp+calloff+4*4] %define t4m dword [esp+calloff+4*5] %define m8 [base+pd_8] %define m9 [base+pd_0xfffffff0] %define m10 [esp+calloff+16*2] %define m11 [base+pd_0xf00800a4] %define m12 [base+sgr_lshuf5] %define m13 [base+pd_34816] %define m14 [base+pw_1023] %define r10 r4 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm add wd, wd mov edged, r7m movu m10, [paramsq] mova m12, [sgr_lshuf5] add lpfq, wq mova m8, [pd_8] lea t1, [rsp+wq+20] mova m9, [pd_0xfffffff0] add dstq, wq lea t3, [rsp+wq*2+400*12+16] mova m11, [pd_0xf00800a4] lea t4, [rsp+wq+400*20+16] pshufhw m7, m10, q0000 pshufb m10, [pw_256] ; s0 punpckhqdq m7, m7 ; w0 neg wq mova m13, [pd_34816] ; (1 << 11) + (1 << 15) pxor m6, m6 mova m14, [pw_1023] psllw m7, 4 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd movu m1, [r1] add lpfm, wq lea t1, [rsp+extra_stack+wq+20] add dstq, wq lea t3, [rsp+extra_stack+wq*2+400*12+16] mov dstm, dstq lea t4, [rsp+extra_stack+wq+400*20+16] mov t3m, t3 pshufhw m7, m1, q0000 mov t4m, t4 pshufb m1, [base+pw_256] ; s0 punpckhqdq m7, m7 ; w0 psllw m7, 4 neg wq mova m10, m1 pxor m6, m6 mov w1m, wd sub wd, 4 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp movif32 t2m, t1 mov t2, t1 call .top_fixup add t1, 400*6 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t0m, t2 mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, stridemp movif32 t4, t4m call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp %if ARCH_X86_64 test hb, hb %else mov r4, hd test r4, r4 %endif jz .odd_height call .h add lpfq, stridemp call .hv movif32 dstq, dstm call .n0 call .n1 sub hd, 2 movif32 t0, t0m jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .h_top add lpfq, stridemp call .hv_bottom .end: movif32 dstq, dstm call .n0 call .n1 .end2: RET .height1: movif32 t4, t4m call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv movif32 dstq, dstm call .n0 call .n1 .odd_height_end: call .v movif32 dstq, dstm call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h lea t2, [t1+400*6] movif32 t2m, t2 call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 movif32 t0m, t0 jmp .main .no_top_height1: movif32 t3, t3m movif32 t4, t4m call .v call .prep_n jmp .odd_height_end .extend_right: movd m0, wd movd m1, [lpfq-2] mova m2, [base+pw_256] mova m3, [base+pb_m14_m13] pshufb m0, m6 pshufb m1, m2 psubb m2, m0 psubb m3, m0 mova m0, [base+pb_0to15] pcmpgtb m2, m0 pcmpgtb m3, m0 pand m4, m2 pand m5, m3 pandn m2, m1 pandn m3, m1 por m4, m2 por m5, m3 ret %assign stack_offset stack_offset+4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 lea wq, [r4-4] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .h_main .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m4, [lpfq+wq- 2] .h_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -20 jl .h_have_right call .extend_right .h_have_right: palignr m2, m5, m4, 2 paddw m0, m4, m2 palignr m3, m5, m4, 6 paddw m0, m3 punpcklwd m1, m2, m3 pmaddwd m1, m1 punpckhwd m2, m3 pmaddwd m2, m2 palignr m5, m4, 8 paddw m0, m5 punpcklwd m3, m4, m5 pmaddwd m3, m3 paddd m1, m3 punpckhwd m3, m4, m5 pmaddwd m3, m3 shufps m4, m5, q2121 paddw m0, m4 ; sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m2, m3 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+wq+400*0] paddd m1, [t1+wq+400*2] paddd m2, [t1+wq+400*4] .h_loop_end: paddd m1, m5 ; sumsq paddd m2, m4 mova [t1+wq+400*0], m0 mova [t1+wq+400*2], m1 mova [t1+wq+400*4], m2 add wq, 16 jl .h_loop ret .top_fixup: %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+wq+400*0] mova m1, [t1+wq+400*2] mova m2, [t1+wq+400*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m1 mova [t2+wq+400*4], m2 add wq, 16 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .hv_main .hv_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .hv_main .hv_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv_loop_start %endif .hv_loop: movif32 lpfq, hvsrcm .hv_loop_start: movu m4, [lpfq+wq- 2] .hv_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp wd, -20 jl .hv_have_right call .extend_right .hv_have_right: movif32 t3, hd palignr m3, m5, m4, 2 paddw m0, m4, m3 palignr m1, m5, m4, 6 paddw m0, m1 punpcklwd m2, m3, m1 pmaddwd m2, m2 punpckhwd m3, m1 pmaddwd m3, m3 palignr m5, m4, 8 paddw m0, m5 punpcklwd m1, m4, m5 pmaddwd m1, m1 paddd m2, m1 punpckhwd m1, m4, m5 pmaddwd m1, m1 shufps m4, m5, q2121 paddw m0, m4 ; h sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m3, m1 paddd m2, m5 ; h sumsq paddd m3, m4 paddw m1, m0, [t1+wq+400*0] paddd m4, m2, [t1+wq+400*2] paddd m5, m3, [t1+wq+400*4] %if ARCH_X86_64 test hd, hd %else test t3, t3 %endif jz .hv_last_row .hv_main2: paddw m1, [t2+wq+400*0] ; hv sum paddd m4, [t2+wq+400*2] ; hv sumsq paddd m5, [t2+wq+400*4] mova [t0+wq+400*0], m0 mova [t0+wq+400*2], m2 mova [t0+wq+400*4], m3 psrlw m3, m1, 1 paddd m4, m8 pavgw m3, m6 ; (b + 2) >> 2 paddd m5, m8 pand m4, m9 ; ((a + 8) >> 4) << 4 pand m5, m9 psrld m2, m4, 4 psrld m0, m5, 4 paddd m2, m4 psrld m4, 1 paddd m0, m5 psrld m5, 1 paddd m4, m2 ; a * 25 paddd m5, m0 punpcklwd m2, m3, m6 punpckhwd m3, m6 pmaddwd m2, m2 ; b * b pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m6 MAXSD m5, m3, m6, 1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m10, m2 ; p * s MULLD m5, m10, m2 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, t2, t2m punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+wq+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*2+ 8], m0 mova [t3+wq*2+24], m1 add wq, 16 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 movif32 t2m, t2 movif32 t0m, t0 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+wq+400*0], m1 paddw m1, m0 mova [t1+wq+400*2], m4 paddd m4, m2 mova [t1+wq+400*4], m5 paddd m5, m3 jmp .hv_main2 .v: ; vertical boxsum + ab %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v_loop: mova m0, [t1+wq+400*0] mova m2, [t1+wq+400*2] mova m3, [t1+wq+400*4] paddw m1, m0, [t2+wq+400*0] paddd m4, m2, [t2+wq+400*2] paddd m5, m3, [t2+wq+400*4] paddw m0, m0 paddd m2, m2 paddd m3, m3 paddw m1, m0 ; hv sum paddd m4, m2 ; hv sumsq paddd m5, m3 psrlw m3, m1, 1 paddd m4, m8 pavgw m3, m6 ; (b + 2) >> 2 paddd m5, m8 pand m4, m9 ; ((a + 8) >> 4) << 4 pand m5, m9 psrld m2, m4, 4 psrld m0, m5, 4 paddd m2, m4 psrld m4, 1 paddd m0, m5 psrld m5, 1 paddd m4, m2 ; a * 25 paddd m5, m0 punpcklwd m2, m3, m6 punpckhwd m3, m6 pmaddwd m2, m2 ; b * b pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m6 MAXSD m5, m3, m6, 1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m10, m2 ; p * s MULLD m5, m10, m2 pmaddwd m0, m11 ; b * 164 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, t2, t2m punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m13 mova [t4+wq+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*2+ 8], m0 mova [t3+wq*2+24], m1 add wq, 16 jl .v_loop ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+ 2] movu m3, [t4+wq*1+ 4] movu m1, [t3+wq*2+ 4] movu m4, [t3+wq*2+ 8] movu m2, [t3+wq*2+20] movu m5, [t3+wq*2+24] paddw m3, m0 paddd m4, m1 paddd m5, m2 paddw m3, [t4+wq*1+ 0] paddd m4, [t3+wq*2+ 0] paddd m5, [t3+wq*2+16] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 mova [t4+wq*1+400*2+ 0], m0 mova [t3+wq*2+400*4+ 0], m1 mova [t3+wq*2+400*4+16], m2 add wq, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*1+ 2] movu m3, [t4+wq*1+ 4] movu m1, [t3+wq*2+ 4] movu m4, [t3+wq*2+ 8] movu m2, [t3+wq*2+20] movu m5, [t3+wq*2+24] paddw m3, m0 paddd m4, m1 paddd m5, m2 paddw m3, [t4+wq*1+ 0] paddd m4, [t3+wq*2+ 0] paddd m5, [t3+wq*2+16] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 paddw m3, m0, [t4+wq*1+400*2+ 0] paddd m4, m1, [t3+wq*2+400*4+ 0] paddd m5, m2, [t3+wq*2+400*4+16] mova [t4+wq*1+400*2+ 0], m0 mova [t3+wq*2+400*4+ 0], m1 mova [t3+wq*2+400*4+16], m2 mova m0, [dstq+wq] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+wq], m0 add wq, 16 jl .n0_loop add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: mova m0, [dstq+wq] mova m3, [t4+wq*1+400*2+ 0] mova m4, [t3+wq*2+400*4+ 0] mova m5, [t3+wq*2+400*4+16] punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 7) psubd m5, m3 psrad m4, 8 psrad m5, 8 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m14 mova [dstq+wq], m0 add wq, 16 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign extra_stack 4*16 %else %assign extra_stack 2*16 %endif cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*2+4*0] %define stridemp dword [esp+calloff+16*2+4*1] %define leftm dword [esp+calloff+16*2+4*2] %define lpfm dword [esp+calloff+16*2+4*3] %define w0m dword [esp+calloff+16*2+4*4] %define hd dword [esp+calloff+16*2+4*5] %define edgeb byte [esp+calloff+16*2+4*6] %define edged dword [esp+calloff+16*2+4*6] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t3m dword [esp+calloff+4*2] %define t4m dword [esp+calloff+4*3] %define m8 [base+pd_8] %define m9 [esp+calloff+16*1] %define m10 [base+pd_0xf00801c7] %define m11 [base+pd_34816] %define m12 [base+sgr_lshuf3] %define m13 [base+pw_1023] %define m14 m6 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm add wd, wd mov edged, r7m movq m9, [paramsq+4] add lpfq, wq lea t1, [rsp+wq+12] mova m8, [pd_8] add dstq, wq lea t3, [rsp+wq*2+400*12+8] mova m10, [pd_0xf00801c7] lea t4, [rsp+wq+400*32+8] mova m11, [pd_34816] pshuflw m7, m9, q3333 pshufb m9, [pw_256] ; s1 punpcklqdq m7, m7 ; w1 neg wq pxor m6, m6 mova m13, [pw_1023] psllw m7, 4 mova m12, [sgr_lshuf3] DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd movq m1, [r1+4] add lpfm, wq lea t1, [rsp+extra_stack+wq+20] add dstq, wq lea t3, [rsp+extra_stack+wq*2+400*12+16] mov dstm, dstq lea t4, [rsp+extra_stack+wq+400*32+16] mov t3m, t3 pshuflw m7, m1, q3333 mov t4m, t4 pshufb m1, [base+pw_256] ; s1 punpcklqdq m7, m7 ; w1 psllw m7, 4 neg wq mova m9, m1 pxor m6, m6 mov w1m, wd sub wd, 4 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp mov t2, t1 add t1, 400*6 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 .main: dec hd jz .height1 movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hb, hb %else mov r4, hd test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .hv0_bottom movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 lea wq, [r4-4] %else mov wq, w0m mov hvsrcm, lpfq %endif lea t2, [t1+400*6] .top_fixup_loop: mova m0, [t1+wq+400*0] mova m1, [t1+wq+400*2] mova m2, [t1+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m1 mova [t2+wq+400*4], m2 add wq, 16 jl .top_fixup_loop movif32 t3, t3m movif32 t4, t4m call .v0 jmp .main .extend_right: movd m1, wd movd m5, [lpfq-2] mova m2, [base+pw_256] mova m3, [base+pb_0to15] pshufb m1, m6 pshufb m5, m2 psubb m2, m1 pcmpgtb m2, m3 pand m4, m2 pandn m2, m5 por m4, m2 ret %assign stack_offset stack_offset+4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 lea wq, [r4-4] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 12 jmp .h_main .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m4, [lpfq+wq+ 0] .h_main: movu m5, [lpfq+wq+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -18 jl .h_have_right call .extend_right .h_have_right: palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 mova [t1+wq+400*0], m1 mova [t1+wq+400*2], m2 mova [t1+wq+400*4], m3 add wq, 16 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 12 jmp .hv0_main .hv0_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv0_loop_start %endif .hv0_loop: movif32 lpfq, hvsrcm .hv0_loop_start: movu m4, [lpfq+wq+ 0] .hv0_main: movu m5, [lpfq+wq+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp wd, -18 jl .hv0_have_right call .extend_right .hv0_have_right: palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 paddw m0, m1, [t1+wq+400*0] paddd m4, m2, [t1+wq+400*2] paddd m5, m3, [t1+wq+400*4] mova [t1+wq+400*0], m1 mova [t1+wq+400*2], m2 mova [t1+wq+400*4], m3 paddw m1, m0, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m5, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m4 mova [t2+wq+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq+4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+ 8], m0 mova [t3+wq*2+24], m1 add wq, 16 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 12 jmp .hv1_main .hv1_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, m12 jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv1_loop_start %endif .hv1_loop: movif32 lpfq, hvsrcm .hv1_loop_start: movu m4, [lpfq+wq+ 0] .hv1_main: movu m5, [lpfq+wq+16] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp wd, -18 jl .hv1_have_right call .extend_right .hv1_have_right: palignr m1, m5, m4, 2 paddw m0, m4, m1 punpcklwd m2, m4, m1 pmaddwd m2, m2 punpckhwd m3, m4, m1 pmaddwd m3, m3 palignr m5, m4, 4 paddw m0, m5 ; h sum punpcklwd m1, m5, m6 pmaddwd m1, m1 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m1 ; h sumsq paddd m3, m5 paddw m1, m0, [t2+wq+400*0] paddd m4, m2, [t2+wq+400*2] paddd m5, m3, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m2 mova [t2+wq+400*4], m3 paddd m4, m8 paddd m5, m8 psrld m4, 4 ; (a + 8) >> 4 psrld m5, 4 pslld m2, m4, 3 pslld m3, m5, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*1+400*2 +4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v0_loop: mova m0, [t1+wq+400*0] mova m4, [t1+wq+400*2] mova m5, [t1+wq+400*4] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m5, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m4 mova [t2+wq+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*1+400*0+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*0+ 8], m0 mova [t3+wq*2+400*0+24], m1 add wq, 16 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v1_loop: mova m0, [t1+wq+400*0] mova m4, [t1+wq+400*2] mova m5, [t1+wq+400*4] paddw m1, m0, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m5, [t2+wq+400*4] mova [t2+wq+400*0], m0 mova [t2+wq+400*2], m4 mova [t2+wq+400*4], m5 paddd m2, m8 paddd m3, m8 psrld m2, 4 ; (a + 8) >> 4 psrld m3, 4 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m6 ; (b + 2) >> 2 punpcklwd m2, m3, m6 pmaddwd m2, m2 punpckhwd m3, m6 pmaddwd m3, m3 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 MAXSD m4, m2, m14 MAXSD m5, m3, m14 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m14 ; p * s MULLD m5, m9, m14 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m14 MULLD m1, m5, m14 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*1+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+400*0+ 4] movu m1, [t3+wq*2+400*0+ 8] movu m2, [t3+wq*2+400*0+24] movu m3, [t4+wq*1+400*0+ 2] movu m4, [t3+wq*2+400*0+ 4] movu m5, [t3+wq*2+400*0+20] paddw m0, [t4+wq*1+400*0+ 0] paddd m1, [t3+wq*2+400*0+ 0] paddd m2, [t3+wq*2+400*0+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a[-1] 444 pslld m4, 2 ; b[-1] 444 pslld m5, 2 psubw m3, m0 ; a[-1] 343 psubd m4, m1 ; b[-1] 343 psubd m5, m2 mova [t4+wq*1+400*4], m3 mova [t3+wq*2+400*8+ 0], m4 mova [t3+wq*2+400*8+16], m5 movu m0, [t4+wq*1+400*2+ 4] movu m1, [t3+wq*2+400*4+ 8] movu m2, [t3+wq*2+400*4+24] movu m3, [t4+wq*1+400*2+ 2] movu m4, [t3+wq*2+400*4+ 4] movu m5, [t3+wq*2+400*4+20] paddw m0, [t4+wq*1+400*2+ 0] paddd m1, [t3+wq*2+400*4+ 0] paddd m2, [t3+wq*2+400*4+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a[ 0] 444 pslld m4, 2 ; b[ 0] 444 pslld m5, 2 mova [t4+wq*1+400* 6], m3 mova [t3+wq*2+400*12+ 0], m4 mova [t3+wq*2+400*12+16], m5 psubw m3, m0 ; a[ 0] 343 psubd m4, m1 ; b[ 0] 343 psubd m5, m2 mova [t4+wq*1+400* 8], m3 mova [t3+wq*2+400*16+ 0], m4 mova [t3+wq*2+400*16+16], m5 add wq, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m3, [t4+wq*1+400*0+4] movu m1, [t4+wq*1+400*0+2] paddw m3, [t4+wq*1+400*0+0] paddw m1, m3 psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+wq*1+400*4] paddw m3, [t4+wq*1+400*6] mova [t4+wq*1+400*4], m2 mova [t4+wq*1+400*6], m1 movu m4, [t3+wq*2+400*0+8] movu m1, [t3+wq*2+400*0+4] paddd m4, [t3+wq*2+400*0+0] paddd m1, m4 pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+wq*2+400* 8+ 0] paddd m4, [t3+wq*2+400*12+ 0] mova [t3+wq*2+400* 8+ 0], m2 mova [t3+wq*2+400*12+ 0], m1 movu m5, [t3+wq*2+400*0+24] movu m1, [t3+wq*2+400*0+20] paddd m5, [t3+wq*2+400*0+16] paddd m1, m5 pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+wq*2+400* 8+16] paddd m5, [t3+wq*2+400*12+16] mova [t3+wq*2+400* 8+16], m2 mova [t3+wq*2+400*12+16], m1 mova m0, [dstq+wq] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+wq], m0 add wq, 16 jl .n0_loop add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*1+400*2+4] movu m1, [t4+wq*1+400*2+2] paddw m3, [t4+wq*1+400*2+0] paddw m1, m3 psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+wq*1+400*6] paddw m3, [t4+wq*1+400*8] mova [t4+wq*1+400*6], m1 mova [t4+wq*1+400*8], m2 movu m4, [t3+wq*2+400*4+8] movu m1, [t3+wq*2+400*4+4] paddd m4, [t3+wq*2+400*4+0] paddd m1, m4 pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+wq*2+400*12+ 0] paddd m4, [t3+wq*2+400*16+ 0] mova [t3+wq*2+400*12+ 0], m1 mova [t3+wq*2+400*16+ 0], m2 movu m5, [t3+wq*2+400*4+24] movu m1, [t3+wq*2+400*4+20] paddd m5, [t3+wq*2+400*4+16] paddd m1, m5 pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+wq*2+400*12+16] paddd m5, [t3+wq*2+400*16+16] mova [t3+wq*2+400*12+16], m1 mova [t3+wq*2+400*16+16], m2 mova m0, [dstq+wq] punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 pmaxsw m0, m6 pminsw m0, m13 mova [dstq+wq], m0 add wq, 16 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign extra_stack 10*16 %else %assign extra_stack 8*16 %endif cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*8+4*0] %define stridemp dword [esp+calloff+16*8+4*1] %define leftm dword [esp+calloff+16*8+4*2] %define lpfm dword [esp+calloff+16*8+4*3] %define w0m dword [esp+calloff+16*8+4*4] %define hd dword [esp+calloff+16*8+4*5] %define edgeb byte [esp+calloff+16*8+4*6] %define edged dword [esp+calloff+16*8+4*6] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t3m dword [esp+calloff+4*2] %define t4m dword [esp+calloff+4*3] %xdefine m8 m6 %define m9 [base+pd_8] %define m10 [base+pd_34816] %define m11 [base+pd_0xf00801c7] %define m12 [base+pd_0xf00800a4] %define m13 [esp+calloff+16*4] %define m14 [esp+calloff+16*5] %define m15 [esp+calloff+16*6] %define m6 [esp+calloff+16*7] %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 movifnidn wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm add wd, wd mov edged, r7m mova m14, [paramsq] add lpfq, wq mova m9, [pd_8] lea t1, [rsp+wq+44] mova m10, [pd_34816] add dstq, wq mova m11, [pd_0xf00801c7] lea t3, [rsp+wq*2+400*24+40] mova m12, [pd_0xf00800a4] lea t4, [rsp+wq+400*52+40] neg wq pshufd m15, m14, q2222 ; w0 w1 punpcklwd m14, m14 pshufd m13, m14, q0000 ; s0 pshufd m14, m14, q2222 ; s1 pxor m6, m6 psllw m15, 2 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ add wd, wd mova m2, [r1] add lpfm, wq lea t1, [rsp+extra_stack+wq+52] add dstq, wq lea t3, [rsp+extra_stack+wq*2+400*24+48] mov dstm, dstq lea t4, [rsp+extra_stack+wq+400*52+48] mov t3m, t3 mov t4m, t4 neg wq pshuflw m0, m2, q0000 pshuflw m1, m2, q2222 pshufhw m2, m2, q1010 punpcklqdq m0, m0 ; s0 punpcklqdq m1, m1 ; s1 punpckhqdq m2, m2 ; w0 w1 mov w1m, wd pxor m3, m3 psllw m2, 2 mova m13, m0 mova m14, m1 sub wd, 4 mova m15, m2 mova m6, m3 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp mov t2, t1 %if ARCH_X86_64 call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup %else mov wq, w0m call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).top_fixup_loop %endif add t1, 400*12 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 .main: dec hd jz .height1 movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hd, hd %else mov r4, hd test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .hv0_bottom movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 lea wq, [r4-4] %else mov wq, w0m mov hvsrcm, lpfq %endif lea t2, [t1+400*12] .top_fixup_loop: mova m0, [t1+wq+400* 0] mova m1, [t1+wq+400* 2] mova m2, [t1+wq+400* 4] paddw m0, m0 mova m3, [t1+wq+400* 6] paddd m1, m1 mova m4, [t1+wq+400* 8] paddd m2, m2 mova m5, [t1+wq+400*10] mova [t2+wq+400* 0], m0 mova [t2+wq+400* 2], m1 mova [t2+wq+400* 4], m2 mova [t2+wq+400* 6], m3 mova [t2+wq+400* 8], m4 mova [t2+wq+400*10], m5 add wq, 16 jl .top_fixup_loop movif32 t3, t3m movif32 t4, t4m call .v0 jmp .main .h: ; horizontal boxsum %assign stack_offset stack_offset+4 %assign calloff 4 %if ARCH_X86_64 lea wq, [r4-4] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .h_main .h_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, [base+sgr_lshuf5] jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-4] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m4, [lpfq+wq- 2] .h_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -20 jl .h_have_right %if ARCH_X86_32 pxor m8, m8 %endif call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right .h_have_right: palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; sum3 punpcklwd m7, m0, m6 pmaddwd m7, m7 punpckhwd m0, m6 pmaddwd m0, m0 paddd m2, m7 ; sumsq3 palignr m5, m4, 8 punpcklwd m7, m5, m4 paddw m8, m4, m5 pmaddwd m7, m7 punpckhwd m5, m4 pmaddwd m5, m5 paddd m3, m0 mova [t1+wq+400* 6], m1 mova [t1+wq+400* 8], m2 mova [t1+wq+400*10], m3 paddw m8, m1 ; sum5 paddd m7, m2 ; sumsq5 paddd m5, m3 mova [t1+wq+400* 0], m8 mova [t1+wq+400* 2], m7 mova [t1+wq+400* 4], m5 add wq, 16 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .hv0_main .hv0_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, [base+sgr_lshuf5] jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv0_loop_start %endif .hv0_loop: movif32 lpfq, hvsrcm .hv0_loop_start: movu m4, [lpfq+wq- 2] .hv0_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp wd, -20 jl .hv0_have_right %if ARCH_X86_32 pxor m8, m8 %endif call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right .hv0_have_right: palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 movif32 t3, t3m paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; h sum3 punpcklwd m7, m0, m6 pmaddwd m7, m7 punpckhwd m0, m6 pmaddwd m0, m0 paddd m2, m7 ; h sumsq3 palignr m5, m4, 8 punpcklwd m7, m5, m4 paddw m8, m4, m5 pmaddwd m7, m7 punpckhwd m5, m4 pmaddwd m5, m5 paddd m3, m0 paddw m8, m1 ; h sum5 paddd m7, m2 ; h sumsq5 paddd m5, m3 mova [t3+wq*2+400*8+ 8], m8 mova [t3+wq*2+400*0+ 8], m7 mova [t3+wq*2+400*0+24], m5 paddw m8, [t1+wq+400* 0] paddd m7, [t1+wq+400* 2] paddd m5, [t1+wq+400* 4] mova [t1+wq+400* 0], m8 mova [t1+wq+400* 2], m7 mova [t1+wq+400* 4], m5 paddw m0, m1, [t1+wq+400* 6] paddd m4, m2, [t1+wq+400* 8] paddd m5, m3, [t1+wq+400*10] mova [t1+wq+400* 6], m1 mova [t1+wq+400* 8], m2 mova [t1+wq+400*10], m3 paddw m1, m0, [t2+wq+400* 6] paddd m2, m4, [t2+wq+400* 8] paddd m3, m5, [t2+wq+400*10] mova [t2+wq+400* 6], m0 mova [t2+wq+400* 8], m4 mova [t2+wq+400*10], m5 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m4, m2, m7 MAXSD m5, m3, m7 psubd m4, m2 ; p3 psubd m5, m3 MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 leftq, leftm movddup m5, [leftq] movif32 wq, w0m mova m4, [lpfq+wq+4] add leftmp, 8 palignr m4, m5, 10 jmp .hv1_main .hv1_extend_left: movif32 wq, w0m mova m4, [lpfq+wq+4] pshufb m4, [base+sgr_lshuf5] jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 lea wq, [r4-4] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv1_loop_start %endif .hv1_loop: movif32 lpfq, hvsrcm .hv1_loop_start: movu m4, [lpfq+wq- 2] .hv1_main: movu m5, [lpfq+wq+14] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp wd, -20 jl .hv1_have_right %if ARCH_X86_32 pxor m8, m8 %endif call mangle(private_prefix %+ _sgr_filter_5x5_16bpc_ssse3).extend_right .hv1_have_right: palignr m7, m5, m4, 2 palignr m3, m5, m4, 4 paddw m2, m7, m3 punpcklwd m0, m7, m3 pmaddwd m0, m0 punpckhwd m7, m3 pmaddwd m7, m7 palignr m3, m5, m4, 6 paddw m2, m3 ; h sum3 punpcklwd m1, m3, m6 pmaddwd m1, m1 punpckhwd m3, m6 pmaddwd m3, m3 paddd m0, m1 ; h sumsq3 palignr m5, m4, 8 punpckhwd m1, m4, m5 paddw m8, m4, m5 pmaddwd m1, m1 punpcklwd m4, m5 pmaddwd m4, m4 paddd m7, m3 paddw m5, m2, [t2+wq+400* 6] mova [t2+wq+400* 6], m2 paddw m8, m2 ; h sum5 paddd m2, m0, [t2+wq+400* 8] paddd m3, m7, [t2+wq+400*10] mova [t2+wq+400* 8], m0 mova [t2+wq+400*10], m7 paddd m4, m0 ; h sumsq5 paddd m1, m7 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 pslld m0, m2, 3 pslld m7, m3, 3 paddd m2, m0 ; ((a3 + 8) >> 4) * 9 paddd m3, m7 psrlw m7, m5, 1 pavgw m7, m6 ; (b3 + 2) >> 2 punpcklwd m0, m7, m6 pmaddwd m0, m0 punpckhwd m7, m6 pmaddwd m7, m7 %if ARCH_X86_32 mova [esp+20], m8 %else SWAP m8, m6 %endif MAXSD m2, m0, m8 MAXSD m3, m7, m8 pxor m8, m8 psubd m2, m0 ; p3 psubd m3, m7 punpcklwd m0, m5, m8 ; b3 punpckhwd m5, m8 MULLD m2, m14, m8 ; p3 * s1 MULLD m3, m14, m8 pmaddwd m0, m11 ; b3 * 455 pmaddwd m5, m11 paddusw m2, m11 paddusw m3, m11 psrld m2, 20 ; min(z3, 255) movif32 t3, t3m psrld m3, 20 GATHER_X_BY_X m8, m2, m3, r0, dstm punpcklwd m2, m8, m8 punpckhwd m3, m8, m8 MULLD m0, m2, m7 MULLD m5, m3, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m5, m10 psrld m0, 12 psrld m5, 12 mova [t4+wq*1+400*4+4], m8 mova [t3+wq*2+400*8+ 8], m0 mova [t3+wq*2+400*8+24], m5 %if ARCH_X86_32 mova m8, [esp+20] %else SWAP m6, m8 pxor m6, m6 %endif paddw m5, m8, [t2+wq+400*0] paddd m2, m4, [t2+wq+400*2] paddd m3, m1, [t2+wq+400*4] paddw m5, [t1+wq+400*0] paddd m2, [t1+wq+400*2] paddd m3, [t1+wq+400*4] mova [t2+wq+400*0], m8 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 mova [t2+wq+400*2], m4 pslld m8, m2, 4 mova [t2+wq+400*4], m1 pslld m4, m3, 4 paddd m8, m2 pslld m2, 3 paddd m4, m3 pslld m3, 3 paddd m2, m8 ; ((a5 + 8) >> 4) * 25 paddd m3, m4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif psrlw m1, m5, 1 pavgw m1, m7 ; (b5 + 2) >> 2 punpcklwd m4, m1, m7 pmaddwd m4, m4 punpckhwd m1, m7 pmaddwd m1, m1 punpcklwd m0, m5, m7 ; b5 punpckhwd m5, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m2, m4, m7 psubd m2, m4 ; p5 MAXSD m3, m1, m7 psubd m3, m1 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 pmaddwd m0, m12 ; b5 * 164 pmaddwd m5, m12 paddusw m2, m12 paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m1, m2, m3, r0, dstm punpcklwd m2, m1, m1 punpckhwd m3, m1, m1 MULLD m0, m2, m7 MULLD m5, m3, m7 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m5, m10 mova [t4+wq*1+400*0+ 4], m1 psrld m0, 12 psrld m5, 12 mova [t3+wq*2+400*0+ 8], m0 mova [t3+wq*2+400*0+24], m5 add wq, 16 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v0_loop: mova m0, [t1+wq+400* 6] mova m4, [t1+wq+400* 8] mova m5, [t1+wq+400*10] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+wq+400* 6] paddd m2, m4, [t2+wq+400* 8] paddd m3, m5, [t2+wq+400*10] mova [t2+wq+400* 6], m0 mova [t2+wq+400* 8], m4 mova [t2+wq+400*10], m5 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m4, m2, m7 MAXSD m5, m3, m7 psubd m4, m2 ; p3 psubd m5, m3 MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*2+4], m3 psrld m0, 12 psrld m1, 12 mova m3, [t1+wq+400*0] mova m4, [t1+wq+400*2] mova m5, [t1+wq+400*4] mova [t3+wq*2+400*8+ 8], m3 mova [t3+wq*2+400*0+ 8], m4 mova [t3+wq*2+400*0+24], m5 paddw m3, m3 ; cc5 paddd m4, m4 paddd m5, m5 mova [t1+wq+400*0], m3 mova [t1+wq+400*2], m4 mova [t1+wq+400*4], m5 mova [t3+wq*2+400*4+ 8], m0 mova [t3+wq*2+400*4+24], m1 add wq, 16 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-4] %else mov wd, w0m %endif .v1_loop: mova m4, [t1+wq+400* 6] mova m5, [t1+wq+400* 8] mova m7, [t1+wq+400*10] paddw m1, m4, [t2+wq+400* 6] paddd m2, m5, [t2+wq+400* 8] paddd m3, m7, [t2+wq+400*10] mova [t2+wq+400* 6], m4 mova [t2+wq+400* 8], m5 mova [t2+wq+400*10], m7 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a3 + 8) >> 4 psrld m3, 4 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 psrlw m3, m1, 1 pavgw m3, m7 ; (b3 + 2) >> 2 punpcklwd m2, m3, m7 pmaddwd m2, m2 punpckhwd m3, m7 pmaddwd m3, m3 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m4, m2, m7 MAXSD m5, m3, m7 psubd m4, m2 ; p3 psubd m5, m3 MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*4+4], m3 psrld m0, 12 psrld m8, m1, 12 mova m4, [t3+wq*2+400*8+ 8] mova m5, [t3+wq*2+400*0+ 8] mova m7, [t3+wq*2+400*0+24] paddw m1, m4, [t2+wq+400*0] paddd m2, m5, [t2+wq+400*2] paddd m3, m7, [t2+wq+400*4] paddw m1, [t1+wq+400*0] paddd m2, [t1+wq+400*2] paddd m3, [t1+wq+400*4] mova [t2+wq+400*0], m4 mova [t2+wq+400*2], m5 mova [t2+wq+400*4], m7 paddd m2, m9 paddd m3, m9 psrld m2, 4 ; (a5 + 8) >> 4 psrld m3, 4 mova [t3+wq*2+400*8+ 8], m0 pslld m4, m2, 4 mova [t3+wq*2+400*8+24], m8 pslld m5, m3, 4 paddd m4, m2 pslld m2, 3 paddd m5, m3 pslld m3, 3 paddd m2, m4 paddd m3, m5 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif psrlw m5, m1, 1 pavgw m5, m7 ; (b5 + 2) >> 2 punpcklwd m4, m5, m7 pmaddwd m4, m4 punpckhwd m5, m7 pmaddwd m5, m5 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 %if ARCH_X86_64 SWAP m7, m6 %endif MAXSD m2, m4, m7 psubd m2, m4 ; p5 MAXSD m3, m5, m7 psubd m3, m5 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 pmaddwd m0, m12 ; b5 * 164 pmaddwd m1, m12 paddusw m2, m12 paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m4, m2, m3, r0, dstm punpcklwd m2, m4, m4 punpckhwd m3, m4, m4 MULLD m0, m2, m7 MULLD m1, m3, m7 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*1+400*0+ 4], m4 psrld m0, 12 psrld m1, 12 mova [t3+wq*2+400*0+ 8], m0 mova [t3+wq*2+400*0+24], m1 add wq, 16 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*1+400*0+ 2] movu m1, [t3+wq*2+400*0+ 4] movu m2, [t3+wq*2+400*0+20] movu m7, [t4+wq*1+400*0+ 4] movu m8, [t3+wq*2+400*0+ 8] paddw m3, m0, [t4+wq*1+400*0+ 0] paddd m4, m1, [t3+wq*2+400*0+ 0] paddd m5, m2, [t3+wq*2+400*0+16] paddw m3, m7 paddd m4, m8 movu m7, [t3+wq*2+400*0+24] paddw m0, m3 paddd m1, m4 psllw m3, 2 pslld m4, 2 paddd m5, m7 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a5 565 paddd m1, m4 ; b5 565 paddd m2, m5 mova [t4+wq*1+400* 6+ 0], m0 mova [t3+wq*2+400*12+ 0], m1 mova [t3+wq*2+400*12+16], m2 movu m0, [t4+wq*1+400*2+ 4] movu m1, [t3+wq*2+400*4+ 8] movu m2, [t3+wq*2+400*4+24] movu m3, [t4+wq*1+400*2+ 2] movu m4, [t3+wq*2+400*4+ 4] movu m5, [t3+wq*2+400*4+20] paddw m0, [t4+wq*1+400*2+ 0] paddd m1, [t3+wq*2+400*4+ 0] paddd m2, [t3+wq*2+400*4+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a3[-1] 444 pslld m4, 2 ; b3[-1] 444 pslld m5, 2 psubw m3, m0 ; a3[-1] 343 psubd m4, m1 ; b3[-1] 343 psubd m5, m2 mova [t4+wq*1+400* 8+ 0], m3 mova [t3+wq*2+400*16+ 0], m4 mova [t3+wq*2+400*16+16], m5 movu m0, [t4+wq*1+400*4+ 4] movu m1, [t3+wq*2+400*8+ 8] movu m2, [t3+wq*2+400*8+24] movu m3, [t4+wq*1+400*4+ 2] movu m4, [t3+wq*2+400*8+ 4] movu m5, [t3+wq*2+400*8+20] paddw m0, [t4+wq*1+400*4+ 0] paddd m1, [t3+wq*2+400*8+ 0] paddd m2, [t3+wq*2+400*8+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a3[ 0] 444 pslld m4, 2 ; b3[ 0] 444 pslld m5, 2 mova [t4+wq*1+400*10+ 0], m3 mova [t3+wq*2+400*20+ 0], m4 mova [t3+wq*2+400*20+16], m5 psubw m3, m0 ; a3[ 0] 343 psubd m4, m1 ; b3[ 0] 343 psubd m5, m2 mova [t4+wq*1+400*12+ 0], m3 mova [t3+wq*2+400*24+ 0], m4 mova [t3+wq*2+400*24+16], m5 add wq, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*1+ 4] movu m2, [t4+wq*1+ 2] paddw m0, [t4+wq*1+ 0] paddw m0, m2 paddw m2, m0 psllw m0, 2 paddw m0, m2 ; a5 movu m4, [t3+wq*2+ 8] movu m5, [t3+wq*2+24] movu m1, [t3+wq*2+ 4] movu m3, [t3+wq*2+20] paddd m4, [t3+wq*2+ 0] paddd m5, [t3+wq*2+16] paddd m4, m1 paddd m5, m3 paddd m1, m4 paddd m3, m5 pslld m4, 2 pslld m5, 2 paddd m4, m1 ; b5 paddd m5, m3 movu m2, [t4+wq*1+400* 6] paddw m2, m0 mova [t4+wq*1+400* 6], m0 paddd m0, m4, [t3+wq*2+400*12+ 0] paddd m1, m5, [t3+wq*2+400*12+16] mova [t3+wq*2+400*12+ 0], m4 mova [t3+wq*2+400*12+16], m5 mova [rsp+16+ARCH_X86_32*4], m1 movu m3, [t4+wq*1+400*2+4] movu m5, [t4+wq*1+400*2+2] paddw m3, [t4+wq*1+400*2+0] paddw m5, m3 psllw m5, 2 ; a3[ 1] 444 psubw m4, m5, m3 ; a3[ 1] 343 movu m3, [t4+wq*1+400* 8] paddw m3, [t4+wq*1+400*10] paddw m3, m4 mova [t4+wq*1+400* 8], m4 mova [t4+wq*1+400*10], m5 movu m1, [t3+wq*2+400*4+ 8] movu m5, [t3+wq*2+400*4+ 4] movu m7, [t3+wq*2+400*4+24] movu m8, [t3+wq*2+400*4+20] paddd m1, [t3+wq*2+400*4+ 0] paddd m7, [t3+wq*2+400*4+16] paddd m5, m1 paddd m8, m7 pslld m5, 2 ; b3[ 1] 444 pslld m8, 2 psubd m4, m5, m1 ; b3[ 1] 343 %if ARCH_X86_32 mova [esp+52], m8 psubd m8, m7 %else psubd m6, m8, m7 SWAP m8, m6 %endif paddd m1, m4, [t3+wq*2+400*16+ 0] paddd m7, m8, [t3+wq*2+400*16+16] paddd m1, [t3+wq*2+400*20+ 0] paddd m7, [t3+wq*2+400*20+16] mova [t3+wq*2+400*16+ 0], m4 mova [t3+wq*2+400*16+16], m8 mova [t3+wq*2+400*20+ 0], m5 %if ARCH_X86_32 mova m8, [esp+52] %else SWAP m8, m6 pxor m6, m6 %endif mova [t3+wq*2+400*20+16], m8 mova [rsp+32+ARCH_X86_32*4], m7 movu m5, [dstq+wq] punpcklwd m4, m5, m6 punpcklwd m7, m2, m6 pmaddwd m7, m4 ; a5 * src punpcklwd m8, m3, m6 pmaddwd m8, m4 ; a3 * src punpckhwd m5, m6 punpckhwd m2, m6 pmaddwd m2, m5 punpckhwd m3, m6 pmaddwd m3, m5 pslld m4, 13 pslld m5, 13 psubd m0, m7 ; b5 - a5 * src + (1 << 8) psubd m1, m8 ; b3 - a3 * src + (1 << 8) mova m7, [base+pd_0xffff] psrld m0, 9 pslld m1, 7 pand m0, m7 pandn m8, m7, m1 por m0, m8 mova m1, [rsp+16+ARCH_X86_32*4] mova m8, [rsp+32+ARCH_X86_32*4] psubd m1, m2 psubd m8, m3 mova m2, [base+pd_4096] psrld m1, 9 pslld m8, 7 pand m1, m7 pandn m7, m8 por m1, m7 pmaddwd m0, m15 pmaddwd m1, m15 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif paddd m4, m2 paddd m5, m2 paddd m0, m4 paddd m1, m5 psrad m0, 8 psrad m1, 8 packssdw m0, m1 ; clip pmaxsw m0, m7 psrlw m0, 5 mova [dstq+wq], m0 add wq, 16 jl .n0_loop add dstq, stridemp ret %if ARCH_X86_64 SWAP m6, m7 %endif ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*1+400*4+4] movu m5, [t4+wq*1+400*4+2] paddw m3, [t4+wq*1+400*4+0] paddw m5, m3 psllw m5, 2 ; a3[ 1] 444 psubw m4, m5, m3 ; a3[ 1] 343 paddw m3, m4, [t4+wq*1+400*12] paddw m3, [t4+wq*1+400*10] mova [t4+wq*1+400*10], m5 mova [t4+wq*1+400*12], m4 movu m1, [t3+wq*2+400*8+ 8] movu m5, [t3+wq*2+400*8+ 4] movu m7, [t3+wq*2+400*8+24] movu m8, [t3+wq*2+400*8+20] paddd m1, [t3+wq*2+400*8+ 0] paddd m7, [t3+wq*2+400*8+16] paddd m5, m1 paddd m8, m7 pslld m5, 2 ; b3[ 1] 444 pslld m8, 2 psubd m4, m5, m1 ; b3[ 1] 343 psubd m0, m8, m7 paddd m1, m4, [t3+wq*2+400*24+ 0] paddd m7, m0, [t3+wq*2+400*24+16] paddd m1, [t3+wq*2+400*20+ 0] paddd m7, [t3+wq*2+400*20+16] mova [t3+wq*2+400*20+ 0], m5 mova [t3+wq*2+400*20+16], m8 mova [t3+wq*2+400*24+ 0], m4 mova [t3+wq*2+400*24+16], m0 mova m5, [dstq+wq] mova m2, [t4+wq*1+400* 6] punpcklwd m4, m5, m6 punpcklwd m8, m2, m6 pmaddwd m8, m4 ; a5 * src punpcklwd m0, m3, m6 pmaddwd m0, m4 ; a3 * src punpckhwd m5, m6 punpckhwd m2, m6 pmaddwd m2, m5 punpckhwd m3, m6 pmaddwd m3, m5 psubd m1, m0 ; b3 - a3 * src + (1 << 8) pslld m4, 13 pslld m5, 13 mova m0, [t3+wq*2+400*12+ 0] psubd m0, m8 ; b5 - a5 * src + (1 << 8) mova m8, [t3+wq*2+400*12+16] psubd m8, m2 psubd m7, m3 mova m2, [base+pd_0xffff] pslld m1, 7 psrld m0, 8 psrld m8, 8 pslld m7, 7 pand m0, m2 pandn m3, m2, m1 por m0, m3 pand m8, m2 pandn m2, m7 por m2, m8 mova m1, [base+pd_4096] pmaddwd m0, m15 pmaddwd m2, m15 %if ARCH_X86_64 SWAP m7, m6 %endif pxor m7, m7 paddd m4, m1 paddd m5, m1 paddd m0, m4 paddd m2, m5 psrad m0, 8 psrad m2, 8 packssdw m0, m2 ; clip pmaxsw m0, m7 psrlw m0, 5 mova [dstq+wq], m0 add wq, 16 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret rav1e-0.7.1/src/x86/looprestoration_avx2.asm000064400000000000000000002116661046102023000170040ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 sgr_r_ext: times 16 db 1 times 16 db 9 ; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of ; cache but eliminates some shifts in the inner sgr loop which is overall a win const sgr_x_by_x_avx2 dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 times 4 db -1 ; needed for 16-bit sgr pb_m5: times 4 db -5 pb_3: times 4 db 3 pw_5_6: dw 5, 6 sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 db 9, -1, 10, -1, 11, -1, 12, -1 pw_256: times 2 dw 256 pw_2056: times 2 dw 2056 pw_m16380: times 2 dw -16380 pd_25: dd 25 pd_34816: dd 34816 pd_m4096: dd -4096 pd_0xf00801c7: dd 0xf00801c7 pd_0xf00800a4: dd 0xf00800a4 SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers INIT_YMM avx2 cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov wd, wm vbroadcasti128 m6, [wiener_shufA] vpbroadcastb m11, [fltq+ 0] ; x0 x0 vbroadcasti128 m7, [wiener_shufB] vpbroadcastd m12, [fltq+ 2] vbroadcasti128 m8, [wiener_shufC] packsswb m12, m12 ; x1 x2 vpbroadcastw m13, [fltq+ 6] ; x3 vbroadcasti128 m9, [sgr_shuf+6] add lpfq, wq vpbroadcastd m10, [pw_m16380] vpbroadcastd m14, [fltq+16] ; y0 y1 add dstq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 lea t1, [rsp+wq*2+16] psllw m14, 5 neg wq psllw m15, 5 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v .v2: call .v jmp .v1 .extend_right: movd xm2, r10d vpbroadcastd m0, [pb_3] vpbroadcastd m1, [pb_m5] vpbroadcastb m2, xm2 movu m3, [pb_0to31] psubb m0, m2 psubb m1, m2 pminub m0, m3 pminub m1, m3 pshufb m4, m0 pshufb m5, m1 ret .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .h_main .h_extend_left: vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located mova m4, [lpfq+r10] ; before the start of the buffer palignr m4, m5, 12 pshufb m4, [wiener_l_shuf] jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+r10-4] .h_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right call .extend_right .h_have_right: pshufb m0, m4, m6 pmaddubsw m0, m11 pshufb m1, m5, m6 pmaddubsw m1, m11 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 paddw m0, m2 pshufb m2, m4, m8 pmaddubsw m2, m12 paddw m1, m3 pshufb m3, m5, m8 pmaddubsw m3, m12 pshufb m4, m9 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m9 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m10 paddw m5, m10 paddw m0, m2 vpbroadcastd m2, [pw_2056] paddw m1, m3 paddsw m0, m4 paddsw m1, m5 psraw m0, 3 psraw m1, 3 paddw m0, m2 paddw m1, m2 mova [t1+r10*2+ 0], m0 mova [t1+r10*2+32], m1 add r10, 32 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .hv_main .hv_extend_left: movu m4, [lpfq+r10-4] pshufb m4, [wiener_l_shuf] jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m4, [lpfq+r10-4] .hv_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -34 jl .hv_have_right call .extend_right .hv_have_right: pshufb m0, m4, m6 pmaddubsw m0, m11 pshufb m1, m5, m6 pmaddubsw m1, m11 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 paddw m0, m2 pshufb m2, m4, m8 pmaddubsw m2, m12 paddw m1, m3 pshufb m3, m5, m8 pmaddubsw m3, m12 pshufb m4, m9 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m9 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m10 paddw m5, m10 paddw m0, m2 paddw m1, m3 mova m2, [t4+r10*2] paddw m2, [t2+r10*2] mova m3, [t3+r10*2] paddsw m0, m4 vpbroadcastd m4, [pw_2056] paddsw m1, m5 mova m5, [t5+r10*2] paddw m5, [t1+r10*2] psraw m0, 3 psraw m1, 3 paddw m0, m4 paddw m1, m4 paddw m4, m0, [t6+r10*2] mova [t0+r10*2], m0 punpcklwd m0, m2, m3 pmaddwd m0, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m5 pmaddwd m3, m14 punpckhwd m4, m5 pmaddwd m4, m14 paddd m0, m3 paddd m4, m2 mova m2, [t4+r10*2+32] paddw m2, [t2+r10*2+32] mova m3, [t3+r10*2+32] mova m5, [t5+r10*2+32] paddw m5, [t1+r10*2+32] packuswb m0, m4 paddw m4, m1, [t6+r10*2+32] mova [t0+r10*2+32], m1 punpcklwd m1, m2, m3 pmaddwd m1, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m5 pmaddwd m3, m14 punpckhwd m4, m5 pmaddwd m4, m14 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .hv_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m2, [t4+r10*2+ 0] paddw m2, [t2+r10*2+ 0] mova m4, [t3+r10*2+ 0] mova m6, [t1+r10*2+ 0] paddw m8, m6, [t6+r10*2+ 0] paddw m6, [t5+r10*2+ 0] mova m3, [t4+r10*2+32] paddw m3, [t2+r10*2+32] mova m5, [t3+r10*2+32] mova m7, [t1+r10*2+32] paddw m9, m7, [t6+r10*2+32] paddw m7, [t5+r10*2+32] punpcklwd m0, m2, m4 pmaddwd m0, m15 punpckhwd m2, m4 pmaddwd m2, m15 punpcklwd m4, m8, m6 pmaddwd m4, m14 punpckhwd m6, m8, m6 pmaddwd m6, m14 punpcklwd m1, m3, m5 pmaddwd m1, m15 punpckhwd m3, m5 pmaddwd m3, m15 punpcklwd m5, m9, m7 pmaddwd m5, m14 punpckhwd m7, m9, m7 pmaddwd m7, m14 paddd m0, m4 paddd m2, m6 paddd m1, m5 paddd m3, m7 packuswb m0, m2 packuswb m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .v_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ w, h, edge, flt mov fltq, r6mp movifnidn hd, hm mov edged, r7m mov wd, wm vbroadcasti128 m6, [wiener_shufB] vpbroadcastd m12, [fltq+ 2] vbroadcasti128 m7, [wiener_shufC] packsswb m12, m12 ; x1 x2 vpbroadcastw m13, [fltq+ 6] ; x3 vbroadcasti128 m8, [sgr_shuf+6] add lpfq, wq vpbroadcastd m9, [pw_m16380] vpbroadcastd m10, [pw_2056] mova m11, [wiener_l_shuf] vpbroadcastd m14, [fltq+16] ; __ y1 add dstq, wq vpbroadcastd m15, [fltq+20] ; y2 y3 lea t1, [rsp+wq*2+16] psllw m14, 5 neg wq psllw m15, 5 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t3, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 .main: mov t0, t4 .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .end: RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 lea t0, [t1+384*2] call .hv dec hd jz .v2 add t0, 384*6 call .hv dec hd jnz .main .v2: call .v mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq .v1: call .v jmp .end .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .h_main .h_extend_left: vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located mova m4, [lpfq+r10] ; before the start of the buffer palignr m4, m5, 12 pshufb m4, m11 jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+r10-4] .h_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -33 jl .h_have_right call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right .h_have_right: pshufb m0, m4, m6 pmaddubsw m0, m12 pshufb m1, m5, m6 pmaddubsw m1, m12 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 pshufb m4, m8 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m8 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m9 paddw m5, m9 paddw m0, m2 paddw m1, m3 paddsw m0, m4 paddsw m1, m5 psraw m0, 3 psraw m1, 3 paddw m0, m10 paddw m1, m10 mova [t1+r10*2+ 0], m0 mova [t1+r10*2+32], m1 add r10, 32 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm4, [leftq] vpblendd m4, [lpfq+r10-4], 0xfe add leftq, 4 jmp .hv_main .hv_extend_left: movu m4, [lpfq+r10-4] pshufb m4, m11 jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m4, [lpfq+r10-4] .hv_main: movu m5, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -33 jl .hv_have_right call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right .hv_have_right: pshufb m0, m4, m6 pmaddubsw m0, m12 pshufb m1, m5, m6 pmaddubsw m1, m12 pshufb m2, m4, m7 pmaddubsw m2, m12 pshufb m3, m5, m7 pmaddubsw m3, m12 pshufb m4, m8 paddw m0, m2 pmullw m2, m4, m13 pshufb m5, m8 paddw m1, m3 pmullw m3, m5, m13 psllw m4, 7 psllw m5, 7 paddw m4, m9 paddw m5, m9 paddw m0, m2 paddw m1, m3 mova m2, [t3+r10*2] paddw m2, [t1+r10*2] mova m3, [t2+r10*2] paddsw m0, m4 paddsw m1, m5 psraw m0, 3 psraw m1, 3 paddw m0, m10 paddw m1, m10 paddw m4, m0, [t4+r10*2] mova [t0+r10*2], m0 punpcklwd m0, m2, m3 pmaddwd m0, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m4 pmaddwd m3, m14 punpckhwd m4, m4 pmaddwd m4, m14 paddd m0, m3 paddd m4, m2 mova m2, [t3+r10*2+32] paddw m2, [t1+r10*2+32] mova m3, [t2+r10*2+32] packuswb m0, m4 paddw m4, m1, [t4+r10*2+32] mova [t0+r10*2+32], m1 punpcklwd m1, m2, m3 pmaddwd m1, m15 punpckhwd m2, m3 pmaddwd m2, m15 punpcklwd m3, m4, m4 pmaddwd m3, m14 punpckhwd m4, m4 pmaddwd m4, m14 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .hv_loop mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t4 add dstq, strideq ret .v: mov r10, wq psrld m13, m14, 16 ; y1 __ .v_loop: mova m6, [t1+r10*2+ 0] paddw m2, m6, [t3+r10*2+ 0] mova m4, [t2+r10*2+ 0] mova m7, [t1+r10*2+32] paddw m3, m7, [t3+r10*2+32] mova m5, [t2+r10*2+32] paddw m6, [t4+r10*2+ 0] paddw m7, [t4+r10*2+32] punpcklwd m0, m2, m4 pmaddwd m0, m15 punpckhwd m2, m4 pmaddwd m2, m15 punpcklwd m1, m3, m5 pmaddwd m1, m15 punpckhwd m3, m5 pmaddwd m3, m15 punpcklwd m5, m7, m6 pmaddwd m4, m5, m14 punpckhwd m7, m6 pmaddwd m6, m7, m14 pmaddwd m5, m13 pmaddwd m7, m13 paddd m0, m4 paddd m2, m6 paddd m1, m5 paddd m3, m7 packuswb m0, m2 packuswb m1, m3 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+r10], m0 add r10, 32 jl .v_loop ret cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ w, h, edge, params %define base r12-sgr_x_by_x_avx2-256*4 lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti128 m8, [base+sgr_shuf+0] vbroadcasti128 m9, [base+sgr_shuf+8] add lpfq, wq vbroadcasti128 m10, [base+sgr_shuf+2] add dstq, wq vbroadcasti128 m11, [base+sgr_shuf+6] lea t3, [rsp+wq*4+16+400*12] vpbroadcastd m12, [paramsq+0] ; s0 pxor m6, m6 vpbroadcastw m7, [paramsq+8] ; w0 lea t1, [rsp+wq*2+20] vpbroadcastd m13, [base+pd_0xf00800a4] neg wq vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) psllw m7, 4 vpbroadcastd m15, [base+pd_m4096] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call .top_fixup add t1, 400*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq test hd, hd jz .odd_height call .h add lpfq, strideq call .hv call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .h_top add lpfq, strideq call .hv_bottom .end: call .n0 call .n1 .end2: RET .height1: call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv call .n0 call .n1 .odd_height_end: call .v call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+400*6] call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 jmp .main .no_top_height1: call .v call .prep_n jmp .odd_height_end .extend_right: movd xm2, r10d mova m0, [sgr_r_ext] vpbroadcastb m2, xm2 psubb m0, m2 pminub m0, [pb_0to31] pshufb m5, m0 ret .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .h_main .h_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu xm5, [lpfq+r10-2] .h_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -18 jl .h_have_right call .extend_right .h_have_right: pshufb m3, m5, m8 pmullw m4, m3, m3 pshufb m2, m5, m9 paddw m0, m3, m2 shufps m3, m2, q2121 paddw m0, m3 punpcklwd m1, m2, m3 pmaddwd m1, m1 punpckhwd m2, m3 pmaddwd m2, m2 punpcklwd m3, m4, m6 paddd m1, m3 punpckhwd m4, m6 paddd m2, m4 pshufb m4, m5, m10 paddw m0, m4 pshufb m5, m11 paddw m0, m5 ; sum punpcklwd m3, m4, m5 pmaddwd m3, m3 punpckhwd m4, m5 pmaddwd m4, m4 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+r10*2+400*0] paddd m1, [t1+r10*2+400*2] paddd m2, [t1+r10*2+400*4] .h_loop_end: paddd m1, m3 ; sumsq paddd m2, m4 mova [t1+r10*2+400*0], m0 mova [t1+r10*2+400*2], m1 mova [t1+r10*2+400*4], m2 add r10, 16 jl .h_loop ret .top_fixup: lea r10, [wq-2] .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+r10*2+400*0] mova m1, [t1+r10*2+400*2] mova m2, [t1+r10*2+400*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+r10*2+400*0], m0 mova [t2+r10*2+400*2], m1 mova [t2+r10*2+400*4], m2 add r10, 16 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv_main .hv_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu xm5, [lpfq+r10-2] .hv_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -18 jl .hv_have_right call .extend_right .hv_have_right: pshufb m1, m5, m8 pmullw m4, m1, m1 pshufb m3, m5, m9 paddw m0, m1, m3 shufps m1, m3, q2121 paddw m0, m1 punpcklwd m2, m3, m1 pmaddwd m2, m2 punpckhwd m3, m1 pmaddwd m3, m3 punpcklwd m1, m4, m6 paddd m2, m1 punpckhwd m4, m6 paddd m3, m4 pshufb m1, m5, m10 paddw m0, m1 pshufb m5, m11 paddw m0, m5 ; h sum punpcklwd m4, m5, m1 pmaddwd m4, m4 punpckhwd m5, m1 pmaddwd m5, m5 paddw m1, m0, [t1+r10*2+400*0] paddd m2, m4 ; h sumsq paddd m3, m5 paddd m4, m2, [t1+r10*2+400*2] paddd m5, m3, [t1+r10*2+400*4] test hd, hd jz .hv_last_row .hv_main2: paddw m1, [t2+r10*2+400*0] ; hv sum paddd m4, [t2+r10*2+400*2] ; hv sumsq paddd m5, [t2+r10*2+400*4] mova [t0+r10*2+400*0], m0 mova [t0+r10*2+400*2], m2 mova [t0+r10*2+400*4], m3 vpbroadcastd m2, [pd_25] punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmulld m4, m2 ; a * 25 pmulld m5, m2 pmaddwd m2, m0, m0 ; b * b pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m12 ; p * s pmulld m5, m12 pmaddwd m0, m13 ; b * 164 pmaddwd m1, m13 paddusw m4, m13 paddusw m5, m13 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 pand m0, m15 pand m1, m15 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. add r10, 16 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+r10*2+400*0], m1 paddw m1, m0 mova [t1+r10*2+400*2], m4 paddd m4, m2 mova [t1+r10*2+400*4], m5 paddd m5, m3 jmp .hv_main2 .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m0, [t1+r10*2+400*0] mova m2, [t1+r10*2+400*2] mova m3, [t1+r10*2+400*4] paddw m1, m0, [t2+r10*2+400*0] paddd m4, m2, [t2+r10*2+400*2] paddd m5, m3, [t2+r10*2+400*4] paddw m0, m0 paddd m2, m2 paddd m3, m3 paddw m1, m0 ; hv sum paddd m4, m2 ; hv sumsq paddd m5, m3 vpbroadcastd m2, [pd_25] punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmulld m4, m2 ; a * 25 pmulld m5, m2 pmaddwd m2, m0, m0 ; b * b pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m12 ; p * s pmulld m5, m12 pmaddwd m0, m13 ; b * 164 pmaddwd m1, m13 paddusw m4, m13 paddusw m5, m13 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 pand m0, m15 pand m1, m15 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 vextracti128 [t3+r10*4+40], m0, 1 mova [t3+r10*4+24], xm1 vextracti128 [t3+r10*4+56], m1, 1 add r10, 16 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+ 4] movu m1, [t3+r10*4+36] paddd m2, m0, [t3+r10*4+ 0] paddd m3, m1, [t3+r10*4+32] paddd m2, [t3+r10*4+ 8] paddd m3, [t3+r10*4+40] paddd m0, m2 pslld m2, 2 paddd m1, m3 pslld m3, 2 paddd m2, m0 ; ab 565 paddd m3, m1 pandn m0, m15, m2 ; a psrld m2, 12 ; b pandn m1, m15, m3 psrld m3, 12 mova [t3+r10*4+400*4+ 0], m0 mova [t3+r10*4+400*8+ 0], m2 mova [t3+r10*4+400*4+32], m1 mova [t3+r10*4+400*8+32], m3 add r10, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m0, [t3+r10*4+ 4] movu m1, [t3+r10*4+36] paddd m2, m0, [t3+r10*4+ 0] paddd m3, m1, [t3+r10*4+32] paddd m2, [t3+r10*4+ 8] paddd m3, [t3+r10*4+40] paddd m0, m2 pslld m2, 2 paddd m1, m3 pslld m3, 2 paddd m2, m0 paddd m3, m1 pandn m0, m15, m2 psrld m2, 12 pandn m1, m15, m3 psrld m3, 12 paddd m4, m0, [t3+r10*4+400*4+ 0] ; a paddd m5, m1, [t3+r10*4+400*4+32] mova [t3+r10*4+400*4+ 0], m0 mova [t3+r10*4+400*4+32], m1 paddd m0, m2, [t3+r10*4+400*8+ 0] ; b paddd m1, m3, [t3+r10*4+400*8+32] mova [t3+r10*4+400*8+ 0], m2 mova [t3+r10*4+400*8+32], m3 pmovzxbd m2, [dstq+r10+0] pmovzxbd m3, [dstq+r10+8] pmaddwd m4, m2 ; a * src pmaddwd m5, m3 packssdw m2, m3 psubd m0, m4 ; b - a * src + (1 << 8) psubd m1, m5 psrad m0, 9 psrad m1, 9 packssdw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 pshufd xm0, xm0, q3120 mova [dstq+r10], xm0 add r10, 16 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: pmovzxbd m2, [dstq+r10+0] pmovzxbd m3, [dstq+r10+8] pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src pmaddwd m5, m3, [t3+r10*4+400*4+32] mova m0, [t3+r10*4+400*8+ 0] ; b mova m1, [t3+r10*4+400*8+32] packssdw m2, m3 psubd m0, m4 ; b - a * src + (1 << 7) psubd m1, m5 psrad m0, 8 psrad m1, 8 packssdw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 pshufd xm0, xm0, q3120 mova [dstq+r10], xm0 add r10, 16 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \ w, h, edge, params %define base r14-sgr_x_by_x_avx2-256*4 mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m lea r14, [sgr_x_by_x_avx2+256*4] vbroadcasti128 m8, [base+sgr_shuf+2] add lpfq, wq vbroadcasti128 m9, [base+sgr_shuf+4] add dstq, wq vbroadcasti128 m10, [base+sgr_shuf+6] lea t3, [rsp+wq*4+16+400*12] vpbroadcastd m11, [paramsq+ 4] ; s1 pxor m6, m6 vpbroadcastw m7, [paramsq+10] ; w1 lea t1, [rsp+wq*2+20] vpbroadcastd m12, [base+pd_0xf00801c7] neg wq vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) psllw m7, 4 vpbroadcastd m14, [base+pd_m4096] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 add t1, 400*6 call .h_top lea t4, [lpfq+strideq*4] mov lpfq, dstq add t4, strideq mov [rsp], t4 ; below mov t0, t2 call .hv .main: mov t5, t3 add t3, 400*4 dec hd jz .height1 add lpfq, strideq call .hv call .prep_n dec hd jz .extend_bottom .main_loop: add lpfq, strideq call .hv call .n dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv_bottom call .n add lpfq, strideq call .hv_bottom .end: call .n RET .height1: call .v call .prep_n mov t2, t1 call .v jmp .end .extend_bottom: call .v call .n mov t2, t1 call .v jmp .end .no_top: lea t4, [lpfq+strideq*4] mov lpfq, dstq lea t4, [t4+strideq*2] mov [rsp], t4 call .h lea t0, [t1+400*6] mov t2, t1 call .v jmp .main .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .h_main .h_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu xm5, [lpfq+r10-2] .h_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -17 jl .h_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .h_have_right: pshufb m0, m5, m8 pmullw m2, m0, m0 pshufb m4, m5, m9 paddw m0, m4 pshufb m5, m10 paddw m0, m5 ; sum punpcklwd m3, m4, m5 pmaddwd m3, m3 punpckhwd m4, m5 pmaddwd m4, m4 punpcklwd m1, m2, m6 punpckhwd m2, m6 mova [t1+r10*2+400*0], m0 paddd m1, m3 ; sumsq paddd m2, m4 mova [t1+r10*2+400*2], m1 mova [t1+r10*2+400*4], m2 add r10, 16 jl .h_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv_main .hv_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu xm5, [lpfq+r10-2] .hv_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -17 jl .hv_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv_have_right: pshufb m0, m5, m8 pmullw m3, m0, m0 pshufb m1, m5, m9 paddw m0, m1 pshufb m5, m10 paddw m0, m5 ; h sum punpcklwd m4, m5, m1 pmaddwd m4, m4 punpckhwd m5, m1 pmaddwd m5, m5 paddw m1, m0, [t2+r10*2+400*0] paddw m1, [t1+r10*2+400*0] ; hv sum punpcklwd m2, m3, m6 punpckhwd m3, m6 paddd m4, m2 ; h sumsq paddd m5, m3 paddd m2, m4, [t2+r10*2+400*2] paddd m3, m5, [t2+r10*2+400*4] paddd m2, [t1+r10*2+400*2] ; hv sumsq paddd m3, [t1+r10*2+400*4] mova [t0+r10*2+400*0], m0 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 mova [t0+r10*2+400*2], m4 pslld m4, m2, 3 mova [t0+r10*2+400*4], m5 pslld m5, m3, 3 paddd m4, m2 ; a * 9 pmaddwd m2, m0, m0 ; b * b paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m11 ; p * s pmulld m5, m11 pmaddwd m0, m12 ; b * 455 pmaddwd m1, m12 paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r14+m3*4], m4 psrad m4, m5, 20 vpgatherdd m3, [r14+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m13 pand m0, m14 pand m1, m14 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 vextracti128 [t3+r10*4+40], m0, 1 mova [t3+r10*4+24], xm1 vextracti128 [t3+r10*4+56], m1, 1 add r10, 16 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m1, [t1+r10*2+400*0] paddw m1, m1 paddw m1, [t2+r10*2+400*0] ; hv sum mova m2, [t1+r10*2+400*2] mova m3, [t1+r10*2+400*4] paddd m2, m2 paddd m3, m3 paddd m2, [t2+r10*2+400*2] ; hv sumsq paddd m3, [t2+r10*2+400*4] punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a * 9 pmaddwd m2, m0, m0 ; b * b paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 pmulld m4, m11 ; p * s pmulld m5, m11 pmaddwd m0, m12 ; b * 455 pmaddwd m1, m12 paddusw m4, m12 paddusw m5, m12 psrad m3, m4, 20 ; min(z, 255) - 256 vpgatherdd m2, [r14+m3*4], m4 psrad m4, m5, 20 vpgatherdd m3, [r14+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m13 pand m0, m14 pand m1, m14 por m0, m2 ; a | (b << 12) por m1, m3 mova [t3+r10*4+ 8], xm0 vextracti128 [t3+r10*4+40], m0, 1 mova [t3+r10*4+24], xm1 vextracti128 [t3+r10*4+56], m1, 1 add r10, 16 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq mov t4, t3 add t3, 400*4 .prep_n_loop: mova m2, [t5+r10*4+0] mova m3, [t4+r10*4+0] paddd m2, [t5+r10*4+8] paddd m3, [t4+r10*4+8] paddd m0, m2, [t5+r10*4+4] paddd m1, m3, [t4+r10*4+4] pslld m0, 2 paddd m1, m1 ; ab[ 0] 222 psubd m0, m2 ; ab[-1] 343 mova [t3+r10*4+400*4], m1 paddd m1, m1 mova [t5+r10*4], m0 psubd m1, m3 ; ab[ 0] 343 mova [t4+r10*4], m1 add r10, 8 jl .prep_n_loop ret ; a+b are packed together in a single dword, but we can't do the ; full neighbor calculations before splitting them since we don't ; have sufficient precision. The solution is to do the calculations ; in two equal halves and split a and b before doing the final sum. ALIGN function_align .n: ; neighbor + output mov r10, wq .n_loop: mova m4, [t3+r10*4+ 0] paddd m4, [t3+r10*4+ 8] paddd m5, m4, [t3+r10*4+ 4] paddd m5, m5 ; ab[+1] 222 mova m2, [t3+r10*4+400*4+ 0] paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 mova m3, [t3+r10*4+400*4+32] paddd m1, m3, [t5+r10*4+32] mova [t3+r10*4+400*4+ 0], m5 paddd m5, m5 psubd m5, m4 ; ab[+1] 343 mova [t5+r10*4+ 0], m5 paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 mova m4, [t3+r10*4+32] paddd m4, [t3+r10*4+40] paddd m5, m4, [t3+r10*4+36] paddd m5, m5 mova [t3+r10*4+400*4+32], m5 paddd m5, m5 psubd m5, m4 mova [t5+r10*4+32], m5 pandn m4, m14, m0 psrld m0, 12 paddd m3, m5 pandn m5, m14, m2 psrld m2, 12 paddd m4, m5 ; a pandn m5, m14, m1 psrld m1, 12 paddd m0, m2 ; b + (1 << 8) pandn m2, m14, m3 psrld m3, 12 paddd m5, m2 pmovzxbd m2, [dstq+r10+0] paddd m1, m3 pmovzxbd m3, [dstq+r10+8] pmaddwd m4, m2 ; a * src pmaddwd m5, m3 packssdw m2, m3 psubd m0, m4 ; b - a * src + (1 << 8) psubd m1, m5 psrad m0, 9 psrad m1, 9 packssdw m0, m1 pmulhrsw m0, m7 paddw m0, m2 vextracti128 xm1, m0, 1 packuswb xm0, xm1 pshufd xm0, xm0, q3120 mova [dstq+r10], xm0 add r10, 16 jl .n_loop mov r10, t5 mov t5, t4 mov t4, r10 add dstq, strideq ret cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \ w, h, edge, params %define base r12-sgr_x_by_x_avx2-256*4 lea r12, [sgr_x_by_x_avx2+256*4] mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti128 m9, [base+sgr_shuf+0] vbroadcasti128 m10, [base+sgr_shuf+8] add lpfq, wq vbroadcasti128 m11, [base+sgr_shuf+2] vbroadcasti128 m12, [base+sgr_shuf+6] add dstq, wq vpbroadcastd m15, [paramsq+8] ; w0 w1 lea t3, [rsp+wq*4+400*24+8] vpbroadcastd m13, [paramsq+0] ; s0 pxor m7, m7 vpbroadcastd m14, [paramsq+4] ; s1 lea t1, [rsp+wq*2+12] neg wq psllw m15, 2 ; to reuse existing pd_m4096 register for rounding test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup add t1, 400*12 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+400*12] lea r10, [wq-2] .top_fixup_loop: mova m0, [t1+r10*2+400* 0] mova m1, [t1+r10*2+400* 2] mova m2, [t1+r10*2+400* 4] paddw m0, m0 mova m3, [t1+r10*2+400* 6] paddd m1, m1 mova m4, [t1+r10*2+400* 8] paddd m2, m2 mova m5, [t1+r10*2+400*10] mova [t2+r10*2+400* 0], m0 mova [t2+r10*2+400* 2], m1 mova [t2+r10*2+400* 4], m2 mova [t2+r10*2+400* 6], m3 mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 add r10, 16 jl .top_fixup_loop call .v0 jmp .main .h: ; horizontal boxsums lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .h_main .h_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu xm5, [lpfq+r10-2] .h_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -18 jl .h_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .h_have_right: pshufb m6, m5, m9 pshufb m4, m5, m10 paddw m8, m6, m4 shufps m0, m6, m4, q2121 pmullw m3, m0, m0 pshufb m2, m5, m11 paddw m0, m2 pshufb m5, m12 paddw m0, m5 ; sum3 punpcklwd m1, m2, m5 pmaddwd m1, m1 punpckhwd m2, m5 pmaddwd m2, m2 punpcklwd m5, m6, m4 pmaddwd m5, m5 punpckhwd m6, m4 pmaddwd m6, m6 punpcklwd m4, m3, m7 paddd m1, m4 ; sumsq3 punpckhwd m3, m7 paddd m2, m3 mova [t1+r10*2+400* 6], m0 mova [t1+r10*2+400* 8], m1 mova [t1+r10*2+400*10], m2 paddw m8, m0 ; sum5 paddd m5, m1 ; sumsq5 paddd m6, m2 mova [t1+r10*2+400* 0], m8 mova [t1+r10*2+400* 2], m5 mova [t1+r10*2+400* 4], m6 add r10, 16 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv0_main .hv0_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv0_main .hv0_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu xm5, [lpfq+r10-2] .hv0_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -18 jl .hv0_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv0_have_right: pshufb m6, m5, m9 pshufb m4, m5, m10 paddw m8, m6, m4 shufps m1, m6, m4, q2121 pmullw m0, m1, m1 pshufb m3, m5, m11 paddw m1, m3 pshufb m5, m12 paddw m1, m5 ; sum3 punpcklwd m2, m3, m5 pmaddwd m2, m2 punpckhwd m3, m5 pmaddwd m3, m3 punpcklwd m5, m6, m4 pmaddwd m5, m5 punpckhwd m6, m4 pmaddwd m6, m6 punpcklwd m4, m0, m7 paddd m2, m4 ; sumsq3 punpckhwd m0, m7 paddd m3, m0 paddw m8, m1 ; sum5 paddd m5, m2 ; sumsq5 paddd m6, m3 mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd mova [t3+r10*4+400*0+40], m6 paddw m8, [t1+r10*2+400* 0] paddd m5, [t1+r10*2+400* 2] paddd m6, [t1+r10*2+400* 4] mova [t1+r10*2+400* 0], m8 mova [t1+r10*2+400* 2], m5 mova [t1+r10*2+400* 4], m6 paddw m0, m1, [t1+r10*2+400* 6] paddd m4, m2, [t1+r10*2+400* 8] paddd m5, m3, [t1+r10*2+400*10] mova [t1+r10*2+400* 6], m1 mova [t1+r10*2+400* 8], m2 mova [t1+r10*2+400*10], m3 paddw m1, m0, [t2+r10*2+400* 6] paddd m2, m4, [t2+r10*2+400* 8] paddd m3, m5, [t2+r10*2+400*10] mova [t2+r10*2+400* 6], m0 mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 vpbroadcastd m4, [base+pd_34816] pmulld m0, m2 vpbroadcastd m5, [base+pd_m4096] pmulld m1, m3 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m4 pand m0, m5 pand m1, m5 por m0, m2 ; a3 | (b3 << 12) por m1, m3 mova [t3+r10*4+400*4+ 8], xm0 vextracti128 [t3+r10*4+400*4+40], m0, 1 mova [t3+r10*4+400*4+24], xm1 vextracti128 [t3+r10*4+400*4+56], m1, 1 add r10, 16 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left vpbroadcastd xm0, [leftq] mova xm5, [lpfq+wq] palignr xm5, xm0, 12 add leftq, 4 jmp .hv1_main .hv1_extend_left: mova xm5, [lpfq+wq] pshufb xm5, [base+sgr_l_shuf] jmp .hv1_main .hv1_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu xm5, [lpfq+r10-2] .hv1_main: vinserti128 m5, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -18 jl .hv1_have_right call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right .hv1_have_right: pshufb m6, m5, m9 pshufb m3, m5, m10 paddw m8, m6, m3 shufps m2, m6, m3, q2121 pmullw m1, m2, m2 pshufb m0, m5, m11 paddw m2, m0 pshufb m5, m12 paddw m2, m5 ; sum3 punpcklwd m4, m5, m0 pmaddwd m4, m4 punpckhwd m5, m0 pmaddwd m5, m5 punpcklwd m0, m6, m3 pmaddwd m0, m0 punpckhwd m6, m3 pmaddwd m6, m6 punpcklwd m3, m1, m7 paddd m4, m3 ; sumsq3 punpckhwd m1, m7 paddd m5, m1 paddw m1, m2, [t2+r10*2+400* 6] mova [t2+r10*2+400* 6], m2 paddw m8, m2 ; sum5 paddd m2, m4, [t2+r10*2+400* 8] paddd m3, m5, [t2+r10*2+400*10] mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 paddd m4, m0 ; sumsq5 paddd m5, m6 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m6, m2, 3 pslld m7, m3, 3 paddd m6, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b3 paddd m7, m3 pmaddwd m3, m1, m1 psubd m6, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m7, m3 pmulld m6, m14 ; p3 * s1 pmulld m7, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m6, m2 paddusw m7, m2 psrad m3, m6, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m6 psrad m6, m7, 20 vpgatherdd m3, [r12+m6*4], m7 vpbroadcastd m6, [base+pd_34816] ; x3 pmulld m0, m2 vpbroadcastd m7, [base+pd_m4096] pmulld m1, m3 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m6 pand m0, m7 pand m7, m1 por m0, m2 ; a3 | (b3 << 12) por m7, m3 paddw m1, m8, [t2+r10*2+400*0] paddd m2, m4, [t2+r10*2+400*2] paddd m3, m5, [t2+r10*2+400*4] paddw m1, [t1+r10*2+400*0] paddd m2, [t1+r10*2+400*2] paddd m3, [t1+r10*2+400*4] mova [t2+r10*2+400*0], m8 mova [t2+r10*2+400*2], m4 mova [t2+r10*2+400*4], m5 mova [t3+r10*4+400*8+ 8], xm0 vextracti128 [t3+r10*4+400*8+40], m0, 1 mova [t3+r10*4+400*8+24], xm7 vextracti128 [t3+r10*4+400*8+56], m7, 1 vpbroadcastd m4, [base+pd_25] pxor m7, m7 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 pmulld m2, m4 ; a5 * 25 pmulld m3, m4 pmaddwd m4, m0, m0 ; b5 * b5 pmaddwd m5, m1, m1 psubd m2, m4 ; p5 vpbroadcastd m4, [base+pd_0xf00800a4] psubd m3, m5 pmulld m2, m13 ; p5 * s0 pmulld m3, m13 pmaddwd m0, m4 ; b5 * 164 pmaddwd m1, m4 paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 vpgatherdd m4, [r12+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r12+m2*4], m3 pmulld m0, m4 pmulld m1, m5 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m6 vpbroadcastd m6, [base+pd_m4096] pand m0, m6 pand m1, m6 por m0, m4 ; a5 | (b5 << 12) por m1, m5 mova [t3+r10*4+400*0+ 8], xm0 vextracti128 [t3+r10*4+400*0+40], m0, 1 mova [t3+r10*4+400*0+24], xm1 vextracti128 [t3+r10*4+400*0+56], m1, 1 add r10, 16 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) lea r10, [wq-2] vpbroadcastd m6, [base+pd_34816] vpbroadcastd m8, [base+pd_m4096] .v0_loop: mova m0, [t1+r10*2+400* 6] mova m4, [t1+r10*2+400* 8] mova m5, [t1+r10*2+400*10] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+r10*2+400* 6] paddd m2, m4, [t2+r10*2+400* 8] paddd m3, m5, [t2+r10*2+400*10] mova [t2+r10*2+400* 6], m0 mova [t2+r10*2+400* 8], m4 mova [t2+r10*2+400*10], m5 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b3 paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 pmulld m0, m2 pmulld m1, m3 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m6 pand m0, m8 pand m1, m8 por m0, m2 ; a3 | (b3 << 12) por m1, m3 mova m2, [t1+r10*2+400*0] mova m3, [t1+r10*2+400*2] mova m4, [t1+r10*2+400*4] mova [t3+r10*4+400*8+ 8], m2 mova [t3+r10*4+400*0+ 8], m3 mova [t3+r10*4+400*0+40], m4 paddw m2, m2 ; cc5 paddd m3, m3 paddd m4, m4 mova [t1+r10*2+400*0], m2 mova [t1+r10*2+400*2], m3 mova [t1+r10*2+400*4], m4 mova [t3+r10*4+400*4+ 8], xm0 vextracti128 [t3+r10*4+400*4+40], m0, 1 mova [t3+r10*4+400*4+24], xm1 vextracti128 [t3+r10*4+400*4+56], m1, 1 add r10, 16 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-2] .v1_loop: mova m4, [t1+r10*2+400* 6] mova m5, [t1+r10*2+400* 8] mova m6, [t1+r10*2+400*10] paddw m1, m4, [t2+r10*2+400* 6] paddd m2, m5, [t2+r10*2+400* 8] paddd m3, m6, [t2+r10*2+400*10] mova [t2+r10*2+400* 6], m4 mova [t2+r10*2+400* 8], m5 mova [t2+r10*2+400*10], m6 punpcklwd m0, m1, m7 ; b3 punpckhwd m1, m7 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 pmaddwd m2, m0, m0 ; b3 * b3 paddd m5, m3 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 vpbroadcastd m2, [base+pd_0xf00801c7] psubd m5, m3 pmulld m4, m14 ; p3 * s1 pmulld m5, m14 pmaddwd m0, m2 ; b3 * 455 pmaddwd m1, m2 paddusw m4, m2 paddusw m5, m2 psrad m3, m4, 20 ; min(z3, 255) - 256 vpgatherdd m2, [r12+m3*4], m4 ; x3 psrad m4, m5, 20 vpgatherdd m3, [r12+m4*4], m5 vpbroadcastd m4, [base+pd_34816] pmulld m0, m2 vpbroadcastd m8, [base+pd_m4096] pmulld m1, m3 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m4 pand m0, m8 pand m8, m1 por m0, m2 ; a3 | (b3 << 12) por m8, m3 mova m4, [t3+r10*4+400*8+ 8] mova m5, [t3+r10*4+400*0+ 8] mova m6, [t3+r10*4+400*0+40] paddw m1, m4, [t2+r10*2+400*0] paddd m2, m5, [t2+r10*2+400*2] paddd m3, m6, [t2+r10*2+400*4] paddw m1, [t1+r10*2+400*0] paddd m2, [t1+r10*2+400*2] paddd m3, [t1+r10*2+400*4] mova [t2+r10*2+400*0], m4 mova [t2+r10*2+400*2], m5 mova [t2+r10*2+400*4], m6 vpbroadcastd m4, [base+pd_25] mova [t3+r10*4+400*8+ 8], xm0 vextracti128 [t3+r10*4+400*8+40], m0, 1 mova [t3+r10*4+400*8+24], xm8 vextracti128 [t3+r10*4+400*8+56], m8, 1 punpcklwd m0, m1, m7 ; b5 punpckhwd m1, m7 pmulld m2, m4 ; a5 * 25 pmulld m3, m4 pmaddwd m4, m0, m0 ; b5 * b5 pmaddwd m5, m1, m1 psubd m2, m4 ; p5 vpbroadcastd m4, [base+pd_0xf00800a4] psubd m3, m5 pmulld m2, m13 ; p5 * s0 pmulld m3, m13 pmaddwd m0, m4 ; b5 * 164 pmaddwd m1, m4 paddusw m2, m4 paddusw m3, m4 psrad m5, m2, 20 ; min(z5, 255) - 256 vpgatherdd m4, [r12+m5*4], m2 ; x5 psrad m2, m3, 20 vpgatherdd m5, [r12+m2*4], m3 pmulld m0, m4 vpbroadcastd m6, [base+pd_34816] pmulld m1, m5 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m6 vpbroadcastd m6, [base+pd_m4096] pand m0, m6 pand m1, m6 por m0, m4 ; a5 | (b5 << 12) por m1, m5 mova [t3+r10*4+400*0+ 8], xm0 vextracti128 [t3+r10*4+400*0+40], m0, 1 mova [t3+r10*4+400*0+24], xm1 vextracti128 [t3+r10*4+400*0+56], m1, 1 add r10, 16 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+400*0+4] paddd m1, m0, [t3+r10*4+400*0+0] mova m4, [t3+r10*4+400*4+0] paddd m1, [t3+r10*4+400*0+8] mova m5, [t3+r10*4+400*8+0] paddd m4, [t3+r10*4+400*4+8] paddd m5, [t3+r10*4+400*8+8] paddd m2, m4, [t3+r10*4+400*4+4] paddd m3, m5, [t3+r10*4+400*8+4] paddd m0, m1 pslld m1, 2 pslld m2, 2 paddd m1, m0 ; ab5 565 paddd m3, m3 ; ab3[ 0] 222 psubd m2, m4 ; ab3[-1] 343 mova [t3+r10*4+400*20], m3 pandn m0, m6, m1 ; a5 565 mova [t3+r10*4+400*24], m2 psrld m1, 12 ; b5 565 mova [t3+r10*4+400*12], m0 paddd m3, m3 mova [t3+r10*4+400*16], m1 psubd m3, m5 ; ab3[ 0] 343 mova [t3+r10*4+400*28], m3 add r10, 8 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m0, [t3+r10*4+4] paddd m4, m0, [t3+r10*4+0] paddd m4, [t3+r10*4+8] paddd m0, m4 pslld m4, 2 paddd m4, m0 pandn m0, m6, m4 psrld m4, 12 paddd m2, m0, [t3+r10*4+400*12] ; a5 mova [t3+r10*4+400*12], m0 paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) mova [t3+r10*4+400*16], m4 mova m3, [t3+r10*4+400*4+0] paddd m3, [t3+r10*4+400*4+8] paddd m5, m3, [t3+r10*4+400*4+4] paddd m5, m5 ; ab3[ 1] 222 mova m4, [t3+r10*4+400*20] paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 mova [t3+r10*4+400*20], m5 paddd m5, m5 psubd m5, m3 ; ab3[ 1] 343 mova [t3+r10*4+400*24], m5 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m3, m6, m1 psrld m1, 12 pandn m5, m6, m4 psrld m4, 12 paddd m3, m5 ; a3 paddd m1, m4 ; b3 + (1 << 8) pmovzxbd m4, [dstq+r10] pmaddwd m2, m4 ; a5 * src pmaddwd m3, m4 ; a3 * src psubd m0, m2 ; b5 - a5 * src + (1 << 8) psubd m1, m3 ; b3 - a3 * src + (1 << 8) psrld m0, 9 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 psubd m0, m6 psrad m0, 13 paddd m0, m4 vextracti128 xm1, m0, 1 packssdw xm0, xm1 packuswb xm0, xm0 movq [dstq+r10], xm0 add r10, 8 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m3, [t3+r10*4+400*8+0] paddd m3, [t3+r10*4+400*8+8] paddd m5, m3, [t3+r10*4+400*8+4] paddd m5, m5 ; ab3[ 1] 222 mova m4, [t3+r10*4+400*20] paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 mova [t3+r10*4+400*20], m5 paddd m5, m5 psubd m5, m3 ; ab3[ 1] 343 mova [t3+r10*4+400*28], m5 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m3, m6, m1 psrld m1, 12 pandn m5, m6, m4 psrld m4, 12 paddd m3, m5 ; -a3 paddd m1, m4 ; b3 + (1 << 8) pmovzxbd m4, [dstq+r10] pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) pmaddwd m3, m4 ; -a3 * src psubd m0, m2 ; a5 * src + b5 + (1 << 7) psubd m1, m3 ; a3 * src + b3 + (1 << 8) psrld m0, 8 pslld m1, 7 pblendw m0, m1, 0xaa pmaddwd m0, m15 psubd m0, m6 psrad m0, 13 paddd m0, m4 vextracti128 xm1, m0, 1 packssdw xm0, xm1 packuswb xm0, xm0 movq [dstq+r10], xm0 add r10, 8 jl .n1_loop add dstq, strideq ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/looprestoration_avx512.asm000064400000000000000000002061251046102023000171440ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 wiener_shufA: db 1, 2, 7, 6, 3, 4, 9, 8, 5, 6, 11, 10, 7, 8, 13, 12 wiener_shufB: db 2, 3, 8, 7, 4, 5, 10, 9, 6, 7, 12, 11, 8, 9, 14, 13 wiener_shufC: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 wiener_shufD: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 wiener_perm32: db 1, 9, 3, 11, 5, 13, 7, 15, 33, 41, 35, 43, 37, 45, 39, 47 db 17, 25, 19, 27, 21, 29, 23, 31, 49, 57, 51, 59, 53, 61, 55, 63 sgr_shuf: db 128, 1, -1, 2,132, 3, -1, 4,136, 5, -1, 6,140, 7, -1, 8 db 129, 9, -1, 10,133, 11, -1, 12,137, -1, -1, -1,141, -1, 0,128 sgr_mix_perm: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 r_ext_mask: times 68 db -1 times 4 db 0 wiener_x_shuf: db 0, 2, -1, 0 wiener_x_add: db 0, 1,127, 0 pw_61448: times 2 dw 61448 pw_164_455: dw 164, 455 pd_m16380: dd -16380 pd_m4096: dd -4096 pd_m25 dd -25 pd_m9: dd -9 pd_34816: dd 34816 pd_8421376: dd 8421376 cextern sgr_x_by_x SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers INIT_ZMM avx512icl cglobal wiener_filter7_8bpc, 4, 15, 20, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt mov fltq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti32x4 m6, [wiener_shufA] vbroadcasti32x4 m7, [wiener_shufB] mov r10d, 0xfffe vbroadcasti32x4 m8, [wiener_shufC] vbroadcasti32x4 m9, [wiener_shufD] kmovw k1, r10d vpbroadcastd m0, [wiener_x_shuf] vpbroadcastd m1, [wiener_x_add] mov r10, 0xaaaaaaaaaaaaaaaa vpbroadcastd m11, [fltq+ 0] vpbroadcastd m12, [fltq+ 4] kmovq k2, r10 vpbroadcastd m10, [pd_m16380] packsswb m11, m11 ; x0 x1 x0 x1 vpbroadcastd m14, [fltq+16] pshufb m12, m0 vpbroadcastd m15, [fltq+20] paddb m12, m1 ; x2 x3+1 x2 127 vpbroadcastd m13, [pd_8421376] psllw m14, 5 ; y0 y1 psllw m15, 5 ; y2 y3 cmp wd, 32 ; the minimum lr unit size for chroma in 4:2:0 is 32 jle .w32 ; pixels, so we need a special case for small widths lea t1, [rsp+wq*2+16] add lpfq, wq add dstq, wq neg wq test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 384*2 add r10, strideq mov [rsp], r10 ; below call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call .v RET .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call .v .v2: call .v jmp .v1 .h: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm16, [leftq] vmovdqu32 m16{k1}, [lpfq+r10-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm16, [lpfq+r10] ; the masked load ensures that no exception vmovdqu32 m16{k1}, [lpfq+r10-4] ; gets raised from accessing invalid memory jmp .h_main .h_top: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m16, [lpfq+r10-4] .h_main: movu m17, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -66 jl .h_have_right push r0 lea r0, [r_ext_mask+65] vpbroadcastb m0, [lpfq-1] vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b vpternlogd m17, m0, [r0+r10+8], 0xe4 pop r0 .h_have_right: pshufb m4, m16, m6 mova m0, m10 vpdpbusd m0, m4, m11 pshufb m4, m16, m7 mova m2, m10 vpdpbusd m2, m4, m11 pshufb m4, m17, m6 mova m1, m10 vpdpbusd m1, m4, m11 pshufb m4, m17, m7 mova m3, m10 vpdpbusd m3, m4, m11 pshufb m4, m16, m8 vpdpbusd m0, m4, m12 pshufb m16, m9 vpdpbusd m2, m16, m12 pshufb m4, m17, m8 vpdpbusd m1, m4, m12 pshufb m17, m9 vpdpbusd m3, m17, m12 packssdw m0, m2 packssdw m1, m3 psraw m0, 3 psraw m1, 3 mova [t1+r10*2+ 0], m0 mova [t1+r10*2+64], m1 add r10, 64 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm16, [leftq] vmovdqu32 m16{k1}, [lpfq+r10-4] add leftq, 4 jmp .hv_main .hv_extend_left: vpbroadcastb xm16, [lpfq+r10] vmovdqu32 m16{k1}, [lpfq+r10-4] jmp .hv_main .hv_bottom: mov r10, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m16, [lpfq+r10-4] .hv_main: movu m17, [lpfq+r10+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -66 jl .hv_have_right push r0 lea r0, [r_ext_mask+65] vpbroadcastb m0, [lpfq-1] vpternlogd m16, m0, [r0+r10+0], 0xe4 ; c ? a : b vpternlogd m17, m0, [r0+r10+8], 0xe4 pop r0 .hv_have_right: pshufb m4, m16, m6 mova m0, m10 vpdpbusd m0, m4, m11 pshufb m4, m16, m7 mova m2, m10 vpdpbusd m2, m4, m11 pshufb m4, m17, m6 mova m1, m10 vpdpbusd m1, m4, m11 pshufb m4, m17, m7 mova m3, m10 vpdpbusd m3, m4, m11 pshufb m4, m16, m8 vpdpbusd m0, m4, m12 pshufb m16, m9 vpdpbusd m2, m16, m12 pshufb m4, m17, m8 vpdpbusd m1, m4, m12 pshufb m17, m9 vpdpbusd m3, m17, m12 packssdw m0, m2 packssdw m1, m3 psraw m0, 3 psraw m1, 3 mova m16, [t4+r10*2] paddw m16, [t2+r10*2] mova m3, [t3+r10*2] mova m17, [t4+r10*2+64] paddw m17, [t2+r10*2+64] mova m5, [t3+r10*2+64] punpcklwd m4, m16, m3 mova m2, m13 vpdpwssd m2, m4, m15 punpcklwd m18, m17, m5 mova m4, m13 vpdpwssd m4, m18, m15 punpckhwd m16, m3 mova m3, m13 vpdpwssd m3, m16, m15 punpckhwd m17, m5 mova m5, m13 vpdpwssd m5, m17, m15 mova m17, [t5+r10*2] paddw m17, [t1+r10*2] paddw m16, m0, [t6+r10*2] mova m19, [t5+r10*2+64] paddw m19, [t1+r10*2+64] paddw m18, m1, [t6+r10*2+64] mova [t0+r10*2+ 0], m0 mova [t0+r10*2+64], m1 punpcklwd m0, m16, m17 vpdpwssd m2, m0, m14 punpcklwd m1, m18, m19 vpdpwssd m4, m1, m14 punpckhwd m16, m17 vpdpwssd m3, m16, m14 punpckhwd m18, m19 vpdpwssd m5, m18, m14 packuswb m2, m4 psrlw m2, 8 vpackuswb m2{k2}, m3, m5 movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap add r10, 64 ; function is used for chroma as well, and in some jl .hv_loop ; esoteric edge cases chroma dst pointers may only mov t6, t5 ; have a 32-byte alignment despite having a width mov t5, t4 ; larger than 32, so use an unaligned store here. mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .v: mov r10, wq .v_loop: mova m4, [t4+r10*2+ 0] paddw m4, [t2+r10*2+ 0] mova m1, [t3+r10*2+ 0] mova m5, [t4+r10*2+64] paddw m5, [t2+r10*2+64] mova m3, [t3+r10*2+64] punpcklwd m6, m4, m1 mova m0, m13 vpdpwssd m0, m6, m15 punpcklwd m6, m5, m3 mova m2, m13 vpdpwssd m2, m6, m15 punpckhwd m4, m1 mova m1, m13 vpdpwssd m1, m4, m15 punpckhwd m5, m3 mova m3, m13 vpdpwssd m3, m5, m15 mova m5, [t1+r10*2+ 0] paddw m4, m5, [t6+r10*2+ 0] paddw m5, [t5+r10*2+ 0] mova m7, [t1+r10*2+64] paddw m6, m7, [t6+r10*2+64] paddw m7, [t5+r10*2+64] punpcklwd m8, m4, m5 vpdpwssd m0, m8, m14 punpcklwd m8, m6, m7 vpdpwssd m2, m8, m14 punpckhwd m4, m5 vpdpwssd m1, m4, m14 punpckhwd m6, m7 vpdpwssd m3, m6, m14 packuswb m0, m2 psrlw m0, 8 vpackuswb m0{k2}, m1, m3 movu [dstq+r10], m0 add r10, 64 jl .v_loop mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret .w32: lea r10, [r_ext_mask+73] mova ym18, [wiener_perm32] lea t1, [rsp+16] sub r10, wq test edgeb, 4 ; LR_HAVE_TOP jz .w32_no_top call .w32_h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 32*2 call .w32_h_top lea r9, [lpfq+strideq*4] mov lpfq, dstq mov t4, t1 add t1, 32*2 add r9, strideq mov [rsp], r9 ; below call .w32_h mov t3, t1 mov t2, t1 dec hd jz .w32_v1 add lpfq, strideq add t1, 32*2 call .w32_h mov t2, t1 dec hd jz .w32_v2 add lpfq, strideq add t1, 32*2 call .w32_h dec hd jz .w32_v3 .w32_main: lea t0, [t1+32*2] .w32_main_loop: call .w32_hv dec hd jnz .w32_main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .w32_v3 mov lpfq, [rsp] call .w32_hv_bottom add lpfq, strideq call .w32_hv_bottom .w32_v1: call .w32_v RET .w32_no_top: lea r9, [lpfq+strideq*4] mov lpfq, dstq lea r9, [r9+strideq*2] mov [rsp], r9 call .w32_h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .w32_v1 add lpfq, strideq add t1, 32*2 call .w32_h mov t2, t1 dec hd jz .w32_v2 add lpfq, strideq add t1, 32*2 call .w32_h dec hd jz .w32_v3 lea t0, [t1+32*2] call .w32_hv dec hd jz .w32_v3 add t0, 32*8 call .w32_hv dec hd jnz .w32_main .w32_v3: call .w32_v .w32_v2: call .w32_v jmp .w32_v1 .w32_h: test edgeb, 1 ; LR_HAVE_LEFT jz .w32_h_extend_left movd xm16, [leftq] vmovdqu32 ym16{k1}, [lpfq-4] add leftq, 4 jmp .w32_h_main .w32_h_extend_left: vpbroadcastb xm16, [lpfq] ; the masked load ensures that no exception vmovdqu32 ym16{k1}, [lpfq-4] ; gets raised from accessing invalid memory jmp .w32_h_main .w32_h_top: test edgeb, 1 ; LR_HAVE_LEFT jz .w32_h_extend_left movu ym16, [lpfq-4] .w32_h_main: vinserti32x8 m16, [lpfq+4], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .w32_h_have_right vpbroadcastb m0, [lpfq+wq-1] movu ym17, [r10-8] vinserti32x8 m17, [r10+0], 1 vpternlogd m16, m0, m17, 0xe4 ; c ? a : b .w32_h_have_right: pshufb m2, m16, m6 mova m0, m10 vpdpbusd m0, m2, m11 pshufb m2, m16, m7 mova m1, m10 vpdpbusd m1, m2, m11 pshufb m2, m16, m8 vpdpbusd m0, m2, m12 pshufb m16, m9 vpdpbusd m1, m16, m12 packssdw m0, m1 psraw m0, 3 mova [t1], m0 ret .w32_hv: add lpfq, strideq test edgeb, 1 ; LR_HAVE_LEFT jz .w32_hv_extend_left movd xm16, [leftq] vmovdqu32 ym16{k1}, [lpfq-4] add leftq, 4 jmp .w32_hv_main .w32_hv_extend_left: vpbroadcastb xm16, [lpfq] vmovdqu32 ym16{k1}, [lpfq-4] jmp .w32_hv_main .w32_hv_bottom: test edgeb, 1 ; LR_HAVE_LEFT jz .w32_hv_extend_left movu ym16, [lpfq-4] .w32_hv_main: vinserti32x8 m16, [lpfq+4], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .w32_hv_have_right vpbroadcastb m0, [lpfq+wq-1] movu ym17, [r10-8] vinserti32x8 m17, [r10+0], 1 vpternlogd m16, m0, m17, 0xe4 .w32_hv_have_right: mova m3, [t4] paddw m3, [t2] mova m2, [t3] pshufb m4, m16, m6 mova m0, m10 vpdpbusd m0, m4, m11 pshufb m4, m16, m7 mova m5, m10 vpdpbusd m5, m4, m11 punpcklwd m4, m3, m2 mova m1, m13 vpdpwssd m1, m4, m15 punpckhwd m3, m2 mova m2, m13 vpdpwssd m2, m3, m15 pshufb m4, m16, m8 vpdpbusd m0, m4, m12 pshufb m16, m9 vpdpbusd m5, m16, m12 packssdw m0, m5 psraw m0, 3 mova m4, [t5] paddw m4, [t1] paddw m3, m0, [t6] mova [t0], m0 punpcklwd m0, m3, m4 vpdpwssd m1, m0, m14 punpckhwd m3, m4 vpdpwssd m2, m3, m14 packuswb m1, m2 vpermb m16, m18, m1 mova [dstq], ym16 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 add dstq, strideq ret .w32_v: mova m2, [t4] paddw m2, [t2] mova m1, [t3] mova m4, [t1] paddw m3, m4, [t6] paddw m4, [t5] punpcklwd m5, m2, m1 mova m0, m13 vpdpwssd m0, m5, m15 punpckhwd m2, m1 mova m1, m13 vpdpwssd m1, m2, m15 punpcklwd m2, m3, m4 vpdpwssd m0, m2, m14 punpckhwd m3, m4 vpdpwssd m1, m3, m14 packuswb m0, m1 vpermb m16, m18, m0 mova [dstq], ym16 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 add dstq, strideq ret cglobal sgr_filter_5x5_8bpc, 4, 13, 23, 416*24+16, dst, stride, left, lpf, \ w, h, edge, params mov paramsq, r6mp mov wd, wm mov hd, hm mov edged, r7m vbroadcasti32x4 m5, [sgr_shuf+1] add lpfq, wq vbroadcasti32x4 m6, [sgr_shuf+9] add dstq, wq vbroadcasti32x4 m7, [sgr_shuf+3] lea t3, [rsp+wq*4+16+416*12] vbroadcasti32x4 m8, [sgr_shuf+7] pxor m4, m4 vpbroadcastd m9, [pd_m25] vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 vpbroadcastw m15, [paramsq+8] ; w0 lea t1, [rsp+wq*2+20] vpbroadcastd m10, [pw_164_455] neg wq vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) mov r10d, 0xfe vpbroadcastd m13, [pd_m4096] kmovb k1, r10d vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) mov r10, 0x3333333333333333 mova m18, [sgr_x_by_x+64*0] kmovq k2, r10 mova m19, [sgr_x_by_x+64*1] lea r12, [r_ext_mask+75] mova m20, [sgr_x_by_x+64*2] psllw m15, 4 mova m21, [sgr_x_by_x+64*3] lea r10, [lpfq+strideq*4] mova ym22, [sgr_shuf] add r10, strideq mov [rsp], r10 ; below test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call .top_fixup add t1, 416*6 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, strideq call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq test hd, hd jz .odd_height call .h add lpfq, strideq call .hv call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .h_top add lpfq, strideq call .hv_bottom .end: call .n0 call .n1 .end2: RET .height1: call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv call .n0 call .n1 .odd_height_end: call .v call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+416*6] call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 jmp .main .no_top_height1: call .v call .prep_n jmp .odd_height_end .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu ym17, [lpfq+r10-2] .h_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .h_have_right: pshufb m3, m17, m5 pmullw m2, m3, m3 pshufb m1, m17, m6 paddw m0, m3, m1 shufps m3, m1, q2121 paddw m0, m3 punpcklwd m16, m3, m1 punpckhwd m3, m1 punpcklwd m1, m2, m4 vpdpwssd m1, m16, m16 punpckhwd m2, m4 vpdpwssd m2, m3, m3 pshufb m16, m17, m7 paddw m0, m16 pshufb m17, m8 paddw m0, m17 ; sum punpcklwd m3, m16, m17 vpdpwssd m1, m3, m3 ; sumsq punpckhwd m16, m17 vpdpwssd m2, m16, m16 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+r10*2+416*0] paddd m1, [t1+r10*2+416*2] paddd m2, [t1+r10*2+416*4] .h_loop_end: mova [t1+r10*2+416*0], m0 mova [t1+r10*2+416*2], m1 mova [t1+r10*2+416*4], m2 add r10, 32 jl .h_loop ret .top_fixup: lea r10, [wq-2] .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+r10*2+416*0] mova m1, [t1+r10*2+416*2] mova m2, [t1+r10*2+416*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+r10*2+416*0], m0 mova [t2+r10*2+416*2], m1 mova [t2+r10*2+416*4], m2 add r10, 32 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv_main .hv_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu ym17, [lpfq+r10-2] .hv_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -34 jl .hv_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv_have_right: pshufb m1, m17, m5 pmullw m3, m1, m1 pshufb m2, m17, m6 paddw m0, m1, m2 shufps m1, m2, q2121 paddw m0, m1 punpcklwd m16, m1, m2 punpckhwd m1, m2 punpcklwd m2, m3, m4 vpdpwssd m2, m16, m16 punpckhwd m3, m4 vpdpwssd m3, m1, m1 pshufb m16, m17, m7 paddw m0, m16 pshufb m17, m8 paddw m0, m17 ; h sum punpcklwd m1, m16, m17 vpdpwssd m2, m1, m1 ; h sumsq punpckhwd m16, m17 vpdpwssd m3, m16, m16 paddw m1, m0, [t1+r10*2+416*0] paddd m16, m2, [t1+r10*2+416*2] paddd m17, m3, [t1+r10*2+416*4] test hd, hd jz .hv_last_row .hv_main2: paddd m16, [t2+r10*2+416*2] ; hv sumsq paddd m17, [t2+r10*2+416*4] paddw m1, [t2+r10*2+416*0] ; hv sum mova [t0+r10*2+416*2], m2 mova [t0+r10*2+416*4], m3 mova [t0+r10*2+416*0], m0 pmulld m16, m9 ; -a * 25 pmulld m17, m9 punpcklwd m0, m1, m4 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 164 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 ; The neighbor calculations requires mova [t3+r10*4+ 24], xm17 ; 13 bits for a and 21 bits for b. vextracti32x4 [t3+r10*4+ 56], m17, 2 ; Packing them allows for 12+20, but mova [t3+r10*4+ 72], m17 ; that gets us most of the way. vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+r10*2+416*0], m1 paddw m1, m0 mova [t1+r10*2+416*2], m16 paddd m16, m2 mova [t1+r10*2+416*4], m17 paddd m17, m3 jmp .hv_main2 .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m2, [t1+r10*2+416*2] paddd m16, m2, [t2+r10*2+416*2] mova m3, [t1+r10*2+416*4] paddd m17, m3, [t2+r10*2+416*4] paddd m2, m2 paddd m3, m3 paddd m16, m2 ; hv sumsq paddd m17, m3 pmulld m16, m9 ; -a * 25 pmulld m17, m9 mova m0, [t1+r10*2+416*0] paddw m1, m0, [t2+r10*2+416*0] paddw m0, m0 paddw m1, m0 ; hv sum punpcklwd m0, m1, m4 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 164 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 mova [t3+r10*4+ 24], xm17 vextracti32x4 [t3+r10*4+ 56], m17, 2 mova [t3+r10*4+ 72], m17 vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+ 4] movu m1, [t3+r10*4+68] paddd m2, m0, [t3+r10*4+ 0] paddd m3, m1, [t3+r10*4+64] paddd m2, [t3+r10*4+ 8] paddd m3, [t3+r10*4+72] paddd m0, m2 pslld m2, 2 paddd m1, m3 pslld m3, 2 paddd m2, m0 ; ab 565 paddd m3, m1 pandn m0, m13, m2 ; a psrld m2, 12 ; b pandn m1, m13, m3 psrld m3, 12 mova [t3+r10*4+416*4+ 0], m0 mova [t3+r10*4+416*8+ 0], m2 mova [t3+r10*4+416*4+64], m1 mova [t3+r10*4+416*8+64], m3 add r10, 32 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m16, [t3+r10*4+ 4] movu m17, [t3+r10*4+68] paddd m0, m16, [t3+r10*4+ 0] paddd m1, m17, [t3+r10*4+64] paddd m0, [t3+r10*4+ 8] paddd m1, [t3+r10*4+72] paddd m16, m0 pslld m0, 2 paddd m17, m1 pslld m1, 2 paddd m0, m16 paddd m1, m17 pandn m16, m13, m0 psrld m0, 12 pandn m17, m13, m1 psrld m1, 12 paddd m2, m16, [t3+r10*4+416*4+ 0] ; a paddd m3, m17, [t3+r10*4+416*4+64] mova [t3+r10*4+416*4+ 0], m16 mova [t3+r10*4+416*4+64], m17 paddd m16, m0, [t3+r10*4+416*8+ 0] ; b + (1 << 8) paddd m17, m1, [t3+r10*4+416*8+64] mova [t3+r10*4+416*8+ 0], m0 mova [t3+r10*4+416*8+64], m1 pmovzxbd m0, [dstq+r10+ 0] pmovzxbd m1, [dstq+r10+16] pmaddwd m2, m0 ; a * src pmaddwd m3, m1 packssdw m0, m1 psubd m16, m2 ; b - a * src + (1 << 8) psubd m17, m3 psrad m16, 9 psrad m17, 9 packssdw m16, m17 pmulhrsw m16, m15 paddw m16, m0 packuswb m16, m16 vpermd m16, m22, m16 mova [dstq+r10], ym16 add r10, 32 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: pmovzxbd m0, [dstq+r10+ 0] pmovzxbd m1, [dstq+r10+16] pmaddwd m2, m0, [t3+r10*4+416*4+ 0] ; a * src pmaddwd m3, m1, [t3+r10*4+416*4+64] mova m16, [t3+r10*4+416*8+ 0] ; b + (1 << 7) mova m17, [t3+r10*4+416*8+64] packssdw m0, m1 psubd m16, m2 ; b - a * src + (1 << 7) psubd m17, m3 psrad m16, 8 psrad m17, 8 packssdw m16, m17 pmulhrsw m16, m15 paddw m16, m0 packuswb m16, m16 vpermd m16, m22, m16 mova [dstq+r10], ym16 add r10, 32 jl .n1_loop add dstq, strideq ret cglobal sgr_filter_3x3_8bpc, 4, 15, 22, -416*28-16, dst, stride, left, lpf, \ w, h, edge, params mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti32x4 m5, [sgr_shuf+3] add lpfq, wq vbroadcasti32x4 m6, [sgr_shuf+5] add dstq, wq vbroadcasti32x4 m7, [sgr_shuf+7] pxor m4, m4 vpbroadcastd m8, [pd_m9] vpsubd m11, m4, [paramsq+4] {1to16} ; -s1 vpbroadcastw m15, [paramsq+10] ; w1 lea t1, [rsp+wq*2+20] vpbroadcastd m10, [pw_164_455] lea t3, [rsp+wq*4+16+416*12] vpbroadcastd m12, [pw_61448] ; (15 << 12) + (1 << 3) neg wq vpbroadcastd m13, [pd_m4096] mov r10d, 0xfe vpbroadcastd m14, [pd_34816] ; (1 << 11) + (1 << 15) kmovb k1, r10d mova m18, [sgr_x_by_x+64*0] mov r10, 0x3333333333333333 mova m19, [sgr_x_by_x+64*1] kmovq k2, r10 mova m20, [sgr_x_by_x+64*2] psllw m15, 4 mova m21, [sgr_x_by_x+64*3] lea r14, [r_ext_mask+75] mova ym9, [sgr_shuf] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 add t1, 416*6 call .h_top lea t4, [lpfq+strideq*4] mov lpfq, dstq add t4, strideq mov [rsp], t4 ; below mov t0, t2 call .hv .main: mov t5, t3 add t3, 416*4 dec hd jz .height1 add lpfq, strideq call .hv call .prep_n dec hd jz .extend_bottom .main_loop: add lpfq, strideq call .hv call .n dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv_bottom call .n add lpfq, strideq call .hv_bottom .end: call .n RET .height1: call .v call .prep_n mov t2, t1 call .v jmp .end .extend_bottom: call .v call .n mov t2, t1 call .v jmp .end .no_top: lea t4, [lpfq+strideq*4] mov lpfq, dstq lea t4, [t4+strideq*2] mov [rsp], t4 call .h lea t0, [t1+416*6] mov t2, t1 call .v jmp .main .h: ; horizontal boxsum lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu ym17, [lpfq+r10-2] .h_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -33 jl .h_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r14+r10-8] vinserti32x8 m16, [r14+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .h_have_right: pshufb m0, m17, m5 pmullw m2, m0, m0 pshufb m16, m17, m6 paddw m0, m16 pshufb m17, m7 paddw m0, m17 ; sum punpcklwd m3, m16, m17 punpcklwd m1, m2, m4 vpdpwssd m1, m3, m3 ; sumsq punpckhwd m16, m17 punpckhwd m2, m4 vpdpwssd m2, m16, m16 mova [t1+r10*2+416*0], m0 mova [t1+r10*2+416*2], m1 mova [t1+r10*2+416*4], m2 add r10, 32 jl .h_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv_main .hv_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv_main .hv_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu ym17, [lpfq+r10-2] .hv_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp r10d, -33 jl .hv_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r14+r10-8] vinserti32x8 m16, [r14+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv_have_right: pshufb m0, m17, m5 pmullw m3, m0, m0 pshufb m1, m17, m6 paddw m0, m1 pshufb m17, m7 paddw m0, m17 ; h sum punpcklwd m16, m17, m1 punpcklwd m2, m3, m4 vpdpwssd m2, m16, m16 ; h sumsq punpckhwd m17, m1 punpckhwd m3, m4 vpdpwssd m3, m17, m17 paddw m1, m0, [t2+r10*2+416*0] paddw m1, [t1+r10*2+416*0] ; hv sum paddd m16, m2, [t2+r10*2+416*2] paddd m17, m3, [t2+r10*2+416*4] paddd m16, [t1+r10*2+416*2] ; hv sumsq paddd m17, [t1+r10*2+416*4] mova [t0+r10*2+416*0], m0 mova [t0+r10*2+416*2], m2 mova [t0+r10*2+416*4], m3 pmulld m16, m8 ; -a * 9 pmulld m17, m8 punpcklwd m0, m4, m1 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 mova [t3+r10*4+ 24], xm17 vextracti32x4 [t3+r10*4+ 56], m17, 2 mova [t3+r10*4+ 72], m17 vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 ret .v: ; vertical boxsum + ab lea r10, [wq-2] .v_loop: mova m16, [t1+r10*2+416*2] mova m17, [t1+r10*2+416*4] paddd m16, m16 paddd m17, m17 paddd m16, [t2+r10*2+416*2] ; hv sumsq paddd m17, [t2+r10*2+416*4] pmulld m16, m8 ; -a * 9 pmulld m17, m8 mova m1, [t1+r10*2+416*0] paddw m1, m1 paddw m1, [t2+r10*2+416*0] ; hv sum punpcklwd m0, m4, m1 ; b vpdpwssd m16, m0, m0 ; -p punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 pmulld m16, m11 ; p * s pmulld m17, m11 vpalignr m17{k2}, m16, m16, 2 mova m16, m20 paddusw m17, m12 psraw m17, 4 ; min(z, 255) - 256 vpermt2b m16, m17, m21 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m18, m19 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x pandn m16, m13, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m14 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m14 vpternlogd m16, m0, m13, 0xd8 ; a | (b << 12) vpternlogd m17, m1, m13, 0xd8 mova [t3+r10*4+ 8], m16 mova [t3+r10*4+ 24], xm17 vextracti32x4 [t3+r10*4+ 56], m17, 2 mova [t3+r10*4+ 72], m17 vextracti128 [t3+r10*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+104], m16, 3 add r10, 32 jl .v_loop ret .prep_n: ; initial neighbor setup mov r10, wq mov t4, t3 add t3, 416*4 .prep_n_loop: mova m2, [t5+r10*4+0] mova m3, [t4+r10*4+0] paddd m2, [t5+r10*4+8] paddd m3, [t4+r10*4+8] paddd m0, m2, [t5+r10*4+4] paddd m1, m3, [t4+r10*4+4] pslld m0, 2 paddd m1, m1 ; ab[ 0] 222 psubd m0, m2 ; ab[-1] 343 mova [t3+r10*4+416*4], m1 paddd m1, m1 mova [t5+r10*4], m0 psubd m1, m3 ; ab[ 0] 343 mova [t4+r10*4], m1 add r10, 16 jl .prep_n_loop ret ; a+b are packed together in a single dword, but we can't do the ; full neighbor calculations before splitting them since we don't ; have sufficient precision. The solution is to do the calculations ; in two equal halves and split a and b before doing the final sum. ALIGN function_align .n: ; neighbor + output mov r10, wq .n_loop: mova m16, [t3+r10*4+ 0] paddd m16, [t3+r10*4+ 8] paddd m17, m16, [t3+r10*4+ 4] paddd m17, m17 ; ab[+1] 222 mova m2, [t3+r10*4+416*4+ 0] paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 mova m3, [t3+r10*4+416*4+64] paddd m1, m3, [t5+r10*4+64] mova [t3+r10*4+416*4+ 0], m17 paddd m17, m17 psubd m17, m16 ; ab[+1] 343 mova [t5+r10*4+ 0], m17 paddd m2, m17 ; ab[ 0] 222 + ab[+1] 343 mova m16, [t3+r10*4+64] paddd m16, [t3+r10*4+72] paddd m17, m16, [t3+r10*4+68] paddd m17, m17 mova [t3+r10*4+416*4+64], m17 paddd m17, m17 psubd m17, m16 mova [t5+r10*4+64], m17 pandn m16, m13, m0 psrld m0, 12 paddd m3, m17 pandn m17, m13, m2 psrld m2, 12 paddd m16, m17 ; a pandn m17, m13, m1 psrld m1, 12 paddd m0, m2 ; b + (1 << 8) pandn m2, m13, m3 psrld m3, 12 paddd m17, m2 pmovzxbd m2, [dstq+r10+ 0] paddd m1, m3 pmovzxbd m3, [dstq+r10+16] pmaddwd m16, m2 ; a * src pmaddwd m17, m3 packssdw m2, m3 psubd m0, m16 ; b - a * src + (1 << 8) psubd m1, m17 psrad m0, 9 psrad m1, 9 packssdw m0, m1 pmulhrsw m0, m15 paddw m0, m2 packuswb m0, m0 vpermd m16, m9, m0 mova [dstq+r10], ym16 add r10, 32 jl .n_loop mov r10, t5 mov t5, t4 mov t4, r10 add dstq, strideq ret cglobal sgr_filter_mix_8bpc, 4, 13, 28, 416*56+8, dst, stride, left, lpf, \ w, h, edge, params mov paramsq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m vbroadcasti128 m5, [sgr_shuf+1] add lpfq, wq vbroadcasti128 m6, [sgr_shuf+9] add dstq, wq vbroadcasti128 m7, [sgr_shuf+3] lea t3, [rsp+wq*4+416*24+8] vbroadcasti128 m8, [sgr_shuf+7] pxor m4, m4 vpbroadcastd m9, [pd_m9] vpsubd m11, m4, [paramsq+0] {1to16} ; -s0 vpbroadcastd m14, [pw_61448] vpsubd m12, m4, [paramsq+4] {1to16} ; -s1 vpbroadcastd m26, [paramsq+8] ; w0 w1 lea t1, [rsp+wq*2+12] vpbroadcastd m10, [pd_m25] neg wq vpbroadcastd m13, [pw_164_455] mov r10d, 0xfe vpbroadcastd m15, [pd_34816] kmovb k1, r10d mova m20, [sgr_x_by_x+64*0] mov r10, 0x3333333333333333 mova m21, [sgr_x_by_x+64*1] kmovq k2, r10 mova m22, [sgr_x_by_x+64*2] lea r12, [r_ext_mask+75] mova m23, [sgr_x_by_x+64*3] vpbroadcastd m24, [pd_m4096] vpbroadcastd m25, [sgr_shuf+28] ; 0x8000____ psllw m26, 5 mova xm27, [sgr_mix_perm] test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t2, t1 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx512icl).top_fixup add t1, 416*12 call .h_top lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov [rsp], r10 ; below call .hv0 .main: dec hd jz .height1 add lpfq, strideq call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: add lpfq, strideq call .hv0 test hd, hd jz .odd_height add lpfq, strideq call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, [rsp] call .hv0_bottom add lpfq, strideq call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov [rsp], r10 call .h lea t2, [t1+416*12] lea r10, [wq-2] .top_fixup_loop: mova m0, [t1+r10*2+416* 0] mova m1, [t1+r10*2+416* 2] mova m2, [t1+r10*2+416* 4] paddw m0, m0 mova m3, [t1+r10*2+416* 6] paddd m1, m1 mova m16, [t1+r10*2+416* 8] paddd m2, m2 mova m17, [t1+r10*2+416*10] mova [t2+r10*2+416* 0], m0 mova [t2+r10*2+416* 2], m1 mova [t2+r10*2+416* 4], m2 mova [t2+r10*2+416* 6], m3 mova [t2+r10*2+416* 8], m16 mova [t2+r10*2+416*10], m17 add r10, 32 jl .top_fixup_loop call .v0 jmp .main .h: ; horizontal boxsums lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .h_main .h_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .h_main .h_top: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu ym17, [lpfq+r10-2] .h_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp r10d, -34 jl .h_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .h_have_right: pshufb m3, m17, m5 pshufb m18, m17, m6 shufps m0, m3, m18, q2121 pmullw m2, m0, m0 pshufb m19, m17, m7 paddw m0, m19 pshufb m17, m8 paddw m0, m17 ; sum3 punpcklwd m16, m19, m17 punpcklwd m1, m2, m4 vpdpwssd m1, m16, m16 ; sumsq3 punpckhwd m19, m17 punpckhwd m2, m4 vpdpwssd m2, m19, m19 mova [t1+r10*2+416* 6], m0 mova [t1+r10*2+416* 8], m1 mova [t1+r10*2+416*10], m2 punpcklwd m19, m3, m18 paddw m0, m3 vpdpwssd m1, m19, m19 ; sumsq5 punpckhwd m3, m18 paddw m0, m18 ; sum5 vpdpwssd m2, m3, m3 mova [t1+r10*2+416* 0], m0 mova [t1+r10*2+416* 2], m1 mova [t1+r10*2+416* 4], m2 add r10, 32 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv0_main .hv0_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv0_main .hv0_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left .hv0_loop: movu ym17, [lpfq+r10-2] .hv0_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp r10d, -34 jl .hv0_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv0_have_right: pshufb m18, m17, m5 pshufb m19, m17, m6 shufps m1, m18, m19, q2121 pmullw m3, m1, m1 pshufb m0, m17, m7 paddw m1, m0 pshufb m17, m8 paddw m1, m17 ; sum3 punpcklwd m16, m0, m17 punpcklwd m2, m3, m4 vpdpwssd m2, m16, m16 ; sumsq3 punpckhwd m0, m17 punpckhwd m3, m4 vpdpwssd m3, m0, m0 paddw m0, m1, [t1+r10*2+416* 6] paddd m16, m2, [t1+r10*2+416* 8] paddd m17, m3, [t1+r10*2+416*10] mova [t1+r10*2+416* 6], m1 mova [t1+r10*2+416* 8], m2 mova [t1+r10*2+416*10], m3 paddw m1, m18 paddw m1, m19 ; sum5 mova [t3+r10*4+416*8+ 8], m1 paddw m1, [t1+r10*2+416* 0] mova [t1+r10*2+416* 0], m1 punpcklwd m1, m18, m19 vpdpwssd m2, m1, m1 ; sumsq5 punpckhwd m18, m19 vpdpwssd m3, m18, m18 mova [t3+r10*4+416*0+ 8], m2 ; we need a clean copy of the last row mova [t3+r10*4+416*0+72], m3 ; in case height is odd paddd m2, [t1+r10*2+416* 2] paddd m3, [t1+r10*2+416* 4] mova [t1+r10*2+416* 2], m2 mova [t1+r10*2+416* 4], m3 paddw m1, m0, [t2+r10*2+416* 6] paddd m2, m16, [t2+r10*2+416* 8] paddd m3, m17, [t2+r10*2+416*10] mova [t2+r10*2+416* 6], m0 mova [t2+r10*2+416* 8], m16 mova [t2+r10*2+416*10], m17 pmulld m16, m2, m9 ; -a3 * 9 pmulld m17, m3, m9 punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 vpalignr m17{k2}, m16, m16, 2 mova m16, m22 paddusw m17, m14 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*4+ 8], m16 mova [t3+r10*4+416*4+ 24], xm17 vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 mova [t3+r10*4+416*4+ 72], m17 vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*4+104], m16, 3 add r10, 32 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movd xm17, [leftq] vmovdqu32 ym17{k1}, [lpfq+wq-4] add leftq, 4 jmp .hv1_main .hv1_extend_left: vpbroadcastb xm17, [lpfq+wq] vmovdqu32 ym17{k1}, [lpfq+wq-4] jmp .hv1_main .hv1_bottom: lea r10, [wq-2] test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left .hv1_loop: movu ym17, [lpfq+r10-2] .hv1_main: vinserti32x8 m17, [lpfq+r10+6], 1 test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp r10d, -34 jl .hv1_have_right vpbroadcastb m0, [lpfq-1] movu ym16, [r12+r10-8] vinserti32x8 m16, [r12+r10+0], 1 vpternlogd m17, m0, m16, 0xe4 .hv1_have_right: pshufb m3, m17, m5 pshufb m19, m17, m6 shufps m2, m3, m19, q2121 pmullw m1, m2, m2 pshufb m18, m17, m7 paddw m2, m18 pshufb m17, m8 paddw m2, m17 ; sum3 punpcklwd m16, m17, m18 punpcklwd m0, m1, m4 vpdpwssd m0, m16, m16 ; sumsq3 punpckhwd m17, m18 punpckhwd m1, m4 vpdpwssd m1, m17, m17 paddd m16, m0, [t2+r10*2+416* 8] paddd m17, m1, [t2+r10*2+416*10] mova [t2+r10*2+416* 8], m0 mova [t2+r10*2+416*10], m1 punpcklwd m18, m3, m19 vpdpwssd m0, m18, m18 ; sumsq5 punpckhwd m18, m3, m19 vpdpwssd m1, m18, m18 paddw m3, m19 pmulld m16, m9 ; -a3 * 9 pmulld m17, m9 paddd m18, m0, [t2+r10*2+416*2] paddd m19, m1, [t2+r10*2+416*4] paddd m18, [t1+r10*2+416*2] paddd m19, [t1+r10*2+416*4] mova [t2+r10*2+416*2], m0 mova [t2+r10*2+416*4], m1 pmulld m18, m10 ; -a5 * 25 pmulld m19, m10 paddw m1, m2, [t2+r10*2+416* 6] mova [t2+r10*2+416* 6], m2 paddw m2, m3 ; sum5 paddw m3, m2, [t2+r10*2+416*0] paddw m3, [t1+r10*2+416*0] mova [t2+r10*2+416*0], m2 punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 punpcklwd m2, m3, m4 ; b5 vpdpwssd m18, m2, m2 ; -p5 punpckhwd m3, m4 vpdpwssd m19, m3, m3 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmulld m18, m11 ; p5 * s0 pmulld m19, m11 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 pmaddwd m2, m13 ; b5 * 164 pmaddwd m3, m13 vpalignr m17{k2}, m16, m16, 2 vpalignr m19{k2}, m18, m18, 2 paddusw m17, m14 mova m16, m22 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] paddusw m19, m14 mova m18, m22 psraw m19, 4 ; min(z5, 255) - 256 vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] vpmovb2m k4, m19 vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 vmovdqu8 m19{k4}, m18 ; x5 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 pandn m18, m24, m19 psrld m19, 16 pmulld m2, m18 pmulld m3, m19 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*8+ 8], m16 mova [t3+r10*4+416*8+ 24], xm17 vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m3, m15 mova [t3+r10*4+416*8+ 72], m17 vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*8+104], m16, 3 vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) vpternlogd m19, m3, m24, 0xd8 mova [t3+r10*4+416*0+ 8], m18 mova [t3+r10*4+416*0+ 24], xm19 vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 mova [t3+r10*4+416*0+ 72], m19 vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 vextracti32x4 [t3+r10*4+416*0+104], m18, 3 add r10, 32 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) lea r10, [wq-2] .v0_loop: mova m2, [t1+r10*2+416* 8] mova m3, [t1+r10*2+416*10] paddd m2, m2 paddd m3, m3 paddd m16, m2, [t2+r10*2+416* 8] paddd m17, m3, [t2+r10*2+416*10] mova m0, [t1+r10*2+416* 6] paddw m0, m0 paddw m1, m0, [t2+r10*2+416* 6] pmulld m16, m9 ; -a3 * 9 pmulld m17, m9 mova [t2+r10*2+416* 6], m0 mova [t2+r10*2+416* 8], m2 mova [t2+r10*2+416*10], m3 mova m2, [t1+r10*2+416*0] mova m3, [t1+r10*2+416*2] mova m18, [t1+r10*2+416*4] punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 mova [t3+r10*4+416*8+ 8], m2 mova [t3+r10*4+416*0+ 8], m3 mova [t3+r10*4+416*0+72], m18 vpalignr m17{k2}, m16, m16, 2 mova m16, m22 paddusw m17, m14 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 paddw m2, m2 ; cc5 paddd m3, m3 paddd m18, m18 mova [t1+r10*2+416*0], m2 mova [t1+r10*2+416*2], m3 mova [t1+r10*2+416*4], m18 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*4+ 8], m16 mova [t3+r10*4+416*4+ 24], xm17 vextracti32x4 [t3+r10*4+416*4+ 56], m17, 2 mova [t3+r10*4+416*4+ 72], m17 vextracti128 [t3+r10*4+416*4+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*4+104], m16, 3 add r10, 32 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) lea r10, [wq-2] .v1_loop: mova m0, [t1+r10*2+416* 8] paddd m16, m0, [t2+r10*2+416* 8] mova m1, [t1+r10*2+416*10] paddd m17, m1, [t2+r10*2+416*10] mova m2, [t3+r10*4+416*0+ 8] paddd m18, m2, [t2+r10*2+416* 2] mova m3, [t3+r10*4+416*0+72] paddd m19, m3, [t2+r10*2+416* 4] paddd m18, [t1+r10*2+416* 2] paddd m19, [t1+r10*2+416* 4] mova [t2+r10*2+416* 8], m0 mova [t2+r10*2+416*10], m1 mova [t2+r10*2+416* 2], m2 mova [t2+r10*2+416* 4], m3 pmulld m16, m9 ; -a3 * 9 pmulld m17, m9 pmulld m18, m10 ; -a5 * 25 pmulld m19, m10 mova m0, [t1+r10*2+416* 6] paddw m1, m0, [t2+r10*2+416* 6] mova m2, [t3+r10*4+416*8+ 8] paddw m3, m2, [t2+r10*2+416*0] paddw m3, [t1+r10*2+416*0] mova [t2+r10*2+416* 6], m0 mova [t2+r10*2+416*0], m2 punpcklwd m0, m4, m1 ; b3 vpdpwssd m16, m0, m0 ; -p3 punpckhwd m1, m4, m1 vpdpwssd m17, m1, m1 punpcklwd m2, m3, m4 ; b5 vpdpwssd m18, m2, m2 ; -p5 punpckhwd m3, m4 vpdpwssd m19, m3, m3 pmulld m16, m12 ; p3 * s1 pmulld m17, m12 pmulld m18, m11 ; p5 * s0 pmulld m19, m11 pmaddwd m0, m13 ; b3 * 455 pmaddwd m1, m13 pmaddwd m2, m13 ; b5 * 164 pmaddwd m3, m13 vpalignr m17{k2}, m16, m16, 2 vpalignr m19{k2}, m18, m18, 2 paddusw m17, m14 mova m16, m22 psraw m17, 4 ; min(z3, 255) - 256 vpermt2b m16, m17, m23 ; sgr_x_by_x[128..255] vpmovb2m k3, m17 vpermi2b m17, m20, m21 ; sgr_x_by_x[ 0..127] paddusw m19, m14 mova m18, m22 psraw m19, 4 ; min(z5, 255) - 256 vpermt2b m18, m19, m23 ; sgr_x_by_x[128..255] vpmovb2m k4, m19 vpermi2b m19, m20, m21 ; sgr_x_by_x[ 0..127] vmovdqu8 m17{k3}, m16 ; x3 vmovdqu8 m19{k4}, m18 ; x5 pandn m16, m24, m17 psrld m17, 16 pmulld m0, m16 pmulld m1, m17 pandn m18, m24, m19 psrld m19, m19, 16 pmulld m2, m18 pmulld m3, m19 paddd m0, m15 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m15 vpternlogd m16, m0, m24, 0xd8 ; a3 | (b3 << 12) vpternlogd m17, m1, m24, 0xd8 mova [t3+r10*4+416*8+ 8], m16 mova [t3+r10*4+416*8+ 24], xm17 vextracti32x4 [t3+r10*4+416*8+ 56], m17, 2 paddd m2, m15 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m3, m15 mova [t3+r10*4+416*8+ 72], m17 vextracti128 [t3+r10*4+416*8+ 72], ym16, 1 vextracti32x4 [t3+r10*4+416*8+104], m16, 3 vpternlogd m18, m2, m24, 0xd8 ; a5 | (b5 << 12) vpternlogd m19, m3, m24, 0xd8 mova [t3+r10*4+416*0+ 8], m18 mova [t3+r10*4+416*0+ 24], xm19 vextracti32x4 [t3+r10*4+416*0+ 56], m19, 2 mova [t3+r10*4+416*0+ 72], m19 vextracti128 [t3+r10*4+416*0+ 72], ym18, 1 vextracti32x4 [t3+r10*4+416*0+104], m18, 3 add r10, 32 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup mov r10, wq .prep_n_loop: movu m0, [t3+r10*4+416*0+4] paddd m1, m0, [t3+r10*4+416*0+0] mova m16, [t3+r10*4+416*4+0] paddd m1, [t3+r10*4+416*0+8] mova m17, [t3+r10*4+416*8+0] paddd m16, [t3+r10*4+416*4+8] paddd m17, [t3+r10*4+416*8+8] paddd m2, m16, [t3+r10*4+416*4+4] paddd m3, m17, [t3+r10*4+416*8+4] paddd m0, m1 pslld m1, 2 pslld m2, 2 paddd m1, m0 ; ab5 565 paddd m3, m3 ; ab3[ 0] 222 psubd m2, m16 ; ab3[-1] 343 mova [t3+r10*4+416*20], m3 pandn m0, m24, m1 ; a5 565 mova [t3+r10*4+416*24], m2 psrld m1, 12 ; b5 565 mova [t3+r10*4+416*12], m0 paddd m3, m3 mova [t3+r10*4+416*16], m1 psubd m3, m17 ; ab3[ 0] 343 mova [t3+r10*4+416*28], m3 add r10, 16 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) mov r10, wq .n0_loop: movu m2, [t3+r10*4+4] paddd m3, m2, [t3+r10*4+0] paddd m3, [t3+r10*4+8] mova m1, [t3+r10*4+416*4+0] paddd m2, m3 pslld m3, 2 paddd m1, [t3+r10*4+416*4+8] paddd m3, m2 pandn m2, m24, m3 psrld m3, 12 paddd m0, m2, [t3+r10*4+416*12] ; a5 paddd m16, m3, [t3+r10*4+416*16] ; b5 + (1 << 8) mova [t3+r10*4+416*12], m2 mova [t3+r10*4+416*16], m3 paddd m2, m1, [t3+r10*4+416*4+4] paddd m2, m2 ; ab3[ 1] 222 mova m3, [t3+r10*4+416*20] paddd m17, m3, [t3+r10*4+416*24] ; ab3[ 0] 222 + ab3[-1] 343 mova [t3+r10*4+416*20], m2 paddd m2, m2 psubd m2, m1 ; ab3[ 1] 343 mova [t3+r10*4+416*24], m2 paddd m2, m3 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m1, m24, m17 psrld m17, 12 pandn m3, m24, m2 psrld m2, 12 paddd m1, m3 ; a3 pmovzxbd m3, [dstq+r10] paddd m17, m2 ; b3 + (1 << 8) pmaddwd m0, m3 ; a5 * src pmaddwd m1, m3 ; a3 * src vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) psubd m16, m0 ; b5 - a5 * src + (1 << 8) psubd m17, m1 ; b3 - a3 * src + (1 << 8) psrld m16, 9 pslld m17, 7 vmovdqu8 m17{k2}, m16 vpdpwssd m3, m17, m26 packuswb m3, m2 vpermb m16, m27, m3 mova [dstq+r10], xm16 add r10, 16 jl .n0_loop add dstq, strideq ret ALIGN function_align .n1: ; neighbor + output (odd rows) mov r10, wq .n1_loop: mova m1, [t3+r10*4+416*8+0] paddd m1, [t3+r10*4+416*8+8] paddd m2, m1, [t3+r10*4+416*8+4] paddd m2, m2 ; ab3[ 1] 222 mova m0, [t3+r10*4+416*20] paddd m17, m0, [t3+r10*4+416*28] ; ab3[ 0] 222 + ab3[-1] 343 pmovzxbd m3, [dstq+r10] mova [t3+r10*4+416*20], m2 paddd m2, m2 psubd m2, m1 ; ab3[ 1] 343 mova [t3+r10*4+416*28], m2 paddd m0, m2 ; ab3[ 0] 222 + ab3[ 1] 343 pandn m1, m24, m17 psrld m17, 12 pandn m2, m24, m0 psrld m0, 12 paddd m1, m2 ; a3 paddd m17, m0 ; b3 + (1 << 8) mova m16, [t3+r10*4+416*16] ; b5 + (1 << 7) pmaddwd m1, m3 ; a3 * src pmaddwd m0, m3, [t3+r10*4+416*12] ; a5 * src vpshldd m3, m25, 16 ; (dst << 16) + (1 << 15) psubd m17, m1 ; b3 - a3 * src + (1 << 8) psubd m16, m0 ; b5 - a5 * src + (1 << 7) pslld m17, 7 palignr m17{k2}, m16, m16, 1 vpdpwssd m3, m17, m26 packuswb m3, m3 vpermb m16, m27, m3 mova [dstq+r10], xm16 add r10, 16 jl .n1_loop add dstq, strideq ret %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/looprestoration_sse.asm000064400000000000000000003202321046102023000167040ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; Copyright © 2018, VideoLabs ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 wiener_init: db 6, 7, 6, 7, 6, 7, 6, 7, 0, 0, 0, 0, 2, 4, 2, 4 wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 wiener_shufD: db 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1, 10, -1, 11, -1 wiener_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_lshuf3: db 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 sgr_lshuf5: db 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pb_right_ext_mask: times 24 db 0xff times 8 db 0 pb_1: times 16 db 1 pb_3: times 16 db 3 pw_256: times 8 dw 256 pw_2056: times 8 dw 2056 pw_m16380: times 8 dw -16380 pd_4096: times 4 dd 4096 pd_34816: times 4 dd 34816 pd_0xffff: times 4 dd 0xffff pd_0xf00800a4: times 4 dd 0xf00800a4 pd_0xf00801c7: times 4 dd 0xf00801c7 cextern sgr_x_by_x SECTION .text %macro movif64 2 ; dst, src %if ARCH_X86_64 mov %1, %2 %endif %endmacro %macro movif32 2 ; dst, src %if ARCH_X86_32 mov %1, %2 %endif %endmacro %if ARCH_X86_32 %define PIC_base_offset $$ %macro SETUP_PIC 1-3 1,0 ; PIC_reg, save_PIC_reg, restore_PIC_reg %assign pic_reg_stk_off 4 %xdefine PIC_reg %1 %if %2 == 1 mov [esp], %1 %endif LEA PIC_reg, PIC_base_offset %if %3 == 1 XCHG_PIC_REG %endif %endmacro %macro XCHG_PIC_REG 0 mov [esp+pic_reg_stk_off], PIC_reg %assign pic_reg_stk_off (pic_reg_stk_off+4) % 8 mov PIC_reg, [esp+pic_reg_stk_off] %endmacro %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) %else %macro XCHG_PIC_REG 0 %endmacro %define PIC_sym(sym) (sym) %endif %macro WIENER 0 %if ARCH_X86_64 DECLARE_REG_TMP 9, 7, 10, 11, 12, 13, 14 ; ring buffer pointers cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ w, h, edge, flt, x %define tmpstrideq strideq %define base 0 mov fltq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m movq m14, [fltq] add lpfq, wq movq m7, [fltq+16] add dstq, wq lea t1, [rsp+wq*2+16] mova m15, [pw_2056] neg wq %if cpuflag(ssse3) pshufb m14, [wiener_init] mova m8, [wiener_shufA] pshufd m12, m14, q2222 ; x0 x0 mova m9, [wiener_shufB] pshufd m13, m14, q3333 ; x1 x2 mova m10, [wiener_shufC] punpcklqdq m14, m14 ; x3 mova m11, [wiener_shufD] %else mova m10, [pw_m16380] punpcklwd m14, m14 pshufd m11, m14, q0000 ; x0 pshufd m12, m14, q1111 ; x1 pshufd m13, m14, q2222 ; x2 pshufd m14, m14, q3333 ; x3 %endif %else DECLARE_REG_TMP 4, 0, _, 5 %if cpuflag(ssse3) %define m10 [base+wiener_shufC] %define m11 [base+wiener_shufD] %define stk_off 96 %else %define m10 [base+pw_m16380] %define m11 [stk+96] %define stk_off 112 %endif cglobal wiener_filter7_8bpc, 0, 7, 8, -384*12-stk_off, _, x, left, lpf, tmpstride %define base r6-pb_right_ext_mask-21 %define stk esp %define dstq leftq %define edgeb byte edged %define edged [stk+ 8] %define dstmp [stk+12] %define hd dword [stk+16] %define wq [stk+20] %define strideq [stk+24] %define leftmp [stk+28] %define t2 [stk+32] %define t4 [stk+36] %define t5 [stk+40] %define t6 [stk+44] %define m8 [base+wiener_shufA] %define m9 [base+wiener_shufB] %define m12 [stk+48] %define m13 [stk+64] %define m14 [stk+80] %define m15 [base+pw_2056] mov r1, r6m ; flt mov r0, r0m ; dst mov r4, r4m ; w mov lpfq, lpfm mov r2, r7m ; edge mov r5, r5m ; h movq m3, [r1+ 0] movq m7, [r1+16] add r0, r4 mov r1, r1m ; stride add lpfq, r4 mov edged, r2 mov r2, r2m ; left mov dstmp, r0 lea t1, [rsp+r4*2+stk_off] mov hd, r5 neg r4 LEA r6, pb_right_ext_mask+21 mov wq, r4 mov strideq, r1 mov leftmp, r2 mov r4, r1 %if cpuflag(ssse3) pshufb m3, [base+wiener_init] pshufd m1, m3, q2222 pshufd m2, m3, q3333 punpcklqdq m3, m3 %else punpcklwd m3, m3 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m11, m0 %endif mova m12, m1 mova m13, m2 mova m14, m3 %endif psllw m7, 5 pshufd m6, m7, q0000 ; y0 y1 pshufd m7, m7, q1111 ; y2 y3 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t6, t1 mov t5, t1 add t1, 384*2 call .h_top lea t3, [lpfq+tmpstrideq*4] mov lpfq, dstmp add t3, tmpstrideq mov [rsp], t3 ; below mov t4, t1 add t1, 384*2 call .h mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 .main: lea t0, [t1+384*2] .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v3 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .v1: call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v RET .no_top: lea t3, [lpfq+tmpstrideq*4] mov lpfq, dstmp lea t3, [t3+tmpstrideq*2] mov [rsp], t3 call .h mov t6, t1 mov t5, t1 mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h mov t2, t1 dec hd jz .v2 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v3 lea t0, [t1+384*2] call .hv dec hd jz .v3 add t0, 384*8 call .hv dec hd jnz .main .v3: call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v .v2: call mangle(private_prefix %+ _wiener_filter7_8bpc_ssse3).v jmp .v1 .extend_right: movd m2, [lpfq-4] %if ARCH_X86_64 push r0 lea r0, [pb_right_ext_mask+21] movu m0, [r0+xq+0] movu m1, [r0+xq+8] pop r0 %else movu m0, [r6+xq+0] movu m1, [r6+xq+8] %endif %if cpuflag(ssse3) pshufb m2, [base+pb_3] %else punpcklbw m2, m2 pshuflw m2, m2, q3333 punpcklqdq m2, m2 %endif pand m4, m0 pand m5, m1 pandn m0, m2 pandn m1, m2 por m4, m0 por m5, m1 ret .h: %define stk esp+4 ; offset due to call mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movifnidn leftq, leftmp mova m4, [lpfq+xq] movd m5, [leftq] add leftq, 4 pslldq m4, 4 por m4, m5 movifnidn leftmp, leftq jmp .h_main .h_extend_left: %if cpuflag(ssse3) mova m4, [lpfq+xq] pshufb m4, [base+wiener_l_shuf] %else mova m5, [lpfq+xq] pshufd m4, m5, q2103 punpcklbw m5, m5 punpcklwd m5, m5 movss m4, m5 %endif jmp .h_main .h_top: mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+xq-4] .h_main: movu m5, [lpfq+xq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp xd, -18 jl .h_have_right call .extend_right .h_have_right: %macro %%h7 0 %if cpuflag(ssse3) pshufb m0, m4, m8 pmaddubsw m0, m12 pshufb m1, m5, m8 pmaddubsw m1, m12 pshufb m2, m4, m9 pmaddubsw m2, m13 pshufb m3, m5, m9 pmaddubsw m3, m13 paddw m0, m2 pshufb m2, m4, m10 pmaddubsw m2, m13 paddw m1, m3 pshufb m3, m5, m10 pmaddubsw m3, m13 pshufb m4, m11 paddw m0, m2 pmullw m2, m14, m4 pshufb m5, m11 paddw m1, m3 pmullw m3, m14, m5 psllw m4, 7 psllw m5, 7 paddw m0, m2 mova m2, [base+pw_m16380] paddw m1, m3 paddw m4, m2 paddw m5, m2 paddsw m0, m4 paddsw m1, m5 %else psrldq m0, m4, 1 pslldq m1, m4, 1 pxor m3, m3 punpcklbw m0, m3 punpckhbw m1, m3 paddw m0, m1 pmullw m0, m11 psrldq m1, m4, 2 pslldq m2, m4, 2 punpcklbw m1, m3 punpckhbw m2, m3 paddw m1, m2 pmullw m1, m12 paddw m0, m1 pshufd m2, m4, q0321 punpcklbw m2, m3 pmullw m1, m14, m2 paddw m0, m1 psrldq m1, m4, 3 pslldq m4, 3 punpcklbw m1, m3 punpckhbw m4, m3 paddw m1, m4 pmullw m1, m13 paddw m0, m1 psllw m2, 7 paddw m2, m10 paddsw m0, m2 psrldq m1, m5, 1 pslldq m2, m5, 1 punpcklbw m1, m3 punpckhbw m2, m3 paddw m1, m2 pmullw m1, m11 psrldq m2, m5, 2 pslldq m4, m5, 2 punpcklbw m2, m3 punpckhbw m4, m3 paddw m2, m4 pmullw m2, m12 paddw m1, m2 pshufd m4, m5, q0321 punpcklbw m4, m3 pmullw m2, m14, m4 paddw m1, m2 psrldq m2, m5, 3 pslldq m5, 3 punpcklbw m2, m3 punpckhbw m5, m3 paddw m2, m5 pmullw m2, m13 paddw m1, m2 psllw m4, 7 paddw m4, m10 paddsw m1, m4 %endif %endmacro %%h7 psraw m0, 3 psraw m1, 3 paddw m0, m15 paddw m1, m15 mova [t1+xq*2+ 0], m0 mova [t1+xq*2+16], m1 add xq, 16 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movifnidn leftq, leftmp mova m4, [lpfq+xq] movd m5, [leftq] add leftq, 4 pslldq m4, 4 por m4, m5 movifnidn leftmp, leftq jmp .hv_main .hv_extend_left: %if cpuflag(ssse3) mova m4, [lpfq+xq] pshufb m4, [base+wiener_l_shuf] %else mova m5, [lpfq+xq] pshufd m4, m5, q2103 punpcklbw m5, m5 punpcklwd m5, m5 movss m4, m5 %endif jmp .hv_main .hv_bottom: mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m4, [lpfq+xq-4] .hv_main: movu m5, [lpfq+xq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp xd, -18 jl .hv_have_right call .extend_right .hv_have_right: %%h7 %if ARCH_X86_64 mova m2, [t4+xq*2] paddw m2, [t2+xq*2] %else mov r2, t4 mova m2, [r2+xq*2] mov r2, t2 paddw m2, [r2+xq*2] mov r2, t5 %endif mova m3, [t3+xq*2] %if ARCH_X86_64 mova m5, [t5+xq*2] %else mova m5, [r2+xq*2] mov r2, t6 %endif paddw m5, [t1+xq*2] psraw m0, 3 psraw m1, 3 paddw m0, m15 paddw m1, m15 %if ARCH_X86_64 paddw m4, m0, [t6+xq*2] %else paddw m4, m0, [r2+xq*2] mov r2, t4 %endif mova [t0+xq*2], m0 punpcklwd m0, m2, m3 pmaddwd m0, m7 punpckhwd m2, m3 pmaddwd m2, m7 punpcklwd m3, m4, m5 pmaddwd m3, m6 punpckhwd m4, m5 pmaddwd m4, m6 paddd m0, m3 mova m3, [t3+xq*2+16] paddd m4, m2 %if ARCH_X86_64 mova m2, [t4+xq*2+16] paddw m2, [t2+xq*2+16] mova m5, [t5+xq*2+16] %else mova m2, [r2+xq*2+16] mov r2, t2 paddw m2, [r2+xq*2+16] mov r2, t5 mova m5, [r2+xq*2+16] mov r2, t6 %endif paddw m5, [t1+xq*2+16] packuswb m0, m4 %if ARCH_X86_64 paddw m4, m1, [t6+xq*2+16] %else paddw m4, m1, [r2+xq*2+16] mov dstq, dstmp %endif mova [t0+xq*2+16], m1 punpcklwd m1, m2, m3 pmaddwd m1, m7 punpckhwd m2, m3 pmaddwd m2, m7 punpcklwd m3, m4, m5 pmaddwd m3, m6 punpckhwd m4, m5 pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 jl .hv_loop add dstq, strideq %if ARCH_X86_64 mov t6, t5 mov t5, t4 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t6 %else mov dstmp, dstq mov r1, t5 mov r2, t4 mov t6, r1 mov t5, r2 mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, r1 %endif ret %if cpuflag(ssse3) ; identical in sse2 and ssse3, so share code .v: mov xq, wq .v_loop: %if ARCH_X86_64 mova m1, [t4+xq*2] paddw m1, [t2+xq*2] %else mov r2, t4 mova m1, [r2+xq*2] mov r2, t2 paddw m1, [r2+xq*2] mov r2, t6 %endif mova m2, [t3+xq*2] mova m4, [t1+xq*2] %if ARCH_X86_64 paddw m3, m4, [t6+xq*2] paddw m4, [t5+xq*2] %else paddw m3, m4, [r2+xq*2] mov r2, t5 paddw m4, [r2+xq*2] mov r2, t4 %endif punpcklwd m0, m1, m2 pmaddwd m0, m7 punpckhwd m1, m2 pmaddwd m1, m7 punpcklwd m2, m3, m4 pmaddwd m2, m6 punpckhwd m3, m4 pmaddwd m3, m6 paddd m0, m2 paddd m1, m3 %if ARCH_X86_64 mova m2, [t4+xq*2+16] paddw m2, [t2+xq*2+16] %else mova m2, [r2+xq*2+16] mov r2, t2 paddw m2, [r2+xq*2+16] mov r2, t6 %endif mova m3, [t3+xq*2+16] mova m5, [t1+xq*2+16] %if ARCH_X86_64 paddw m4, m5, [t6+xq*2+16] paddw m5, [t5+xq*2+16] %else paddw m4, m5, [r2+xq*2+16] mov r2, t5 paddw m5, [r2+xq*2+16] movifnidn dstq, dstmp %endif packuswb m0, m1 punpcklwd m1, m2, m3 pmaddwd m1, m7 punpckhwd m2, m3 pmaddwd m2, m7 punpcklwd m3, m4, m5 pmaddwd m3, m6 punpckhwd m4, m5 pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 jl .v_loop add dstq, strideq %if ARCH_X86_64 mov t6, t5 mov t5, t4 %else mov dstmp, dstq mov r1, t5 mov r2, t4 mov t6, r1 mov t5, r2 %endif mov t4, t3 mov t3, t2 mov t2, t1 ret %endif %if ARCH_X86_64 cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ w, h, edge, flt, x mov fltq, r6mp mov wd, wm movifnidn hd, hm mov edged, r7m movq m14, [fltq] add lpfq, wq movq m7, [fltq+16] add dstq, wq mova m8, [pw_m16380] lea t1, [rsp+wq*2+16] mova m15, [pw_2056] neg wq %if cpuflag(ssse3) pshufb m14, [wiener_init] mova m9, [wiener_shufB] pshufd m13, m14, q3333 ; x1 x2 mova m10, [wiener_shufC] punpcklqdq m14, m14 ; x3 mova m11, [wiener_shufD] mova m12, [wiener_l_shuf] %else punpcklwd m14, m14 pshufd m11, m14, q1111 ; x1 pshufd m13, m14, q2222 ; x2 pshufd m14, m14, q3333 ; x3 %endif %else %if cpuflag(ssse3) %define stk_off 80 %else %define m11 [stk+80] %define stk_off 96 %endif cglobal wiener_filter5_8bpc, 0, 7, 8, -384*8-stk_off, _, x, left, lpf, tmpstride %define stk esp %define leftmp [stk+28] %define m8 [base+pw_m16380] %define m12 [base+wiener_l_shuf] %define m14 [stk+48] mov r1, r6m ; flt mov r0, r0m ; dst mov r4, r4m ; w mov lpfq, lpfm mov r2, r7m ; edge mov r5, r5m ; h movq m2, [r1+ 0] movq m7, [r1+16] add r0, r4 mov r1, r1m ; stride add lpfq, r4 mov edged, r2 mov r2, r2m ; left mov dstmp, r0 lea t1, [rsp+r4*2+stk_off] mov hd, r5 neg r4 LEA r6, pb_right_ext_mask+21 mov wq, r4 mov strideq, r1 mov leftmp, r2 mov r4, r1 %if cpuflag(ssse3) pshufb m2, [base+wiener_init] pshufd m1, m2, q3333 punpcklqdq m2, m2 %else punpcklwd m2, m2 pshufd m0, m2, q1111 pshufd m1, m2, q2222 pshufd m2, m2, q3333 mova m11, m0 %endif mova m13, m1 mova m14, m2 %endif psllw m7, 5 pshufd m6, m7, q0000 ; __ y1 pshufd m7, m7, q1111 ; y2 y3 test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, strideq mov t4, t1 add t1, 384*2 call .h_top lea xq, [lpfq+tmpstrideq*4] mov lpfq, dstmp mov t3, t1 add t1, 384*2 add xq, tmpstrideq mov [rsp], xq ; below call .h mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 .main: mov t0, t4 .main_loop: call .hv dec hd jnz .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .v2 mov lpfq, [rsp] call .hv_bottom add lpfq, strideq call .hv_bottom .end: RET .no_top: lea t3, [lpfq+tmpstrideq*4] mov lpfq, dstmp lea t3, [t3+tmpstrideq*2] mov [rsp], t3 call .h mov t4, t1 mov t3, t1 mov t2, t1 dec hd jz .v1 add lpfq, strideq add t1, 384*2 call .h dec hd jz .v2 lea t0, [t1+384*2] call .hv dec hd jz .v2 add t0, 384*6 call .hv dec hd jnz .main .v2: call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v add dstq, strideq mov t4, t3 mov t3, t2 mov t2, t1 movifnidn dstmp, dstq .v1: call mangle(private_prefix %+ _wiener_filter5_8bpc_ssse3).v jmp .end .h: %define stk esp+4 mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movifnidn leftq, leftmp mova m4, [lpfq+xq] movd m5, [leftq] add leftq, 4 pslldq m4, 4 por m4, m5 movifnidn leftmp, leftq jmp .h_main .h_extend_left: %if cpuflag(ssse3) mova m4, [lpfq+xq] pshufb m4, m12 %else mova m5, [lpfq+xq] pshufd m4, m5, q2103 punpcklbw m5, m5 punpcklwd m5, m5 movss m4, m5 %endif jmp .h_main .h_top: mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left .h_loop: movu m4, [lpfq+xq-4] .h_main: movu m5, [lpfq+xq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp xd, -17 jl .h_have_right call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right .h_have_right: %macro %%h5 0 %if cpuflag(ssse3) pshufb m0, m4, m9 pmaddubsw m0, m13 pshufb m1, m5, m9 pmaddubsw m1, m13 pshufb m2, m4, m10 pmaddubsw m2, m13 pshufb m3, m5, m10 pmaddubsw m3, m13 pshufb m4, m11 paddw m0, m2 pmullw m2, m14, m4 pshufb m5, m11 paddw m1, m3 pmullw m3, m14, m5 psllw m4, 7 psllw m5, 7 paddw m4, m8 paddw m5, m8 paddw m0, m2 paddw m1, m3 paddsw m0, m4 paddsw m1, m5 %else psrldq m0, m4, 2 pslldq m1, m4, 2 pxor m3, m3 punpcklbw m0, m3 punpckhbw m1, m3 paddw m0, m1 pmullw m0, m11 pshufd m2, m4, q0321 punpcklbw m2, m3 pmullw m1, m14, m2 paddw m0, m1 psrldq m1, m4, 3 pslldq m4, 3 punpcklbw m1, m3 punpckhbw m4, m3 paddw m1, m4 pmullw m1, m13 paddw m0, m1 psllw m2, 7 paddw m2, m8 paddsw m0, m2 psrldq m1, m5, 2 pslldq m4, m5, 2 punpcklbw m1, m3 punpckhbw m4, m3 paddw m1, m4 pmullw m1, m11 pshufd m4, m5, q0321 punpcklbw m4, m3 pmullw m2, m14, m4 paddw m1, m2 psrldq m2, m5, 3 pslldq m5, 3 punpcklbw m2, m3 punpckhbw m5, m3 paddw m2, m5 pmullw m2, m13 paddw m1, m2 psllw m4, 7 paddw m4, m8 paddsw m1, m4 %endif %endmacro %%h5 psraw m0, 3 psraw m1, 3 paddw m0, m15 paddw m1, m15 mova [t1+xq*2+ 0], m0 mova [t1+xq*2+16], m1 add xq, 16 jl .h_loop ret ALIGN function_align .hv: add lpfq, strideq mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movifnidn leftq, leftmp mova m4, [lpfq+xq] movd m5, [leftq] add leftq, 4 pslldq m4, 4 por m4, m5 movifnidn leftmp, leftq jmp .hv_main .hv_extend_left: %if cpuflag(ssse3) mova m4, [lpfq+xq] pshufb m4, m12 %else mova m5, [lpfq+xq] pshufd m4, m5, q2103 punpcklbw m5, m5 punpcklwd m5, m5 movss m4, m5 %endif jmp .hv_main .hv_bottom: mov xq, wq test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left .hv_loop: movu m4, [lpfq+xq-4] .hv_main: movu m5, [lpfq+xq+4] test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp xd, -17 jl .hv_have_right call mangle(private_prefix %+ _wiener_filter7_8bpc %+ SUFFIX).extend_right .hv_have_right: %%h5 mova m2, [t3+xq*2] paddw m2, [t1+xq*2] psraw m0, 3 psraw m1, 3 paddw m0, m15 paddw m1, m15 %if ARCH_X86_64 mova m3, [t2+xq*2] paddw m4, m0, [t4+xq*2] %else mov r2, t2 mova m3, [r2+xq*2] mov r2, t4 paddw m4, m0, [r2+xq*2] %endif mova [t0+xq*2], m0 punpcklwd m0, m2, m3 pmaddwd m0, m7 punpckhwd m2, m3 pmaddwd m2, m7 punpcklwd m3, m4, m4 pmaddwd m3, m6 punpckhwd m4, m4 pmaddwd m4, m6 paddd m0, m3 paddd m4, m2 mova m2, [t3+xq*2+16] paddw m2, [t1+xq*2+16] packuswb m0, m4 %if ARCH_X86_64 mova m3, [t2+xq*2+16] paddw m4, m1, [t4+xq*2+16] %else paddw m4, m1, [r2+xq*2+16] mov r2, t2 mova m3, [r2+xq*2+16] mov dstq, dstmp %endif mova [t0+xq*2+16], m1 punpcklwd m1, m2, m3 pmaddwd m1, m7 punpckhwd m2, m3 pmaddwd m2, m7 punpcklwd m3, m4, m4 pmaddwd m3, m6 punpckhwd m4, m4 pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 jl .hv_loop add dstq, strideq mov t4, t3 mov t3, t2 mov t2, t1 mov t1, t0 mov t0, t4 movifnidn dstmp, dstq ret %if cpuflag(ssse3) .v: mov xq, wq .v_loop: mova m3, [t1+xq*2] paddw m1, m3, [t3+xq*2] %if ARCH_X86_64 mova m2, [t2+xq*2] paddw m3, [t4+xq*2] %else mov r2, t2 mova m2, [r2+xq*2] mov r2, t4 paddw m3, [r2+xq*2] %endif punpcklwd m0, m1, m2 pmaddwd m0, m7 punpckhwd m1, m2 pmaddwd m1, m7 punpcklwd m2, m3 pmaddwd m2, m6 punpckhwd m3, m3 pmaddwd m3, m6 paddd m0, m2 paddd m1, m3 mova m4, [t1+xq*2+16] paddw m2, m4, [t3+xq*2+16] %if ARCH_X86_64 mova m3, [t2+xq*2+16] paddw m4, [t4+xq*2+16] %else paddw m4, [r2+xq*2+16] mov r2, t2 mova m3, [r2+xq*2+16] mov dstq, dstmp %endif packuswb m0, m1 punpcklwd m1, m2, m3 pmaddwd m1, m7 punpckhwd m2, m3 pmaddwd m2, m7 punpcklwd m3, m4 pmaddwd m3, m6 punpckhwd m4, m4 pmaddwd m4, m6 paddd m1, m3 paddd m2, m4 packuswb m1, m2 psrlw m0, 8 psrlw m1, 8 packuswb m0, m1 mova [dstq+xq], m0 add xq, 16 jl .v_loop ret %endif %endmacro INIT_XMM sse2 WIENER INIT_XMM ssse3 WIENER ;;;;;;;;;;;;;;;;;;;;;;;;;; ;; self-guided ;; ;;;;;;;;;;;;;;;;;;;;;;;;;; %macro GATHERDD 3 ; dst, src, tmp movd %3d, %2 %if ARCH_X86_64 movd %1, [r13+%3] pextrw %3d, %2, 2 pinsrw %1, [r13+%3+2], 3 pextrw %3d, %2, 4 pinsrw %1, [r13+%3+2], 5 pextrw %3d, %2, 6 pinsrw %1, [r13+%3+2], 7 %else movd %1, [base+sgr_x_by_x-0xf03+%3] pextrw %3, %2, 2 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3 pextrw %3, %2, 4 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5 pextrw %3, %2, 6 pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7 %endif %endmacro %macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore %if ARCH_X86_64 %define tmp r14 %else %define tmp %4 %endif GATHERDD %1, %2, tmp GATHERDD %2, %3, tmp movif32 %4, %5 psrld %1, 24 psrld %2, 24 packssdw %1, %2 %endmacro %macro MULLD 3 ; dst, src, tmp pmulhuw %3, %1, %2 pmullw %1, %2 pslld %3, 16 paddd %1, %3 %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 0, 1, 2, 3, 5 %if STACK_ALIGNMENT < 16 %assign extra_stack 5*16 %else %assign extra_stack 3*16 %endif cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*0+4*6] %define stridemp dword [esp+calloff+16*0+4*7] %define leftm dword [esp+calloff+16*3+4*0] %define lpfm dword [esp+calloff+16*3+4*1] %define w0m dword [esp+calloff+16*3+4*2] %define hd dword [esp+calloff+16*3+4*3] %define edgeb byte [esp+calloff+16*3+4*4] %define edged dword [esp+calloff+16*3+4*4] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t0m dword [esp+calloff+4*2] %define t2m dword [esp+calloff+4*3] %define t3m dword [esp+calloff+4*4] %define t4m dword [esp+calloff+4*5] %define m8 [base+pb_1] %define m9 [esp+calloff+16*2] %define m10 [base+pd_0xf00800a4] %define m11 [base+sgr_lshuf5] %define m12 [base+pd_34816] %define m13 [base+pb_0to15] %define r10 r4 %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else DECLARE_REG_TMP 8, 7, 9, 11, 12 cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 mov wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm mov edged, r7m movu m9, [paramsq] add lpfq, wq mova m8, [pb_1] lea t1, [rsp+wq*2+20] mova m10, [pd_0xf00800a4] add dstq, wq lea t3, [rsp+wq*4+400*12+16] mova m12, [pd_34816] ; (1 << 11) + (1 << 15) lea t4, [rsp+wq*2+400*20+16] pshufhw m7, m9, q0000 pshufb m9, [pw_256] ; s0 punpckhqdq m7, m7 ; w0 neg wq mova m13, [pb_0to15] pxor m6, m6 mova m11, [sgr_lshuf5] psllw m7, 4 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ movu m1, [r1] add lpfm, wq lea t1, [rsp+extra_stack+wq*2+20] add dstq, wq lea t3, [rsp+extra_stack+wq*4+400*12+16] mov dstm, dstq lea t4, [rsp+extra_stack+wq*2+400*20+16] mov t3m, t3 pshufhw m7, m1, q0000 mov t4m, t4 pshufb m1, [base+pw_256] ; s0 punpckhqdq m7, m7 ; w0 psllw m7, 4 neg wq mova m9, m1 pxor m6, m6 mov w1m, wd sub wd, 2 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp movif32 t2m, t1 mov t2, t1 call .top_fixup add t1, 400*6 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t0m, t2 mov t0, t2 dec hd jz .height1 or edged, 16 call .h .main: add lpfq, stridemp movif32 t4, t4m call .hv call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp %if ARCH_X86_64 test hb, hb %else mov r4, hd test r4, r4 %endif jz .odd_height call .h add lpfq, stridemp call .hv movif32 dstq, dstm call .n0 call .n1 sub hd, 2 movif32 t0, t0m jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .h_top add lpfq, stridemp call .hv_bottom .end: movif32 dstq, dstm call .n0 call .n1 .end2: RET .height1: movif32 t4, t4m call .hv call .prep_n jmp .odd_height_end .odd_height: call .hv movif32 dstq, dstm call .n0 call .n1 .odd_height_end: call .v movif32 dstq, dstm call .n0 jmp .end2 .extend_bottom: call .v jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h lea t2, [t1+400*6] movif32 t2m, t2 call .top_fixup dec hd jz .no_top_height1 or edged, 16 mov t0, t1 mov t1, t2 movif32 t0m, t0 jmp .main .no_top_height1: movif32 t3, t3m movif32 t4, t4m call .v call .prep_n jmp .odd_height_end .extend_right: %assign stack_offset stack_offset+8 %assign calloff 8 movd m1, wd movd m3, [lpfq-1] pshufb m1, m6 pshufb m3, m6 psubb m2, m8, m1 pcmpgtb m2, m13 pand m5, m2 pandn m2, m3 por m5, m2 ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 lea wq, [r4-2] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 13 jmp .h_main .h_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, m11 jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-2] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m5, [lpfq+wq-1] .h_main: test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -10 jl .h_have_right call .extend_right .h_have_right: punpcklbw m4, m5, m6 punpckhbw m5, m6 palignr m2, m5, m4, 2 paddw m0, m4, m2 palignr m3, m5, m4, 6 paddw m0, m3 punpcklwd m1, m2, m3 pmaddwd m1, m1 punpckhwd m2, m3 pmaddwd m2, m2 palignr m5, m4, 8 paddw m0, m5 punpcklwd m3, m4, m5 pmaddwd m3, m3 paddd m1, m3 punpckhwd m3, m4, m5 pmaddwd m3, m3 shufps m4, m5, q2121 paddw m0, m4 ; sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m2, m3 test edgeb, 16 ; y > 0 jz .h_loop_end paddw m0, [t1+wq*2+400*0] paddd m1, [t1+wq*2+400*2] paddd m2, [t1+wq*2+400*4] .h_loop_end: paddd m1, m5 ; sumsq paddd m2, m4 mova [t1+wq*2+400*0], m0 mova [t1+wq*2+400*2], m1 mova [t1+wq*2+400*4], m2 add wq, 8 jl .h_loop ret .top_fixup: %if ARCH_X86_64 lea wq, [r4-2] %else mov wd, w0m %endif .top_fixup_loop: ; the sums of the first row needs to be doubled mova m0, [t1+wq*2+400*0] mova m1, [t1+wq*2+400*2] mova m2, [t1+wq*2+400*4] paddw m0, m0 paddd m1, m1 paddd m2, m2 mova [t2+wq*2+400*0], m0 mova [t2+wq*2+400*2], m1 mova [t2+wq*2+400*4], m2 add wq, 8 jl .top_fixup_loop ret ALIGN function_align .hv: ; horizontal boxsum + vertical boxsum + ab %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 13 jmp .hv_main .hv_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, m11 jmp .hv_main .hv_bottom: %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv_loop_start %endif .hv_loop: movif32 lpfq, hvsrcm .hv_loop_start: movu m5, [lpfq+wq-1] .hv_main: test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv_have_right cmp wd, -10 jl .hv_have_right call .extend_right .hv_have_right: movif32 t3, hd punpcklbw m4, m5, m6 punpckhbw m5, m6 palignr m3, m5, m4, 2 paddw m0, m4, m3 palignr m1, m5, m4, 6 paddw m0, m1 punpcklwd m2, m3, m1 pmaddwd m2, m2 punpckhwd m3, m1 pmaddwd m3, m3 palignr m5, m4, 8 paddw m0, m5 punpcklwd m1, m4, m5 pmaddwd m1, m1 paddd m2, m1 punpckhwd m1, m4, m5 pmaddwd m1, m1 shufps m4, m5, q2121 paddw m0, m4 ; h sum punpcklwd m5, m4, m6 pmaddwd m5, m5 punpckhwd m4, m6 pmaddwd m4, m4 paddd m3, m1 paddd m2, m5 ; h sumsq paddd m3, m4 paddw m1, m0, [t1+wq*2+400*0] paddd m4, m2, [t1+wq*2+400*2] paddd m5, m3, [t1+wq*2+400*4] %if ARCH_X86_64 test hd, hd %else test t3, t3 %endif jz .hv_last_row .hv_main2: paddw m1, [t2+wq*2+400*0] ; hv sum paddd m4, [t2+wq*2+400*2] ; hv sumsq paddd m5, [t2+wq*2+400*4] mova [t0+wq*2+400*0], m0 pslld m0, m4, 4 mova [t0+wq*2+400*2], m2 mova [t0+wq*2+400*4], m3 pslld m2, m4, 3 paddd m4, m0 pslld m0, m5, 4 paddd m4, m2 ; a * 25 pslld m2, m5, 3 paddd m5, m0 paddd m5, m2 punpcklwd m0, m1, m6 ; b punpckhwd m1, m6 pmaddwd m2, m0, m0 ; b * b pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m2 ; p * s MULLD m5, m9, m2 pmaddwd m0, m10 ; b * 164 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, t2, t2m punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m12 mova [t4+wq*2+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*4+ 8], m0 mova [t3+wq*4+24], m1 add wq, 8 jl .hv_loop mov t2, t1 mov t1, t0 mov t0, t2 movif32 t2m, t2 movif32 t0m, t0 ret .hv_last_row: ; esoteric edge case for odd heights mova [t1+wq*2+400*0], m1 paddw m1, m0 mova [t1+wq*2+400*2], m4 paddd m4, m2 mova [t1+wq*2+400*4], m5 paddd m5, m3 jmp .hv_main2 .v: ; vertical boxsum + ab %if ARCH_X86_64 lea wq, [r4-2] %else mov wd, w0m %endif .v_loop: mova m0, [t1+wq*2+400*0] mova m2, [t1+wq*2+400*2] mova m3, [t1+wq*2+400*4] paddw m1, m0, [t2+wq*2+400*0] paddd m4, m2, [t2+wq*2+400*2] paddd m5, m3, [t2+wq*2+400*4] paddw m0, m0 paddd m2, m2 paddd m3, m3 paddw m1, m0 ; hv sum paddd m4, m2 ; hv sumsq pslld m0, m4, 4 paddd m5, m3 pslld m2, m4, 3 paddd m4, m0 pslld m0, m5, 4 paddd m4, m2 ; a * 25 pslld m2, m5, 3 paddd m5, m0 paddd m5, m2 punpcklwd m0, m1, m6 punpckhwd m1, m6 pmaddwd m2, m0, m0 ; b * b pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m2 ; p * s MULLD m5, m9, m2 pmaddwd m0, m10 ; b * 164 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, t2, t2m punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m2 MULLD m1, m5, m2 paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15) paddd m1, m12 mova [t4+wq*2+4], m3 psrld m0, 12 ; b psrld m1, 12 mova [t3+wq*4+ 8], m0 mova [t3+wq*4+24], m1 add wq, 8 jl .v_loop ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*2+ 2] movu m3, [t4+wq*2+ 4] movu m1, [t3+wq*4+ 4] movu m4, [t3+wq*4+ 8] movu m2, [t3+wq*4+20] movu m5, [t3+wq*4+24] paddw m3, m0 paddd m4, m1 paddd m5, m2 paddw m3, [t4+wq*2+ 0] paddd m4, [t3+wq*4+ 0] paddd m5, [t3+wq*4+16] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 mova [t4+wq*2+400*2+ 0], m0 mova [t3+wq*4+400*4+ 0], m1 mova [t3+wq*4+400*4+16], m2 add wq, 8 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*2+ 2] movu m3, [t4+wq*2+ 4] movu m1, [t3+wq*4+ 4] movu m4, [t3+wq*4+ 8] movu m2, [t3+wq*4+20] movu m5, [t3+wq*4+24] paddw m3, m0 paddd m4, m1 paddd m5, m2 paddw m3, [t4+wq*2+ 0] paddd m4, [t3+wq*4+ 0] paddd m5, [t3+wq*4+16] paddw m0, m3 psllw m3, 2 paddd m1, m4 pslld m4, 2 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a 565 paddd m1, m4 ; b 565 paddd m2, m5 paddw m3, m0, [t4+wq*2+400*2+ 0] paddd m4, m1, [t3+wq*4+400*4+ 0] paddd m5, m2, [t3+wq*4+400*4+16] mova [t4+wq*2+400*2+ 0], m0 mova [t3+wq*4+400*4+ 0], m1 mova [t3+wq*4+400*4+16], m2 movq m0, [dstq+wq] punpcklbw m0, m6 punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n0_loop add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: movq m0, [dstq+wq] mova m3, [t4+wq*2+400*2+ 0] mova m4, [t3+wq*4+400*4+ 0] mova m5, [t3+wq*4+400*4+16] punpcklbw m0, m6 punpcklwd m1, m0, m6 ; src punpcklwd m2, m3, m6 ; a pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 7) psubd m5, m3 psrad m4, 8 psrad m5, 8 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign extra_stack 4*16 %else %assign extra_stack 2*16 %endif cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*2+4*0] %define stridemp dword [esp+calloff+16*2+4*1] %define leftm dword [esp+calloff+16*2+4*2] %define lpfm dword [esp+calloff+16*2+4*3] %define w0m dword [esp+calloff+16*2+4*4] %define hd dword [esp+calloff+16*2+4*5] %define edgeb byte [esp+calloff+16*2+4*6] %define edged dword [esp+calloff+16*2+4*6] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t3m dword [esp+calloff+4*2] %define t4m dword [esp+calloff+4*3] %define m8 [base+pb_0to15] %define m9 [esp+calloff+16*1] %define m10 [base+pd_0xf00801c7] %define m11 [base+pd_34816] %define m12 m6 %define m13 [base+sgr_lshuf3] %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 mov wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] mov hd, hm mov edged, r7m movq m9, [paramsq+4] add lpfq, wq lea t1, [rsp+wq*2+12] mova m8, [pb_0to15] add dstq, wq lea t3, [rsp+wq*4+400*12+8] mova m10, [pd_0xf00801c7] lea t4, [rsp+wq*2+400*32+8] mova m11, [pd_34816] pshuflw m7, m9, q3333 pshufb m9, [pw_256] ; s1 punpcklqdq m7, m7 ; w1 neg wq pxor m6, m6 mova m13, [sgr_lshuf3] psllw m7, 4 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ movq m1, [r1+4] add lpfm, wq lea t1, [rsp+extra_stack+wq*2+20] add dstq, wq lea t3, [rsp+extra_stack+wq*4+400*12+16] mov dstm, dstq lea t4, [rsp+extra_stack+wq*2+400*32+16] mov t3m, t3 pshuflw m7, m1, q3333 mov t4m, t4 pshufb m1, [base+pw_256] ; s1 punpcklqdq m7, m7 ; w1 psllw m7, 4 neg wq mova m9, m1 pxor m6, m6 mov w1m, wd sub wd, 2 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp mov t2, t1 add t1, 400*6 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 .main: dec hd jz .height1 movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hb, hb %else mov r4, hd test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .hv0_bottom movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 lea wq, [r4-2] %else mov wq, w0m mov hvsrcm, lpfq %endif lea t2, [t1+400*6] .top_fixup_loop: mova m0, [t1+wq*2+400*0] mova m1, [t1+wq*2+400*2] mova m2, [t1+wq*2+400*4] mova [t2+wq*2+400*0], m0 mova [t2+wq*2+400*2], m1 mova [t2+wq*2+400*4], m2 add wq, 8 jl .top_fixup_loop movif32 t3, t3m movif32 t4, t4m call .v0 jmp .main .extend_right: %assign stack_offset stack_offset+8 %assign calloff 8 movd m0, [lpfq-1] movd m1, wd mova m3, m8 pshufb m0, m6 pshufb m1, m6 mova m2, m6 psubb m2, m1 pcmpgtb m2, m3 pand m5, m2 pandn m2, m0 por m5, m2 ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 lea wq, [r4-2] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 14 jmp .h_main .h_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, m13 jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-2] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m5, [lpfq+wq] .h_main: test edgeb, 2 ; LR_HAVE_RIGHT jnz .h_have_right cmp wd, -9 jl .h_have_right call .extend_right .h_have_right: punpcklbw m4, m5, m6 punpckhbw m5, m6 palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 mova [t1+wq*2+400*0], m1 mova [t1+wq*2+400*2], m2 mova [t1+wq*2+400*4], m3 add wq, 8 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab (even rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 14 jmp .hv0_main .hv0_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, m13 jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv0_loop_start %endif .hv0_loop: movif32 lpfq, hvsrcm .hv0_loop_start: movu m5, [lpfq+wq] .hv0_main: test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv0_have_right cmp wd, -9 jl .hv0_have_right call .extend_right .hv0_have_right: punpcklbw m4, m5, m6 punpckhbw m5, m6 palignr m0, m5, m4, 2 paddw m1, m4, m0 punpcklwd m2, m4, m0 pmaddwd m2, m2 punpckhwd m3, m4, m0 pmaddwd m3, m3 palignr m5, m4, 4 paddw m1, m5 ; sum punpcklwd m4, m5, m6 pmaddwd m4, m4 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m4 ; sumsq paddd m3, m5 paddw m0, m1, [t1+wq*2+400*0] paddd m4, m2, [t1+wq*2+400*2] paddd m5, m3, [t1+wq*2+400*4] mova [t1+wq*2+400*0], m1 mova [t1+wq*2+400*2], m2 mova [t1+wq*2+400*4], m3 paddw m1, m0, [t2+wq*2+400*0] paddd m2, m4, [t2+wq*2+400*2] paddd m3, m5, [t2+wq*2+400*4] mova [t2+wq*2+400*0], m0 mova [t2+wq*2+400*2], m4 mova [t2+wq*2+400*4], m5 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a * 9 paddd m5, m3 punpcklwd m0, m1, m6 ; b pmaddwd m2, m0, m0 ; b * b punpckhwd m1, m6 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m12 ; p * s MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m12 MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*2+4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+ 8], m0 mova [t3+wq*4+24], m1 add wq, 8 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 14 jmp .hv1_main .hv1_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, m13 jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv1_loop_start %endif .hv1_loop: movif32 lpfq, hvsrcm .hv1_loop_start: movu m5, [lpfq+wq] .hv1_main: test edgeb, 2 ; LR_HAVE_RIGHT jnz .hv1_have_right cmp wd, -9 jl .hv1_have_right call .extend_right .hv1_have_right: punpcklbw m4, m5, m6 punpckhbw m5, m6 palignr m1, m5, m4, 2 paddw m0, m4, m1 punpcklwd m2, m4, m1 pmaddwd m2, m2 punpckhwd m3, m4, m1 pmaddwd m3, m3 palignr m5, m4, 4 paddw m0, m5 ; h sum punpcklwd m1, m5, m6 pmaddwd m1, m1 punpckhwd m5, m6 pmaddwd m5, m5 paddd m2, m1 ; h sumsq paddd m3, m5 paddw m1, m0, [t2+wq*2+400*0] paddd m4, m2, [t2+wq*2+400*2] paddd m5, m3, [t2+wq*2+400*4] mova [t2+wq*2+400*0], m0 mova [t2+wq*2+400*2], m2 mova [t2+wq*2+400*4], m3 pslld m2, m4, 3 pslld m3, m5, 3 paddd m4, m2 ; a * 9 paddd m5, m3 punpcklwd m0, m1, m6 ; b pmaddwd m2, m0, m0 ; b * b punpckhwd m1, m6 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m12 ; p * s MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) movif32 t3, t3m psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m12 MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*2+400*2 +4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+400*4+ 8], m0 mova [t3+wq*4+400*4+24], m1 add wq, 8 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab (even rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov wd, w0m %endif .v0_loop: mova m0, [t1+wq*2+400*0] mova m4, [t1+wq*2+400*2] mova m5, [t1+wq*2+400*4] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+wq*2+400*0] paddd m2, m4, [t2+wq*2+400*2] paddd m3, m5, [t2+wq*2+400*4] mova [t2+wq*2+400*0], m0 mova [t2+wq*2+400*2], m4 mova [t2+wq*2+400*4], m5 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a * 9 paddd m5, m3 punpcklwd m0, m1, m6 ; b pmaddwd m2, m0, m0 ; b * b punpckhwd m1, m6 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m12 ; p * s MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m12 MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*2+4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+ 8], m0 mova [t3+wq*4+24], m1 add wq, 8 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov wd, w0m %endif .v1_loop: mova m0, [t1+wq*2+400*0] mova m4, [t1+wq*2+400*2] mova m5, [t1+wq*2+400*4] paddw m1, m0, [t2+wq*2+400*0] paddd m2, m4, [t2+wq*2+400*2] paddd m3, m5, [t2+wq*2+400*4] mova [t2+wq*2+400*0], m0 mova [t2+wq*2+400*2], m4 mova [t2+wq*2+400*4], m5 pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a * 9 paddd m5, m3 punpcklwd m0, m1, m6 ; b pmaddwd m2, m0, m0 ; b * b punpckhwd m1, m6 pmaddwd m3, m1, m1 psubd m4, m2 ; p psubd m5, m3 MULLD m4, m9, m12 ; p * s MULLD m5, m9, m12 pmaddwd m0, m10 ; b * 455 pmaddwd m1, m10 paddusw m4, m10 paddusw m5, m10 psrld m4, 20 ; min(z, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m12 MULLD m1, m5, m12 %if ARCH_X86_32 pxor m6, m6 %endif paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15) paddd m1, m11 mova [t4+wq*2+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+400*4+ 8], m0 mova [t3+wq*4+400*4+24], m1 add wq, 8 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*2+400*0+ 4] movu m1, [t3+wq*4+400*0+ 8] movu m2, [t3+wq*4+400*0+24] movu m3, [t4+wq*2+400*0+ 2] movu m4, [t3+wq*4+400*0+ 4] movu m5, [t3+wq*4+400*0+20] paddw m0, [t4+wq*2+400*0+ 0] paddd m1, [t3+wq*4+400*0+ 0] paddd m2, [t3+wq*4+400*0+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a[-1] 444 pslld m4, 2 ; b[-1] 444 pslld m5, 2 psubw m3, m0 ; a[-1] 343 psubd m4, m1 ; b[-1] 343 psubd m5, m2 mova [t4+wq*2+400*4], m3 mova [t3+wq*4+400*8+ 0], m4 mova [t3+wq*4+400*8+16], m5 movu m0, [t4+wq*2+400*2+ 4] movu m1, [t3+wq*4+400*4+ 8] movu m2, [t3+wq*4+400*4+24] movu m3, [t4+wq*2+400*2+ 2] movu m4, [t3+wq*4+400*4+ 4] movu m5, [t3+wq*4+400*4+20] paddw m0, [t4+wq*2+400*2+ 0] paddd m1, [t3+wq*4+400*4+ 0] paddd m2, [t3+wq*4+400*4+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a[ 0] 444 pslld m4, 2 ; b[ 0] 444 pslld m5, 2 mova [t4+wq*2+400* 6], m3 mova [t3+wq*4+400*12+ 0], m4 mova [t3+wq*4+400*12+16], m5 psubw m3, m0 ; a[ 0] 343 psubd m4, m1 ; b[ 0] 343 psubd m5, m2 mova [t4+wq*2+400* 8], m3 mova [t3+wq*4+400*16+ 0], m4 mova [t3+wq*4+400*16+16], m5 add wq, 8 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m3, [t4+wq*2+400*0+4] movu m1, [t4+wq*2+400*0+2] paddw m3, [t4+wq*2+400*0+0] paddw m1, m3 psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+wq*2+400*4] paddw m3, [t4+wq*2+400*6] mova [t4+wq*2+400*4], m2 mova [t4+wq*2+400*6], m1 movu m4, [t3+wq*4+400*0+8] movu m1, [t3+wq*4+400*0+4] paddd m4, [t3+wq*4+400*0+0] paddd m1, m4 pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+wq*4+400* 8+ 0] paddd m4, [t3+wq*4+400*12+ 0] mova [t3+wq*4+400* 8+ 0], m2 mova [t3+wq*4+400*12+ 0], m1 movu m5, [t3+wq*4+400*0+24] movu m1, [t3+wq*4+400*0+20] paddd m5, [t3+wq*4+400*0+16] paddd m1, m5 pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+wq*4+400* 8+16] paddd m5, [t3+wq*4+400*12+16] mova [t3+wq*4+400* 8+16], m2 mova [t3+wq*4+400*12+16], m1 movq m0, [dstq+wq] punpcklbw m0, m6 punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n0_loop add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*2+400*2+4] movu m1, [t4+wq*2+400*2+2] paddw m3, [t4+wq*2+400*2+0] paddw m1, m3 psllw m1, 2 ; a[ 1] 444 psubw m2, m1, m3 ; a[ 1] 343 paddw m3, m2, [t4+wq*2+400*6] paddw m3, [t4+wq*2+400*8] mova [t4+wq*2+400*6], m1 mova [t4+wq*2+400*8], m2 movu m4, [t3+wq*4+400*4+8] movu m1, [t3+wq*4+400*4+4] paddd m4, [t3+wq*4+400*4+0] paddd m1, m4 pslld m1, 2 ; b[ 1] 444 psubd m2, m1, m4 ; b[ 1] 343 paddd m4, m2, [t3+wq*4+400*12+ 0] paddd m4, [t3+wq*4+400*16+ 0] mova [t3+wq*4+400*12+ 0], m1 mova [t3+wq*4+400*16+ 0], m2 movu m5, [t3+wq*4+400*4+24] movu m1, [t3+wq*4+400*4+20] paddd m5, [t3+wq*4+400*4+16] paddd m1, m5 pslld m1, 2 psubd m2, m1, m5 paddd m5, m2, [t3+wq*4+400*12+16] paddd m5, [t3+wq*4+400*16+16] mova [t3+wq*4+400*12+16], m1 mova [t3+wq*4+400*16+16], m2 movq m0, [dstq+wq] punpcklbw m0, m6 punpcklwd m1, m0, m6 punpcklwd m2, m3, m6 pmaddwd m2, m1 ; a * src punpckhwd m1, m0, m6 punpckhwd m3, m6 pmaddwd m3, m1 psubd m4, m2 ; b - a * src + (1 << 8) psubd m5, m3 psrad m4, 9 psrad m5, 9 packssdw m4, m5 pmulhrsw m4, m7 paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %assign extra_stack 10*16 %else %assign extra_stack 8*16 %endif cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \ dst, stride, left, lpf, w %if STACK_ALIGNMENT < 16 %define dstm dword [esp+calloff+16*8+4*0] %define stridemp dword [esp+calloff+16*8+4*1] %define leftm dword [esp+calloff+16*8+4*2] %define lpfm dword [esp+calloff+16*8+4*3] %define w0m dword [esp+calloff+16*8+4*4] %define hd dword [esp+calloff+16*8+4*5] %define edgeb byte [esp+calloff+16*8+4*6] %define edged dword [esp+calloff+16*8+4*6] %define leftmp leftm %else %define w0m wm %define hd dword r5m %define edgeb byte r7m %define edged dword r7m %endif %define hvsrcm dword [esp+calloff+4*0] %define w1m dword [esp+calloff+4*1] %define t3m dword [esp+calloff+4*2] %define t4m dword [esp+calloff+4*3] %xdefine m8 m6 %define m9 [base+pd_0xffff] %define m10 [base+pd_34816] %define m11 [base+pd_0xf00801c7] %define m12 [base+pd_0xf00800a4] %define m13 [esp+calloff+16*4] %define m14 [esp+calloff+16*5] %define m15 [esp+calloff+16*6] %define m6 [esp+calloff+16*7] %define base r6-$$ %assign calloff 0 %if STACK_ALIGNMENT < 16 mov strideq, [rstk+stack_offset+ 8] mov leftq, [rstk+stack_offset+12] mov lpfq, [rstk+stack_offset+16] mov wd, [rstk+stack_offset+20] mov dstm, dstq mov stridemp, strideq mov leftm, leftq mov r1, [rstk+stack_offset+24] mov r2, [rstk+stack_offset+32] mov lpfm, lpfq mov hd, r1 mov edged, r2 %endif %else cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \ w, h, edge, params %endif %if ARCH_X86_64 || STACK_ALIGNMENT >= 16 mov wd, wm %endif %if ARCH_X86_64 mov paramsq, r6mp lea r13, [sgr_x_by_x-0xf03] movifnidn hd, hm mov edged, r7m mova m15, [paramsq] add lpfq, wq mova m9, [pd_0xffff] lea t1, [rsp+wq*2+44] mova m10, [pd_34816] add dstq, wq lea t3, [rsp+wq*4+400*24+40] mova m11, [pd_0xf00801c7] lea t4, [rsp+wq*2+400*52+40] mova m12, [base+pd_0xf00800a4] neg wq pshuflw m13, m15, q0000 pshuflw m14, m15, q2222 pshufhw m15, m15, q1010 punpcklqdq m13, m13 ; s0 punpcklqdq m14, m14 ; s1 punpckhqdq m15, m15 ; w0 w1 pxor m6, m6 psllw m15, 2 DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w %define lpfm [rsp] %else mov r1, [rstk+stack_offset+28] ; params LEA r6, $$ mova m2, [r1] add lpfm, wq lea t1, [rsp+extra_stack+wq*2+52] add dstq, wq lea t3, [rsp+extra_stack+wq*4+400*24+48] mov dstm, dstq lea t4, [rsp+extra_stack+wq*2+400*52+48] mov t3m, t3 mov t4m, t4 neg wq pshuflw m0, m2, q0000 pshuflw m1, m2, q2222 pshufhw m2, m2, q1010 punpcklqdq m0, m0 ; s0 punpcklqdq m1, m1 ; s1 punpckhqdq m2, m2 ; w0 w1 mov w1m, wd pxor m3, m3 psllw m2, 2 mova m13, m0 mova m14, m1 sub wd, 2 mova m15, m2 mova m6, m3 mov lpfq, lpfm mov w0m, wd %define strideq r5 %endif test edgeb, 4 ; LR_HAVE_TOP jz .no_top call .h_top add lpfq, stridemp mov t2, t1 %if ARCH_X86_64 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup %else mov wq, w0m call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_ssse3).top_fixup_loop %endif add t1, 400*12 call .h_top movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq add r10, strideq mov lpfm, r10 ; below movif32 t4, t4m call .hv0 .main: dec hd jz .height1 movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .prep_n sub hd, 2 jl .extend_bottom .main_loop: movif32 lpfq, hvsrcm add lpfq, stridemp call .hv0 %if ARCH_X86_64 test hd, hd %else mov r4, hd test r4, r4 %endif jz .odd_height movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1 call .n0 call .n1 sub hd, 2 jge .main_loop test edgeb, 8 ; LR_HAVE_BOTTOM jz .extend_bottom mov lpfq, lpfm call .hv0_bottom movif32 lpfq, hvsrcm add lpfq, stridemp call .hv1_bottom .end: call .n0 call .n1 .end2: RET .height1: call .v1 call .prep_n jmp .odd_height_end .odd_height: call .v1 call .n0 call .n1 .odd_height_end: call .v0 call .v1 call .n0 jmp .end2 .extend_bottom: call .v0 call .v1 jmp .end .no_top: movif32 strideq, stridemp lea r10, [lpfq+strideq*4] mov lpfq, dstq lea r10, [r10+strideq*2] mov lpfm, r10 call .h %if ARCH_X86_64 lea wq, [r4-2] %else mov wq, w0m mov hvsrcm, lpfq %endif lea t2, [t1+400*12] .top_fixup_loop: mova m0, [t1+wq*2+400* 0] mova m1, [t1+wq*2+400* 2] mova m2, [t1+wq*2+400* 4] paddw m0, m0 mova m3, [t1+wq*2+400* 6] paddd m1, m1 mova m4, [t1+wq*2+400* 8] paddd m2, m2 mova m5, [t1+wq*2+400*10] mova [t2+wq*2+400* 0], m0 mova [t2+wq*2+400* 2], m1 mova [t2+wq*2+400* 4], m2 mova [t2+wq*2+400* 6], m3 mova [t2+wq*2+400* 8], m4 mova [t2+wq*2+400*10], m5 add wq, 8 jl .top_fixup_loop movif32 t3, t3m movif32 t4, t4m call .v0 jmp .main .extend_right: %assign stack_offset stack_offset+8 %assign calloff 8 %if ARCH_X86_64 SWAP m8, m6 %endif movd m1, wd movd m3, [lpfq-1] pshufb m1, m8 pshufb m3, m8 psubb m2, [base+pb_1], m1 pcmpgtb m2, [base+pb_0to15] pand m5, m2 pandn m2, m3 por m5, m2 %if ARCH_X86_64 SWAP m6, m8 %endif ret %assign stack_offset stack_offset-4 %assign calloff 4 .h: ; horizontal boxsum %if ARCH_X86_64 lea wq, [r4-2] %else %define leftq r4 %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 13 jmp .h_main .h_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, [base+sgr_lshuf5] jmp .h_main .h_top: %if ARCH_X86_64 lea wq, [r4-2] %endif test edgeb, 1 ; LR_HAVE_LEFT jz .h_extend_left movif32 wq, w0m .h_loop: movu m5, [lpfq+wq-1] .h_main: test edgeb, 2 ; LR_HAVE_RIGHT %if ARCH_X86_32 pxor m8, m8 %else SWAP m8, m6 %endif jnz .h_have_right cmp wd, -10 jl .h_have_right call .extend_right .h_have_right: punpcklbw m4, m5, m8 punpckhbw m5, m8 palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; sum3 punpcklwd m7, m0, m8 pmaddwd m7, m7 punpckhwd m0, m8 pmaddwd m0, m0 %if ARCH_X86_64 SWAP m6, m8 %endif paddd m2, m7 ; sumsq3 palignr m5, m4, 8 punpcklwd m7, m5, m4 paddw m8, m4, m5 pmaddwd m7, m7 punpckhwd m5, m4 pmaddwd m5, m5 paddd m3, m0 mova [t1+wq*2+400* 6], m1 mova [t1+wq*2+400* 8], m2 mova [t1+wq*2+400*10], m3 paddw m8, m1 ; sum5 paddd m7, m2 ; sumsq5 paddd m5, m3 mova [t1+wq*2+400* 0], m8 mova [t1+wq*2+400* 2], m7 mova [t1+wq*2+400* 4], m5 add wq, 8 jl .h_loop ret ALIGN function_align .hv0: ; horizontal boxsum + vertical boxsum + ab3 (even rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 13 jmp .hv0_main .hv0_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, [base+sgr_lshuf5] jmp .hv0_main .hv0_bottom: %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv0_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv0_loop_start %endif .hv0_loop: movif32 lpfq, hvsrcm .hv0_loop_start: movu m5, [lpfq+wq-1] .hv0_main: test edgeb, 2 ; LR_HAVE_RIGHT %if ARCH_X86_32 pxor m8, m8 %else SWAP m8, m6 %endif jnz .hv0_have_right cmp wd, -10 jl .hv0_have_right call .extend_right .hv0_have_right: punpcklbw m4, m5, m8 punpckhbw m5, m8 palignr m3, m5, m4, 2 palignr m0, m5, m4, 4 movif32 t3, t3m paddw m1, m3, m0 punpcklwd m2, m3, m0 pmaddwd m2, m2 punpckhwd m3, m0 pmaddwd m3, m3 palignr m0, m5, m4, 6 paddw m1, m0 ; h sum3 punpcklwd m7, m0, m8 pmaddwd m7, m7 punpckhwd m0, m8 %if ARCH_X86_64 SWAP m6, m8 %endif pmaddwd m0, m0 paddd m2, m7 ; h sumsq3 palignr m5, m4, 8 punpcklwd m7, m5, m4 paddw m8, m4, m5 pmaddwd m7, m7 punpckhwd m5, m4 pmaddwd m5, m5 paddd m3, m0 paddw m8, m1 ; h sum5 paddd m7, m2 ; h sumsq5 paddd m5, m3 mova [t3+wq*4+400*8+ 8], m8 mova [t3+wq*4+400*0+ 8], m7 mova [t3+wq*4+400*0+24], m5 paddw m8, [t1+wq*2+400* 0] paddd m7, [t1+wq*2+400* 2] paddd m5, [t1+wq*2+400* 4] mova [t1+wq*2+400* 0], m8 mova [t1+wq*2+400* 2], m7 mova [t1+wq*2+400* 4], m5 paddw m0, m1, [t1+wq*2+400* 6] paddd m4, m2, [t1+wq*2+400* 8] paddd m5, m3, [t1+wq*2+400*10] mova [t1+wq*2+400* 6], m1 mova [t1+wq*2+400* 8], m2 mova [t1+wq*2+400*10], m3 paddw m1, m0, [t2+wq*2+400* 6] paddd m2, m4, [t2+wq*2+400* 8] paddd m3, m5, [t2+wq*2+400*10] mova [t2+wq*2+400* 6], m0 mova [t2+wq*2+400* 8], m4 mova [t2+wq*2+400*10], m5 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 paddd m5, m3 punpcklwd m0, m1, m7 ; b3 pmaddwd m2, m0, m0 punpckhwd m1, m7 pmaddwd m3, m1, m1 %if ARCH_X86_64 SWAP m7, m6 %endif psubd m4, m2 ; p3 psubd m5, m3 MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*2+400*2+ 4], m3 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+400*4+ 8], m0 mova [t3+wq*4+400*4+24], m1 add wq, 8 jl .hv0_loop ret ALIGN function_align .hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 leftq, leftm movddup m4, [leftq-4] movif32 wq, w0m mova m5, [lpfq+wq+2] add leftmp, 4 palignr m5, m4, 13 jmp .hv1_main .hv1_extend_left: movif32 wq, w0m mova m5, [lpfq+wq+2] pshufb m5, [base+sgr_lshuf5] jmp .hv1_main .hv1_bottom: %if ARCH_X86_64 lea wq, [r4-2] %else mov hvsrcm, lpfq %endif test edgeb, 1 ; LR_HAVE_LEFT jz .hv1_extend_left movif32 wq, w0m %if ARCH_X86_32 jmp .hv1_loop_start %endif .hv1_loop: movif32 lpfq, hvsrcm .hv1_loop_start: movu m5, [lpfq+wq-1] .hv1_main: test edgeb, 2 ; LR_HAVE_RIGHT %if ARCH_X86_32 pxor m8, m8 %else SWAP m8, m6 %endif jnz .hv1_have_right cmp wd, -10 jl .hv1_have_right call .extend_right .hv1_have_right: punpcklbw m4, m5, m8 punpckhbw m5, m8 palignr m7, m5, m4, 2 palignr m3, m5, m4, 4 paddw m2, m7, m3 punpcklwd m0, m7, m3 pmaddwd m0, m0 punpckhwd m7, m3 pmaddwd m7, m7 palignr m3, m5, m4, 6 paddw m2, m3 ; h sum3 punpcklwd m1, m3, m8 pmaddwd m1, m1 punpckhwd m3, m8 %if ARCH_X86_64 SWAP m6, m8 %endif pmaddwd m3, m3 paddd m0, m1 ; h sumsq3 palignr m5, m4, 8 punpckhwd m1, m4, m5 paddw m8, m4, m5 pmaddwd m1, m1 punpcklwd m4, m5 pmaddwd m4, m4 paddd m7, m3 paddw m5, m2, [t2+wq*2+400* 6] mova [t2+wq*2+400* 6], m2 paddw m8, m2 ; h sum5 paddd m2, m0, [t2+wq*2+400* 8] paddd m3, m7, [t2+wq*2+400*10] mova [t2+wq*2+400* 8], m0 mova [t2+wq*2+400*10], m7 paddd m4, m0 ; h sumsq5 paddd m1, m7 pslld m0, m2, 3 pslld m7, m3, 3 paddd m2, m0 ; a3 * 9 paddd m3, m7 %if ARCH_X86_32 mova [esp+20], m8 pxor m8, m8 %else SWAP m8, m6 %endif punpcklwd m0, m5, m8 ; b3 pmaddwd m7, m0, m0 punpckhwd m5, m8 pmaddwd m8, m5, m5 psubd m2, m7 ; p3 psubd m3, m8 MULLD m2, m14, m8 ; p3 * s1 MULLD m3, m14, m8 pmaddwd m0, m11 ; b3 * 455 pmaddwd m5, m11 paddusw m2, m11 paddusw m3, m11 psrld m2, 20 ; min(z3, 255) movif32 t3, t3m psrld m3, 20 GATHER_X_BY_X m8, m2, m3, r0, dstm punpcklwd m2, m8, m8 punpckhwd m3, m8, m8 MULLD m0, m2, m7 MULLD m5, m3, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m5, m10 psrld m0, 12 psrld m5, 12 mova [t4+wq*2+400*4+ 4], m8 mova [t3+wq*4+400*8+ 8], m0 mova [t3+wq*4+400*8+24], m5 %if ARCH_X86_32 mova m8, [esp+20] %else SWAP m6, m8 pxor m6, m6 %endif paddw m5, m8, [t2+wq*2+400*0] paddd m2, m4, [t2+wq*2+400*2] paddd m3, m1, [t2+wq*2+400*4] paddw m5, [t1+wq*2+400*0] paddd m2, [t1+wq*2+400*2] paddd m3, [t1+wq*2+400*4] mova [t2+wq*2+400*0], m8 pslld m0, m2, 4 mova [t2+wq*2+400*2], m4 pslld m8, m3, 4 mova [t2+wq*2+400*4], m1 pslld m4, m2, 3 paddd m2, m0 pslld m7, m3, 3 paddd m3, m8 paddd m2, m4 ; a5 * 25 paddd m3, m7 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif punpcklwd m0, m5, m7 ; b5 pmaddwd m4, m0, m0 punpckhwd m5, m7 pmaddwd m1, m5, m5 %if ARCH_X86_64 SWAP m7, m6 %endif psubd m2, m4 ; p5 psubd m3, m1 MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 pmaddwd m0, m12 ; b5 * 164 pmaddwd m5, m12 paddusw m2, m12 paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m1, m2, m3, r0, dstm punpcklwd m2, m1, m1 punpckhwd m3, m1, m1 MULLD m0, m2, m7 MULLD m5, m3, m7 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m5, m10 mova [t4+wq*2+4], m1 psrld m0, 12 psrld m5, 12 mova [t3+wq*4+ 8], m0 mova [t3+wq*4+24], m5 add wq, 8 jl .hv1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .v0: ; vertical boxsums + ab3 (even rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov wd, w0m %endif .v0_loop: mova m0, [t1+wq*2+400* 6] mova m4, [t1+wq*2+400* 8] mova m5, [t1+wq*2+400*10] paddw m0, m0 paddd m4, m4 paddd m5, m5 paddw m1, m0, [t2+wq*2+400* 6] paddd m2, m4, [t2+wq*2+400* 8] paddd m3, m5, [t2+wq*2+400*10] mova [t2+wq*2+400* 6], m0 mova [t2+wq*2+400* 8], m4 mova [t2+wq*2+400*10], m5 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; a3 * 9 paddd m5, m3 punpcklwd m0, m1, m7 ; b3 pmaddwd m2, m0, m0 punpckhwd m1, m7 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 psubd m5, m3 %if ARCH_X86_64 SWAP m7, m6 %endif MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*2+400*2+4], m3 psrld m0, 12 psrld m1, 12 mova m3, [t1+wq*2+400*0] mova m4, [t1+wq*2+400*2] mova m5, [t1+wq*2+400*4] mova [t3+wq*4+400*8+ 8], m3 mova [t3+wq*4+400*0+ 8], m4 mova [t3+wq*4+400*0+24], m5 paddw m3, m3 ; cc5 paddd m4, m4 paddd m5, m5 mova [t1+wq*2+400*0], m3 mova [t1+wq*2+400*2], m4 mova [t1+wq*2+400*4], m5 mova [t3+wq*4+400*4+ 8], m0 mova [t3+wq*4+400*4+24], m1 add wq, 8 jl .v0_loop ret .v1: ; vertical boxsums + ab (odd rows) %if ARCH_X86_64 lea wq, [r4-2] %else mov wd, w0m %endif .v1_loop: mova m4, [t1+wq*2+400* 6] mova m5, [t1+wq*2+400* 8] mova m7, [t1+wq*2+400*10] paddw m1, m4, [t2+wq*2+400* 6] paddd m2, m5, [t2+wq*2+400* 8] paddd m3, m7, [t2+wq*2+400*10] mova [t2+wq*2+400* 6], m4 mova [t2+wq*2+400* 8], m5 mova [t2+wq*2+400*10], m7 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif pslld m4, m2, 3 pslld m5, m3, 3 paddd m4, m2 ; ((a3 + 8) >> 4) * 9 paddd m5, m3 punpcklwd m0, m1, m7 ; b3 pmaddwd m2, m0, m0 punpckhwd m1, m7 pmaddwd m3, m1, m1 psubd m4, m2 ; p3 psubd m5, m3 %if ARCH_X86_64 SWAP m7, m6 %endif MULLD m4, m14, m7 ; p3 * s1 MULLD m5, m14, m7 pmaddwd m0, m11 ; b3 * 455 pmaddwd m1, m11 paddusw m4, m11 paddusw m5, m11 psrld m4, 20 ; min(z3, 255) psrld m5, 20 GATHER_X_BY_X m3, m4, m5, r0, dstm punpcklwd m4, m3, m3 punpckhwd m5, m3, m3 MULLD m0, m4, m7 MULLD m1, m5, m7 paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*2+400*4+4], m3 psrld m0, 12 psrld m8, m1, 12 mova m4, [t3+wq*4+400*8+ 8] mova m5, [t3+wq*4+400*0+ 8] mova m7, [t3+wq*4+400*0+24] paddw m1, m4, [t2+wq*2+400*0] paddd m2, m5, [t2+wq*2+400*2] paddd m3, m7, [t2+wq*2+400*4] paddw m1, [t1+wq*2+400*0] paddd m2, [t1+wq*2+400*2] paddd m3, [t1+wq*2+400*4] mova [t2+wq*2+400*0], m4 mova [t2+wq*2+400*2], m5 mova [t2+wq*2+400*4], m7 pslld m4, m2, 4 mova [t3+wq*4+400*8+ 8], m0 pslld m5, m3, 4 mova [t3+wq*4+400*8+24], m8 pslld m7, m2, 3 paddd m2, m4 pslld m8, m3, 3 paddd m3, m5 paddd m2, m7 ; a5 * 25 paddd m3, m8 %if ARCH_X86_32 pxor m7, m7 %else SWAP m7, m6 %endif punpcklwd m0, m1, m7 ; b5 pmaddwd m4, m0, m0 punpckhwd m1, m7 pmaddwd m5, m1, m1 psubd m2, m4 ; p5 psubd m3, m5 %if ARCH_X86_64 SWAP m7, m6 %endif MULLD m2, m13, m7 ; p5 * s0 MULLD m3, m13, m7 pmaddwd m0, m12 ; b5 * 164 pmaddwd m1, m12 paddusw m2, m12 paddusw m3, m12 psrld m2, 20 ; min(z5, 255) psrld m3, 20 GATHER_X_BY_X m4, m2, m3, r0, dstm punpcklwd m2, m4, m4 punpckhwd m3, m4, m4 MULLD m0, m2, m7 MULLD m1, m3, m7 paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) paddd m1, m10 mova [t4+wq*2+4], m4 psrld m0, 12 psrld m1, 12 mova [t3+wq*4+ 8], m0 mova [t3+wq*4+24], m1 add wq, 8 jl .v1_loop mov r10, t2 mov t2, t1 mov t1, r10 ret .prep_n: ; initial neighbor setup movif64 wq, r4 movif32 wd, w1m .prep_n_loop: movu m0, [t4+wq*2+400*0+ 2] movu m1, [t3+wq*4+400*0+ 4] movu m2, [t3+wq*4+400*0+20] movu m7, [t4+wq*2+400*0+ 4] movu m8, [t3+wq*4+400*0+ 8] paddw m3, m0, [t4+wq*2+400*0+ 0] paddd m4, m1, [t3+wq*4+400*0+ 0] paddd m5, m2, [t3+wq*4+400*0+16] paddw m3, m7 paddd m4, m8 movu m7, [t3+wq*4+400*0+24] paddw m0, m3 paddd m1, m4 psllw m3, 2 pslld m4, 2 paddd m5, m7 paddd m2, m5 pslld m5, 2 paddw m0, m3 ; a5 565 paddd m1, m4 ; b5 565 paddd m2, m5 mova [t4+wq*2+400* 6+ 0], m0 mova [t3+wq*4+400*12+ 0], m1 mova [t3+wq*4+400*12+16], m2 movu m0, [t4+wq*2+400*2+ 4] movu m1, [t3+wq*4+400*4+ 8] movu m2, [t3+wq*4+400*4+24] movu m3, [t4+wq*2+400*2+ 2] movu m4, [t3+wq*4+400*4+ 4] movu m5, [t3+wq*4+400*4+20] paddw m0, [t4+wq*2+400*2+ 0] paddd m1, [t3+wq*4+400*4+ 0] paddd m2, [t3+wq*4+400*4+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a3[-1] 444 pslld m4, 2 ; b3[-1] 444 pslld m5, 2 psubw m3, m0 ; a3[-1] 343 psubd m4, m1 ; b3[-1] 343 psubd m5, m2 mova [t4+wq*2+400* 8+ 0], m3 mova [t3+wq*4+400*16+ 0], m4 mova [t3+wq*4+400*16+16], m5 movu m0, [t4+wq*2+400*4+ 4] movu m1, [t3+wq*4+400*8+ 8] movu m2, [t3+wq*4+400*8+24] movu m3, [t4+wq*2+400*4+ 2] movu m4, [t3+wq*4+400*8+ 4] movu m5, [t3+wq*4+400*8+20] paddw m0, [t4+wq*2+400*4+ 0] paddd m1, [t3+wq*4+400*8+ 0] paddd m2, [t3+wq*4+400*8+16] paddw m3, m0 paddd m4, m1 paddd m5, m2 psllw m3, 2 ; a3[ 0] 444 pslld m4, 2 ; b3[ 0] 444 pslld m5, 2 mova [t4+wq*2+400*10+ 0], m3 mova [t3+wq*4+400*20+ 0], m4 mova [t3+wq*4+400*20+16], m5 psubw m3, m0 ; a3[ 0] 343 psubd m4, m1 ; b3[ 0] 343 psubd m5, m2 mova [t4+wq*2+400*12+ 0], m3 mova [t3+wq*4+400*24+ 0], m4 mova [t3+wq*4+400*24+16], m5 add wq, 8 jl .prep_n_loop ret ALIGN function_align .n0: ; neighbor + output (even rows) movif64 wq, r4 movif32 wd, w1m .n0_loop: movu m0, [t4+wq*2+ 4] movu m2, [t4+wq*2+ 2] paddw m0, [t4+wq*2+ 0] paddw m0, m2 paddw m2, m0 psllw m0, 2 paddw m0, m2 ; a5 movu m4, [t3+wq*4+ 8] movu m5, [t3+wq*4+24] movu m1, [t3+wq*4+ 4] movu m3, [t3+wq*4+20] paddd m4, [t3+wq*4+ 0] paddd m5, [t3+wq*4+16] paddd m4, m1 paddd m5, m3 paddd m1, m4 paddd m3, m5 pslld m4, 2 pslld m5, 2 paddd m4, m1 ; b5 paddd m5, m3 movu m2, [t4+wq*2+400* 6] paddw m2, m0 mova [t4+wq*2+400* 6], m0 paddd m0, m4, [t3+wq*4+400*12+ 0] paddd m1, m5, [t3+wq*4+400*12+16] mova [t3+wq*4+400*12+ 0], m4 mova [t3+wq*4+400*12+16], m5 mova [rsp+16+ARCH_X86_32*4], m1 movu m3, [t4+wq*2+400*2+4] movu m5, [t4+wq*2+400*2+2] paddw m3, [t4+wq*2+400*2+0] paddw m5, m3 psllw m5, 2 ; a3[ 1] 444 psubw m4, m5, m3 ; a3[ 1] 343 movu m3, [t4+wq*2+400* 8] paddw m3, [t4+wq*2+400*10] paddw m3, m4 mova [t4+wq*2+400* 8], m4 mova [t4+wq*2+400*10], m5 movu m1, [t3+wq*4+400*4+ 8] movu m5, [t3+wq*4+400*4+ 4] movu m7, [t3+wq*4+400*4+24] movu m8, [t3+wq*4+400*4+20] paddd m1, [t3+wq*4+400*4+ 0] paddd m7, [t3+wq*4+400*4+16] paddd m5, m1 paddd m8, m7 pslld m5, 2 ; b3[ 1] 444 pslld m8, 2 psubd m4, m5, m1 ; b3[ 1] 343 %if ARCH_X86_32 mova [esp+52], m8 psubd m8, m7 %else psubd m6, m8, m7 SWAP m8, m6 %endif paddd m1, m4, [t3+wq*4+400*16+ 0] paddd m7, m8, [t3+wq*4+400*16+16] paddd m1, [t3+wq*4+400*20+ 0] paddd m7, [t3+wq*4+400*20+16] mova [t3+wq*4+400*16+ 0], m4 mova [t3+wq*4+400*16+16], m8 mova [t3+wq*4+400*20+ 0], m5 %if ARCH_X86_32 mova m8, [esp+52] %else SWAP m8, m6 pxor m6, m6 %endif mova [t3+wq*4+400*20+16], m8 mova [rsp+32+ARCH_X86_32*4], m7 movq m4, [dstq+wq] punpcklbw m4, m6 punpcklwd m5, m4, m6 punpcklwd m7, m2, m6 pmaddwd m7, m5 ; a5 * src punpcklwd m8, m3, m6 pmaddwd m8, m5 ; a3 * src punpckhwd m5, m4, m6 punpckhwd m2, m6 pmaddwd m2, m5 punpckhwd m3, m6 pmaddwd m3, m5 psubd m0, m7 ; b5 - a5 * src + (1 << 8) - (src << 13) psubd m1, m8 ; b3 - a3 * src + (1 << 8) - (src << 13) psrld m0, 9 pslld m1, 7 pand m0, m9 pandn m8, m9, m1 por m0, m8 mova m1, [rsp+16+ARCH_X86_32*4] psubd m1, m2 mova m2, [rsp+32+ARCH_X86_32*4] psubd m2, m3 mova m3, [base+pd_4096] psrld m1, 9 pslld m2, 7 pand m1, m9 pandn m5, m9, m2 por m1, m5 pmaddwd m0, m15 pmaddwd m1, m15 paddd m0, m3 paddd m1, m3 psrad m0, 13 psrad m1, 13 packssdw m0, m1 paddw m0, m4 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n0_loop add dstq, stridemp ret ALIGN function_align .n1: ; neighbor + output (odd rows) movif64 wq, r4 movif32 wd, w1m .n1_loop: movu m3, [t4+wq*2+400*4+4] movu m5, [t4+wq*2+400*4+2] paddw m3, [t4+wq*2+400*4+0] paddw m5, m3 psllw m5, 2 ; a3[ 1] 444 psubw m4, m5, m3 ; a3[ 1] 343 paddw m3, m4, [t4+wq*2+400*12] paddw m3, [t4+wq*2+400*10] mova [t4+wq*2+400*10], m5 mova [t4+wq*2+400*12], m4 movu m1, [t3+wq*4+400*8+ 8] movu m5, [t3+wq*4+400*8+ 4] movu m7, [t3+wq*4+400*8+24] movu m8, [t3+wq*4+400*8+20] paddd m1, [t3+wq*4+400*8+ 0] paddd m7, [t3+wq*4+400*8+16] paddd m5, m1 paddd m8, m7 pslld m5, 2 ; b3[ 1] 444 pslld m8, 2 psubd m4, m5, m1 ; b3[ 1] 343 psubd m0, m8, m7 paddd m1, m4, [t3+wq*4+400*24+ 0] paddd m7, m0, [t3+wq*4+400*24+16] paddd m1, [t3+wq*4+400*20+ 0] paddd m7, [t3+wq*4+400*20+16] mova [t3+wq*4+400*20+ 0], m5 mova [t3+wq*4+400*20+16], m8 mova [t3+wq*4+400*24+ 0], m4 mova [t3+wq*4+400*24+16], m0 movq m5, [dstq+wq] mova m2, [t4+wq*2+400* 6] punpcklbw m5, m6 punpcklwd m4, m5, m6 punpcklwd m8, m2, m6 pmaddwd m8, m4 ; a5 * src punpcklwd m0, m3, m6 pmaddwd m0, m4 ; a3 * src punpckhwd m4, m5, m6 punpckhwd m2, m6 pmaddwd m2, m4 punpckhwd m3, m6 pmaddwd m3, m4 psubd m1, m0 ; b3 - a3 * src + (1 << 8) - (src << 13) mova m0, [t3+wq*4+400*12+ 0] psubd m0, m8 ; b5 - a5 * src + (1 << 8) - (src << 13) mova m4, [t3+wq*4+400*12+16] psubd m4, m2 psubd m7, m3 pslld m1, 7 psrld m0, 8 psrld m4, 8 pslld m7, 7 pandn m3, m9, m1 pand m0, m9 por m0, m3 pand m4, m9 pandn m2, m9, m7 por m2, m4 mova m1, [base+pd_4096] pmaddwd m0, m15 pmaddwd m2, m15 paddd m0, m1 paddd m2, m1 psrad m0, 13 psrad m2, 13 packssdw m0, m2 paddw m0, m5 packuswb m0, m0 movq [dstq+wq], m0 add wq, 8 jl .n1_loop add dstq, stridemp movif32 dstm, dstq ret rav1e-0.7.1/src/x86/mc16_avx2.asm000064400000000000000000006007401046102023000143020ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 ; dav1d_obmc_masks[] * -512 const obmc_masks_avx2 dw 0, 0, -9728, 0, -12800, -7168, -2560, 0 dw -14336, -11264, -8192, -5632, -3584, -1536, 0, 0 dw -15360, -13824, -12288, -10752, -9216, -7680, -6144, -5120 dw -4096, -3072, -2048, -1536, 0, 0, 0, 0 dw -15872, -14848, -14336, -13312, -12288, -11776, -10752, -10240 dw -9728, -8704, -8192, -7168, -6656, -6144, -5632, -4608 dw -4096, -3584, -3072, -2560, -2048, -2048, -1536, -1024 dw 0, 0, 0, 0, 0, 0, 0, 0 deint_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 subpel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 subpel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 subpel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 subpel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 rescale_mul2: dd 0, 1, 4, 5, 2, 3, 6, 7 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 bdct_lb_q: times 8 db 0 times 8 db 4 times 8 db 8 times 8 db 12 prep_mul: dw 16, 16, 4, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 put_8tap_h_rnd: dd 34, 40 s_8tap_h_rnd: dd 2, 8 s_8tap_h_sh: dd 2, 4 put_s_8tap_v_rnd: dd 512, 128 put_s_8tap_v_sh: dd 10, 8 prep_8tap_1d_rnd: dd 8 - (8192 << 4) prep_8tap_2d_rnd: dd 32 - (8192 << 5) warp8x8t_rnd: dd 16384 - (8192 << 15) warp8x8_shift: dd 5, 3 warp8x8_rnd: dw 4096, 4096, 16384, 16384 bidir_rnd: dw -16400, -16400, -16388, -16388 bidir_mul: dw 2048, 2048, 8192, 8192 %define pw_16 prep_mul %define pd_512 put_s_8tap_v_rnd pw_2: times 2 dw 2 pw_64: times 2 dw 64 pw_2048: times 2 dw 2048 pw_8192: times 2 dw 8192 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 pw_m512: times 2 dw -512 pd_32: dd 32 pd_63: dd 63 pd_64: dd 64 pd_32768: dd 32768 pd_65538: dd 65538 pd_m524256: dd -524256 ; -8192 << 6 + 32 pd_0x3ff: dd 0x3ff pq_0x40000000: dq 0x40000000 dd 0 %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_16bpc_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_16bpc_avx2.prep) BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern resize_filter SECTION .text INIT_XMM avx2 cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx2] %if UNIX64 DECLARE_REG_TMP 8 %define org_w r8d mov r8d, wd %else DECLARE_REG_TMP 7 %define org_w wm %endif tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET INIT_YMM avx2 .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*1+32*0], m2 mova [dstq+dsq*1+32*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] add srcq, ssq mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 movu m0, [srcq+32*4] movu m1, [srcq+32*5] movu m2, [srcq+32*6] movu m3, [srcq+32*7] add srcq, ssq mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 add dstq, dsq dec hd jg .put_w128 RET .h: movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add wq, r7 shr r6d, 11 vpbroadcastd m3, [r7-put_avx2+put_bilin_h_rnd+r6*4] jmp wq .h_w2: movq xm1, [srcq+ssq*0] movhps xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] movq xm1, [srcq+ssq*0+2] movhps xm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xm0, xm4 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 4 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 movu xm1, [srcq+ssq*0+2] vinserti128 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+32*0] pmullw m1, m5, [srcq+32*0+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+32*1] pmullw m2, m5, [srcq+32*1+2] add srcq, ssq paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .h_w32 RET .h_w64: .h_w128: movifnidn t0d, org_w .h_w64_loop0: mov r6d, t0d .h_w64_loop: pmullw m0, m4, [srcq+r6*2-32*1] pmullw m1, m5, [srcq+r6*2-32*1+2] paddw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r6*2-32*2] pmullw m2, m5, [srcq+r6*2-32*2+2] paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+r6*2-32*1], m0 mova [dstq+r6*2-32*2], m1 sub r6d, 32 jg .h_w64_loop add srcq, ssq add dstq, dsq dec hd jg .h_w64_loop0 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] shl mxyd, 11 movd xm5, mxyd add wq, r7 vpbroadcastw m5, xm5 jmp wq .v_w2: movd xm0, [srcq+ssq*0] .v_w2_loop: movd xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xm2, xm0, xm1 movd xm0, [srcq+ssq*0] punpckldq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm0, [srcq+ssq*0] .v_w4_loop: movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xm2, xm0, xm1 movq xm0, [srcq+ssq*0] punpcklqdq xm1, xm0 psubw xm1, xm2 pmulhrsw xm1, xm5 paddw xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m2, m0, m1, 0xf0 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m1, m0, 0xf0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w32: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] .v_w32_loop: movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m5 paddw m4, m0 movu m0, [srcq+ssq*0+32*0] mova [dstq+dsq*0+32*0], m4 psubw m4, m3, m1 pmulhrsw m4, m5 paddw m4, m1 movu m1, [srcq+ssq*0+32*1] mova [dstq+dsq*0+32*1], m4 psubw m4, m0, m2 pmulhrsw m4, m5 paddw m4, m2 mova [dstq+dsq*1+32*0], m4 psubw m4, m1, m3 pmulhrsw m4, m5 paddw m4, m3 mova [dstq+dsq*1+32*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w16: .v_w64: .v_w128: movifnidn t0d, org_w add t0d, t0d mov r4, srcq lea r6d, [hq+t0*8-256] mov r7, dstq .v_w16_loop0: movu m0, [srcq+ssq*0] .v_w16_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m5 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m5 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 vpbroadcastd m3, [pw_2] movd xm6, mxyd vpbroadcastd m7, [pw_8192] add wq, r7 vpbroadcastw m6, xm6 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m7, [pw_2048] .hv_12bpc: jmp wq .hv_w2: vpbroadcastq xm1, [srcq+ssq*0] pmullw xm0, xm4, xm1 psrlq xm1, 16 pmullw xm1, xm5 paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w2_loop: movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm2, [srcq+ssq*0] pmullw xm1, xm4, xm2 psrlq xm2, 16 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 _ 2 _ shufpd xm2, xm0, xm1, 0x01 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xm0, xm4, [srcq+ssq*0-8] pmullw xm1, xm5, [srcq+ssq*0-6] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 .hv_w4_loop: movq xm1, [srcq+ssq*1] movq xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xm1, [srcq+ssq*0] movhps xm2, [srcq+ssq*0+2] pmullw xm1, xm4 pmullw xm2, xm5 paddw xm1, xm3 paddw xm1, xm2 psrlw xm1, 2 ; 1 2 shufpd xm2, xm0, xm1, 0x01 ; 0 1 mova xm0, xm1 psubw xm1, xm2 paddw xm1, xm1 pmulhw xm1, xm6 paddw xm1, xm2 pmulhrsw xm1, xm7 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+ssq*0] pmullw xm1, xm5, [srcq+ssq*0+2] paddw xm0, xm3 paddw xm0, xm1 psrlw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 vinserti128 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: .hv_w32: .hv_w64: .hv_w128: %if UNIX64 lea r6d, [r8*2-32] %else mov r6d, wm lea r6d, [r6*2-32] %endif mov r4, srcq lea r6d, [hq+r6*8] mov r7, dstq .hv_w16_loop0: pmullw m0, m4, [srcq+ssq*0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w16_loop: pmullw m1, m4, [srcq+ssq*1] pmullw m2, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m1, m3 paddw m1, m2 psrlw m1, 2 psubw m2, m1, m0 paddw m2, m2 pmulhw m2, m6 paddw m2, m0 pmulhrsw m2, m7 mova [dstq+dsq*0], m2 pmullw m0, m4, [srcq+ssq*0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m3 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m1 paddw m2, m2 pmulhw m2, m6 paddw m2, m1 pmulhrsw m2, m7 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx2] %if UNIX64 DECLARE_REG_TMP 7 %define org_w r7d %else DECLARE_REG_TMP 6 %define org_w r5m %endif mov org_w, wd tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx2+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx2+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] vpbroadcastq m1, [srcq+strideq*2] vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*2] vinserti128 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0+32*0] pmullw m1, m4, [srcq+strideq*0+32*1] pmullw m2, m4, [srcq+strideq*1+32*0] pmullw m3, m4, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 2 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+32*0] pmullw m1, m4, [srcq+32*1] pmullw m2, m4, [srcq+32*2] pmullw m3, m4, [srcq+32*3] add srcq, strideq psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 dec hd jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+32*0] pmullw m1, m4, [srcq+32*1] pmullw m2, m4, [srcq+32*2] pmullw m3, m4, [srcq+32*3] psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 pmullw m0, m4, [srcq+32*4] pmullw m1, m4, [srcq+32*5] pmullw m2, m4, [srcq+32*6] pmullw m3, m4, [srcq+32*7] add tmpq, 32*8 add srcq, strideq psubw m0, m5 psubw m1, m5 psubw m2, m5 psubw m3, m5 mova [tmpq-32*4], m0 mova [tmpq-32*3], m1 mova [tmpq-32*2], m2 mova [tmpq-32*1], m3 dec hd jg .prep_w128 RET .h: movd xm5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti128 m2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq m0, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m0, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*0+2] vinserti128 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m3 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+strideq*1] pmullw m2, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 2 jg .h_w16 RET .h_w32: .h_w64: .h_w128: movifnidn t0d, org_w .h_w32_loop0: mov r3d, t0d .h_w32_loop: pmullw m0, m4, [srcq+r3*2-32*1] pmullw m1, m5, [srcq+r3*2-32*1+2] psubw m0, m3 paddw m0, m1 pmullw m1, m4, [srcq+r3*2-32*2] pmullw m2, m5, [srcq+r3*2-32*2+2] psubw m1, m3 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+r3*2-32*1], m0 mova [tmpq+r3*2-32*2], m1 sub r3d, 32 jg .h_w32_loop add srcq, strideq lea tmpq, [tmpq+t0*2] dec hd jg .h_w32_loop0 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] movd xm5, mxyd vpbroadcastd m4, [pw_16] vpbroadcastw m5, xm5 vpbroadcastd m3, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m4, m5 test dword r7m, 0x800 jnz .v_12bpc psllw m4, 2 psllw m5, 2 .v_12bpc: jmp wq .v_w4: movq xm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq xm1, [srcq+strideq*1] vpblendd m2, m0, 0x03 ; 0 2 2 2 vpbroadcastq m0, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0xf0 ; 1 1 3 3 vpbroadcastq m0, [srcq+strideq*0] vpblendd m1, m2, 0x33 ; 0 1 2 3 vpblendd m0, m2, 0x0c ; 4 2 4 4 punpckhqdq m2, m1, m0 ; 1 2 3 4 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vbroadcasti128 m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m1, m0, m2, 0xf0 ; 0 1 vbroadcasti128 m0, [srcq+strideq*0] vpblendd m2, m0, 0xf0 ; 1 2 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 32 sub hd, 2 jg .v_w8_loop RET .v_w16: movu m0, [srcq+strideq*0] .v_w16_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m4 mova [tmpq+32*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 PUSH r7 %endif movifnidn r7d, org_w add r7d, r7d mov r3, srcq lea r6d, [hq+r7*8-256] mov r5, tmpq .v_w32_loop0: movu m0, [srcq+strideq*0] .v_w32_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5, m2 psubw m0, m3 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m4 mova [tmpq+r7*0], m1 pmullw m1, m5, m0 psubw m2, m3 paddw m1, m2 psraw m1, 2 mova [tmpq+r7*1], m1 lea tmpq, [tmpq+r7*2] sub hd, 2 jg .v_w32_loop add r3, 32 add r5, 32 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .v_w32_loop0 %if WIN64 POP r7 %endif RET .hv: WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd add wq, r6 lea stride3q, [strideq*3] vpbroadcastw m6, xm6 jmp wq .hv_w4: movu xm1, [srcq+strideq*0] %if WIN64 movaps [rsp+24], xmm7 %endif pmullw xm0, xm4, xm1 psrldq xm1, 2 pmullw xm1, xm5 psubw xm0, xm3 paddw xm0, xm1 psraw xm0, 2 vpbroadcastq m0, xm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 m1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 m2, [srcq+strideq*0], 1 punpcklqdq m7, m1, m2 psrldq m1, 2 pslldq m2, 6 pmullw m7, m4 vpblendd m1, m2, 0xcc pmullw m1, m5 psubw m7, m3 paddw m1, m7 psraw m1, 2 ; 1 2 3 4 vpblendd m0, m1, 0x3f vpermq m2, m0, q2103 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop %if WIN64 movaps xmm7, [rsp+24] %endif RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm3 paddw xm0, xm1 psraw xm0, 2 vinserti128 m0, xm0, 1 .hv_w8_loop: movu xm1, [srcq+strideq*1] movu xm2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti128 m1, [srcq+strideq*0], 1 vinserti128 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m3 paddw m1, m2 psraw m1, 2 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: .hv_w32: .hv_w64: .hv_w128: %if WIN64 PUSH r7 %endif movifnidn r7d, org_w add r7d, r7d mov r3, srcq lea r6d, [hq+r7*8-256] mov r5, tmpq .hv_w16_loop0: pmullw m0, m4, [srcq] pmullw m1, m5, [srcq+2] psubw m0, m3 paddw m0, m1 psraw m0, 2 .hv_w16_loop: pmullw m1, m4, [srcq+strideq*1] pmullw m2, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m1, m3 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+r7*0], m2 pmullw m0, m4, [srcq+strideq*0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m3 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+r7*1], m2 lea tmpq, [tmpq+r7*2] sub hd, 2 jg .hv_w16_loop add r3, 32 add r5, 32 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .hv_w16_loop0 %if WIN64 POP r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; prefix, type, type_h, type_v cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx2 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx2] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb sub srcq, 2 mova xm2, [subpel_h_shuf2] vpbroadcastd xm3, [base+subpel_filters+mxq*8+2] pmovsxbw xm3, xm3 .h_w2_loop: movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm2 pshufb xm1, xm2 pmaddwd xm0, xm3 pmaddwd xm1, xm3 phaddd xm0, xm1 paddd xm0, xm4 psrad xm0, 6 packusdw xm0, xm0 pminsw xm0, xm5 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm3, [base+subpel_filters+mxq*8] WIN64_SPILL_XMM 8 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] pshufd xm3, xm3, q2211 vpbroadcastq m2, xm3 vpermq m3, m3, q1111 .h_w4_loop: movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m2 pmaddwd m1, m3 paddd m0, m4 paddd m0, m1 psrad m0, 6 vextracti128 xm1, m0, 1 packusdw xm0, xm1 pminsw xm0, xm5 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m5, r8m shr r7d, 11 vpbroadcastd m4, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 je .h_w4 jl .h_w2 %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 13 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 8 jg .h_w16 .h_w8: %macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m4 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m4 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 6 psrad m%2, 6 packusdw m%1, m%2 pminsw m%1, m5 %endmacro movu xm0, [srcq+ssq*0+ 0] vinserti128 m0, [srcq+ssq*1+ 0], 1 movu xm2, [srcq+ssq*0+16] vinserti128 m2, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] shufpd m1, m0, m2, 0x05 PUT_8TAP_H 0, 1, 2, 3, 12 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6*2-32] movu m1, [srcq+r6*2-24] movu m2, [srcq+r6*2-16] PUT_8TAP_H 0, 1, 2, 3, 12 mova [dstq+r6*2-32], m0 sub r6d, 16 jg .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m6, [pd_32] vpbroadcastw m7, r8m lea r6, [ssq*3] sub srcq, r6 punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 pinsrd xm2, [srcq+r6 ], 3 ; 0 1 2 3 lea srcq, [srcq+ssq*4] movd xm3, [srcq+ssq*0] vpbroadcastd xm1, [srcq+ssq*1] vpbroadcastd xm0, [srcq+ssq*2] add srcq, r6 vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklwd xm3, xm1 ; 45 56 punpcklwd xm1, xm2, xm4 ; 01 12 punpckhwd xm2, xm4 ; 23 34 .v_w2_loop: vpbroadcastd xm4, [srcq+ssq*0] pmaddwd xm5, xm8, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm9 ; a1 b1 paddd xm5, xm6 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm10 ; a2 b2 paddd xm5, xm3 vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklwd xm3, xm4 ; 67 78 pmaddwd xm4, xm11, xm3 ; a3 b3 paddd xm5, xm4 psrad xm5, 6 packusdw xm5, xm5 pminsw xm5, xm7 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xm1, [srcq+ssq*0] vpbroadcastq m0, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+ssq*4] vpbroadcastq m3, [srcq+ssq*0] vpbroadcastq m5, [srcq+ssq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+ssq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+ssq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 6 vextracti128 xm4, m5, 1 packusdw xm5, xm4 pminsw xm5, xm7 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*1], xm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: shl wd, 5 mov r7, srcq mov r8, dstq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+ssq*2] lea srcq, [srcq+ssq*4] vbroadcasti128 m1, [srcq+ssq*0] vbroadcasti128 m2, [srcq+ssq*1] vbroadcasti128 m3, [srcq+ssq*2] add srcq, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [srcq+ssq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 5 psrad m13, 5 packusdw m12, m13 pxor m13, m13 pavgw m12, m13 pminsw m12, m7 vpermq m12, m12, q3120 mova [dstq+dsq*0], xm12 vextracti128 [dstq+dsq*1], m12, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop add r7, 16 add r8, 16 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .v_w8_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastw m15, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] vpbroadcastd m6, [pd_512] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 pxor m7, m7 punpcklbw m7, m0 punpcklbw m1, m1 psraw m1, 8 ; sign-extend test dword r8m, 0x800 jz .hv_10bit psraw m7, 2 psllw m1, 2 .hv_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 m9, [subpel_h_shuf2] vbroadcasti128 m1, [srcq+r6 ] ; 3 3 movu xm3, [srcq+ssq*2] movu xm0, [srcq+ssq*0] movu xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*4] vinserti128 m3, [srcq+ssq*0], 1 ; 2 4 vinserti128 m0, [srcq+ssq*1], 1 ; 0 5 vinserti128 m2, [srcq+ssq*2], 1 ; 1 6 add srcq, r6 pshufb m1, m9 pshufb m3, m9 pshufb m0, m9 pshufb m2, m9 pmaddwd m1, m7 pmaddwd m3, m7 pmaddwd m0, m7 pmaddwd m2, m7 phaddd m1, m3 phaddd m0, m2 paddd m1, m6 paddd m0, m6 psrad m1, 10 psrad m0, 10 packssdw m1, m0 ; 3 2 0 1 vextracti128 xm0, m1, 1 ; 3 4 5 6 pshufd xm2, xm1, q1301 ; 2 3 1 2 pshufd xm3, xm0, q2121 ; 4 5 4 5 punpckhwd xm1, xm2 ; 01 12 punpcklwd xm2, xm0 ; 23 34 punpckhwd xm3, xm0 ; 45 56 .hv_w2_loop: movu xm4, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm4, xm9 pshufb xm5, xm9 pmaddwd xm4, xm7 pmaddwd xm5, xm7 phaddd xm4, xm5 pmaddwd xm5, xm11, xm1 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm12 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm13 ; a2 b2 paddd xm5, xm3 paddd xm4, xm6 psrad xm4, 10 packssdw xm4, xm4 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 pmaddwd xm4, xm14, xm3 ; a3 b3 paddd xm5, xm6 paddd xm5, xm4 psrad xm5, 10 packusdw xm5, xm5 pminsw xm5, xm15 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+ssq*0] vinserti128 m1, [srcq+ssq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+ssq*2], 0 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m0, [srcq+ssq*0], 1 ; 3 4 movu xm3, [srcq+ssq*1] vinserti128 m3, [srcq+ssq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m6 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m6 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m6 paddd m4, m0 paddd m5, m6 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 10 psrld m2, 10 vperm2i128 m3, m4, m5, 0x21 pslld m4, 6 pslld m5, 6 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 6 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 10 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+ssq*0] vinserti128 m4, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m6 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m6 paddd m4, m3 psrad m4, 10 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 10 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, xm15 movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] shl wd, 5 lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 pxor m0, m0 punpcklbw m0, m2 mov r7, srcq mov r8, dstq lea wd, [hq+wq-256] test dword r8m, 0x800 jz .hv_w8_10bit psraw m0, 2 psllw xm1, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 pshufd m13, m0, q2222 pshufd m14, m0, q3333 %if WIN64 %define v_mul (rsp+stack_offset+40) ; r4m %else %define v_mul (rsp-24) ; red zone %endif mova [v_mul], xm1 .hv_w8_loop0: %macro PUT_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m10 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m10 paddd m2, m3 paddd m%3, m2 paddd m%2, m%3 psrad m%1, 10 psrad m%2, 10 packssdw m%1, m%2 %endmacro movu xm4, [srcq+r6 *1+ 0] vbroadcasti128 m8, [subpel_h_shufA] movu xm6, [srcq+r6 *1+ 8] vbroadcasti128 m9, [subpel_h_shufB] movu xm0, [srcq+r6 *1+16] vpbroadcastd m10, [pd_512] movu xm5, [srcq+ssq*0+ 0] vinserti128 m5, [srcq+ssq*4+ 0], 1 movu xm1, [srcq+ssq*0+16] vinserti128 m1, [srcq+ssq*4+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PUT_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PUT_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+ssq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+ssq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PUT_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+ssq*1+ 0] movu xm1, [srcq+ssq*1+16] lea srcq, [srcq+ssq*4] vinserti128 m6, [srcq+ssq*1+ 0], 1 vinserti128 m1, [srcq+ssq*1+16], 1 add srcq, r6 shufpd m7, m6, m1, 0x05 PUT_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [srcq+ssq*0] vinserti128 m5, [srcq+ssq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [srcq+ssq*0+16] vinserti128 m6, [srcq+ssq*1+16], 1 vextracti128 [dstq], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m6, m5 movu xm5, [srcq+ssq*0+8] vinserti128 m5, [srcq+ssq*1+8], 1 lea srcq, [srcq+ssq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 vpbroadcastd m10, [pd_512] paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [dstq] paddd m8, m10 paddd m9, m10 paddd m0, m10 paddd m5, m10 vpbroadcastd m10, [v_mul+4*3] psrad m0, 10 psrad m5, 10 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 10 psrad m9, 10 packusdw m7, m9 pminsw m7, m15 vpermq m7, m7, q3120 mova [dstq+dsq*0], xm7 vextracti128 [dstq+dsq*1], m7, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop add r7, 16 add r8, 16 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .hv_w8_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w, h, mx, my %define base r7-prep_avx2 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx2] movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd mov r6d, r7m ; bitdepth_max movzx wd, word [r7+wq*2+table_offset(prep,)] vpbroadcastd m5, [r7-prep_avx2+pw_8192] shr r6d, 11 add wq, r7 vpbroadcastd m4, [base+prep_mul+r6*4] lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xm0, [base+subpel_filters+mxq*8] vbroadcasti128 m3, [subpel_h_shufA] vbroadcasti128 m4, [subpel_h_shufB] WIN64_SPILL_XMM 8 pshufd xm0, xm0, q2211 test dword r7m, 0x800 jnz .h_w4_12bpc psllw xm0, 2 .h_w4_12bpc: vpbroadcastq m6, xm0 vpermq m7, m0, q1111 .h_w4_loop: movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti128 m2, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m6 pmaddwd m1, m7 paddd m0, m5 paddd m0, m1 pshufb m1, m2, m3 pshufb m2, m4 pmaddwd m1, m6 pmaddwd m2, m7 paddd m1, m5 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4) lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 sub srcq, 6 vpbroadcastq m0, [base+subpel_filters+mxq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 vbroadcasti128 m6, [subpel_h_shufA] vbroadcasti128 m7, [subpel_h_shufB] punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .h_12bpc psllw m0, 2 .h_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 8 jg .h_w16 .h_w8: %macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2] pshufb m%4, m%1, m7 ; 2 3 3 4 4 5 5 6 pshufb m%1, m6 ; 0 1 1 2 2 3 3 4 pmaddwd m%5, m9, m%4 ; abcd1 pmaddwd m%1, m8 ; abcd0 pshufb m%2, m7 ; 6 7 7 8 8 9 9 a shufpd m%4, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m%5, m5 paddd m%1, m%5 pmaddwd m%5, m11, m%2 ; abcd3 paddd m%1, m%5 pmaddwd m%5, m10, m%4 ; abcd2 pshufb m%3, m7 ; a b b c c d d e pmaddwd m%4, m8 ; efgh0 paddd m%1, m%5 pmaddwd m%5, m9, m%2 ; efgh1 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m11 ; efgh3 pmaddwd m%2, m10 ; efgh2 paddd m%4, m5 paddd m%4, m%5 paddd m%3, m%4 paddd m%2, m%3 psrad m%1, 4 psrad m%2, 4 packssdw m%1, m%2 %endmacro movu xm0, [srcq+strideq*0+ 0] vinserti128 m0, [srcq+strideq*1+ 0], 1 movu xm2, [srcq+strideq*0+16] vinserti128 m2, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] shufpd m1, m0, m2, 0x05 PREP_8TAP_H 0, 1, 2, 3, 4 mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: add wd, wd .h_w16_loop0: mov r6d, wd .h_w16_loop: movu m0, [srcq+r6-32] movu m1, [srcq+r6-24] movu m2, [srcq+r6-16] PREP_8TAP_H 0, 1, 2, 3, 4 mova [tmpq+r6-32], m0 sub r6d, 32 jg .h_w16_loop add srcq, strideq add tmpq, wq dec hd jg .h_w16_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m0, [base+subpel_filters+myq*8] %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 15 vpbroadcastd m7, [prep_8tap_1d_rnd] lea r6, [strideq*3] sub srcq, r6 punpcklbw m0, m0 psraw m0, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc psllw m0, 2 .v_12bpc: pshufd m8, m0, q0000 pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 cmp wd, 4 jg .v_w8 .v_w4: movq xm1, [srcq+strideq*0] vpbroadcastq m0, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq m4, [srcq+r6 ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m5, [srcq+strideq*1] vpblendd m1, m0, 0x30 vpblendd m0, m2, 0x30 punpcklwd m1, m0 ; 01 12 vpbroadcastq m0, [srcq+strideq*2] add srcq, r6 vpblendd m2, m4, 0x30 vpblendd m4, m3, 0x30 punpcklwd m2, m4 ; 23 34 vpblendd m3, m5, 0x30 vpblendd m5, m0, 0x30 punpcklwd m3, m5 ; 45 56 .v_w4_loop: vpbroadcastq m4, [srcq+strideq*0] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m7 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m4, m0, 0x30 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 4 vextracti128 xm4, m5, 1 packssdw xm5, xm4 mova [tmpq], xm5 add tmpq, 16 sub hd, 2 jg .v_w4_loop RET .v_w8: %if WIN64 push r8 %endif mov r8d, wd shl wd, 5 mov r5, srcq mov r7, tmpq lea wd, [hq+wq-256] .v_w8_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] vbroadcasti128 m0, [srcq+r6 ] vbroadcasti128 m6, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vbroadcasti128 m1, [srcq+strideq*0] vbroadcasti128 m2, [srcq+strideq*1] vbroadcasti128 m3, [srcq+strideq*2] add srcq, r6 shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklwd m3, m6, m0 ; 23 punpckhwd m6, m0 ; 56 .v_w8_loop: vbroadcasti128 m14, [srcq+strideq*0] pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m7 paddd m13, m7 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 vbroadcasti128 m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] paddd m13, m6 shufpd m6, m0, m14, 0x0d shufpd m0, m14, m5, 0x0c punpcklwd m5, m6, m0 ; 67 punpckhwd m6, m0 ; 78 pmaddwd m14, m11, m5 ; a3 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 4 psrad m13, 4 packssdw m12, m13 vpermq m12, m12, q3120 mova [tmpq+r8*0], xm12 vextracti128 [tmpq+r8*2], m12, 1 lea tmpq, [tmpq+r8*4] sub hd, 2 jg .v_w8_loop add r5, 16 add r7, 16 movzx hd, wb mov srcq, r5 mov tmpq, r7 sub wd, 1<<8 jg .v_w8_loop0 %if WIN64 pop r8 %endif RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 vpbroadcastd m15, [prep_8tap_2d_rnd] cmp wd, 4 jg .hv_w8 movzx mxd, mxb vpbroadcastd m0, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd vpbroadcastq m1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 sub srcq, r6 pxor m7, m7 punpcklbw m7, m0 punpcklbw m1, m1 psraw m7, 4 psraw m1, 8 test dword r7m, 0x800 jz .hv_w4_10bit psraw m7, 2 .hv_w4_10bit: pshufd m11, m1, q0000 pshufd m12, m1, q1111 pshufd m13, m1, q2222 pshufd m14, m1, q3333 .hv_w4: vbroadcasti128 m9, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] pshufd m8, m7, q1111 pshufd m7, m7, q0000 movu xm1, [srcq+strideq*0] vinserti128 m1, [srcq+strideq*1], 1 ; 0 1 vbroadcasti128 m0, [srcq+r6 ] vinserti128 m2, m0, [srcq+strideq*2], 0 ; 2 3 lea srcq, [srcq+strideq*4] vinserti128 m0, [srcq+strideq*0], 1 ; 3 4 movu xm3, [srcq+strideq*1] vinserti128 m3, [srcq+strideq*2], 1 ; 5 6 add srcq, r6 pshufb m4, m1, m9 pshufb m1, m10 pmaddwd m4, m7 pmaddwd m1, m8 pshufb m5, m2, m9 pshufb m2, m10 pmaddwd m5, m7 pmaddwd m2, m8 paddd m4, m15 paddd m1, m4 pshufb m4, m0, m9 pshufb m0, m10 pmaddwd m4, m7 pmaddwd m0, m8 paddd m5, m15 paddd m2, m5 pshufb m5, m3, m9 pshufb m3, m10 pmaddwd m5, m7 pmaddwd m3, m8 paddd m4, m15 paddd m4, m0 paddd m5, m15 paddd m5, m3 vperm2i128 m0, m1, m2, 0x21 psrld m1, 6 psrld m2, 6 vperm2i128 m3, m4, m5, 0x21 pslld m4, 10 pslld m5, 10 pblendw m2, m4, 0xaa ; 23 34 pslld m0, 10 pblendw m1, m0, 0xaa ; 01 12 psrld m3, 6 pblendw m3, m5, 0xaa ; 45 56 psrad m0, m5, 16 .hv_w4_loop: movu xm4, [srcq+strideq*0] vinserti128 m4, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m15 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 pshufb m3, m4, m9 pshufb m4, m10 pmaddwd m3, m7 pmaddwd m4, m8 paddd m3, m15 paddd m4, m3 psrad m4, 6 packssdw m0, m4 ; _ 7 6 8 vpermq m3, m0, q1122 ; _ 6 _ 7 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m4, m5 psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 vpbroadcastq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmovle myd, mxd pmovsxbw xm1, [base+subpel_filters+myq*8] %if WIN64 PUSH r8 %endif mov r8d, wd shl wd, 5 lea r6, [strideq*3] sub srcq, 6 sub srcq, r6 mov r5, srcq mov r7, tmpq lea wd, [hq+wq-256] pxor m0, m0 punpcklbw m0, m2 mova [v_mul], xm1 psraw m0, 4 test dword r7m, 0x800 jz .hv_w8_10bit psraw m0, 2 .hv_w8_10bit: pshufd m11, m0, q0000 pshufd m12, m0, q1111 pshufd m13, m0, q2222 pshufd m14, m0, q3333 .hv_w8_loop0: %macro PREP_8TAP_HV_H 3 ; dst/src+0, src+8, src+16 pshufb m2, m%1, m9 ; 2 3 3 4 4 5 5 6 pshufb m%1, m8 ; 0 1 1 2 2 3 3 4 pmaddwd m3, m12, m2 pmaddwd m%1, m11 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a shufpd m2, m%2, 0x05 ; 4 5 5 6 6 7 7 8 paddd m3, m15 paddd m%1, m3 pmaddwd m3, m14, m%2 paddd m%1, m3 pmaddwd m3, m13, m2 pshufb m%3, m9 ; a b b c c d d e pmaddwd m2, m11 paddd m%1, m3 pmaddwd m3, m12, m%2 shufpd m%2, m%3, 0x05 ; 8 9 9 a a b b c pmaddwd m%3, m14 pmaddwd m%2, m13 paddd m2, m15 paddd m2, m3 paddd m2, m%3 paddd m2, m%2 psrad m%1, 6 psrad m2, 6 packssdw m%1, m2 %endmacro movu xm4, [srcq+r6 + 0] vbroadcasti128 m8, [subpel_h_shufA] movu xm6, [srcq+r6 + 8] vbroadcasti128 m9, [subpel_h_shufB] movu xm0, [srcq+r6 +16] movu xm5, [srcq+strideq*0+ 0] vinserti128 m5, [srcq+strideq*4+ 0], 1 movu xm1, [srcq+strideq*0+16] vinserti128 m1, [srcq+strideq*4+16], 1 shufpd m7, m5, m1, 0x05 INIT_XMM avx2 PREP_8TAP_HV_H 4, 6, 0 ; 3 INIT_YMM avx2 PREP_8TAP_HV_H 5, 7, 1 ; 0 4 movu xm0, [srcq+strideq*2+ 0] vinserti128 m0, [srcq+r6 *2+ 0], 1 movu xm1, [srcq+strideq*2+16] vinserti128 m1, [srcq+r6 *2+16], 1 shufpd m7, m0, m1, 0x05 PREP_8TAP_HV_H 0, 7, 1 ; 2 6 movu xm6, [srcq+strideq*1+ 0] movu xm1, [srcq+strideq*1+16] lea srcq, [srcq+strideq*4] vinserti128 m6, [srcq+strideq*1+ 0], 1 vinserti128 m1, [srcq+strideq*1+16], 1 add srcq, r6 shufpd m7, m6, m1, 0x05 PREP_8TAP_HV_H 6, 7, 1 ; 1 5 vpermq m4, m4, q1100 vpermq m5, m5, q3120 vpermq m6, m6, q3120 vpermq m7, m0, q3120 punpcklwd m3, m7, m4 ; 23 punpckhwd m4, m5 ; 34 punpcklwd m1, m5, m6 ; 01 punpckhwd m5, m6 ; 45 punpcklwd m2, m6, m7 ; 12 punpckhwd m6, m7 ; 56 .hv_w8_loop: vpbroadcastd m9, [v_mul+4*0] vpbroadcastd m7, [v_mul+4*1] vpbroadcastd m10, [v_mul+4*2] pmaddwd m8, m9, m1 ; a0 pmaddwd m9, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m7 ; a1 pmaddwd m4, m7 ; b1 paddd m8, m15 paddd m9, m15 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m8, m5 paddd m9, m6 movu xm5, [srcq+strideq*0] vinserti128 m5, [srcq+strideq*1], 1 vbroadcasti128 m7, [subpel_h_shufA] vbroadcasti128 m10, [subpel_h_shufB] movu xm6, [srcq+strideq*0+16] vinserti128 m6, [srcq+strideq*1+16], 1 vextracti128 [tmpq], m0, 1 pshufb m0, m5, m7 ; 01 pshufb m5, m10 ; 23 pmaddwd m0, m11 pmaddwd m5, m12 paddd m0, m15 paddd m0, m5 pshufb m5, m6, m7 ; 89 pshufb m6, m10 ; ab pmaddwd m5, m13 pmaddwd m6, m14 paddd m5, m15 paddd m6, m5 movu xm5, [srcq+strideq*0+8] vinserti128 m5, [srcq+strideq*1+8], 1 lea srcq, [srcq+strideq*2] pshufb m7, m5, m7 pshufb m5, m10 pmaddwd m10, m13, m7 pmaddwd m7, m11 paddd m0, m10 paddd m6, m7 pmaddwd m7, m14, m5 pmaddwd m5, m12 paddd m0, m7 paddd m5, m6 vbroadcasti128 m6, [tmpq] vpbroadcastd m10, [v_mul+4*3] psrad m0, 6 psrad m5, 6 packssdw m0, m5 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m10, m5 ; a3 pmaddwd m10, m6 ; b3 paddd m7, m8 paddd m9, m10 psrad m7, 6 psrad m9, 6 packssdw m7, m9 vpermq m7, m7, q3120 mova [tmpq+r8*0], xm7 vextracti128 [tmpq+r8*2], m7, 1 lea tmpq, [tmpq+r8*4] sub hd, 2 jg .hv_w8_loop add r5, 16 add r7, 16 movzx hd, wb mov srcq, r5 mov tmpq, r7 sub wd, 1<<8 jg .hv_w8_loop0 %if WIN64 POP r8 %endif RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro REMAP_REG 2 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %xdefine r14_save r14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep %xdefine r14 r14_save %undef r14_save %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %macro MC_8TAP_SCALED_H 8-9 0 ; dst, tmp[0-6], load_hrnd movu xm%1, [srcq+ r4*2] movu xm%2, [srcq+ r6*2] movu xm%3, [srcq+ r7*2] movu xm%4, [srcq+ r9*2] vinserti128 m%1, [srcq+r10*2], 1 vinserti128 m%2, [srcq+r11*2], 1 vinserti128 m%3, [srcq+r13*2], 1 vinserti128 m%4, [srcq+ rX*2], 1 add srcq, ssq movu xm%5, [srcq+ r4*2] movu xm%6, [srcq+ r6*2] movu xm%7, [srcq+ r7*2] movu xm%8, [srcq+ r9*2] vinserti128 m%5, [srcq+r10*2], 1 vinserti128 m%6, [srcq+r11*2], 1 vinserti128 m%7, [srcq+r13*2], 1 vinserti128 m%8, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m%1, m12 pmaddwd m%2, m13 pmaddwd m%3, m14 pmaddwd m%4, m15 pmaddwd m%5, m12 pmaddwd m%6, m13 pmaddwd m%7, m14 pmaddwd m%8, m15 phaddd m%1, m%2 %if %9 mova m10, [rsp+0x00] %endif phaddd m%3, m%4 phaddd m%5, m%6 phaddd m%7, m%8 phaddd m%1, m%3 phaddd m%5, m%7 paddd m%1, m10 paddd m%5, m10 psrad m%1, xm11 psrad m%5, xm11 packssdw m%1, m%5 %endmacro %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isput 1 %assign isprep 0 cglobal put_8tap_scaled_16bpc, 4, 14, 16, 0xe0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %xdefine base_reg r12 mov r7d, pxmaxm %else %assign isput 0 %assign isprep 1 cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [rsp+0xd0] %xdefine base_reg r11 %endif lea base_reg, [%1_8tap_scaled_16bpc_avx2] %define base base_reg-%1_8tap_scaled_16bpc_avx2 tzcnt wd, wm vpbroadcastd m8, dxm %if isprep && UNIX64 movd xm10, mxd vpbroadcastd m10, xm10 mov r5d, t0d DECLARE_REG_TMP 5, 7 mov r6d, pxmaxm %else vpbroadcastd m10, mxm %if isput vpbroadcastw m11, pxmaxm %else mov r6d, pxmaxm %endif %endif mov dyd, dym %if isput %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %else DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %define dsm [rsp+0x98] %define rX r1 %define rXd r1d %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %else DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x98] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define rX r14 %define rXd r14d %endif shr r7d, 11 vpbroadcastd m6, [base+pd_0x3ff] vpbroadcastd m12, [base+s_8tap_h_rnd+r7*4] movd xm7, [base+s_8tap_h_sh+r7*4] %if isput vpbroadcastd m13, [base+put_s_8tap_v_rnd+r7*4] pinsrd xm7, [base+put_s_8tap_v_sh+r7*4], 2 %else vpbroadcastd m13, [base+pd_m524256] %endif pxor m9, m9 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] add wq, base_reg jmp wq %if isput .w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0,1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 ; 0 4 vinserti128 m1, [srcq+ssq*1], 1 ; 1 5 vinserti128 m2, [srcq+ssq*2], 1 ; 2 6 vinserti128 m3, [srcq+ss3q ], 1 ; 3 7 lea srcq, [srcq+ssq*4] REPX {pshufb x, m10}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 ; 0 1 2 3 4 5 6 7 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 pshufd xm4, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm4 ; 45 56 punpckhwd xm4, xm1, xm4 ; 67 __ .w2_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm14, r6q pmovsxbw xm14, xm14 pshufd xm8, xm14, q0000 pshufd xm9, xm14, q1111 pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pshufd xm8, xm14, q2222 pshufd xm14, xm14, q3333 paddd xm5, xm6 pmaddwd xm6, xm2, xm8 pmaddwd xm8, xm4, xm14 psrldq xm9, xm7, 8 paddd xm5, xm6 paddd xm5, xm13 paddd xm5, xm8 psrad xm5, xm9 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq], xm5 add dstq, dsq dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w2_loop movu xm5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps xm3, xm0, q1032 ; 01 12 shufps xm0, xm2, q1032 ; 23 34 shufps xm2, xm4, q1032 ; 45 56 pshufb xm5, xm10 pmaddwd xm5, xm15 phaddd xm5, xm5 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 palignr xm1, xm5, xm1, 12 punpcklqdq xm1, xm1 ; 6 7 6 7 punpcklwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop .w2_skip_line: movu xm6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xm3, xm0 ; 01 12 mova xm0, xm2 ; 23 34 pshufb xm5, xm10 pshufb xm6, xm10 pmaddwd xm5, xm15 pmaddwd xm6, xm15 phaddd xm5, xm6 paddd xm5, xm12 psrad xm5, xm7 packssdw xm5, xm5 ; 6 7 6 7 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 pshufd xm5, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm5 ; 45 56 punpckhwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop %endif .w4: mov myd, mym mova [rsp+0x00], m12 %if isput mova [rsp+0x20], xm13 %else SWAP m11, m13 %endif mova [rsp+0x30], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m0, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm7, [srcq+ssq*0] movu xm9, [srcq+ssq*1] movu xm8, [srcq+ssq*2] movu xm10, [srcq+ss3q ] movu xm1, [srcq+r4 ] movu xm3, [srcq+r6 ] movu xm2, [srcq+r11 ] movu xm4, [srcq+r13 ] lea srcq, [srcq+ssq*4] vinserti128 m7, [srcq+ssq*0], 1 vinserti128 m9, [srcq+ssq*1], 1 vinserti128 m8, [srcq+ssq*2], 1 vinserti128 m10, [srcq+ss3q ], 1 vinserti128 m1, [srcq+r4 ], 1 vinserti128 m3, [srcq+r6 ], 1 vinserti128 m2, [srcq+r11 ], 1 vinserti128 m4, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m0 paddb m13, m0 REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m14}, m7, m9, m8, m10 REPX {pshufb x, m13}, m1, m2, m3, m4 REPX {pmaddwd x, m15}, m1, m2, m3, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x30] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 4 5 packssdw m8, m10 ; 2 3 6 7 vextracti128 xm9, m7, 1 ; 4 5 vextracti128 xm3, m8, 1 ; 6 7 shufps xm4, xm7, xm8, q1032 ; 1 2 shufps xm5, xm8, xm9, q1032 ; 3 4 shufps xm6, xm9, xm3, q1032 ; 5 6 psrldq xm10, xm3, 8 ; 7 _ punpcklwd xm0, xm7, xm4 ; 01 punpckhwd xm7, xm4 ; 12 punpcklwd xm1, xm8, xm5 ; 23 punpckhwd xm8, xm5 ; 34 punpcklwd xm2, xm9, xm6 ; 45 punpckhwd xm9, xm6 ; 56 punpcklwd xm3, xm10 ; 67 mova [rsp+0x40], xm7 mova [rsp+0x50], xm8 mova [rsp+0x60], xm9 .w4_loop: and myd, 0x3ff mov r11d, 64 << 24 mov r13d, myd shr r13d, 6 lea r13d, [t1+r13] cmovnz r11q, [base+subpel_filters+r13*8] movq xm9, r11q pmovsxbw xm9, xm9 pshufd xm7, xm9, q0000 pshufd xm8, xm9, q1111 pmaddwd xm4, xm0, xm7 pmaddwd xm5, xm1, xm8 pshufd xm7, xm9, q2222 pshufd xm9, xm9, q3333 pmaddwd xm6, xm2, xm7 pmaddwd xm8, xm3, xm9 %if isput mova xm7, [rsp+0x20] movd xm9, [rsp+0x38] %else SWAP m7, m11 %endif paddd xm4, xm5 paddd xm6, xm8 paddd xm4, xm6 paddd xm4, xm7 %if isput psrad xm4, xm9 packusdw xm4, xm4 pminuw xm4, xm11 movq [dstq], xm4 add dstq, dsq %else SWAP m11, m7 psrad xm4, 6 packssdw xm4, xm4 movq [tmpq], xm4 add tmpq, 8 %endif dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w4_loop mova xm8, [rsp+0x00] movd xm9, [rsp+0x30] movu xm4, [srcq] movu xm5, [srcq+r4] test myd, 0x400 jz .w4_skip_line mova xm0, [rsp+0x40] mova [rsp+0x40], xm1 mova xm1, [rsp+0x50] mova [rsp+0x50], xm2 mova xm2, [rsp+0x60] mova [rsp+0x60], xm3 pshufb xm4, xm12 pshufb xm5, xm13 pmaddwd xm4, xm14 pmaddwd xm5, xm15 phaddd xm4, xm5 paddd xm4, xm8 psrad xm4, xm9 packssdw xm4, xm4 punpcklwd xm3, xm10, xm4 mova xm10, xm4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu xm6, [srcq+ssq*1] movu xm7, [srcq+r6] movu m0, [rsp+0x50] pshufb xm4, xm12 pshufb xm6, xm12 pshufb xm5, xm13 pshufb xm7, xm13 pmaddwd xm4, xm14 pmaddwd xm6, xm14 pmaddwd xm5, xm15 pmaddwd xm7, xm15 mova [rsp+0x40], m0 phaddd xm4, xm5 phaddd xm6, xm7 paddd xm4, xm8 paddd xm6, xm8 psrad xm4, xm9 psrad xm6, xm9 packssdw xm4, xm6 punpcklwd xm9, xm10, xm4 mova [rsp+0x60], xm9 psrldq xm10, xm4, 8 mova xm0, xm1 mova xm1, xm2 mova xm2, xm3 punpcklwd xm3, xm4, xm10 lea srcq, [srcq+ssq*2] jmp .w4_loop SWAP m10, m13 %if isprep SWAP m13, m11 %endif .w8: mov dword [rsp+0x80], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+0x80], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+0x80], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+0x80], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+0x80], 16 movifprep tmp_stridem, 256 .w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free %if isput movifnidn dsm, dsq mova [rsp+0xb0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 shr t0d, 16 sub srcq, 6 pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0x84], t0d mov [rsp+0x88], srcq mov [rsp+0x90], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] jmp .hloop .hloop_prep: dec dword [rsp+0x80] jz .ret add qword [rsp+0x90], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x40] vpbroadcastd m15, [rsp+0x84] pxor m9, m9 mov srcq, [rsp+0x88] mov r0q, [rsp+0x90] ; dstq / tmpq .hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x40], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq [rsp+0xa0], xm1 movq [rsp+0xa8], xm7 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x60], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x60] vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, mym mov dyd, dym pshufb m0, m9 ; 01a 01b pshufb m1, m9 ; 23a 23b pshufb m2, m9 ; 45a 45b pshufb m3, m9 ; 67a 67b .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm9, r6q punpcklqdq xm9, xm9 pmovsxbw m9, xm9 pshufd m8, m9, q0000 pshufd m7, m9, q1111 pmaddwd m4, m0, m8 pmaddwd m5, m1, m7 pshufd m8, m9, q2222 pshufd m9, m9, q3333 pmaddwd m6, m2, m8 pmaddwd m7, m3, m9 %if isput psrldq xm8, xm11, 8 %endif paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, xm8 vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xb0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+0x60], myd mov r4d, [rsp+0xa0] mov r6d, [rsp+0xa4] mov r7d, [rsp+0xa8] mov r9d, [rsp+0xac] jz .skip_line vbroadcasti128 m9, [base+wswap] movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq mov myd, [rsp+0x60] mov dyd, dym pshufb m0, m9 pshufb m1, m9 pshufb m2, m9 pshufb m3, m9 pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, m10 psrad m4, xm11 pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .vloop .skip_line: mova m0, m1 mova m1, m2 mova m2, m3 MC_8TAP_SCALED_H 3, 10, 4, 5, 6, 7, 8, 9, 1 vbroadcasti128 m9, [base+subpel_s_shuf8] mov myd, [rsp+0x60] mov dyd, dym pshufb m3, m9 jmp .vloop SWAP m1, m12, m10 SWAP m7, m11 .dy1: movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] add wq, base_reg jmp wq %if isput .dy1_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] movu xm2, [srcq+ssq*2] movu xm3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*1], 1 vinserti128 m2, [srcq+ssq*2], 1 add srcq, ss3q movq xm6, r4q pmovsxbw xm6, xm6 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 REPX {pshufb x, m10}, m0, m1, m2 pshufb xm3, xm10 REPX {pmaddwd x, m15}, m0, m1, m2 pmaddwd xm3, xm15 phaddd m0, m1 phaddd m2, m3 paddd m0, m12 paddd m2, m12 psrad m0, xm7 psrad m2, xm7 packssdw m0, m2 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 pshufd xm4, xm1, q2121 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 punpcklwd xm2, xm1, xm4 ; 45 56 .dy1_w2_loop: movu xm1, [srcq+ssq*0] movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm1, xm10 pshufb xm5, xm10 pmaddwd xm1, xm15 pmaddwd xm5, xm15 phaddd xm1, xm5 pmaddwd xm5, xm3, xm8 mova xm3, xm0 pmaddwd xm0, xm9 paddd xm1, xm12 psrad xm1, xm7 packssdw xm1, xm1 paddd xm5, xm0 mova xm0, xm2 pmaddwd xm2, xm14 paddd xm5, xm2 palignr xm2, xm1, xm4, 12 punpcklwd xm2, xm1 ; 67 78 pmaddwd xm4, xm2, xm6 paddd xm5, xm13 paddd xm5, xm4 mova xm4, xm1 psrldq xm1, xm7, 8 psrad xm5, xm1 packusdw xm5, xm5 pminsw xm5, xm11 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif .dy1_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] pcmpeqd m6, m9 punpckldq m10, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 pblendvb m14, m2, m10 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*2] lea r11, [r4+ssq*1] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*2] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] movu xm3, [srcq+ssq*2] ; 6 _ movu xm10, [srcq+r6 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r11 ], 1 lea srcq, [srcq+ss3q ] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pmaddwd m0, m14 pmaddwd m1, m14 pshufb m7, m13 pshufb m8, m13 pmaddwd m7, m15 pmaddwd m8, m15 pshufb m2, m12 pshufb xm3, xm12 pmaddwd m2, m14 pmaddwd xm3, xm14 pshufb m9, m13 pshufb xm10, xm13 pmaddwd m9, m15 pmaddwd xm10, xm15 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 phaddd xm3, xm10 paddd m0, m5 paddd m1, m5 paddd m2, m5 paddd xm3, xm5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 psrad xm3, xm6 vperm2i128 m4, m0, m1, 0x21 ; 1 2 vperm2i128 m5, m1, m2, 0x21 ; 3 4 vperm2i128 m6, m2, m3, 0x21 ; 5 6 shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pslld m4, 16 pslld m5, 16 pslld m6, 16 pblendw m0, m4, 0xaa ; 01 12 pblendw m1, m5, 0xaa ; 23 34 pblendw m2, m6, 0xaa ; 45 56 movq xm10, r13q punpcklqdq xm10, xm10 pmovsxbw m10, xm10 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 .dy1_w4_loop: movu xm11, [srcq+ssq*0] movu xm6, [srcq+r4 ] vinserti128 m11, [srcq+ssq*1], 1 vinserti128 m6, [srcq+r11 ], 1 lea srcq, [srcq+ssq*2] pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pshufb m11, m12 pshufb m6, m13 pmaddwd m11, m14 pmaddwd m6, m15 paddd m4, [rsp+0x20] phaddd m11, m6 pmaddwd m6, m2, m9 paddd m11, [rsp+0x00] psrad m11, [rsp+0x40] mova m0, m1 mova m1, m2 paddd m5, m6 paddd m4, m5 vinserti128 m2, m3, xm11, 1 pslld m3, m11, 16 pblendw m2, m3, 0xaa ; 67 78 pmaddwd m5, m2, m10 vextracti128 xm3, m11, 1 paddd m4, m5 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy1_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy1_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput %define dsm [rsp+0xb8] movifnidn dsm, dsq mova [rsp+0xc0], xm7 %else %if UNIX64 %define hm [rsp+0xb8] %endif %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy1_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy1_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep vbroadcasti128 m7, [base+wswap] pshufb m0, m7 pshufb m1, m7 pshufb m2, m7 pshufb m3, m7 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 phaddd m4, m6 paddd m4, [rsp+0x00] psrad m4, [rsp+0x40] pslld m4, 16 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .dy1_vloop SWAP m1, m12, m10 SWAP m7, m11 .dy2: movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %if isput .dy2_w2: mov myd, mym movzx t0d, t0b sub srcq, 2 movd xm15, t0d punpckldq m8, m9, m8 paddd m10, m8 ; mx+dx*[0-1] vpbroadcastd xm14, [base+pq_0x40000000+2] vpbroadcastd xm15, xm15 pand xm8, xm10, xm6 psrld xm8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_q] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd xm4, [base+subpel_filters+r6*8+2] pcmpeqd xm8, xm9 psrld m10, 10 paddd m10, m10 movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*2] movu xm2, [srcq+ssq*4] pshufb m10, m5 paddb m10, m6 vpblendd xm15, xm4, 0xa pblendvb xm15, xm14, xm8 pmovsxbw m15, xm15 vinserti128 m0, [srcq+ssq*1], 1 ; 0 1 vinserti128 m1, [srcq+ss3q ], 1 ; 2 3 lea srcq, [srcq+ssq*4] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m0, m10 pshufb m1, m10 pshufb m2, m10 pmaddwd m0, m15 pmaddwd m1, m15 pmaddwd m2, m15 movq xm6, r4q pmovsxbw xm6, xm6 phaddd m0, m1 phaddd m1, m2 paddd m0, m12 paddd m1, m12 psrad m0, xm7 psrad m1, xm7 packssdw m0, m1 ; 0 2 2 4 1 3 3 5 vextracti128 xm1, m0, 1 pshufd xm8, xm6, q0000 pshufd xm9, xm6, q1111 pshufd xm14, xm6, q2222 pshufd xm6, xm6, q3333 punpcklwd xm2, xm0, xm1 ; 01 23 punpckhwd xm1, xm0, xm1 ; 23 45 .dy2_w2_loop: movu xm3, [srcq+ssq*0] movu xm5, [srcq+ssq*2] vinserti128 m3, [srcq+ssq*1], 1 ; 6 7 vinserti128 m5, [srcq+ss3q ], 1 ; 8 9 lea srcq, [srcq+ssq*4] pmaddwd xm4, xm2, xm8 pmaddwd xm1, xm9 pshufb m3, m10 pshufb m5, m10 pmaddwd m3, m15 pmaddwd m5, m15 phaddd m3, m5 paddd xm4, xm1 paddd m3, m12 psrad m3, xm7 packssdw m3, m3 pshufd m3, m3, q2100 palignr m0, m3, m0, 12 ; 4 6 6 8 5 7 7 9 vextracti128 xm1, m0, 1 punpcklwd xm2, xm0, xm1 ; 45 67 punpckhwd xm1, xm0, xm1 ; 67 89 pmaddwd xm3, xm2, xm14 pmaddwd xm5, xm1, xm6 paddd xm4, xm13 paddd xm4, xm3 psrldq xm3, xm7, 8 paddd xm4, xm5 psrad xm4, xm3 packusdw xm4, xm4 pminsw xm4, xm11 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif .dy2_w4: mov myd, mym %if isput mova [rsp+0x50], xm11 %endif mova [rsp+0x00], m12 mova [rsp+0x20], m13 mova [rsp+0x40], xm7 vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b sub srcq, 2 movd xm15, t0d pmaddwd m8, m7 vpbroadcastq m2, [base+pq_0x40000000+1] vpbroadcastd xm15, xm15 SWAP m13, m10 paddd m13, m8 ; mx+dx*[0-3] pand m6, m13 psrld m6, 6 paddd xm15, xm6 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 vbroadcasti128 m5, [base+bdct_lb_q+ 0] vbroadcasti128 m1, [base+bdct_lb_q+16] vbroadcasti128 m4, [base+subpel_s_shuf2] vpbroadcastd xm14, [base+subpel_filters+r4*8+2] vpbroadcastd xm7, [base+subpel_filters+r6*8+2] vpbroadcastd xm15, [base+subpel_filters+r11*8+2] vpbroadcastd xm8, [base+subpel_filters+r13*8+2] shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] pcmpeqd m6, m9 punpckldq m11, m6, m6 punpckhdq m6, m6 psrld m13, 10 paddd m13, m13 vpblendd xm14, xm7, 0xa vpblendd xm15, xm8, 0xa pmovsxbw m14, xm14 pmovsxbw m15, xm15 movq xm10, r13q pblendvb m14, m2, m11 pblendvb m15, m2, m6 pextrd r4, xm13, 2 pshufb m12, m13, m5 pshufb m13, m1 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu xm0, [srcq+ssq*0] movu xm7, [srcq+r4 ] movu xm1, [srcq+ssq*1] movu xm8, [srcq+r6 ] vinserti128 m0, [srcq+ssq*2], 1 ; 0 2 vinserti128 m7, [srcq+r11 ], 1 vinserti128 m1, [srcq+ss3q ], 1 ; 1 3 vinserti128 m8, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] movu xm2, [srcq+ssq*0] movu xm9, [srcq+r4 ] vinserti128 m2, [srcq+ssq*1], 1 ; 4 5 vinserti128 m9, [srcq+r6 ], 1 lea srcq, [srcq+ssq*2] vpbroadcastb m5, xm13 psubb m13, m5 paddb m12, m4 paddb m13, m4 mova m5, [rsp+0x00] movd xm6, [rsp+0x40] pshufb m0, m12 pshufb m1, m12 pshufb m2, m12 pmaddwd m0, m14 pmaddwd m1, m14 pmaddwd m2, m14 pshufb m7, m13 pshufb m8, m13 pshufb m9, m13 pmaddwd m7, m15 pmaddwd m8, m15 pmaddwd m9, m15 punpcklqdq xm10, xm10 pmovsxbw m10, xm10 phaddd m0, m7 phaddd m1, m8 phaddd m2, m9 paddd m0, m5 paddd m1, m5 paddd m2, m5 psrad m0, xm6 psrad m1, xm6 psrad m2, xm6 vperm2i128 m3, m0, m2, 0x21 ; 2 4 vperm2i128 m2, m1, 0x13 ; 3 5 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 packssdw m0, m3 ; 0 2 2 4 packssdw m1, m2 ; 1 3 3 5 punpckhwd m2, m0, m1 ; 23 45 punpcklwd m0, m1 ; 01 23 .dy2_w4_loop: movu xm1, [srcq+ssq*0] movu xm6, [srcq+r4 ] movu xm3, [srcq+ssq*1] movu xm11, [srcq+r6 ] vinserti128 m1, [srcq+ssq*2], 1 ; 6 8 vinserti128 m6, [srcq+r11 ], 1 vinserti128 m3, [srcq+ss3q ], 1 ; 7 9 vinserti128 m11, [srcq+r13 ], 1 lea srcq, [srcq+ssq*4] pmaddwd m4, m0, m7 pmaddwd m5, m2, m8 pshufb m1, m12 pshufb m3, m12 pmaddwd m1, m14 pmaddwd m3, m14 mova m0, [rsp+0x00] pshufb m6, m13 pshufb m11, m13 pmaddwd m6, m15 pmaddwd m11, m15 paddd m4, m5 movd xm5, [rsp+0x40] phaddd m1, m6 phaddd m3, m11 paddd m1, m0 paddd m3, m0 psrad m1, xm5 psrad m3, xm5 pslld m3, 16 pblendw m1, m3, 0xaa ; 67 89 vperm2i128 m0, m2, m1, 0x21 ; 45 67 paddd m4, [rsp+0x20] mova m2, m1 pmaddwd m5, m0, m9 pmaddwd m6, m2, m10 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0x50] movq [dstq+dsq*0], xm4 movhps [dstq+dsq*1], xm4 lea dstq, [dstq+dsq*2] %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET SWAP m10, m13 .dy2_w8: mov dword [rsp+0xa0], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+0xa0], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+0xa0], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+0xa0], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+0xa0], 16 movifprep tmp_stridem, 256 .dy2_w_start: SWAP m10, m12, m1 SWAP m11, m7 ; m1=mx, m7=pxmax, m10=h_rnd, m11=h_sh, m12=free mov myd, mym %if isput movifnidn dsm, dsq mova [rsp+0xc0], xm7 %endif mova [rsp+0x00], m10 mova [rsp+0x20], m13 mova [rsp+0x40], xm11 shr t0d, 16 sub srcq, 6 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul2] movd xm15, t0d mov [rsp+0xa4], t0d mov [rsp+0xa8], srcq mov [rsp+0xb0], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m1, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0xa0] jz .ret add qword [rsp+0xb0], 16 mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m6, [base+pd_0x3ff] paddd m1, m8, [rsp+0x60] vpbroadcastd m15, [rsp+0xa4] pxor m9, m9 mov srcq, [rsp+0xa8] mov r0q, [rsp+0xb0] ; dstq / tmpq mova m10, [rsp+0x00] mova xm11, [rsp+0x40] .dy2_hloop: vpbroadcastq xm2, [base+pq_0x40000000] pand m5, m1, m6 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 vextracti128 xm7, m15, 1 movq r6, xm15 pextrq r9, xm15, 1 movq r11, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r7d, r9d shr r9, 32 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mova [rsp+0x60], m1 movq xm12, [base+subpel_filters+ r4*8] movq xm13, [base+subpel_filters+ r6*8] movhps xm12, [base+subpel_filters+ r7*8] movhps xm13, [base+subpel_filters+ r9*8] movq xm14, [base+subpel_filters+r10*8] movq xm15, [base+subpel_filters+r11*8] movhps xm14, [base+subpel_filters+r13*8] movhps xm15, [base+subpel_filters+ rX*8] psrld m1, 10 vextracti128 xm7, m1, 1 vextracti128 xm6, m5, 1 movq r6, xm1 pextrq r11, xm1, 1 movq r9, xm7 pextrq rX, xm7, 1 mov r4d, r6d shr r6, 32 mov r10d, r11d shr r11, 32 mov r7d, r9d shr r9, 32 mov r13d, rXd shr rX, 32 pshufd xm4, xm5, q2200 pshufd xm5, xm5, q3311 pshufd xm7, xm6, q2200 pshufd xm6, xm6, q3311 pblendvb xm12, xm2, xm4 pblendvb xm13, xm2, xm5 pblendvb xm14, xm2, xm7 pblendvb xm15, xm2, xm6 pmovsxbw m12, xm12 pmovsxbw m13, xm13 pmovsxbw m14, xm14 pmovsxbw m15, xm15 MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b mova [rsp+0x80], m0 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 0 ; 6a 7a 6b 7b mova m0, [rsp+0x80] vbroadcasti128 m7, [base+subpel_s_shuf8] vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m10, [rsp+0x58] vpbroadcastd m11, [rsp+0x5c] pshufb m0, m7 ; 01a 01b pshufb m1, m7 ; 23a 23b pshufb m2, m7 ; 45a 45b pshufb m3, m7 ; 67a 67b .dy2_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, [rsp+0x20] paddd m6, m7 paddd m4, m5 paddd m4, m6 %if isput psrad m4, [rsp+0x48] vextracti128 xm5, m4, 1 packusdw xm4, xm5 pminsw xm4, [rsp+0xc0] mova [dstq], xm4 add dstq, dsm %else psrad m4, 6 vextracti128 xm5, m4, 1 packssdw xm4, xm5 mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep mova m0, m1 mova m1, m2 mova m2, m3 movu xm3, [srcq+ r4*2] movu xm4, [srcq+ r6*2] movu xm5, [srcq+ r7*2] movu xm6, [srcq+ r9*2] vinserti128 m3, [srcq+r10*2], 1 vinserti128 m4, [srcq+r11*2], 1 vinserti128 m5, [srcq+r13*2], 1 vinserti128 m6, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m3, m12 pmaddwd m4, m13 pmaddwd m5, m14 pmaddwd m6, m15 phaddd m3, m4 phaddd m5, m6 phaddd m3, m5 movu xm4, [srcq+ r4*2] movu xm5, [srcq+ r6*2] movu xm6, [srcq+ r7*2] movu xm7, [srcq+ r9*2] vinserti128 m4, [srcq+r10*2], 1 vinserti128 m5, [srcq+r11*2], 1 vinserti128 m6, [srcq+r13*2], 1 vinserti128 m7, [srcq+ rX*2], 1 add srcq, ssq pmaddwd m4, m12 pmaddwd m5, m13 pmaddwd m6, m14 pmaddwd m7, m15 phaddd m4, m5 phaddd m6, m7 mova m5, [rsp+0x00] movd xm7, [rsp+0x40] phaddd m4, m6 paddd m3, m5 paddd m4, m5 psrad m3, xm7 psrad m4, xm7 pslld m4, 16 pblendw m3, m4, 0xaa jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %undef isput %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_16bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, t0d jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %else DECLARE_REG_TMP 6, 8 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, SHARP PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, SHARP PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %macro WARP_V 5 ; dst, 01, 23, 45, 67 lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] vinserti128 m8, [filterq+tmp1q*8], 1 ; a e lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; b f lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm9, [filterq+myq *8] vinserti128 m9, [filterq+tmp1q*8], 1 ; c g lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma punpcklwd m8, m0 shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; d h punpcklwd m0, m9, m0 punpckldq m9, m8, m0 punpckhdq m0, m8, m0 punpcklbw m8, m11, m9 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 punpckhbw m9, m11, m9 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 pmaddwd m%2, m8 pmaddwd m9, m%3 punpcklbw m8, m11, m0 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 punpckhbw m0, m11, m0 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 pmaddwd m8, m%4 pmaddwd m0, m%5 paddd m9, m%2 mova m%2, m%3 paddd m0, m8 mova m%3, m%4 mova m%4, m%5 paddd m%1, m0, m9 %endmacro cglobal warp_affine_8x8t_16bpc, 4, 14, 16, tmp, ts mov r6d, r7m lea r9, [$$] shr r6d, 11 vpbroadcastd m13, [r9-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [warp8x8t_rnd] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main jmp .start .loop: call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx2).main2 lea tmpq, [tmpq+tsq*4] .start: paddd m7, m14 paddd m0, m14 psrad m7, 15 psrad m0, 15 packssdw m7, m0 vpermq m7, m7, q3120 mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jg .loop .end: RET cglobal warp_affine_8x8_16bpc, 4, 14, 16, dst, ds, src, ss, abcd, mx, tmp2, \ alpha, beta, filter, tmp1, delta, \ my, gamma mov r6d, r7m lea filterq, [$$] shr r6d, 11 vpbroadcastd m13, [filterq-$$+warp8x8_shift+r6*4] vpbroadcastd m14, [filterq-$$+warp8x8_rnd +r6*4] vpbroadcastw m15, r7m ; pixel_max call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m7, 16 psrad m0, 16 packusdw m7, m0 pmulhrsw m7, m14 pminsw m7, m15 vpermq m7, m7, q3120 mova [dstq+dsq*0], xm7 vextracti128 [dstq+dsq*1], m7, 1 dec r4d jg .loop .end: RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov abcdq, r5m mov mxd, r6m %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] vpbroadcastd m12, [pd_32768] pxor m11, m11 add filterq, mc_warp_filter-$$ lea tmp1q, [ssq*3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 sub betad, tmp2d ; beta -= alpha*3 mov myd, r7m call .h psrld m1, m0, 16 call .h pblendw m1, m0, 0xaa ; 01 psrld m2, m0, 16 call .h pblendw m2, m0, 0xaa ; 12 psrld m3, m0, 16 call .h pblendw m3, m0, 0xaa ; 23 psrld m4, m0, 16 call .h pblendw m4, m0, 0xaa ; 34 psrld m5, m0, 16 call .h pblendw m5, m0, 0xaa ; 45 psrld m6, m0, 16 call .h pblendw m6, m0, 0xaa ; 56 movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] add myd, 512+(64<<10) mov r4d, 4 lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 .main2: call .h psrld m7, m6, 16 pblendw m7, m0, 0xaa ; 67 WARP_V 7, 1, 3, 5, 7 call .h psrld m10, m5, 16 pblendw m10, m0, 0xaa ; 78 WARP_V 0, 2, 4, 6, 10 ret ALIGN function_align .h: lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] movu xm10, [srcq-6] vinserti128 m10, [srcq+2], 1 shr mxd, 10 ; 0 shr tmp1d, 10 ; 4 movq xm0, [filterq+mxq *8] vinserti128 m0, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] movu xm8, [srcq-4] vinserti128 m8, [srcq+4], 1 shr tmp2d, 10 ; 1 shr tmp1d, 10 ; 5 movq xm9, [filterq+tmp2q*8] vinserti128 m9, [filterq+tmp1q*8], 1 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 ; 2 shr tmp1d, 10 ; 6 punpcklbw m0, m11, m0 pmaddwd m0, m10 movu xm10, [srcq-2] vinserti128 m10, [srcq+6], 1 punpcklbw m9, m11, m9 pmaddwd m9, m8 movq xm8, [filterq+mxq *8] vinserti128 m8, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta phaddd m0, m9 ; 0 1 4 5 movu xm9, [srcq+0] vinserti128 m9, [srcq+8], 1 shr tmp2d, 10 ; 3 shr tmp1d, 10 ; 7 punpcklbw m8, m11, m8 pmaddwd m8, m10 movq xm10, [filterq+tmp2q*8] vinserti128 m10, [filterq+tmp1q*8], 1 punpcklbw m10, m11, m10 pmaddwd m9, m10 add srcq, ssq phaddd m8, m9 ; 2 3 6 7 phaddd m0, m8 ; 0 1 2 3 4 5 6 7 vpsllvd m0, m13 paddd m0, m12 ; rounded 14-bit result in upper 16 bits of dword ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 cmp hd, 8 je .ret lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq ], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .ret: RET .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 cmp hd, 4 jne .w8_loop_start RET .w8_loop: call .main lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 .w8_loop_start: lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 sub hd, 8 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx2_table lea r6, [avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m4, [base+bidir_rnd+t0*4] vpbroadcastd m5, [base+bidir_mul+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+32*0] paddsw m0, [tmp2q+32*0] mova m1, [tmp1q+32*1] paddsw m1, [tmp2q+32*1] mova m2, [tmp1q+32*2] paddsw m2, [tmp2q+32*2] mova m3, [tmp1q+32*3] paddsw m3, [tmp2q+32*3] add tmp1q, 32*4 add tmp2q, 32*4 pmaxsw m0, m4 pmaxsw m1, m4 pmaxsw m2, m4 pmaxsw m3, m4 psubsw m0, m4 psubsw m1, m4 psubsw m2, m4 psubsw m3, m4 pmulhw m0, m5 pmulhw m1, m5 pmulhw m2, m5 pmulhw m3, m5 ret cglobal w_avg_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, stride3 lea r6, [w_avg_avx2_table] tzcnt wd, wm mov t0d, r6m ; weight vpbroadcastw m8, r7m ; pixel_max vpbroadcastd m7, [r6-w_avg_avx2_table+pd_65538] movsxd wq, [r6+wq*4] paddw m7, m8 add wq, r6 lea r6d, [t0-16] shl t0d, 16 sub t0d, r6d ; 16-weight, weight pslld m7, 7 rorx r6d, t0d, 30 ; << 2 test dword r7m, 0x800 cmovz r6d, t0d movifnidn hd, hm movd xm6, r6d vpbroadcastd m6, xm6 BIDIR_FN ALIGN function_align .main: mova m4, [tmp1q+32*0] mova m0, [tmp2q+32*0] punpckhwd m5, m0, m4 punpcklwd m0, m4 mova m4, [tmp1q+32*1] mova m1, [tmp2q+32*1] pmaddwd m5, m6 pmaddwd m0, m6 paddd m5, m7 paddd m0, m7 psrad m5, 8 psrad m0, 8 packusdw m0, m5 punpckhwd m5, m1, m4 punpcklwd m1, m4 mova m4, [tmp1q+32*2] mova m2, [tmp2q+32*2] pmaddwd m5, m6 pmaddwd m1, m6 paddd m5, m7 paddd m1, m7 psrad m5, 8 psrad m1, 8 packusdw m1, m5 punpckhwd m5, m2, m4 punpcklwd m2, m4 mova m4, [tmp1q+32*3] mova m3, [tmp2q+32*3] add tmp1q, 32*4 add tmp2q, 32*4 pmaddwd m5, m6 pmaddwd m2, m6 paddd m5, m7 paddd m2, m7 psrad m5, 8 psrad m2, 8 packusdw m2, m5 punpckhwd m5, m3, m4 punpcklwd m3, m4 pmaddwd m5, m6 pmaddwd m3, m6 paddd m5, m7 paddd m3, m7 psrad m5, 8 psrad m3, 8 packusdw m3, m5 pminsw m0, m8 pminsw m1, m8 pminsw m2, m8 pminsw m3, m8 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx2_table lea r7, [mask_avx2_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+bidir_rnd+r6*4] vpbroadcastd m10, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: %macro MASK 1 pmovzxbw m5, [maskq+16*%1] mova m%1, [tmp1q+32*%1] mova m6, [tmp2q+32*%1] punpckhwd m4, m%1, m6 punpcklwd m%1, m6 psubw m7, m8, m5 punpckhwd m6, m5, m7 ; m, 64-m punpcklwd m5, m7 pmaddwd m4, m6 ; tmp1 * m + tmp2 * (64-m) pmaddwd m%1, m5 psrad m4, 5 psrad m%1, 5 packssdw m%1, m4 pmaxsw m%1, m9 psubsw m%1, m9 pmulhw m%1, m10 %endmacro MASK 0 MASK 1 MASK 2 MASK 3 add maskq, 16*4 add tmp1q, 32*4 add tmp2q, 32*4 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx2_table lea r7, [w_mask_420_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movd xm0, r7m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] movd xm14, [base+pw_2] mov maskq, maskmp psubw xm14, xm0 vpbroadcastw m14, xm14 add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: phaddd m4, m5 paddw m4, m14 psrlw m4, 2 packuswb m4, m4 vextracti128 xm5, m4, 1 punpcklwd xm4, xm5 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 mova [maskq], xm4 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8: vperm2i128 m6, m4, m5, 0x21 vpblendd m4, m5, 0xf0 paddw m4, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 mova [maskq], xm4 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16: punpcklqdq m6, m4, m5 punpckhqdq m4, m5 paddw m6, m14 paddw m4, m6 psrlw m4, 2 vextracti128 xm5, m4, 1 packuswb xm4, xm5 pshufd xm4, xm4, q3120 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 mova [maskq], xm4 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m4, m14 paddw m4, m5 psrlw m15, m4, 2 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 call .main mova m6, [deint_shuf] paddw m4, m14 paddw m4, m5 psrlw m4, 2 packuswb m15, m4 vpermd m4, m6, m15 mova [dstq+strideq*2+32*0], m0 mova [dstq+strideq*2+32*1], m1 mova [dstq+stride3q +32*0], m2 mova [dstq+stride3q +32*1], m3 mova [maskq], m4 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: paddw m4, m14 paddw m15, m14, m5 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [maskq], m4 ; no available registers call .main paddw m4, [maskq] mova m6, [deint_shuf] paddw m5, m15 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 ; 0 2 4 6 1 3 5 7 vpermd m4, m6, m4 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 mova [maskq], m4 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: paddw m4, m14 paddw m5, m14 mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*0+32*2], m2 mova [dstq+strideq*0+32*3], m3 mova [maskq+32*0], m4 mova [dstq+strideq], m5 call .main paddw m4, m14 paddw m15, m14, m5 mova [dstq+strideq*0+32*4], m0 mova [dstq+strideq*0+32*5], m1 mova [dstq+strideq*0+32*6], m2 mova [dstq+strideq*0+32*7], m3 mova [maskq+32*1], m4 call .main paddw m4, [maskq+32*0] paddw m5, [dstq+strideq] mova m6, [deint_shuf] psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m6, m4 mova [dstq+strideq*1+32*0], m0 mova [dstq+strideq*1+32*1], m1 mova [dstq+strideq*1+32*2], m2 mova [dstq+strideq*1+32*3], m3 mova [maskq+32*0], m4 call .main paddw m4, [maskq+32*1] mova m6, [deint_shuf] paddw m5, m15 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m6, m4 mova [dstq+strideq*1+32*4], m0 mova [dstq+strideq*1+32*5], m1 mova [dstq+strideq*1+32*6], m2 mova [dstq+strideq*1+32*7], m3 mova [maskq+32*1], m4 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: %macro W_MASK 2-6 11, 12, 13 ; dst/src1, mask/src2, pw_64, rnd, mul mova m%1, [tmp1q+32*%1] mova m%2, [tmp2q+32*%1] punpcklwd m8, m%2, m%1 punpckhwd m9, m%2, m%1 psubsw m%1, m%2 pabsw m%1, m%1 psubusw m7, m10, m%1 psrlw m7, 10 ; 64-m psubw m%2, m%3, m7 ; m punpcklwd m%1, m7, m%2 punpckhwd m7, m%2 pmaddwd m%1, m8 pmaddwd m7, m9 psrad m%1, 5 psrad m7, 5 packssdw m%1, m7 pmaxsw m%1, m%4 psubsw m%1, m%4 pmulhw m%1, m%5 %endmacro W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 ret cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx2_table lea r7, [w_mask_422_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max vpbroadcastb m14, r7m ; sign movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+bidir_rnd+r6*4] vpbroadcastd m13, [base+bidir_mul+r6*4] mova m15, [base+deint_shuf] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti128 xm2, m2, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm3 movhps [dstq+strideq*1], xm3 vextracti128 xm3, m3, 1 movq [dstq+strideq*2], xm3 movhps [dstq+stride3q ], xm3 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm2 vextracti128 [dstq+strideq*1], m2, 1 mova [dstq+strideq*2], xm3 vextracti128 [dstq+stride3q ], m3, 1 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 mova [dstq+strideq*2], m2 mova [dstq+stride3q ], m3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0+32*0], m0 mova [dstq+strideq*0+32*1], m1 mova [dstq+strideq*1+32*0], m2 mova [dstq+strideq*1+32*1], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 mova [dstq+32*6], m2 mova [dstq+32*7], m3 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 4 W_MASK 1, 5 phaddw m4, m5 W_MASK 2, 5 W_MASK 3, 6 phaddw m5, m6 add tmp1q, 32*4 add tmp2q, 32*4 packuswb m4, m5 pxor m5, m5 psubb m4, m14 pavgb m4, m5 vpermd m4, m15, m4 mova [maskq], m4 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx2_table lea r7, [w_mask_444_avx2_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] vpbroadcastd m4, [base+pw_64] vpbroadcastd m5, [base+bidir_rnd+r6*4] vpbroadcastd m6, [base+bidir_mul+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 je .w4_end call .main lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti128 xm0, m0, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti128 xm1, m1, 1 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], xm1 vextracti128 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+32*0], m0 mova [dstq+32*1], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+32*0], m0 mova [dstq+32*1], m1 call .main mova [dstq+32*2], m0 mova [dstq+32*3], m1 call .main mova [dstq+32*4], m0 mova [dstq+32*5], m1 call .main mova [dstq+32*6], m0 mova [dstq+32*7], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2, 4, 5, 6 W_MASK 1, 3, 4, 5, 6 packuswb m2, m3 vpermq m2, m2, q3120 add tmp1q, 32*2 add tmp2q, 32*2 mova [maskq], m2 add maskq, 32 ret ; (a * (64 - m) + b * m + 32) >> 6 ; = (((b - a) * m + 32) >> 6) + a ; = (((b - a) * (m << 9) + 16384) >> 15) + a ; except m << 9 overflows int16_t when m == 64 (which is possible), ; but if we negate m it works out (-64 << 9 == -32768). ; = (((a - b) * (m * -512) + 16384) >> 15) + a cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw m3, [maskq] movq xm0, [dstq+dsq*0] movhps xm0, [dstq+dsq*1] vpbroadcastq m1, [dstq+dsq*2] vpbroadcastq m2, [dstq+r6 ] vpblendd m0, m1, 0x30 vpblendd m0, m2, 0xc0 psubw m1, m0, [tmpq] add maskq, 16 add tmpq, 32 pmullw m3, m6 pmulhrsw m1, m3 paddw m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 mova xm1, [dstq+dsq*2] vinserti128 m1, [dstq+r6 ], 1 psubw m2, m0, [tmpq+32*0] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], xm1 vextracti128 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16 RET .w32: pmovzxbw m4, [maskq+16*0] pmovzxbw m5, [maskq+16*1] mova m0, [dstq+32*0] psubw m2, m0, [tmpq+32*0] mova m1, [dstq+32*1] psubw m3, m1, [tmpq+32*1] add maskq, 16*2 add tmpq, 32*2 pmullw m4, m6 pmullw m5, m6 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .w32 RET INIT_XMM avx2 cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h %define base r5-blend_v_avx2_table lea r5, [blend_v_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: vpbroadcastd m2, [base+obmc_masks_avx2+2*2] .w2_loop: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 movq m1, [tmpq] add tmpq, 4*2 psubw m1, m0, m1 pmulhrsw m1, m2 paddw m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_loop RET .w4: vpbroadcastq m2, [base+obmc_masks_avx2+4*2] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] psubw m1, m0, [tmpq] add tmpq, 8*2 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET INIT_YMM avx2 .w8: vbroadcasti128 m2, [base+obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 psubw m1, m0, [tmpq] add tmpq, 16*2 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: mova m4, [base+obmc_masks_avx2+16*2] .w16_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add tmpq, 32*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: %if WIN64 movaps [rsp+ 8], xmm6 movaps [rsp+24], xmm7 %endif mova m6, [base+obmc_masks_avx2+32*2] vbroadcasti128 m7, [base+obmc_masks_avx2+32*3] .w32_loop: mova m0, [dstq+dsq*0+32*0] psubw m3, m0, [tmpq +32*0] mova xm2, [dstq+dsq*0+32*1] mova xm5, [tmpq +32*1] mova m1, [dstq+dsq*1+32*0] psubw m4, m1, [tmpq +32*2] vinserti128 m2, [dstq+dsq*1+32*1], 1 vinserti128 m5, [tmpq +32*3], 1 add tmpq, 32*4 psubw m5, m2, m5 pmulhrsw m3, m6 pmulhrsw m4, m6 pmulhrsw m5, m7 paddw m0, m3 paddw m1, m4 paddw m2, m5 mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*1+32*0], m1 mova [dstq+dsq*0+32*1], xm2 vextracti128 [dstq+dsq*1+32*1], m2, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop %if WIN64 movaps xmm6, [rsp+ 8] movaps xmm7, [rsp+24] %endif RET %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp mova m0, [dstq+32*(%1+0)] psubw m2, m0, [tmpq+32*(%2+0)] mova m1, [dstq+32*(%1+1)] psubw m3, m1, [tmpq+32*(%2+1)] %if %3 add tmpq, 32*%3 %endif pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+32*(%1+0)], m0 mova [dstq+32*(%1+1)], m1 %endmacro INIT_XMM avx2 cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_h_avx2_table lea r5, [blend_h_avx2_table] tzcnt wd, wm mov hd, hm movsxd wq, [r5+wq*4] add wq, r5 lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd m0, [dstq+dsq*0] pinsrd m0, [dstq+dsq*1], 1 movd m2, [maskq+hq*2] movq m1, [tmpq] add tmpq, 4*2 punpcklwd m2, m2 psubw m1, m0, m1 pmulhrsw m1, m2 paddw m0, m1 movd [dstq+dsq*0], m0 pextrd [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova m3, [blend_shuf] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] movd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 8*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET INIT_YMM avx2 .w8: vbroadcasti128 m3, [blend_shuf] shufpd m3, m3, 0x0c .w8_loop: mova xm0, [dstq+dsq*0] vinserti128 m0, [dstq+dsq*1], 1 vpbroadcastd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 16*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET .w16: vpbroadcastw m4, [maskq+hq*2] vpbroadcastw m5, [maskq+hq*2+2] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 32*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 32*1] add tmpq, 32*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16 RET .w32: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0, 2 add dstq, dsq inc hq jl .w32 RET .w64: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 4 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m4, [maskq+hq*2] BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 8 BLEND_H_ROW 4, -4 BLEND_H_ROW 6, -2 add dstq, dsq inc hq jl .w128 RET cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor r12d, r12d lea r10, [ihq-1] cmp yq, ihq cmovs r10, yq test yq, yq cmovs r10, r12 imul r10, sstrideq add srcq, r10 ; ref += iclip(x, 0, iw - 1) lea r10, [iwq-1] cmp xq, iwq cmovs r10, xq test xq, xq cmovs r10, r12 lea srcq, [srcq+r10*2] ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) lea bottomextq, [yq+bhq] sub bottomextq, ihq lea r3, [bhq-1] cmovs bottomextq, r12 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ bottomext, rightext ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, r12 cmp bottomextq, bhq cmovns bottomextq, r3 cmp topextq, bhq cmovg topextq, r3 ; right_ext = iclip(x + bw - iw, 0, bw - 1) lea rightextq, [xq+bwq] sub rightextq, iwq lea r2, [bwq-1] cmovs rightextq, r12 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ bottomext, rightext ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, r12 cmp rightextq, bwq cmovns rightextq, r2 cmp leftextq, bwq cmovns leftextq, r2 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ dst, dstride, src, sstride, bottomext, rightext ; center_h = bh - top_ext - bottom_ext lea r3, [bottomextq+topextq] sub centerhq, r3 ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq imul r2, dstrideq add dstq, r2 mov r9m, dstq ; center_w = bw - left_ext - right_ext mov centerwq, bwq lea r3, [rightextq+leftextq] sub centerwq, r3 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 ; left extension xor r3, r3 vpbroadcastw m0, [srcq] .left_loop_%3: mova [dstq+r3*2], m0 add r3, 16 cmp r3, leftextq jl .left_loop_%3 ; body lea r12, [dstq+leftextq*2] %endif xor r3, r3 .body_loop_%3: movu m0, [srcq+r3*2] %if %1 movu [r12+r3*2], m0 %else movu [dstq+r3*2], m0 %endif add r3, 16 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 lea r12, [r12+centerwq*2] %else lea r12, [dstq+centerwq*2] %endif xor r3, r3 vpbroadcastw m0, [srcq+centerwq*2-2] .right_loop_%3: movu [r12+r3*2], m0 add r3, 16 cmp r3, rightextq jl .right_loop_%3 %endif add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %endmacro test leftextq, leftextq jnz .need_left_ext test rightextq, rightextq jnz .need_right_ext v_loop 0, 0, 0 jmp .body_done .need_left_ext: test rightextq, rightextq jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; bottom edge extension test bottomextq, bottomextq jz .top mov srcq, dstq sub srcq, dstrideq xor r1, r1 .bottom_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, bottomextq .bottom_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .bottom_y_loop add r1, 16 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end mov srcq, r9m mov dstq, dstm xor r1, r1 .top_x_loop: mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, topextq .top_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .top_y_loop add r1, 16 cmp r1, bwq jl .top_x_loop .end: RET cglobal resize_16bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax sub dword mx0m, 4<<14 sub dword src_wm, 8 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_64] vpbroadcastw xm7, pxmaxm pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] pslld m5, 3 ; dx*8 pslld m6, 14 paddd m8, m2 ; mx+[0..7]*dx .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: vpbroadcastd m10, [base+pd_63] pxor m2, m2 pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset pand m9, m10 ; filter offset (masked) ; load source pixels movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vextracti128 xm0, m0, 1 movu xm10, [srcq+r8*2] movu xm11, [srcq+r9*2] movu xm12, [srcq+r10*2] movu xm13, [srcq+r11*2] movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vinserti128 m10, [srcq+r8*2], 1 vinserti128 m11, [srcq+r9*2], 1 vinserti128 m12, [srcq+r10*2], 1 vinserti128 m13, [srcq+r11*2], 1 ptest m1, m1 jz .filter movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vextracti128 xm1, m1, 1 movu xm14, [base+resize_shuf+8+r8*2] movu xm15, [base+resize_shuf+8+r9*2] movu xm0, [base+resize_shuf+8+r10*2] movu xm2, [base+resize_shuf+8+r11*2] movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vinserti128 m14, [base+resize_shuf+8+r8*2], 1 vinserti128 m15, [base+resize_shuf+8+r9*2], 1 vinserti128 m0, [base+resize_shuf+8+r10*2], 1 vinserti128 m2, [base+resize_shuf+8+r11*2], 1 pshufb m10, m14 pshufb m11, m15 pshufb m12, m0 pshufb m13, m2 .filter: movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 vextracti128 xm9, m9, 1 movq xm14, [base+resize_filter+r8*8] movq xm15, [base+resize_filter+r9*8] movq xm0, [base+resize_filter+r10*8] movq xm2, [base+resize_filter+r11*8] movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 movhps xm14, [base+resize_filter+r8*8] movhps xm15, [base+resize_filter+r9*8] movhps xm0, [base+resize_filter+r10*8] movhps xm2, [base+resize_filter+r11*8] pmovsxbw m14, xm14 pmovsxbw m15, xm15 pmovsxbw m0, xm0 pmovsxbw m2, xm2 pmaddwd m10, m14 pmaddwd m11, m15 pmaddwd m12, m0 pmaddwd m13, m2 phaddd m10, m11 phaddd m12, m13 phaddd m10, m12 psubd m10, m3, m10 psrad m10, 7 vextracti128 xm0, m10, 1 packusdw xm10, xm0 pminsw xm10, xm7 mova [dstq+xq*2], xm10 paddd m4, m5 add xd, 8 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/mc16_avx512.asm000064400000000000000000005245471046102023000144620ustar 00000000000000; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41 spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17 db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49 db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25 db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45 spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21 db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53 db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29 db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61 spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94 db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126 prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46 db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62 db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110 db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126 prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78 db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94 db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110 db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126 spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf4b: db 18, 19, 33, 34, 22, 23, 37, 38, 26, 27, 41, 42, 30, 31, 45, 46 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30 db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78 db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62 db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110 spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78 db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94 db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110 db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126 spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46 db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62 db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110 db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126 spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78 db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94 db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110 db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126 spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38 db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14 db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46 spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30 spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21 db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25 w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94 db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126 w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94 db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126 w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94 db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126 w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110 db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126 warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37 db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41 db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45 db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49 db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53 db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57 db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61 warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53 db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55 db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61 db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63 deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 dd 1 pw_2048: times 2 dw 2048 dd 3 pw_8192: times 2 dw 8192 avg_shift: dw 5, 5, 3, 3 pw_27615: times 2 dw 27615 pw_32766: times 2 dw 32766 warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29 resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31 resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13 resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15 resize_permE: dq 0, 2, 4, 6 resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13 resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 prep_hv_shift: dq 6, 4 put_bilin_h_rnd: dw 8, 8, 10, 10 prep_mul: dw 16, 16, 4, 4 put_8tap_h_rnd: dd 34, 40 prep_8tap_rnd: dd 128 - (8192 << 8) warp_8x8_rnd_h: dd 512, 2048 warp_8x8_rnd_v: dd 262144, 65536 warp_8x8t_rnd_v: dd 16384 - (8192 << 15) avg_round: dw -16400, -16400, -16388, -16388 w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4) mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6) w_mask_round: dd 128, 64 bidir_shift: dw 6, 6, 4, 4 pb_64: times 4 db 64 pw_m512: times 2 dw -512 pw_2: times 2 dw 2 pw_64: times 2 dw 64 pd_32: dd 32 pd_63: dd 63 pd_128: dd 128 pd_640: dd 640 pd_2176: dd 2176 pd_16384: dd 16384 pd_0_4: dd 0, 4 %define pw_16 prep_mul %define pd_512 warp_8x8_rnd_h %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep) BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern obmc_masks_avx2 cextern resize_filter SECTION .text %if WIN64 DECLARE_REG_TMP 4 %else DECLARE_REG_TMP 8 %endif INIT_ZMM avx512icl cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy mov mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt t0d, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx t0d, word [r7+t0*2+table_offset(put,)] add t0, r7 jmp t0 .put_w2: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] add srcq, ssq mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .put_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r7m ; my vpbroadcastd m4, [pw_16] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)] mov r6d, r8m ; bitdepth_max add t0, r7 shr r6d, 11 vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4] jmp t0 .h_w2: movq xmm1, [srcq+ssq*0] movhps xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq xmm0, [srcq+ssq*0+0] movhps xmm0, [srcq+ssq*1+0] movq xmm1, [srcq+ssq*0+2] movhps xmm1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw xmm0, xm4 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 4 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu xm0, [srcq+ssq*0+0] vinserti32x4 ym0, [srcq+ssq*1+0], 1 movu xm1, [srcq+ssq*0+2] vinserti32x4 ym1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw ym0, ym4 pmullw ym1, ym5 paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 4 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu ym0, [srcq+ssq*0+0] vinserti32x8 m0, [srcq+ssq*1+0], 1 movu ym1, [srcq+ssq*0+2] vinserti32x8 m1, [srcq+ssq*1+2], 1 lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m6 paddw m0, m1 psrlw m0, 4 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] pmullw m1, m4, [srcq+ssq*1+0] pmullw m3, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+64*0+0] pmullw m2, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m3, m5, [srcq+64*1+2] add srcq, ssq paddw m0, m6 paddw m1, m6 paddw m0, m2 paddw m1, m3 psrlw m0, 4 psrlw m1, 4 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+64*0+0] pmullw m7, m5, [srcq+64*0+2] pmullw m1, m4, [srcq+64*1+0] pmullw m8, m5, [srcq+64*1+2] pmullw m2, m4, [srcq+64*2+0] pmullw m9, m5, [srcq+64*2+2] pmullw m3, m4, [srcq+64*3+0] pmullw m10, m5, [srcq+64*3+2] add srcq, ssq REPX {paddw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psrlw x, 4}, m0, m1, m2, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq dec hd jg .h_w128 RET .v: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)] shl mxyd, 11 vpbroadcastw m8, mxyd add t0, r7 jmp t0 .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq xmm2, xmm0, xmm1 movd xmm0, [srcq+ssq*0] punpckldq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm0, [srcq+ssq*0] .v_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq xmm2, xmm0, xmm1 movq xmm0, [srcq+ssq*0] punpcklqdq xmm1, xmm0 psubw xmm1, xmm2 pmulhrsw xmm1, xm8 paddw xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movu xmm0, [srcq+ssq*0] .v_w8_loop: vbroadcasti128 ymm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm2, ymm0, ymm1, 0xf0 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm1, ymm0, 0xf0 psubw ymm1, ymm2 pmulhrsw ymm1, ym8 paddw ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: movu ym0, [srcq+ssq*0] .v_w16_loop: movu ym3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw ym1, ym3, ym0 pmulhrsw ym1, ym8 paddw ym1, ym0 movu ym0, [srcq+ssq*0] psubw ym2, ym0, ym3 pmulhrsw ym2, ym8 paddw ym2, ym3 mova [dstq+dsq*0], ym1 mova [dstq+dsq*1], ym2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: movu m0, [srcq+ssq*0] .v_w32_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m8 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m8 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] .v_w64_loop: movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] psubw m4, m2, m0 pmulhrsw m4, m8 paddw m4, m0 movu m0, [srcq+ssq*0+64*0] psubw m5, m3, m1 pmulhrsw m5, m8 paddw m5, m1 movu m1, [srcq+ssq*0+64*1] psubw m6, m0, m2 pmulhrsw m6, m8 psubw m7, m1, m3 pmulhrsw m7, m8 mova [dstq+dsq*0+64*0], m4 mova [dstq+dsq*0+64*1], m5 paddw m6, m2 paddw m7, m3 mova [dstq+dsq*1+64*0], m6 mova [dstq+dsq*1+64*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*0+64*2] movu m3, [srcq+ssq*0+64*3] .v_w128_loop: movu m4, [srcq+ssq*1+64*0] movu m5, [srcq+ssq*1+64*1] movu m6, [srcq+ssq*1+64*2] movu m7, [srcq+ssq*1+64*3] lea srcq, [srcq+ssq*2] psubw m9, m4, m0 pmulhrsw m9, m8 paddw m9, m0 movu m0, [srcq+ssq*0+64*0] psubw m10, m5, m1 pmulhrsw m10, m8 paddw m10, m1 movu m1, [srcq+ssq*0+64*1] psubw m11, m6, m2 pmulhrsw m11, m8 paddw m11, m2 movu m2, [srcq+ssq*0+64*2] psubw m12, m7, m3 pmulhrsw m12, m8 paddw m12, m3 movu m3, [srcq+ssq*0+64*3] mova [dstq+dsq*0+64*0], m9 psubw m9, m0, m4 pmulhrsw m9, m8 mova [dstq+dsq*0+64*1], m10 psubw m10, m1, m5 pmulhrsw m10, m8 mova [dstq+dsq*0+64*2], m11 psubw m11, m2, m6 pmulhrsw m11, m8 mova [dstq+dsq*0+64*3], m12 psubw m12, m3, m7 pmulhrsw m12, m8 paddw m9, m4 paddw m10, m5 mova [dstq+dsq*1+64*0], m9 mova [dstq+dsq*1+64*1], m10 paddw m11, m6 paddw m12, m7 mova [dstq+dsq*1+64*2], m11 mova [dstq+dsq*1+64*3], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w128_loop RET .hv: movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)] shl mxyd, 11 vpbroadcastd m6, [pw_2] vpbroadcastw m7, mxyd vpbroadcastd m8, [pw_8192] add t0, r7 test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 vpbroadcastd m8, [pw_2048] .hv_12bpc: jmp t0 .hv_w2: vpbroadcastq xmm1, [srcq+ssq*0] pmullw xmm0, xmm1, xm4 psrlq xmm1, 16 pmullw xmm1, xm5 paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w2_loop: movq xmm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm2, [srcq+ssq*0] pmullw xmm1, xmm2, xm4 psrlq xmm2, 16 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 _ 2 _ shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: pmullw xmm0, xm4, [srcq+ssq*0-8] pmullw xmm1, xm5, [srcq+ssq*0-6] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 .hv_w4_loop: movq xmm1, [srcq+ssq*1+0] movq xmm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0+0] movhps xmm2, [srcq+ssq*0+2] pmullw xmm1, xm4 pmullw xmm2, xm5 paddw xmm1, xm6 paddw xmm1, xmm2 psrlw xmm1, 2 ; 1 2 shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm7 paddw xmm1, xmm2 pmulhrsw xmm1, xm8 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: pmullw xmm0, xm4, [srcq+ssq*0+0] pmullw xmm1, xm5, [srcq+ssq*0+2] paddw xmm0, xm6 paddw xmm0, xmm1 psrlw xmm0, 2 vinserti32x4 ym0, xmm0, 1 .hv_w8_loop: movu xm1, [srcq+ssq*1+0] movu xm2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x4 ym1, [srcq+ssq*0+0], 1 vinserti32x4 ym2, [srcq+ssq*0+2], 1 pmullw ym1, ym4 pmullw ym2, ym5 paddw ym1, ym6 paddw ym1, ym2 psrlw ym1, 2 ; 1 2 vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym7 paddw ym1, ym2 pmulhrsw ym1, ym8 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+ssq*0+0] pmullw ym1, ym5, [srcq+ssq*0+2] paddw ym0, ym6 paddw ym0, ym1 psrlw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+ssq*1+0] movu ym2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0+0], 1 vinserti32x8 m2, [srcq+ssq*0+2], 1 pmullw m1, m4 pmullw m2, m5 paddw m1, m6 paddw m1, m2 psrlw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m7 paddw m1, m2 pmulhrsw m1, m8 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: .hv_w64: .hv_w128: movifnidn wd, wm lea r6d, [hq+wq*8-256] mov r4, srcq mov r7, dstq .hv_w32_loop0: pmullw m0, m4, [srcq+ssq*0+0] pmullw m1, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m1 psrlw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+ssq*1+0] pmullw m1, m5, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] paddw m3, m6 paddw m3, m1 psrlw m3, 2 psubw m1, m3, m0 paddw m1, m1 pmulhw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+ssq*0+0] pmullw m2, m5, [srcq+ssq*0+2] paddw m0, m6 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m3 paddw m2, m2 pmulhw m2, m7 paddw m2, m3 pmulhrsw m1, m8 pmulhrsw m2, m8 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w32_loop0 RET cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [r6-prep_avx512icl+pw_8192] add wq, r6 shr r5d, 11 vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4] lea stride3q, [strideq*3] jmp wq .prep_w4: movq xmm0, [srcq+strideq*0] movhps xmm0, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm0, ymm1, 0x30 vpblendd ymm0, ymm2, 0xc0 pmullw ymm0, ym4 psubw ymm0, ym5 mova [tmpq], ymm0 add tmpq, 32 sub hd, 4 jg .prep_w4 vzeroupper RET .prep_w8: movu xm0, [srcq+strideq*0] vinserti32x4 ym0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 psubw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .prep_w8 RET .prep_w16: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .prep_w16 RET .prep_w32: pmullw m0, m4, [srcq+strideq*0] pmullw m1, m4, [srcq+strideq*1] pmullw m2, m4, [srcq+strideq*2] pmullw m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmullw m0, m4, [srcq+strideq*0+64*0] pmullw m1, m4, [srcq+strideq*0+64*1] pmullw m2, m4, [srcq+strideq*1+64*0] pmullw m3, m4, [srcq+strideq*1+64*1] lea srcq, [srcq+strideq*2] REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmullw m0, m4, [srcq+64*0] pmullw m1, m4, [srcq+64*1] pmullw m2, m4, [srcq+64*2] pmullw m3, m4, [srcq+64*3] add srcq, strideq REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .prep_w128 RET .h: vpbroadcastw m5, mxyd mov mxyd, r6m ; my vpbroadcastd m4, [pw_16] vpbroadcastd m6, [pw_32766] psubw m4, m5 test dword r7m, 0x800 jnz .h_12bpc psllw m4, 2 psllw m5, 2 .h_12bpc: test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: movu xm1, [srcq+strideq*0] vinserti32x4 ym1, [srcq+strideq*2], 1 movu xm2, [srcq+strideq*1] vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym0, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4 RET .h_w8: movu xm0, [srcq+strideq*0+0] movu xm1, [srcq+strideq*0+2] vinserti32x4 ym0, [srcq+strideq*1+0], 1 vinserti32x4 ym1, [srcq+strideq*1+2], 1 vinserti32x4 m0, [srcq+strideq*2+0], 2 vinserti32x4 m1, [srcq+strideq*2+2], 2 vinserti32x4 m0, [srcq+stride3q +0], 3 vinserti32x4 m1, [srcq+stride3q +2], 3 lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8 RET .h_w16: movu ym0, [srcq+strideq*0+0] vinserti32x8 m0, [srcq+strideq*1+0], 1 movu ym1, [srcq+strideq*0+2] vinserti32x8 m1, [srcq+strideq*1+2], 1 lea srcq, [srcq+strideq*2] pmullw m0, m4 pmullw m1, m5 psubw m0, m6 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16 RET .h_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] pmullw m1, m4, [srcq+strideq*1+0] pmullw m3, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 2 jg .h_w32 RET .h_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] add srcq, strideq psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 dec hd jg .h_w64 RET .h_w128: pmullw m0, m4, [srcq+ 0] pmullw m7, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m8, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m9, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m10, m5, [srcq+194] add srcq, strideq REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m7 paddw m1, m8 paddw m2, m9 paddw m3, m10 REPX {psraw x, 2}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 dec hd jg .h_w128 RET .v: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] vpbroadcastw m9, mxyd vpbroadcastd m8, [pw_16] vpbroadcastd m10, [pw_32766] add wq, r6 lea stride3q, [strideq*3] psubw m8, m9 test dword r7m, 0x800 jnz .v_12bpc psllw m8, 2 psllw m9, 2 .v_12bpc: jmp wq .v_w4: movq xmm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastq xmm2, [srcq+strideq*1] vpbroadcastq ymm1, [srcq+strideq*2] vpbroadcastq ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm2, ymm1, 0x30 vpblendd ymm2, ymm3, 0xc0 vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3 movq xmm0, [srcq+strideq*0] valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4 pmullw ymm1, ym8 pmullw ymm2, ym9 psubw ymm1, ym10 paddw ymm1, ymm2 psraw ymm1, 2 mova [tmpq], ymm1 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: movu xm0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vinserti32x4 m1, [srcq+strideq*2], 2 vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] valignq m2, m0, m1, 2 ; 1 2 3 4 pmullw m1, m8 pmullw m2, m9 psubw m1, m10 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: movu ym0, [srcq+strideq*0] .v_w16_loop: vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1 movu ym3, [srcq+strideq*2] vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3 lea srcq, [srcq+strideq*4] movu ym0, [srcq+strideq*0] vshufi32x4 m3, m1, m3, q1032 ; 1 2 vshufi32x4 m4, m2, m0, q1032 ; 3 4 pmullw m1, m8 pmullw m2, m8 pmullw m3, m9 pmullw m4, m9 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: movu m0, [srcq+strideq*0] .v_w32_loop: movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m1, m8, m0 movu m0, [srcq+strideq*0] pmullw m2, m8, m3 pmullw m3, m9 pmullw m4, m9, m0 psubw m1, m10 psubw m2, m10 paddw m1, m3 paddw m2, m4 psraw m1, 2 psraw m2, 2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w64_loop: add srcq, strideq pmullw m2, m8, m0 movu m0, [srcq+64*0] pmullw m3, m8, m1 movu m1, [srcq+64*1] pmullw m4, m9, m0 pmullw m5, m9, m1 psubw m2, m10 psubw m3, m10 paddw m2, m4 paddw m3, m5 psraw m2, 2 psraw m3, 2 mova [tmpq+64*0], m2 mova [tmpq+64*1], m3 add tmpq, 64*2 dec hd jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] movu m2, [srcq+64*2] movu m3, [srcq+64*3] .v_w128_loop: add srcq, strideq pmullw m4, m8, m0 movu m0, [srcq+64*0] pmullw m5, m8, m1 movu m1, [srcq+64*1] pmullw m6, m8, m2 movu m2, [srcq+64*2] pmullw m7, m8, m3 movu m3, [srcq+64*3] pmullw m11, m9, m0 pmullw m12, m9, m1 pmullw m13, m9, m2 pmullw m14, m9, m3 REPX {psubw x, m10}, m4, m5, m6, m7 paddw m4, m11 paddw m5, m12 paddw m6, m13 paddw m7, m14 REPX {psraw x, 2}, m4, m5, m6, m7 mova [tmpq+64*0], m4 mova [tmpq+64*1], m5 mova [tmpq+64*2], m6 mova [tmpq+64*3], m7 add tmpq, 64*4 dec hd jg .v_w128_loop RET .hv: movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m7, mxyd add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: movq xmm0, [srcq+strideq*0+0] movq xmm1, [srcq+strideq*0+2] pmullw xmm0, xm4 pmullw xmm1, xm5 psubw xmm0, xm6 paddw xmm0, xmm1 psraw xmm0, 2 vpbroadcastq ym0, xmm0 .hv_w4_loop: movu xm1, [srcq+strideq*1] vinserti128 ym1, [srcq+stride3q ], 1 movu xm2, [srcq+strideq*2] lea srcq, [srcq+strideq*4] vinserti128 ym2, [srcq+strideq*0], 1 punpcklqdq ym3, ym1, ym2 psrldq ym1, 2 psrldq ym2, 2 pmullw ym3, ym4 punpcklqdq ym1, ym2 pmullw ym1, ym5 psubw ym3, ym6 paddw ym1, ym3 psraw ym1, 2 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym7 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: pmullw xm0, xm4, [srcq+strideq*0+0] pmullw xm1, xm5, [srcq+strideq*0+2] psubw xm0, xm6 paddw xm0, xm1 psraw xm0, 2 vinserti32x4 m0, xm0, 3 .hv_w8_loop: movu xm1, [srcq+strideq*1+0] movu xm2, [srcq+strideq*1+2] vinserti32x4 ym1, [srcq+strideq*2+0], 1 vinserti32x4 ym2, [srcq+strideq*2+2], 1 vinserti32x4 m1, [srcq+stride3q +0], 2 vinserti32x4 m2, [srcq+stride3q +2], 2 lea srcq, [srcq+strideq*4] vinserti32x4 m1, [srcq+strideq*0+0], 3 vinserti32x4 m2, [srcq+strideq*0+2], 3 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: pmullw ym0, ym4, [srcq+strideq*0+0] pmullw ym1, ym5, [srcq+strideq*0+2] psubw ym0, ym6 paddw ym0, ym1 psraw ym0, 2 vinserti32x8 m0, ym0, 1 .hv_w16_loop: movu ym1, [srcq+strideq*1+0] movu ym2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] vinserti32x8 m1, [srcq+strideq*0+0], 1 vinserti32x8 m2, [srcq+strideq*0+2], 1 pmullw m1, m4 pmullw m2, m5 psubw m1, m6 paddw m1, m2 psraw m1, 2 ; 1 2 vshufi32x4 m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m7 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: pmullw m0, m4, [srcq+strideq*0+0] pmullw m1, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m1 psraw m0, 2 .hv_w32_loop: pmullw m3, m4, [srcq+strideq*1+0] pmullw m1, m5, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] psubw m3, m6 paddw m3, m1 psraw m3, 2 psubw m1, m3, m0 pmulhrsw m1, m7 paddw m1, m0 pmullw m0, m4, [srcq+strideq*0+0] pmullw m2, m5, [srcq+strideq*0+2] psubw m0, m6 paddw m0, m2 psraw m0, 2 psubw m2, m0, m3 pmulhrsw m2, m7 paddw m2, m3 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: pmullw m0, m4, [srcq+ 0] pmullw m2, m5, [srcq+ 2] pmullw m1, m4, [srcq+64] pmullw m3, m5, [srcq+66] psubw m0, m6 psubw m1, m6 paddw m0, m2 paddw m1, m3 psraw m0, 2 psraw m1, 2 .hv_w64_loop: add srcq, strideq pmullw m2, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m3, m4, [srcq+64] pmullw m9, m5, [srcq+66] psubw m2, m6 psubw m3, m6 paddw m2, m8 paddw m3, m9 psraw m2, 2 psraw m3, 2 psubw m8, m2, m0 psubw m9, m3, m1 pmulhrsw m8, m7 pmulhrsw m9, m7 paddw m8, m0 mova m0, m2 paddw m9, m1 mova m1, m3 mova [tmpq+64*0], m8 mova [tmpq+64*1], m9 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: pmullw m0, m4, [srcq+ 0] pmullw m8, m5, [srcq+ 2] pmullw m1, m4, [srcq+ 64] pmullw m9, m5, [srcq+ 66] pmullw m2, m4, [srcq+128] pmullw m10, m5, [srcq+130] pmullw m3, m4, [srcq+192] pmullw m11, m5, [srcq+194] REPX {psubw x, m6}, m0, m1, m2, m3 paddw m0, m8 paddw m1, m9 paddw m2, m10 paddw m3, m11 REPX {psraw x, 2}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq pmullw m8, m4, [srcq+ 0] pmullw m12, m5, [srcq+ 2] pmullw m9, m4, [srcq+ 64] pmullw m13, m5, [srcq+ 66] pmullw m10, m4, [srcq+128] pmullw m14, m5, [srcq+130] pmullw m11, m4, [srcq+192] pmullw m15, m5, [srcq+194] REPX {psubw x, m6}, m8, m9, m10, m11 paddw m8, m12 paddw m9, m13 paddw m10, m14 paddw m11, m15 REPX {psraw x, 2}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 REPX {pmulhrsw x, m7}, m12, m13, m14, m15 paddw m12, m0 mova m0, m8 paddw m13, m1 mova m1, m9 mova [tmpq+64*0], m12 mova [tmpq+64*1], m13 paddw m14, m2 mova m2, m10 paddw m15, m3 mova m3, m11 mova [tmpq+64*2], m14 mova [tmpq+64*3], m15 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v cglobal %1_8tap_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %define buf rsp+stack_offset+8 ; shadow space %else DECLARE_REG_TMP 7, 8 %define buf rsp-40 ; red zone %endif MC_8TAP_FN put, sharp, SHARP, SHARP MC_8TAP_FN put, sharp_smooth, SHARP, SMOOTH MC_8TAP_FN put, smooth_sharp, SMOOTH, SHARP MC_8TAP_FN put, smooth, SMOOTH, SMOOTH MC_8TAP_FN put, sharp_regular, SHARP, REGULAR MC_8TAP_FN put, regular_sharp, REGULAR, SHARP MC_8TAP_FN put, smooth_regular, SMOOTH, REGULAR MC_8TAP_FN put, regular_smooth, REGULAR, SMOOTH MC_8TAP_FN put, regular, REGULAR, REGULAR cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movifnidn wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 %if WIN64 pop r8 %endif jmp wq .h_w2: movzx mxd, mxb sub srcq, 2 mova ym2, [spel_h_shuf2a] pmovsxbw xmm4, [base+subpel_filters+mxq*8] pshufd xmm3, xmm4, q1111 pshufd xmm4, xmm4, q2222 .h_w2_loop: movu xm1, [srcq+ssq*0] vinserti32x4 ym1, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova xmm0, xm8 vpermb ym1, ym2, ym1 vpdpwssd xmm0, xmm3, xm1 vextracti32x4 xm1, ym1, 1 vpdpwssd xmm0, xmm4, xm1 psrad xmm0, 6 packusdw xmm0, xmm0 pminsw xmm0, xm9 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] vbroadcasti32x4 ym4, [spel_h_shufA] vbroadcasti32x4 ym5, [spel_h_shufB] pshufd xmm0, xmm0, q2211 vpbroadcastq ym6, xmm0 vpermq ym7, ymm0, q1111 .h_w4_loop: movu xm2, [srcq+ssq*0] vinserti32x4 ym2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova ym0, ym8 pshufb ym1, ym2, ym4 vpdpwssd ym0, ym6, ym1 pshufb ym2, ym5 vpdpwssd ym0, ym7, ym2 psrad ym0, 6 vextracti32x4 xm1, ym0, 1 packusdw xm0, xm1 pminsw xmm0, xm0, xm9 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv mov r7d, r8m vpbroadcastw m9, r8m shr r7d, 11 vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4] cmp wd, 4 je .h_w4 jl .h_w2 shr mxd, 16 sub srcq, 6 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mova [buf], xmm0 vpbroadcastd m10, xmm0 vpbroadcastd m11, [buf+ 4] vpbroadcastd m12, [buf+ 8] vpbroadcastd m13, [buf+12] sub wd, 16 je .h_w16 jg .h_w32 .h_w8: mova m4, [spel_h_shufA] movu m5, [spel_h_shufB] movu m6, [spel_h_shufC] mova m7, [spel_h_shufD] .h_w8_loop: movu ym2, [srcq+ssq*0] vinserti32x8 m2, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] mova m0, m8 vpermb m1, m4, m2 vpdpwssd m0, m10, m1 vpermb m1, m5, m2 vpdpwssd m0, m11, m1 vpermb m1, m6, m2 vpdpwssd m0, m12, m1 vpermb m1, m7, m2 vpdpwssd m0, m13, m1 psrad m0, 6 vextracti32x8 ym1, m0, 1 packusdw ym0, ym1 pminsw ym0, ym9 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8_loop RET .h_w16: vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] .h_w16_loop: movu ym2, [srcq+ssq*0+ 0] vinserti32x8 m2, [srcq+ssq*1+ 0], 1 movu ym3, [srcq+ssq*0+16] vinserti32x8 m3, [srcq+ssq*1+16], 1 lea srcq, [srcq+ssq*2] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m11, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m13, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a2 vpdpwssd m1, m10, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a3 vpdpwssd m1, m11, m2 ; b1 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m9 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: lea srcq, [srcq+wq*2] vbroadcasti32x4 m6, [spel_h_shufA] lea dstq, [dstq+wq*2] vbroadcasti32x4 m7, [spel_h_shufB] neg wq .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m8 mova m1, m8 pshufb m4, m2, m6 vpdpwssd m0, m10, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m10, m4 ; b0 vpdpwssd m0, m12, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m11, m3 ; b1 vpdpwssd m0, m13, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m12, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m11, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m13, m4 ; b3 psrad m0, 6 psrad m1, 6 packusdw m0, m1 pminsw m0, m9 mova [dstq+r6*2], m0 add r6, 32 jl .h_w32_loop add srcq, ssq add dstq, dsq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m10, [pd_32] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r7d, wd vpbroadcastw m11, r8m lea r6, [ssq*3] movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)] sub srcq, r6 mova [rsp+stack_offset+8], xmm0 vpbroadcastd m12, xmm0 add r7, r8 vpbroadcastd m13, [rsp+stack_offset+12] vpbroadcastd m14, [rsp+stack_offset+16] vpbroadcastd m15, [rsp+stack_offset+20] jmp r7 .v_w2: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, r6 pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, r6 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklwd xmm3, xmm1 ; 45 56 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 .v_w2_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xmm5, xm10 vpdpwssd xmm5, xm12, xmm1 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xm13, xmm2 ; a1 b1 mova xmm2, xmm3 vpdpwssd xmm5, xm14, xmm3 ; a2 b2 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm0, 0x02 ; 7 8 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xm15, xmm3 ; a3 b3 psrad xmm5, 6 packusdw xmm5, xmm5 pminsw xmm5, xm11 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm5, [srcq+ssq*2] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 psrad ymm4, 6 vextracti128 xmm5, ymm4, 1 packusdw xmm4, xmm5 pminsw xmm4, xm11 movq [dstq+dsq*0], xmm4 movhps [dstq+dsq*1], xmm4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+ssq*2] vinserti32x4 m1, m2, [srcq+ssq*0], 0 vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+ssq*0], 1 vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+ssq*1], 3 lea srcq, [srcq+ssq*2] movu xm5, [srcq+ssq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 psrad m4, 6 vextracti32x8 ym5, m4, 1 packusdw ym4, ym5 pminsw ym4, ym11 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m1, [srcq+ssq*1] vinserti32x8 m0, m1, [srcq+ssq*0], 0 vinserti32x8 m1, [srcq+ssq*2], 1 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+ssq*0] vinserti32x8 m3, [srcq+ssq*1], 1 movu ym5, [srcq+ssq*2] add srcq, r6 vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m0, m8, m0 ; 01 vpermb m1, m8, m1 ; 12 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 mova m9, [deint_q_shuf] vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m6, m10 mova m7, m10 vpdpwssd m6, m12, m0 ; a0 mova m0, m2 vpdpwssd m7, m12, m1 ; b0 mova m1, m3 vpdpwssd m6, m13, m2 ; a1 mova m2, m4 vpdpwssd m7, m13, m3 ; b1 mova m3, m5 vpdpwssd m6, m14, m4 ; a2 mova m4, m5 vpdpwssd m7, m14, m5 ; b2 movu ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m5, [srcq+ssq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m5, 16 ; 67 vpdpwssd m6, m15, m4 ; a3 vpdpwssd m7, m15, m5 ; b3 psrad m6, 6 psrad m7, 6 packusdw m6, m7 pminsw m6, m11 vpermq m6, m9, m6 mova [dstq+dsq*0], ym6 vextracti32x8 [dstq+dsq*1], m6, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 movaps [rsp+stack_offset+8], xmm6 %endif lea wd, [hq+wq*8-256] mov r7, srcq mov r8, dstq .v_w32_loop0: movu m16, [srcq+ssq*0] movu m17, [srcq+ssq*1] movu m18, [srcq+ssq*2] add srcq, r6 movu m19, [srcq+ssq*0] movu m20, [srcq+ssq*1] movu m21, [srcq+ssq*2] add srcq, r6 movu m22, [srcq+ssq*0] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [srcq+ssq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h REPX {psrad x, 6}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [dstq+dsq*0], m6 mova [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop add r7, 64 add r8, 64 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .v_w32_loop0 %if WIN64 movaps xmm6, [rsp+stack_offset+8] %endif vzeroupper RET .hv: vpbroadcastw m11, r8m cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 test dword r8m, 0x800 jnz .hv_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_main .hv_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m8, [buf+ 4] vpbroadcastd m9, [buf+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [buf+20] vpbroadcastd ym14, [buf+24] vpbroadcastd ym15, [buf+28] movu xm4, [srcq+ssq*0] vinserti32x4 ym4, [srcq+ssq*1], 1 vinserti32x4 m4, [srcq+ssq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3 movu xm0, [srcq+ssq*1] vinserti32x4 ym0, [srcq+ssq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6 cmp wd, 4 je .hv_w4 vbroadcasti32x4 m2, [spel_h_shufA] mova m3, [spel_h_shuf2b] mova ym6, [spel_h_shuf2a] mova xm7, [spel_shuf2] mova m1, m10 pshufb m4, m2 pshufb m0, m2 punpcklqdq m2, m4, m0 vpdpwssd m1, m8, m2 ; 04 15 26 3_ punpckhqdq m4, m0 vpdpwssd m1, m9, m4 vpermb m1, m3, m1 ; 01 12 vextracti32x4 xm2, ym1, 1 ; 23 34 vextracti32x4 xm3, m1, 2 ; 45 56 .hv_w2_loop: movu xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 mova xm4, xm10 vpermb ym5, ym6, ym5 pmaddwd xmm0, xm12, xm1 ; a0 b0 vpdpwssd xm4, xm8, xm5 vextracti32x4 xm5, ym5, 1 mova xm1, xm2 vpdpwssd xmm0, xm13, xm2 ; a1 b1 vpdpwssd xm4, xm9, xm5 ; 7 8 mova xm2, xm3 vpdpwssd xmm0, xm14, xm3 ; a2 b2 vpermt2b xm3, xm7, xm4 ; 67 78 vpdpwssd xmm0, xm15, xm3 ; a3 b3 psrad xmm0, 10 packusdw xmm0, xmm0 pminsw xmm0, xm11 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym18, [srcq+ssq*0], 1 mova ym4, ym10 pshufb ym17, ym18, ym19 pmaddwd ym16, ym12, ym1 ; a0 b0 vpdpwssd ym4, ym8, ym17 pshufb ym18, ym20 mova ym1, ym2 vpdpwssd ym16, ym13, ym2 ; a1 b1 vpdpwssd ym4, ym9, ym18 ; 7 8 mova ym2, ym3 vpdpwssd ym16, ym14, ym3 ; a2 b2 vpermt2b ym3, ym7, ym4 ; 67 78 vpdpwssd ym16, ym15, ym3 ; a3 b3 psrad ym16, 10 vextracti128 xm17, ym16, 1 packusdw xm16, xm17 pminsw xm16, xm11 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 test dword r8m, 0x800 jnz .hv_w8_12bit vpbroadcastd m10, [pd_2176] psllw xmm0, 6 jmp .hv_w8_main .hv_w8_12bit: vpbroadcastd m10, [pd_640] psllw xmm0, 4 psllw xmm1, 2 .hv_w8_main: mova [buf+ 0], xmm0 mova [buf+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [buf+ 4] vpbroadcastd m14, [buf+ 8] vpbroadcastd m15, [buf+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [buf+20] vpbroadcastd m18, [buf+24] vpbroadcastd m19, [buf+28] cmp wd, 16 je .hv_w16 jg .hv_w32 mova m5, [spel_h_shufA] movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1 movu ym9, [srcq+ssq*2] add srcq, r6 vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3 movu ym20, [srcq+ssq*1] vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+ssq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m0, [srcq+ssq*0], 1 mova m4, m10 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 pmaddwd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 psrad m20, 10 vextracti32x8 ym21, m20, 1 packusdw ym20, ym21 pminsw ym20, ym11 mova [dstq+dsq*0], xm20 vextracti128 [dstq+dsq*1], ym20, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: WIN64_SPILL_XMM 26 vbroadcasti32x8 m5, [srcq+ssq*0+ 8] vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0 vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0 movu ym6, [srcq+ssq*1+ 0] movu ym7, [srcq+ssq*1+16] vinserti32x8 m6, [srcq+ssq*2+ 0], 1 vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2 add srcq, r6 movu ym22, [srcq+ssq*0+ 0] movu ym23, [srcq+ssq*0+16] vinserti32x8 m22, [srcq+ssq*1+ 0], 1 vinserti32x8 m23, [srcq+ssq*1+16], 1 ; 3 4 movu ym24, [srcq+ssq*2+ 0] movu ym25, [srcq+ssq*2+16] add srcq, r6 vinserti32x8 m24, [srcq+ssq*0+ 0], 1 vinserti32x8 m25, [srcq+ssq*0+16], 1 ; 5 6 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m9, [spel_shuf16] pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [srcq+ssq*1+ 0] movu ym25, [srcq+ssq*1+16] lea srcq, [srcq+ssq*2] vinserti32x8 m24, [srcq+ssq*0+ 0], 1 vinserti32x8 m25, [srcq+ssq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 pmaddwd m22, m16, m1 ; A0 mova m1, m3 pmaddwd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 psrad m22, 10 psrad m23, 10 vshufi32x4 m0, m22, m23, q3232 vinserti32x8 m22, ym23, 1 packusdw m22, m0 pminsw m22, m11 mova [dstq+dsq*0], ym22 vextracti32x8 [dstq+dsq*1], m22, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 32 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m22, [spel_shuf32] lea wd, [hq+wq*8-256] mov r7, srcq mov r8, dstq .hv_w32_loop0: movu m6, [srcq+ssq*0+ 0] movu m7, [srcq+ssq*0+ 8] movu m8, [srcq+ssq*0+16] mova m0, m10 mova m23, m10 pshufb m9, m6, m20 vpdpwssd m0, m12, m9 ; a0l pshufb m9, m7, m20 vpdpwssd m23, m12, m9 ; a0h vpdpwssd m0, m14, m9 ; a2l pshufb m7, m21 vpdpwssd m23, m13, m7 ; a1h vpdpwssd m0, m15, m7 ; a3l pshufb m7, m8, m20 vpdpwssd m23, m14, m7 ; a2h pshufb m6, m21 vpdpwssd m0, m13, m6 ; a1l pshufb m8, m21 vpdpwssd m23, m15, m8 ; a3h %macro PUT_8TAP_HV_W32 5 ; dst_lo, dst_hi, stride_name, stride[1-2] movu m6, [srcq+%3*%4+ 0] movu m7, [srcq+%3*%4+ 8] movu m8, [srcq+%3*%4+16] %if %4 == 2 add srcq, r6 %endif movu m29, [srcq+%3*%5+ 0] movu m30, [srcq+%3*%5+ 8] movu m31, [srcq+%3*%5+16] %if %5 == 2 add srcq, r6 %endif mova m%1, m10 mova m9, m10 pshufb m%2, m6, m20 vpdpwssd m%1, m12, m%2 ; x0l pshufb m%2, m29, m20 vpdpwssd m9, m12, m%2 ; y0l pshufb m6, m21 vpdpwssd m%1, m13, m6 ; x1l pshufb m29, m21 vpdpwssd m9, m13, m29 ; y1l pshufb m6, m7, m20 mova m%2, m10 vpdpwssd m%2, m12, m6 ; x0h pshufb m29, m30, m20 vpdpwssd m%1, m14, m6 ; y2l mova m6, m10 vpdpwssd m6, m12, m29 ; x0h pshufb m7, m21 vpdpwssd m9, m14, m29 ; y2l pshufb m30, m21 vpdpwssd m%2, m13, m7 ; x1h vpdpwssd m%1, m15, m7 ; x3l pshufb m7, m8, m20 vpdpwssd m6, m13, m30 ; y1h vpdpwssd m9, m15, m30 ; y3l pshufb m30, m31, m20 vpdpwssd m%2, m14, m7 ; x2h pshufb m8, m21 vpdpwssd m6, m14, m30 ; y2h pshufb m31, m21 vpdpwssd m%2, m15, m8 ; x3h vpdpwssd m6, m15, m31 ; y3h %if %1 == 1 vpermt2b m0, m22, m%1 ; 01l vpermt2b m23, m22, m%2 ; 01h %endif vpermt2b m%1, m22, m9 ; xyl vpermt2b m%2, m22, m6 ; xyh %endmacro PUT_8TAP_HV_W32 1, 24, ssq, 1, 2 ; 12 PUT_8TAP_HV_W32 3, 26, ssq, 0, 1 ; 34 PUT_8TAP_HV_W32 5, 28, ssq, 2, 0 ; 56 vpshrdd m2, m1, m3, 16 ; 23l vpshrdd m25, m24, m26, 16 ; 23h vpshrdd m4, m3, m5, 16 ; 45l vpshrdd m27, m26, m28, 16 ; 45h .hv_w32_loop: movu m7, [srcq+ssq*1+ 0] movu m9, [srcq+ssq*2+ 0] movu m6, [srcq+ssq*1+ 8] movu m8, [srcq+ssq*2+ 8] mova m29, m10 mova m31, m10 pshufb m30, m7, m20 vpdpwssd m29, m12, m30 ; h0l pshufb m30, m9, m20 vpdpwssd m31, m12, m30 ; i0l pshufb m7, m21 vpdpwssd m29, m13, m7 ; h1l pshufb m9, m21 vpdpwssd m31, m13, m9 ; i1l pshufb m7, m6, m20 vpdpwssd m29, m14, m7 ; h2l pshufb m9, m8, m20 vpdpwssd m31, m14, m9 ; i2l pshufb m6, m21 vpdpwssd m29, m15, m6 ; h3l pshufb m8, m21 vpdpwssd m31, m15, m8 ; i3l mova m30, m10 vpdpwssd m30, m12, m7 ; h0h movu m7, [srcq+ssq*1+16] lea srcq, [srcq+ssq*2] vpermt2b m29, m22, m31 ; 78l mova m31, m10 vpdpwssd m31, m12, m9 ; i0h movu m9, [srcq+ssq*0+16] vpdpwssd m30, m13, m6 ; h1h pshufb m6, m7, m20 vpdpwssd m31, m13, m8 ; i1h pshufb m8, m9, m20 vpdpwssd m30, m14, m6 ; h2h pmaddwd m6, m16, m0 ; A0l pshufb m7, m21 vpdpwssd m31, m14, m8 ; i2h pmaddwd m8, m16, m23 ; A0h pshufb m9, m21 vpdpwssd m30, m15, m7 ; h3h pmaddwd m7, m16, m1 ; B0l vpdpwssd m31, m15, m9 ; i3h pmaddwd m9, m16, m24 ; B0h mova m0, m2 vpdpwssd m6, m17, m2 ; A1l mova m23, m25 vpdpwssd m8, m17, m25 ; A1h mova m1, m3 vpdpwssd m7, m17, m3 ; B1l mova m24, m26 vpdpwssd m9, m17, m26 ; B1h vpermt2b m30, m22, m31 ; 78h vpdpwssd m6, m18, m4 ; A2l mova m2, m4 vpdpwssd m8, m18, m27 ; A2h mova m25, m27 vpdpwssd m7, m18, m5 ; B2l mova m3, m5 vpdpwssd m9, m18, m28 ; B2h mova m26, m28 vpshrdd m4, m5, m29, 16 ; 67l vpdpwssd m6, m19, m4 ; A3l vpshrdd m27, m28, m30, 16 ; 67h vpdpwssd m8, m19, m27 ; A3h mova m5, m29 vpdpwssd m7, m19, m29 ; B3l mova m28, m30 vpdpwssd m9, m19, m30 ; B3h REPX {psrad x, 10}, m6, m8, m7, m9 packusdw m6, m8 packusdw m7, m9 pminsw m6, m11 pminsw m7, m11 mova [dstq+dsq*0], m6 mova [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop add r7, 64 add r8, 64 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 jg .hv_w32_loop0 RET %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif MC_8TAP_FN prep, sharp, SHARP, SHARP MC_8TAP_FN prep, sharp_smooth, SHARP, SMOOTH MC_8TAP_FN prep, smooth_sharp, SMOOTH, SHARP MC_8TAP_FN prep, smooth, SMOOTH, SMOOTH MC_8TAP_FN prep, sharp_regular, SHARP, REGULAR MC_8TAP_FN prep, regular_sharp, REGULAR, SHARP MC_8TAP_FN prep, smooth_regular, SMOOTH, REGULAR MC_8TAP_FN prep, regular_smooth, REGULAR, SMOOTH MC_8TAP_FN prep, regular, REGULAR, REGULAR cglobal prep_8tap_16bpc, 3, 8, 16, tmp, src, stride, w, h, mx, my, stride3 %define base r7-prep_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] mov wd, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd mov r5d, r7m ; bitdepth_max vpbroadcastd m5, [pw_8192] movzx wd, word [r7+wq*2+table_offset(prep,)] shr r5d, 11 vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4] add wq, r7 lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h_w4: movzx mxd, mxb sub srcq, 2 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m vbroadcasti32x4 m4, [spel_h_shufA] vbroadcasti32x4 m5, [spel_h_shufB] shr r5d, 11 mova ym9, [prep_endA] psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m6, [tmpq+4] vpbroadcastd m7, [tmpq+8] .h_w4_loop: movu xm2, [srcq+strideq*0] vinserti32x4 ym2, [srcq+strideq*1], 1 vinserti32x4 m2, [srcq+strideq*2], 2 vinserti32x4 m2, [srcq+r6 ], 3 lea srcq, [srcq+strideq*4] mova m0, m10 pshufb m1, m2, m4 vpdpwssd m0, m6, m1 pshufb m2, m5 vpdpwssd m0, m7, m2 vpermb m0, m9, m0 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h: test myd, 0xf00 jnz .hv vpbroadcastd m10, [prep_8tap_rnd] lea r6, [strideq*3] cmp wd, 4 je .h_w4 shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] mov r5d, r7m sub srcq, 6 shr r5d, 11 psllw xmm0, [base+prep_hv_shift+r5*8] mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] cmp wd, 16 je .h_w16 jg .h_w32 .h_w8: mova m6, [spel_h_shufA] movu m7, [spel_h_shufB] movu m8, [spel_h_shufC] mova m9, [spel_h_shufD] mova m11, [prep_endB] .h_w8_loop: movu ym4, [srcq+strideq*0] vinserti32x8 m4, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] vinserti32x8 m5, [srcq+r6 ], 1 lea srcq, [srcq+strideq*4] mova m0, m10 mova m1, m10 vpermb m2, m6, m4 vpermb m3, m6, m5 vpdpwssd m0, m12, m2 vpdpwssd m1, m12, m3 vpermb m2, m7, m4 vpermb m3, m7, m5 vpdpwssd m0, m13, m2 vpdpwssd m1, m13, m3 vpermb m2, m8, m4 vpermb m3, m8, m5 vpdpwssd m0, m14, m2 vpdpwssd m1, m14, m3 vpermb m2, m9, m4 vpermb m3, m9, m5 vpdpwssd m0, m15, m2 vpdpwssd m1, m15, m3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: vbroadcasti32x4 m6, [spel_h_shufA] vbroadcasti32x4 m7, [spel_h_shufB] mova m11, [prep_endC] .h_w16_loop: movu ym2, [srcq+strideq*0+ 0] vinserti32x8 m2, [srcq+strideq*1+ 0], 1 movu ym3, [srcq+strideq*0+16] vinserti32x8 m3, [srcq+strideq*1+16], 1 lea srcq, [srcq+strideq*2] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m14, m4 ; b2 pshufb m4, m2, m7 vpdpwssd m0, m13, m4 ; a1 pshufb m4, m3, m7 vpdpwssd m1, m15, m4 ; b3 shufpd m2, m3, 0x55 pshufb m4, m2, m6 vpdpwssd m0, m14, m4 ; a2 vpdpwssd m1, m12, m4 ; b0 pshufb m2, m7 vpdpwssd m0, m15, m2 ; a3 vpdpwssd m1, m13, m2 ; b1 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 sub hd, 2 jg .h_w16_loop RET .h_w32: vbroadcasti32x4 m6, [spel_h_shufA] lea srcq, [srcq+wq*2] vbroadcasti32x4 m7, [spel_h_shufB] neg wq mova m11, [prep_endC] .h_w32_loop0: mov r6, wq .h_w32_loop: movu m2, [srcq+r6*2+ 0] movu m3, [srcq+r6*2+ 8] mova m0, m10 mova m1, m10 pshufb m4, m2, m6 vpdpwssd m0, m12, m4 ; a0 pshufb m4, m3, m6 vpdpwssd m1, m12, m4 ; b0 vpdpwssd m0, m14, m4 ; a2 movu m4, [srcq+r6*2+16] pshufb m3, m7 vpdpwssd m1, m13, m3 ; b1 vpdpwssd m0, m15, m3 ; a3 pshufb m3, m4, m6 vpdpwssd m1, m14, m3 ; b2 pshufb m2, m7 vpdpwssd m0, m13, m2 ; a1 pshufb m4, m7 vpdpwssd m1, m15, m4 ; b3 vpermt2b m0, m11, m1 mova [tmpq], m0 add tmpq, 64 add r6, 32 jl .h_w32_loop add srcq, strideq dec hd jg .h_w32_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m vpbroadcastd m10, [prep_8tap_rnd] pmovsxbw xmm0, [base+subpel_filters+myq*8] tzcnt r6d, wd shr r5d, 11 movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)] psllw xmm0, [base+prep_hv_shift+r5*8] add r7, r6 lea r6, [strideq*3] sub srcq, r6 mova [tmpq], xmm0 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] jmp r7 .v_w4: movq xmm1, [srcq+strideq*0] vpbroadcastq ymm0, [srcq+strideq*1] vpbroadcastq ymm2, [srcq+strideq*2] add srcq, r6 vpbroadcastq ymm4, [srcq+strideq*0] vpbroadcastq ymm3, [srcq+strideq*1] vpbroadcastq ymm5, [srcq+strideq*2] mova xm11, [prep_endA] add srcq, r6 vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklwd ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+strideq*0] vpblendd ymm2, ymm4, 0x30 vpblendd ymm4, ymm3, 0x30 punpcklwd ymm2, ymm4 ; 23 34 vpblendd ymm3, ymm5, 0x30 vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 45 56 .v_w4_loop: vpbroadcastq ymm5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] mova ymm4, ym10 vpdpwssd ymm4, ym12, ymm1 ; a0 b0 mova ymm1, ymm2 vpdpwssd ymm4, ym13, ymm2 ; a1 b1 mova ymm2, ymm3 vpdpwssd ymm4, ym14, ymm3 ; a2 b2 vpblendd ymm3, ymm0, ymm5, 0x30 vpbroadcastq ymm0, [srcq+strideq*0] vpblendd ymm5, ymm0, 0x30 punpcklwd ymm3, ymm5 ; 67 78 vpdpwssd ymm4, ym15, ymm3 ; a3 b3 vpermb ymm4, ym11, ymm4 mova [tmpq], xmm4 add tmpq, 16 sub hd, 2 jg .v_w4_loop vzeroupper RET .v_w8: vbroadcasti32x4 m2, [srcq+strideq*2] vinserti32x4 m1, m2, [srcq+strideq*0], 0 vinserti32x4 m1, [srcq+strideq*1], 1 ; 0 1 2 add srcq, r6 vinserti32x4 ym2, [srcq+strideq*0], 1 vinserti32x4 m2, [srcq+strideq*1], 2 ; 2 3 4 mova m6, [spel_v_shuf8] movu xm0, [srcq+strideq*1] vinserti32x4 ym0, [srcq+strideq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 mova ym11, [prep_endB] vpermb m1, m6, m1 ; 01 12 vpermb m2, m6, m2 ; 23 34 vpermb m3, m6, m0 ; 45 56 .v_w8_loop: vinserti32x4 m0, [srcq+strideq*1], 3 lea srcq, [srcq+strideq*2] movu xm5, [srcq+strideq*0] mova m4, m10 vpdpwssd m4, m12, m1 ; a0 b0 mova m1, m2 vshufi32x4 m0, m5, q1032 ; 6 7 8 vpdpwssd m4, m13, m2 ; a1 b1 mova m2, m3 vpdpwssd m4, m14, m3 ; a2 b2 vpermb m3, m6, m0 ; 67 78 vpdpwssd m4, m15, m3 ; a3 b3 vpermb m4, m11, m4 mova [tmpq], ym4 add tmpq, 32 sub hd, 2 jg .v_w8_loop RET .v_w16: vbroadcasti32x8 m1, [srcq+strideq*1] vinserti32x8 m0, m1, [srcq+strideq*0], 0 vinserti32x8 m1, [srcq+strideq*2], 1 mova m8, [spel_v_shuf16] add srcq, r6 movu ym3, [srcq+strideq*0] vinserti32x8 m3, [srcq+strideq*1], 1 movu ym5, [srcq+strideq*2] add srcq, r6 vinserti32x8 m5, [srcq+strideq*0], 1 mova m11, [prep_endA] vpermb m0, m8, m0 ; 01 vpermb m1, m8, m1 ; 12 vpermb m3, m8, m3 ; 34 vpermb m5, m8, m5 ; 56 vpshrdd m2, m1, m3, 16 ; 23 vpshrdd m4, m3, m5, 16 ; 45 .v_w16_loop: mova m6, m10 mova m7, m10 vpdpwssd m6, m12, m0 ; a0 mova m0, m2 vpdpwssd m7, m12, m1 ; b0 mova m1, m3 vpdpwssd m6, m13, m2 ; a1 mova m2, m4 vpdpwssd m7, m13, m3 ; b1 mova m3, m5 vpdpwssd m6, m14, m4 ; a2 mova m4, m5 vpdpwssd m7, m14, m5 ; b2 movu ym5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m5, [srcq+strideq*0], 1 vpermb m5, m8, m5 ; 78 vpshrdd m4, m5, 16 ; 67 vpdpwssd m6, m15, m4 ; a3 vpdpwssd m7, m15, m5 ; b3 vpermt2b m6, m11, m7 mova [tmpq], m6 add tmpq, 64 sub hd, 2 jg .v_w16_loop RET .v_w32: .v_w64: .v_w128: %if WIN64 PUSH r8 movaps [rsp+stack_offset+8], xmm6 %endif lea r5, [hq+wq*8-256] mov r7, srcq mov r8, tmpq .v_w32_loop0: movu m16, [srcq+strideq*0] movu m17, [srcq+strideq*1] movu m18, [srcq+strideq*2] add srcq, r6 movu m19, [srcq+strideq*0] movu m20, [srcq+strideq*1] movu m21, [srcq+strideq*2] add srcq, r6 movu m22, [srcq+strideq*0] mova m11, [prep_endC] punpcklwd m0, m16, m17 ; 01l punpckhwd m16, m17 ; 01h punpcklwd m1, m17, m18 ; 12l punpckhwd m17, m18 ; 12h punpcklwd m2, m18, m19 ; 23l punpckhwd m18, m19 ; 23h punpcklwd m3, m19, m20 ; 34l punpckhwd m19, m20 ; 34h punpcklwd m4, m20, m21 ; 45l punpckhwd m20, m21 ; 45h punpcklwd m5, m21, m22 ; 56l punpckhwd m21, m22 ; 56h .v_w32_loop: mova m6, m10 vpdpwssd m6, m12, m0 ; a0l mova m8, m10 vpdpwssd m8, m12, m16 ; a0h mova m7, m10 vpdpwssd m7, m12, m1 ; b0l mova m9, m10 vpdpwssd m9, m12, m17 ; b0h mova m0, m2 vpdpwssd m6, m13, m2 ; a1l mova m16, m18 vpdpwssd m8, m13, m18 ; a1h mova m1, m3 vpdpwssd m7, m13, m3 ; b1l mova m17, m19 vpdpwssd m9, m13, m19 ; b1h mova m2, m4 vpdpwssd m6, m14, m4 ; a2l mova m18, m20 vpdpwssd m8, m14, m20 ; a2h mova m3, m5 vpdpwssd m7, m14, m5 ; b2l mova m19, m21 vpdpwssd m9, m14, m21 ; b2h movu m21, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklwd m4, m22, m21 ; 67l punpckhwd m20, m22, m21 ; 67h movu m22, [srcq+strideq*0] vpdpwssd m6, m15, m4 ; a3l vpdpwssd m8, m15, m20 ; a3h punpcklwd m5, m21, m22 ; 78l punpckhwd m21, m22 ; 78h vpdpwssd m7, m15, m5 ; b3l vpdpwssd m9, m15, m21 ; b3h vpermt2b m6, m11, m8 vpermt2b m7, m11, m9 mova [tmpq+wq*0], m6 mova [tmpq+wq*2], m7 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w32_loop add r7, 64 add r8, 64 movzx hd, r5b mov srcq, r7 mov tmpq, r8 sub r5d, 1<<8 jg .v_w32_loop0 %if WIN64 movaps xmm6, [rsp+stack_offset+8] POP r8 %endif vzeroupper RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 2 shr r5d, 11 sub srcq, r6 psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 vpbroadcastd m10, [prep_8tap_rnd] vpbroadcastd ym11, [pd_128] mova xm21, [prep_endA] mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m8, [tmpq+ 4] vpbroadcastd m9, [tmpq+ 8] vpbroadcastd ym12, xmm1 vpbroadcastd ym13, [tmpq+20] vpbroadcastd ym14, [tmpq+24] vpbroadcastd ym15, [tmpq+28] movu xm4, [srcq+strideq*0] vinserti32x4 ym4, [srcq+strideq*1], 1 vinserti32x4 m4, [srcq+strideq*2], 2 add srcq, r6 vinserti32x4 m4, [srcq+strideq*0], 3 ; 0 1 2 3 movu xm0, [srcq+strideq*1] vinserti32x4 ym0, [srcq+strideq*2], 1 add srcq, r6 vinserti32x4 m0, [srcq+strideq*0], 2 ; 4 5 6 vbroadcasti32x4 m19, [spel_h_shufA] vbroadcasti32x4 m20, [spel_h_shufB] mova ym6, [spel_shuf4a] mova ym7, [spel_shuf4b] mova m2, m10 mova m3, m10 pshufb m1, m4, m19 vpdpwssd m2, m8, m1 pshufb m1, m0, m19 vpdpwssd m3, m8, m1 pshufb m4, m20 vpdpwssd m2, m9, m4 pshufb m0, m20 vpdpwssd m3, m9, m0 vpermb m1, m6, m2 ; 01 12 vshufi32x4 m2, m3, q1032 vpermb m3, m6, m3 ; 45 56 vpermb m2, m6, m2 ; 23 34 .hv_w4_loop: movu xm18, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti128 ym18, [srcq+strideq*0], 1 mova ym16, ym11 mova ym4, ym10 pshufb ym17, ym18, ym19 vpdpwssd ym16, ym12, ym1 ; a0 b0 vpdpwssd ym4, ym8, ym17 pshufb ym18, ym20 mova ym1, ym2 vpdpwssd ym16, ym13, ym2 ; a1 b1 vpdpwssd ym4, ym9, ym18 ; 7 8 mova ym2, ym3 vpdpwssd ym16, ym14, ym3 ; a2 b2 vpermt2b ym3, ym7, ym4 ; 67 78 vpdpwssd ym16, ym15, ym3 ; a3 b3 vpermb ym16, ym21, ym16 mova [tmpq], xm16 add tmpq, 16 sub hd, 2 jg .hv_w4_loop vzeroupper RET .hv_w8: shr mxd, 16 pmovsxbw xmm0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd mov r5d, r7m pmovsxbw xmm1, [base+subpel_filters+myq*8] lea r6, [strideq*3] sub srcq, 6 shr r5d, 11 sub srcq, r6 vpbroadcastd m10, [prep_8tap_rnd] vpbroadcastd m11, [pd_128] psllw xmm0, [base+prep_hv_shift+r5*8] psllw xmm1, 2 mova [tmpq+ 0], xmm0 mova [tmpq+16], xmm1 vpbroadcastd m12, xmm0 vpbroadcastd m13, [tmpq+ 4] vpbroadcastd m14, [tmpq+ 8] vpbroadcastd m15, [tmpq+12] vpbroadcastd m16, xmm1 vpbroadcastd m17, [tmpq+20] vpbroadcastd m18, [tmpq+24] vpbroadcastd m19, [tmpq+28] cmp wd, 16 je .hv_w16 jg .hv_w32 WIN64_SPILL_XMM 23 mova m5, [spel_h_shufA] movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1 movu ym9, [srcq+strideq*2] add srcq, r6 vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3 movu ym20, [srcq+strideq*1] vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5 add srcq, r6 movu ym21, [srcq+strideq*0] ; 6 movu m6, [spel_h_shufB] movu m7, [spel_h_shufC] mova ym22, [prep_endB] vpermb m8, m5, m0 mova m1, m10 vpdpwssd m1, m12, m8 ; a0 b0 vpermb m8, m5, m9 mova m2, m10 vpdpwssd m2, m12, m8 ; c0 d0 vpermb m8, m5, m20 mova m3, m10 vpdpwssd m3, m12, m8 ; e0 f0 vpermb m8, m5, m21 mova m4, m10 vpdpwssd m4, m12, m8 ; g0 vpermb m8, m6, m0 vpdpwssd m1, m13, m8 ; a1 b1 vpermb m8, m6, m9 vpdpwssd m2, m13, m8 ; c1 d1 vpermb m8, m6, m20 vpdpwssd m3, m13, m8 ; e1 f1 vpermb m8, m6, m21 vpdpwssd m4, m13, m8 ; g1 vpermb m8, m7, m0 vpdpwssd m1, m14, m8 ; a2 b2 vpermb m8, m7, m9 vpdpwssd m2, m14, m8 ; c2 d2 vpermb m8, m7, m20 vpdpwssd m3, m14, m8 ; e2 f2 vpermb m8, m7, m21 vpdpwssd m4, m14, m8 ; g2 mova m8, [spel_h_shufD] vpermb m0, m8, m0 vpdpwssd m1, m15, m0 ; a3 b3 mova m0, [spel_shuf8a] vpermb m9, m8, m9 vpdpwssd m2, m15, m9 ; c3 d3 mova m9, [spel_shuf8b] vpermb m20, m8, m20 vpdpwssd m3, m15, m20 ; e3 f3 vpermb m21, m8, m21 vpdpwssd m4, m15, m21 ; g3 vpermt2b m1, m0, m2 ; 01 12 vpermt2b m2, m0, m3 ; 23 34 vpermt2b m3, m0, m4 ; 45 56 .hv_w8_loop: movu ym0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*0], 1 mova m4, m10 mova m20, m11 vpermb m21, m5, m0 vpdpwssd m4, m12, m21 ; h0 i0 vpermb m21, m6, m0 vpdpwssd m20, m16, m1 ; A0 B0 vpdpwssd m4, m13, m21 ; h1 i1 vpermb m21, m7, m0 mova m1, m2 vpdpwssd m20, m17, m2 ; A1 B1 vpdpwssd m4, m14, m21 ; h2 i2 vpermb m21, m8, m0 mova m2, m3 vpdpwssd m20, m18, m3 ; A2 B2 vpdpwssd m4, m15, m21 ; h3 i3 vpermt2b m3, m9, m4 ; 67 78 vpdpwssd m20, m19, m3 ; A3 B3 vpermb m20, m22, m20 mova [tmpq], ym20 add tmpq, 32 sub hd, 2 jg .hv_w8_loop RET .hv_w16: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 27 vbroadcasti32x8 m5, [srcq+strideq*0+ 8] vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0 vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0 movu ym6, [srcq+strideq*1+ 0] movu ym7, [srcq+strideq*1+16] vinserti32x8 m6, [srcq+strideq*2+ 0], 1 vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2 add srcq, r6 movu ym22, [srcq+strideq*0+ 0] movu ym23, [srcq+strideq*0+16] vinserti32x8 m22, [srcq+strideq*1+ 0], 1 vinserti32x8 m23, [srcq+strideq*1+16], 1 ; 3 4 movu ym24, [srcq+strideq*2+ 0] movu ym25, [srcq+strideq*2+16] add srcq, r6 vinserti32x8 m24, [srcq+strideq*0+ 0], 1 vinserti32x8 m25, [srcq+strideq*0+16], 1 ; 5 6 vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m9, [spel_shuf16] mova m26, [prep_endB] pshufb m0, m4, m20 mova m1, m10 vpdpwssd m1, m12, m0 ; a0 pshufb m0, m6, m20 mova m2, m10 vpdpwssd m2, m12, m0 ; b0 pshufb m0, m7, m20 mova m3, m10 vpdpwssd m3, m14, m0 ; c2 pshufb m0, m4, m21 vpdpwssd m1, m13, m0 ; a1 pshufb m0, m6, m21 vpdpwssd m2, m13, m0 ; b1 pshufb m0, m7, m21 vpdpwssd m3, m15, m0 ; c3 pshufb m0, m5, m20 vpdpwssd m1, m14, m0 ; a2 shufpd m6, m7, 0x55 pshufb m7, m6, m20 vpdpwssd m2, m14, m7 ; b2 vpdpwssd m3, m12, m7 ; c0 pshufb m5, m21 vpdpwssd m1, m15, m5 ; a3 pshufb m6, m21 vpdpwssd m2, m15, m6 ; b3 vpdpwssd m3, m13, m6 ; c1 pshufb m0, m22, m20 mova m4, m10 vpdpwssd m4, m12, m0 ; d0 pshufb m0, m23, m20 mova m5, m10 vpdpwssd m5, m14, m0 ; e2 pshufb m0, m24, m20 mova m6, m10 vpdpwssd m6, m12, m0 ; f0 pshufb m0, m25, m20 mova m7, m10 vpdpwssd m7, m14, m0 ; g2 pshufb m0, m22, m21 vpdpwssd m4, m13, m0 ; d1 pshufb m0, m23, m21 vpdpwssd m5, m15, m0 ; e3 pshufb m0, m24, m21 vpdpwssd m6, m13, m0 ; f1 pshufb m0, m25, m21 vpdpwssd m7, m15, m0 ; g3 shufpd m22, m23, 0x55 pshufb m23, m22, m20 vpdpwssd m4, m14, m23 ; d2 vpdpwssd m5, m12, m23 ; e0 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m6, m14, m25 ; f2 vpdpwssd m7, m12, m25 ; g0 pshufb m22, m21 vpdpwssd m4, m15, m22 ; d3 vpdpwssd m5, m13, m22 ; e1 pshufb m24, m21 vpdpwssd m6, m15, m24 ; f3 vpdpwssd m7, m13, m24 ; g1 pslldq m1, 1 vpermt2b m2, m9, m3 ; 12 vpermt2b m4, m9, m5 ; 34 vpermt2b m6, m9, m7 ; 56 vpshrdd m1, m2, 16 ; 01 vpshrdd m3, m2, m4, 16 ; 23 vpshrdd m5, m4, m6, 16 ; 45 .hv_w16_loop: movu ym24, [srcq+strideq*1+ 0] movu ym25, [srcq+strideq*1+16] lea srcq, [srcq+strideq*2] vinserti32x8 m24, [srcq+strideq*0+ 0], 1 vinserti32x8 m25, [srcq+strideq*0+16], 1 mova m7, m10 mova m8, m10 pshufb m0, m24, m20 vpdpwssd m7, m12, m0 ; h0 mova m22, m11 pshufb m0, m25, m20 vpdpwssd m8, m14, m0 ; i2 mova m23, m11 vpdpwssd m22, m16, m1 ; A0 mova m1, m3 vpdpwssd m23, m16, m2 ; B0 mova m2, m4 pshufb m0, m24, m21 vpdpwssd m7, m13, m0 ; h1 pshufb m0, m25, m21 vpdpwssd m8, m15, m0 ; i3 vpdpwssd m22, m17, m3 ; A1 mova m3, m5 vpdpwssd m23, m17, m4 ; B1 mova m4, m6 shufpd m24, m25, 0x55 pshufb m25, m24, m20 vpdpwssd m7, m14, m25 ; h2 vpdpwssd m8, m12, m25 ; i0 vpdpwssd m22, m18, m5 ; A2 vpdpwssd m23, m18, m6 ; B2 pshufb m24, m21 vpdpwssd m7, m15, m24 ; h3 vpdpwssd m8, m13, m24 ; i1 vpermt2b m7, m9, m8 ; 78 vpshrdd m5, m6, m7, 16 ; 67 vpdpwssd m22, m19, m5 ; A3 vpdpwssd m23, m19, m7 ; B3 mova m6, m7 vpermt2b m22, m26, m23 mova [tmpq], m22 add tmpq, 64 sub hd, 2 jg .hv_w16_loop RET .hv_w32: %if WIN64 %assign stack_offset stack_offset - stack_size_padded PUSH r8 %assign regs_used regs_used + 1 WIN64_SPILL_XMM 32 %endif vbroadcasti32x4 m20, [spel_h_shufA] vbroadcasti32x4 m21, [spel_h_shufB] mova m22, [spel_shuf32] lea r5d, [hq+wq*8-256] mov r7, srcq mov r8, tmpq .hv_w32_loop0: movu m6, [srcq+strideq*0+ 0] movu m7, [srcq+strideq*0+ 8] movu m8, [srcq+strideq*0+16] mova m0, m10 mova m23, m10 pshufb m9, m6, m20 vpdpwssd m0, m12, m9 ; a0l pshufb m9, m7, m20 vpdpwssd m23, m12, m9 ; a0h vpdpwssd m0, m14, m9 ; a2l pshufb m7, m21 vpdpwssd m23, m13, m7 ; a1h vpdpwssd m0, m15, m7 ; a3l pshufb m7, m8, m20 vpdpwssd m23, m14, m7 ; a2h pshufb m6, m21 vpdpwssd m0, m13, m6 ; a1l pshufb m8, m21 vpdpwssd m23, m15, m8 ; a3h PUT_8TAP_HV_W32 1, 24, strideq, 1, 2 ; 12 PUT_8TAP_HV_W32 3, 26, strideq, 0, 1 ; 34 PUT_8TAP_HV_W32 5, 28, strideq, 2, 0 ; 56 vpshrdd m2, m1, m3, 16 ; 23l vpshrdd m25, m24, m26, 16 ; 23h vpshrdd m4, m3, m5, 16 ; 45l vpshrdd m27, m26, m28, 16 ; 45h .hv_w32_loop: movu m7, [srcq+strideq*1+ 0] movu m9, [srcq+strideq*2+ 0] movu m6, [srcq+strideq*1+ 8] movu m8, [srcq+strideq*2+ 8] mova m29, m10 mova m31, m10 pshufb m30, m7, m20 vpdpwssd m29, m12, m30 ; h0l pshufb m30, m9, m20 vpdpwssd m31, m12, m30 ; i0l pshufb m7, m21 vpdpwssd m29, m13, m7 ; h1l pshufb m9, m21 vpdpwssd m31, m13, m9 ; i1l pshufb m7, m6, m20 vpdpwssd m29, m14, m7 ; h2l pshufb m9, m8, m20 vpdpwssd m31, m14, m9 ; i2l pshufb m6, m21 vpdpwssd m29, m15, m6 ; h3l pshufb m8, m21 vpdpwssd m31, m15, m8 ; i3l mova m30, m10 vpdpwssd m30, m12, m7 ; h0h movu m7, [srcq+strideq*1+16] lea srcq, [srcq+strideq*2] vpermt2b m29, m22, m31 ; 78l mova m31, m10 vpdpwssd m31, m12, m9 ; i0h movu m9, [srcq+strideq*0+16] vpdpwssd m30, m13, m6 ; h1h pshufb m6, m7, m20 vpdpwssd m31, m13, m8 ; i1h pshufb m8, m9, m20 vpdpwssd m30, m14, m6 ; h2h mova m6, m11 vpdpwssd m6, m16, m0 ; A0l pshufb m7, m21 vpdpwssd m31, m14, m8 ; i2h mova m8, m11 vpdpwssd m8, m16, m23 ; A0h pshufb m9, m21 vpdpwssd m30, m15, m7 ; h3h mova m7, m11 vpdpwssd m7, m16, m1 ; B0l vpdpwssd m31, m15, m9 ; i3h mova m9, m11 vpdpwssd m9, m16, m24 ; B0h mova m0, m2 vpdpwssd m6, m17, m2 ; A1l mova m23, m25 vpdpwssd m8, m17, m25 ; A1h mova m1, m3 vpdpwssd m7, m17, m3 ; B1l mova m24, m26 vpdpwssd m9, m17, m26 ; B1h vpermt2b m30, m22, m31 ; 78h mova m31, [prep_endC] vpdpwssd m6, m18, m4 ; A2l mova m2, m4 vpdpwssd m8, m18, m27 ; A2h mova m25, m27 vpdpwssd m7, m18, m5 ; B2l mova m3, m5 vpdpwssd m9, m18, m28 ; B2h mova m26, m28 vpshrdd m4, m5, m29, 16 ; 67l vpdpwssd m6, m19, m4 ; A3l vpshrdd m27, m28, m30, 16 ; 67h vpdpwssd m8, m19, m27 ; A3h mova m5, m29 vpdpwssd m7, m19, m29 ; B3l mova m28, m30 vpdpwssd m9, m19, m30 ; B3h vpermt2b m6, m31, m8 vpermt2b m7, m31, m9 mova [tmpq+wq*0], m6 mova [tmpq+wq*2], m7 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w32_loop add r7, 64 add r8, 64 movzx hd, r5b mov srcq, r7 mov tmpq, r8 sub r5d, 1<<8 jg .hv_w32_loop0 RET %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts %define base r6-pd_0to7 mov t0d, r7m lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m8, [base+warp_8x8t_rnd_v] vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main psrad m14, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m16, 15 packssdw m14, m16 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 psrad m15, m16, 15 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2 add tsq, tsq psrad m16, 15 packssdw m15, m16 jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd mov t0d, r7m ; pixel_max lea r6, [pd_0to7] shr t0d, 11 vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4] vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4] call .main psrad m14, m16, 13 call .main2 psrad m16, 13 packusdw m14, m16 call .main2 psrad m15, m16, 13 call .main2 vpbroadcastd m0, [base+bidir_shift+t0*4] vpsrlvw m14, m0 psrad m16, 13 packusdw m15, m16 vpsrlvw m15, m0 .end: mova m0, [base+warp8x8_end] vpermb m16, m0, m14 lea r2, [dsq*3] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 vpermb m16, m0, m15 lea dstq, [dstq+dsq*4] mova [dstq+dsq*0], xm16 vextracti128 [dstq+dsq*1], ym16, 1 vextracti32x4 [dstq+dsq*2], m16, 2 vextracti32x4 [dstq+r2 ], m16, 3 RET .main: vpbroadcastd ym3, [base+pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym3, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym3, r7m {1to8} ; my mova ym16, [base+pd_0to7] vpbroadcastd ym19, [abcdq+4*0] ; alpha vpbroadcastd ym21, [abcdq+4*1] ; gamma lea r4, [ssq*3+6] vpdpwssd ym18, ym19, ym16 ; tmx vpdpwssd ym20, ym21, ym16 ; tmy sub srcq, r4 mova m10, [base+warp8x8_permA] lea r4, [mc_warp_filter+64*8] vbroadcasti32x4 m12, [base+warp8x8_permC] kxnorb k1, k1, k1 vbroadcasti32x4 m13, [base+warp8x8_permD] movu ym5, [srcq+0] vinserti32x8 m5, [srcq+8], 1 psrad ym17, ym18, 10 mova m11, [base+warp8x8_permB] kmovb k2, k1 vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0 psrad ym19, 16 ; beta psrad ym21, 16 ; delta paddd ym18, ym19 vpermb m4, m10, m5 vpbroadcastq m9, [base+warp_shift_h+t0*8] pshufd m3, m3, q3120 paddd m7, m1, m1 pshufb m2, m3, m12 vpdpwssd m1, m4, m2 vpermb m5, m11, m5 vshufi32x4 m4, m5, q1021 pshufb m3, m13 vpdpwssd m1, m4, m3 call .h psllq m2, m1, 32 paddd m1, m2 vpmultishiftqb m1, m9, m1 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym6, ym20, 10 kmovb k1, k2 paddd ym17, ym20, ym21 ; my += delta vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1 shufps m5, m20, m6, q2020 mova m16, m8 pshufb m4, m5, m12 vpdpwssd m16, m1, m4 ; a0 b0 pshufb m5, m13 mova m1, m2 vpdpwssd m16, m2, m5 ; a1 b1 shufps m6, m20, m6, q3131 paddd ym20, ym17, ym21 pshufb m4, m6, m12 mova m2, m3 vpdpwssd m16, m3, m4 ; a2 b2 vpshrdq m3, m0, 48 ; 67 78 pshufb m6, m13 vpdpwssd m16, m3, m6 ; a3 b3 ret ALIGN function_align .h: movu ym16, [srcq+ssq*1] psrad ym6, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x8 m5, m16, [srcq+ssq*0], 1 kmovb k1, k2 paddd ym17, ym18, ym19 ; mx += beta vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1 psrad ym16, ym17, 10 kmovb k2, k1 vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2 vpermb m4, m10, m5 shufps m16, m18, m6, q2020 shufps m6, m18, m6, q3131 mova m0, m7 pshufb m18, m16, m12 vpdpwssd m0, m4, m18 ; a0 b0 vpermb m5, m11, m5 pshufb m18, m6, m13 vpdpwssd m0, m5, m18 ; a3 b3 paddd ym18, ym17, ym19 vshufi32x4 m17, m4, m5, q1021 pshufb m16, m13 vpdpwssd m0, m17, m16 ; a1 b1 vshufi32x4 m4, m5, q2132 pshufb m6, m12 vpdpwssd m0, m4, m6 ; a2 b2 vpmultishiftqb m0, m9, m0 ; a a b b ret %macro BIDIR_FN 0 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq ], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm0, ym1, 1 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 vextracti32x4 xm0, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq ], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET %endmacro %if WIN64 DECLARE_REG_TMP 5 %else DECLARE_REG_TMP 7 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 vpbroadcastd m2, [base+avg_round+t0*4] vpbroadcastd m3, [base+avg_shift+t0*4] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+64*0] paddsw m0, [tmp2q+64*0] mova m1, [tmp1q+64*1] paddsw m1, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 vpsrlvw m0, m3 vpsrlvw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] vpbroadcastd m5, [base+w_avg_round+t0*4] vpbroadcastd m7, [base+bidir_shift+t0*4] add wq, r6 mov r6d, r6m ; weight lea t0d, [r6-16] shl r6d, 16 sub r6d, t0d ; 16-weight, weight movifnidn hd, hm vpbroadcastd m6, r6d BIDIR_FN ALIGN function_align .main: mova m3, [tmp1q+64*0] mova m1, [tmp2q+64*0] mova m0, [tmp1q+64*1] mova m4, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m2, m1, m3 punpckhwd m1, m3 punpcklwd m3, m4, m0 punpckhwd m4, m0 mova m0, m5 vpdpwssd m0, m6, m2 mova m2, m5 vpdpwssd m2, m6, m1 mova m1, m5 vpdpwssd m1, m6, m3 mova m3, m5 vpdpwssd m3, m6, m4 REPX {psrad x, 2}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m7 vpsrlvw m1, m7 ret cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_64] vpbroadcastd m9, [base+mask_round+r6*4] vpbroadcastd m10, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 BIDIR_FN ALIGN function_align .main: pmovzxbw m1, [maskq+32*0] mova m4, [tmp1q+64*0] mova m2, [tmp2q+64*0] pmovzxbw m6, [maskq+32*1] mova m5, [tmp1q+64*1] mova m3, [tmp2q+64*1] add maskq, 32*2 add tmp1q, 64*2 add tmp2q, 64*2 punpcklwd m7, m4, m2 punpckhwd m4, m2 psubw m0, m8, m1 punpcklwd m2, m1, m0 ; m, 64-m punpckhwd m1, m0 mova m0, m9 vpdpwssd m0, m7, m2 mova m2, m9 vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m) punpcklwd m7, m5, m3 punpckhwd m5, m3 psubw m1, m8, m6 punpcklwd m3, m6, m1 punpckhwd m6, m1 mova m1, m9 vpdpwssd m1, m7, m3 mova m3, m9 vpdpwssd m3, m5, m6 REPX {psrad x, 4}, m0, m2, m1, m3 packusdw m0, m2 packusdw m1, m3 vpsrlvw m0, m10 vpsrlvw m1, m10 ret cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m11, [base+pw_64] vpbroadcastd m12, [base+mask_round+r6*4] vpbroadcastd m13, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m14, [base+w_mask_round+r6*4] mova ym15, [w_mask_end42x] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: mova m4, [w_mask_shuf4] vpermt2b m2, m4, m3 mova m3, m14 vpdpbusd m3, m2, [pb_64] {1to16} vpermb m3, m15, m3 movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 mova [maskq], xm3 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8: mova m8, [w_mask_shuf8] vpbroadcastd m9, [pb_64] jmp .w8_start .w8_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w8_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 mova [maskq], xm3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16: mova m8, [w_mask_shuf16] vpbroadcastd m9, [pb_64] jmp .w16_start .w16_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 16 .w16_start: vpermt2b m2, m8, m3 mova m3, m14 vpdpbusd m3, m2, m9 vpermb m3, m15, m3 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 mova [maskq], xm3 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*4] add maskq, 32 .w32: paddw m2, m3 mova m8, m14 vpdpwssd m8, m11, m2 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 call .main paddw m2, m3 mova m3, m14 vpdpwssd m3, m11, m2 vpermt2b m8, m15, m3 mova [dstq+strideq*2], m0 mova [dstq+stride3q ], m1 mova [maskq], ym8 sub hd, 4 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 32 .w64: mova m8, m2 mova m9, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main paddw m8, m2 paddw m9, m3 mova m2, m14 vpdpwssd m2, m11, m8 mova m3, m14 vpdpwssd m3, m11, m9 vpermt2b m2, m15, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 mova [maskq], ym2 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 64 .w128: mova m16, m2 mova m8, m3 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*0+64*1], m1 call .main mova m17, m2 mova m9, m3 mova [dstq+strideq*0+64*2], m0 mova [dstq+strideq*0+64*3], m1 call .main paddw m2, m16 paddw m3, m8 mova m16, m14 vpdpwssd m16, m11, m2 mova m8, m14 vpdpwssd m8, m11, m3 mova [dstq+strideq*1+64*0], m0 mova [dstq+strideq*1+64*1], m1 call .main paddw m2, m17 paddw m3, m9 mova m17, m14 vpdpwssd m17, m11, m2 mova m9, m14 vpdpwssd m9, m11, m3 vpermt2b m16, m15, m8 vpermt2b m17, m15, m9 mova [dstq+strideq*1+64*2], m0 mova [dstq+strideq*1+64*3], m1 mova [maskq+32*0], ym16 mova [maskq+32*1], ym17 sub hd, 2 jg .w128_loop vzeroupper RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m10, m6 psrlw m6, 10 ; 64-m psubw m2, m11, m6 ; m punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m12 vpdpwssd m0, m5, m1 mova m1, m12 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m10, m5 psrlw m5, 10 psubw m3, m11, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m12 vpdpwssd m1, m6, m4 mova m4, m12 vpdpwssd m4, m7, m5 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m13 vpsrlvw m1, m13 ret cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] vpbroadcastd m11, [base+bidir_shift+r6*4] mov r6d, r7m ; sign vpbroadcastd m12, [base+w_mask_round+r6*4] mova ym13, [w_mask_end42x] mov maskq, maskmp add wq, r7 paddw m14, m9, m9 ; pw_128 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 mova m5, m12 vpdpwssd m5, m14, m2 mova m2, m12 vpdpwssd m2, m14, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpermt2b m5, m13, m2 vpsrlvw m0, m11 vpsrlvw m1, m11 mova [maskq], ym5 add maskq, 32 ret cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm mov r6d, r8m ; pixel_max movifnidn hd, hm shr r6d, 11 movsxd wq, [r7+wq*4] vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 vpbroadcastd m9, [base+pw_64] vpbroadcastd m10, [base+mask_round+r6*4] mova m11, [w_mask_end444] vpbroadcastd m12, [base+bidir_shift+r6*4] mov maskq, maskmp add wq, r7 call .main lea stride3q, [strideq*3] jmp wq .w4: movq [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm0 vextracti32x4 xm2, ym0, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 cmp hd, 8 jl .w4_end vextracti32x4 xm2, m0, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm0, m0, 3 movq [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm0 je .w4_end lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm1 movhps [dstq+strideq*1], xm1 vextracti32x4 xm2, ym1, 1 movq [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm2 vextracti32x4 xm2, m1, 2 lea dstq, [dstq+strideq*4] movq [dstq+strideq*0], xm2 movhps [dstq+strideq*1], xm2 vextracti32x4 xm1, m1, 3 movq [dstq+strideq*2], xm1 movhps [dstq+stride3q ], xm1 .w4_end: RET .w8_loop: call .main lea dstq, [dstq+strideq*4] .w8: mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], ym0, 1 vextracti32x4 [dstq+strideq*2], m0, 2 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 8 jl .w8_end lea dstq, [dstq+strideq*4] mova [dstq+strideq*0], xm1 vextracti32x4 [dstq+strideq*1], ym1, 1 vextracti32x4 [dstq+strideq*2], m1, 2 vextracti32x4 [dstq+stride3q ], m1, 3 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*4] .w16: mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 mova [dstq+strideq*2], ym1 vextracti32x8 [dstq+stride3q ], m1, 1 sub hd, 4 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] .w32: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+64*0], m0 mova [dstq+64*1], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+64*0], m0 mova [dstq+64*1], m1 call .main mova [dstq+64*2], m0 mova [dstq+64*3], m1 dec hd jg .w128_loop RET ALIGN function_align .main: mova m1, [tmp1q+64*0] mova m3, [tmp2q+64*0] mova m4, [tmp1q+64*1] mova m7, [tmp2q+64*1] add tmp1q, 64*2 add tmp2q, 64*2 psubsw m6, m1, m3 punpcklwd m5, m3, m1 pabsw m6, m6 punpckhwd m3, m1 psubusw m6, m8, m6 psrlw m6, 10 psubw m2, m9, m6 punpcklwd m1, m6, m2 punpckhwd m6, m2 mova m0, m10 vpdpwssd m0, m5, m1 mova m1, m10 vpdpwssd m1, m3, m6 psubsw m5, m4, m7 punpcklwd m6, m7, m4 pabsw m5, m5 punpckhwd m7, m4 psubusw m5, m8, m5 psrlw m5, 10 psubw m3, m9, m5 punpcklwd m4, m5, m3 psrad m0, 4 punpckhwd m5, m3 psrad m1, 4 packusdw m0, m1 mova m1, m10 vpdpwssd m1, m6, m4 mova m4, m10 vpdpwssd m4, m7, m5 vpermt2b m2, m11, m3 psrad m1, 4 psrad m4, 4 packusdw m1, m4 vpsrlvw m0, m12 vpsrlvw m1, m12 mova [maskq], m2 add maskq, 64 ret cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp vpbroadcastd m6, [base+pw_m512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: pmovzxbw ym19, [maskq] movq xm16, [dstq+dsq*0] movhps xm16, [dstq+dsq*1] vpbroadcastq ym17, [dstq+dsq*2] vpbroadcastq ym18, [dstq+r6 ] pmullw ym19, ym6 vpblendd ym16, ym17, 0x30 vpblendd ym16, ym18, 0xc0 psubw ym17, ym16, [tmpq] add maskq, 16 add tmpq, 32 pmulhrsw ym17, ym19 paddw ym16, ym17 vextracti128 xm17, ym16, 1 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 movq [dstq+dsq*2], xm17 movhps [dstq+r6 ], xm17 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 vzeroupper RET .w8: pmovzxbw m2, [maskq] mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vinserti32x4 m0, [dstq+dsq*2], 2 vinserti32x4 m0, [dstq+r6 ], 3 pmullw m2, m6 psubw m1, m0, [tmpq] add maskq, 32 add tmpq, 64 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET .w16: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 mova ym1, [dstq+dsq*2] vinserti32x8 m1, [dstq+r6 ], 1 pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+64*0] psubw m3, m1, [tmpq+64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 mova [dstq+dsq*2], ym1 vextracti32x8 [dstq+r6 ], m1, 1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: pmovzxbw m4, [maskq+32*0] pmovzxbw m5, [maskq+32*1] mova m0, [dstq+dsq*0] mova m1, [dstq+dsq*1] pmullw m4, m6 pmullw m5, m6 psubw m2, m0, [tmpq+ 64*0] psubw m3, m1, [tmpq+ 64*1] add maskq, 32*2 add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h lea r5, [blend_v_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: vpbroadcastd xmm2, [obmc_masks_avx2+2*2] .w2_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movq xmm1, [tmpq] add tmpq, 4*2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_loop RET .w4: vpbroadcastq xmm2, [obmc_masks_avx2+4*2] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2] .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 psubw ym1, ym0, [tmpq] add tmpq, 16*2 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: vbroadcasti32x8 m2, [obmc_masks_avx2+16*2] .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 psubw m1, m0, [tmpq] add tmpq, 32*2 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: mova m4, [obmc_masks_avx2+32*2] .w32_loop: mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop RET cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask %define base r6-$$ lea r6, [$$] tzcnt wd, wm mov hd, hm movsxd wq, [base+blend_h_avx512icl_table+wq*4] lea maskq, [base+obmc_masks_avx2+hq*2] lea hd, [hq*3] lea wq, [base+blend_h_avx512icl_table+wq] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movq xmm1, [tmpq] add tmpq, 4*2 punpcklwd xmm2, xmm2 psubw xmm1, xmm0, xmm1 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova xmm3, [blend_shuf] .w4_loop: movq xmm0, [dstq+dsq*0] movhps xmm0, [dstq+dsq*1] movd xmm2, [maskq+hq*2] psubw xmm1, xmm0, [tmpq] add tmpq, 8*2 pshufb xmm2, xmm3 pmulhrsw xmm1, xmm2 paddw xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: vbroadcasti32x4 ym3, [blend_shuf] shufpd ym3, ym3, 0x0c .w8_loop: mova xm0, [dstq+dsq*0] vinserti32x4 ym0, [dstq+dsq*1], 1 vpbroadcastd ym2, [maskq+hq*2] psubw ym1, ym0, [tmpq] add tmpq, 16*2 pshufb ym2, ym3 pmulhrsw ym1, ym2 paddw ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET .w16: vbroadcasti32x4 m3, [blend_shuf] shufpd m3, m3, 0xf0 .w16_loop: mova ym0, [dstq+dsq*0] vinserti32x8 m0, [dstq+dsq*1], 1 vpbroadcastd m2, [maskq+hq*2] psubw m1, m0, [tmpq] add tmpq, 32*2 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET .w32: vpbroadcastw m4, [maskq+hq*2] vpbroadcastw m5, [maskq+hq*2+2] mova m0, [dstq+dsq*0] psubw m2, m0, [tmpq+ 64*0] mova m1, [dstq+dsq*1] psubw m3, m1, [tmpq+ 64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w32 RET .w64: vpbroadcastw m4, [maskq+hq*2] mova m0, [dstq+64*0] psubw m2, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m3, m1, [tmpq+64*1] add tmpq, 64*2 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m8, [maskq+hq*2] mova m0, [dstq+64*0] psubw m4, m0, [tmpq+64*0] mova m1, [dstq+64*1] psubw m5, m1, [tmpq+64*1] mova m2, [dstq+64*2] psubw m6, m2, [tmpq+64*2] mova m3, [dstq+64*3] psubw m7, m3, [tmpq+64*3] add tmpq, 64*4 REPX {pmulhrsw x, m8}, m4, m5, m6, m7 paddw m0, m4 paddw m1, m5 paddw m2, m6 paddw m3, m7 mova [dstq+64*0], m0 mova [dstq+64*1], m1 mova [dstq+64*2], m2 mova [dstq+64*3], m3 add dstq, dsq inc hq jl .w128 RET cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax sub dword mx0m, 4<<14 sub dword src_wm, 8 mov r6, ~0 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm kmovq k6, r6 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pd_16384] vpbroadcastd m7, [base+pd_63] mova m24, [base+resize_permA] mova m25, [base+resize_permB] mova m26, [base+resize_permC] mova m27, [base+resize_permD] vbroadcasti32x4 m28, [base+resize_shufA] vbroadcasti32x4 m29, [base+resize_shufB] mova m30, [base+resize_permE] vpbroadcastw ym31, pxmaxm vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] pslld m5, 4 ; dx*16 pslld m6, 14 pxor m2, m2 .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset vptestmd k5, m1, m1 pand m9, m7 ; filter offset (masked) ktestw k5, k5 jz .load vpbroadcastq m14, [base+pd_0_4] vpermq m10, m0, q1100 vpermq m11, m0, q3322 vpermq m20, m1, q1100 vpermq m21, m1, q3322 punpckldq m10, m10 punpckldq m11, m11 punpckldq m20, m20 punpckldq m21, m21 paddd m10, m14 paddd m11, m14 paddd m20, m14 paddd m21, m14 vextracti32x8 ym12, m10, 1 vextracti32x8 ym13, m11, 1 vextracti32x8 ym22, m20, 1 vextracti32x8 ym23, m21, 1 kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3 vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7 vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2] vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2] vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2] vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2] pshufb m16, m0 pshufb m17, m1 pshufb m18, m14 pshufb m19, m15 mova m20, m24 mova m22, m24 mova m21, m25 mova m23, m25 vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd mova m15, m26 mova m17, m26 mova m16, m27 mova m18, m27 vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd kmovq k1, k6 kmovq k2, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 jmp .filter .load: kmovq k1, k6 kmovq k2, k6 kmovq k3, k6 kmovq k4, k6 vpgatherdd m11{k1}, [base+resize_filter+m9*8+0] vpgatherdd m13{k2}, [base+resize_filter+m9*8+4] pshufb m10, m11, m28 pshufb m11, m11, m29 pshufb m12, m13, m28 pshufb m13, m13, m29 vpgatherdd m15{k3}, [srcq+m0*2+ 0] vpgatherdd m16{k4}, [srcq+m0*2+ 4] kmovq k1, k6 kmovq k2, k6 vpgatherdd m17{k1}, [srcq+m0*2+ 8] vpgatherdd m18{k2}, [srcq+m0*2+12] .filter: mova m14, m2 vpdpwssd m14, m15, m10 vpdpwssd m14, m16, m11 vpdpwssd m14, m17, m12 vpdpwssd m14, m18, m13 psubd m14, m3, m14 psrad m14, 15 packusdw m14, m14 vpermq m14, m30, m14 pminsw ym14, ym31 mova [dstq+xq*2], ym14 paddd m4, m5 add xd, 16 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/mc16_sse.asm000064400000000000000000010371301046102023000142120ustar 00000000000000; Copyright © 2021, VideoLAN and dav1d authors ; Copyright © 2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA ; dav1d_obmc_masks[] << 9 obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 bdct_lb_q: times 8 db 0 times 8 db 4 times 8 db 8 times 8 db 12 pw_2: times 8 dw 2 pw_16: times 4 dw 16 prep_mul: times 4 dw 16 times 8 dw 4 pw_64: times 8 dw 64 pw_256: times 8 dw 256 pw_2048: times 4 dw 2048 bidir_mul: times 4 dw 2048 pw_8192: times 8 dw 8192 pw_27615: times 8 dw 27615 pw_32766: times 8 dw 32766 pw_m512: times 8 dw -512 pd_63: times 4 dd 63 pd_64: times 4 dd 64 pd_512: times 4 dd 512 pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 pd_0x3ff: times 4 dd 0x3ff pd_0x4000: times 4 dd 0x4000 pq_0x400000: times 2 dq 0x400000 pq_0x40000000: times 2 dq 0x40000000 pd_65538: times 2 dd 65538 put_bilin_h_rnd: times 4 dw 8 times 4 dw 10 s_8tap_h_rnd: times 2 dd 2 times 2 dd 8 put_s_8tap_v_rnd: times 2 dd 512 times 2 dd 128 s_8tap_h_sh: dd 2, 4 put_s_8tap_v_sh: dd 10, 8 bidir_rnd: times 4 dw -16400 times 4 dw -16388 put_8tap_h_rnd: dd 34, 34, 40, 40 prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) warp8x8_shift: dd 11, 13 warp8x8_rnd1: dd 1024, 1024, 4096, 4096 warp8x8_rnd2: times 4 dw 4096 times 4 dw 16384 warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern resize_filter SECTION .text %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif INIT_XMM ssse3 cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy %define base t0-put_ssse3 mov mxyd, r6m ; mx LEA t0, put_ssse3 movifnidn wd, wm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: tzcnt wd, wd movzx wd, word [base+put_ssse3_table+wq*2] add wq, t0 movifnidn hd, hm jmp wq .put_w2: mov r4d, [srcq+ssq*0] mov r6d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4d mov [dstq+dsq*1], r6d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq [dstq+dsq*0], m0 movq [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0+16*0] movu m1, [srcq+ssq*0+16*1] movu m2, [srcq+ssq*1+16*0] movu m3, [srcq+ssq*1+16*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+16*0], m0 mova [dstq+dsq*0+16*1], m1 mova [dstq+dsq*1+16*0], m2 mova [dstq+dsq*1+16*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, ssq mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 add dstq, dsq dec hd jg .put_w32 RET .put_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, ssq mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: add srcq, 16*8 add dstq, 16*8 .put_w128_loop: movu m0, [srcq-16*8] movu m1, [srcq-16*7] movu m2, [srcq-16*6] movu m3, [srcq-16*5] mova [dstq-16*8], m0 mova [dstq-16*7], m1 mova [dstq-16*6], m2 mova [dstq-16*5], m3 movu m0, [srcq-16*4] movu m1, [srcq-16*3] movu m2, [srcq-16*2] movu m3, [srcq-16*1] mova [dstq-16*4], m0 mova [dstq-16*3], m1 mova [dstq-16*2], m2 mova [dstq-16*1], m3 movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, ssq mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add dstq, dsq dec hd jg .put_w128_loop RET .h: movd m5, mxyd mov mxyd, r7m ; my mova m4, [base+pw_16] pshufb m5, [base+pw_256] psubw m4, m5 test mxyd, mxyd jnz .hv ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v mov r6d, r8m ; bitdepth_max shr r6d, 11 movddup m3, [base+put_bilin_h_rnd+r6*8] movifnidn hd, hm sub wd, 8 jg .h_w16 je .h_w8 cmp wd, -4 je .h_w4 .h_w2: movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmullw m0, m4, m1 psrlq m1, 16 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 movd [dstq+dsq*0], m0 punpckhqdq m0, m0 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] movq m1, [srcq+ssq*0+2] movhps m1, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 4 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: lea srcq, [srcq+wq*2] lea dstq, [dstq+wq*2] neg wq .h_w16_loop0: mov r6, wq .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 movu m1, [srcq+r6*2+16] movu m2, [srcq+r6*2+18] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m0, 4 psrlw m1, 4 mova [dstq+r6*2+16*0], m0 mova [dstq+r6*2+16*1], m1 add r6, 16 jl .h_w16_loop add srcq, ssq add dstq, dsq dec hd jg .h_w16_loop0 RET .v: shl mxyd, 11 movd m5, mxyd pshufb m5, [base+pw_256] movifnidn hd, hm cmp wd, 4 jg .v_w8 je .v_w4 .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq m2, m0, m1 movd m0, [srcq+ssq*0] punpcklqdq m1, m0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 movd [dstq+dsq*0], m1 punpckhqdq m1, m1 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movq m0, [srcq+ssq*0] .v_w4_loop: movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklqdq m2, m0, m1 movq m0, [srcq+ssq*0] punpcklqdq m1, m0 psubw m1, m2 pmulhrsw m1, m5 paddw m1, m2 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: %if ARCH_X86_64 %if WIN64 push r7 %endif shl wd, 5 mov r7, srcq lea r6d, [wq+hq-256] mov r4, dstq %else mov r6, srcq %endif .v_w8_loop0: movu m0, [srcq+ssq*0] .v_w8_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] psubw m1, m3, m0 pmulhrsw m1, m5 paddw m1, m0 movu m0, [srcq+ssq*0] psubw m2, m0, m3 pmulhrsw m2, m5 paddw m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop %if ARCH_X86_64 add r7, 16 add r4, 16 movzx hd, r6b mov srcq, r7 mov dstq, r4 sub r6d, 1<<8 %else mov dstq, dstmp add r6, 16 mov hd, hm add dstq, 16 mov srcq, r6 mov dstmp, dstq sub wd, 8 %endif jg .v_w8_loop0 %if WIN64 pop r7 %endif RET .hv: WIN64_SPILL_XMM 8 shl mxyd, 11 mova m3, [base+pw_2] movd m6, mxyd mova m7, [base+pw_8192] pshufb m6, [base+pw_256] test dword r8m, 0x800 jnz .hv_12bpc psllw m4, 2 psllw m5, 2 mova m7, [base+pw_2048] .hv_12bpc: movifnidn hd, hm cmp wd, 4 jg .hv_w8 je .hv_w4 .hv_w2: movddup m0, [srcq+ssq*0] pshufhw m1, m0, q0321 pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w2_loop: movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps m2, [srcq+ssq*0] pmullw m1, m4, m2 psrlq m2, 16 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 _ 2 _ shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 movd [dstq+dsq*0], m1 punpckhqdq m1, m1 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: movddup m0, [srcq+ssq*0] movddup m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w4_loop: movq m1, [srcq+ssq*1] movq m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] movhps m2, [srcq+ssq*0+2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 ; 1 2 shufpd m2, m0, m1, 0x01 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: %if ARCH_X86_64 %if WIN64 push r7 %endif shl wd, 5 lea r6d, [wq+hq-256] mov r4, srcq mov r7, dstq %else mov r6, srcq %endif .hv_w8_loop0: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*0+2] pmullw m0, m4 pmullw m1, m5 paddw m0, m3 paddw m0, m1 psrlw m0, 2 .hv_w8_loop: movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*1+2] lea srcq, [srcq+ssq*2] pmullw m1, m4 pmullw m2, m5 paddw m1, m3 paddw m1, m2 psrlw m1, 2 psubw m2, m1, m0 paddw m2, m2 pmulhw m2, m6 paddw m2, m0 pmulhrsw m2, m7 mova [dstq+dsq*0], m2 movu m0, [srcq+ssq*0] movu m2, [srcq+ssq*0+2] pmullw m0, m4 pmullw m2, m5 paddw m0, m3 paddw m0, m2 psrlw m0, 2 psubw m2, m0, m1 paddw m2, m2 pmulhw m2, m6 paddw m2, m1 pmulhrsw m2, m7 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop %if ARCH_X86_64 add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 %else mov dstq, dstmp add r6, 16 mov hd, hm add dstq, 16 mov srcq, r6 mov dstmp, dstq sub wd, 8 %endif jg .hv_w8_loop0 %if WIN64 pop r7 %endif RET cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 %define base r6-prep_ssse3 movifnidn mxyd, r5m ; mx LEA r6, prep_ssse3 movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: tzcnt wd, wd movzx wd, word [base+prep_ssse3_table+wq*2] mov r5d, r7m ; bitdepth_max mova m5, [base+pw_8192] add wq, r6 shr r5d, 11 movddup m4, [base+prep_mul+r5*8] lea stride3q, [strideq*3] jmp wq .prep_w4: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmullw m0, m4 pmullw m1, m4 psubw m0, m5 psubw m1, m5 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET .prep_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .prep_w8 RET .prep_w16: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .prep_w16 RET .prep_w32: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 dec hd jg .prep_w32 RET .prep_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*4], m0 mova [tmpq+16*5], m1 mova [tmpq+16*6], m2 mova [tmpq+16*7], m3 add tmpq, 16*8 dec hd jg .prep_w64 RET .prep_w128: movu m0, [srcq+16* 0] movu m1, [srcq+16* 1] movu m2, [srcq+16* 2] movu m3, [srcq+16* 3] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 movu m0, [srcq+16* 4] movu m1, [srcq+16* 5] movu m2, [srcq+16* 6] movu m3, [srcq+16* 7] REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq+16*4], m0 mova [tmpq+16*5], m1 mova [tmpq+16*6], m2 mova [tmpq+16*7], m3 movu m0, [srcq+16* 8] movu m1, [srcq+16* 9] movu m2, [srcq+16*10] movu m3, [srcq+16*11] add tmpq, 16*16 REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq-16*8], m0 mova [tmpq-16*7], m1 mova [tmpq-16*6], m2 mova [tmpq-16*5], m3 movu m0, [srcq+16*12] movu m1, [srcq+16*13] movu m2, [srcq+16*14] movu m3, [srcq+16*15] add srcq, strideq REPX {pmullw x, m4}, m0, m1, m2, m3 REPX {psubw x, m5}, m0, m1, m2, m3 mova [tmpq-16*4], m0 mova [tmpq-16*3], m1 mova [tmpq-16*2], m2 mova [tmpq-16*1], m3 dec hd jg .prep_w128 RET .h: movd m4, mxyd mov mxyd, r6m ; my mova m3, [base+pw_16] pshufb m4, [base+pw_256] mova m5, [base+pw_32766] psubw m3, m4 test dword r7m, 0x800 jnz .h_12bpc psllw m3, 2 psllw m4, 2 .h_12bpc: test mxyd, mxyd jnz .hv sub wd, 8 je .h_w8 jg .h_w16 .h_w4: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*0+2] movhps m1, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 16 sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 2 jg .h_w8 RET .h_w16: lea srcq, [srcq+wq*2] neg wq .h_w16_loop0: mov r6, wq .h_w16_loop: movu m0, [srcq+r6*2+ 0] movu m1, [srcq+r6*2+ 2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 movu m1, [srcq+r6*2+16] movu m2, [srcq+r6*2+18] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m0, 2 psraw m1, 2 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 16*2 add r6, 16 jl .h_w16_loop add srcq, strideq dec hd jg .h_w16_loop0 RET .v: movd m4, mxyd mova m3, [base+pw_16] pshufb m4, [base+pw_256] mova m5, [base+pw_32766] psubw m3, m4 test dword r7m, 0x800 jnz .v_12bpc psllw m3, 2 psllw m4, 2 .v_12bpc: cmp wd, 8 je .v_w8 jg .v_w16 .v_w4: movq m0, [srcq+strideq*0] .v_w4_loop: movq m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklqdq m1, m0, m2 ; 0 1 movq m0, [srcq+strideq*0] punpcklqdq m2, m0 ; 1 2 pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 mova [tmpq], m1 add tmpq, 16 sub hd, 2 jg .v_w4_loop RET .v_w8: movu m0, [srcq+strideq*0] .v_w8_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4, m2 psubw m0, m5 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m3 mova [tmpq+16*0], m1 pmullw m1, m4, m0 psubw m2, m5 paddw m1, m2 psraw m1, 2 mova [tmpq+16*1], m1 add tmpq, 16*2 sub hd, 2 jg .v_w8_loop RET .v_w16: %if WIN64 push r7 %endif mov r5, srcq %if ARCH_X86_64 lea r6d, [wq*4-32] mov wd, wd lea r6d, [hq+r6*8] mov r7, tmpq %else mov r6d, wd %endif .v_w16_loop0: movu m0, [srcq+strideq*0] .v_w16_loop: movu m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] pmullw m0, m3 pmullw m1, m4, m2 psubw m0, m5 paddw m1, m0 movu m0, [srcq+strideq*0] psraw m1, 2 pmullw m2, m3 mova [tmpq+wq*0], m1 pmullw m1, m4, m0 psubw m2, m5 paddw m1, m2 psraw m1, 2 mova [tmpq+wq*2], m1 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w16_loop %if ARCH_X86_64 add r5, 16 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 %else mov tmpq, tmpmp add r5, 16 mov hd, hm add tmpq, 16 mov srcq, r5 mov tmpmp, tmpq sub r6d, 8 %endif jg .v_w16_loop0 %if WIN64 pop r7 %endif RET .hv: WIN64_SPILL_XMM 7 shl mxyd, 11 movd m6, mxyd pshufb m6, [base+pw_256] cmp wd, 8 je .hv_w8 jg .hv_w16 .hv_w4: movddup m0, [srcq+strideq*0] movddup m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w4_loop: movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] movhps m1, [srcq+strideq*0] movhps m2, [srcq+strideq*0+2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 ; 1 2 shufpd m2, m0, m1, 0x01 ; 0 1 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 16 sub hd, 2 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w8_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+16*0], m2 movu m0, [srcq+strideq*0] movu m2, [srcq+strideq*0+2] pmullw m0, m3 pmullw m2, m4 psubw m0, m5 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 2 jg .hv_w8_loop RET .hv_w16: %if WIN64 push r7 %endif mov r5, srcq %if ARCH_X86_64 lea r6d, [wq*4-32] mov wd, wd lea r6d, [hq+r6*8] mov r7, tmpq %else mov r6d, wd %endif .hv_w16_loop0: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*0+2] pmullw m0, m3 pmullw m1, m4 psubw m0, m5 paddw m0, m1 psraw m0, 2 .hv_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*1+2] lea srcq, [srcq+strideq*2] pmullw m1, m3 pmullw m2, m4 psubw m1, m5 paddw m1, m2 psraw m1, 2 psubw m2, m1, m0 pmulhrsw m2, m6 paddw m2, m0 mova [tmpq+wq*0], m2 movu m0, [srcq+strideq*0] movu m2, [srcq+strideq*0+2] pmullw m0, m3 pmullw m2, m4 psubw m0, m5 paddw m0, m2 psraw m0, 2 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w16_loop %if ARCH_X86_64 add r5, 16 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 %else mov tmpq, tmpmp add r5, 16 mov hd, hm add tmpq, 16 mov srcq, r5 mov tmpmp, tmpq sub r6d, 8 %endif jg .hv_w16_loop0 %if WIN64 pop r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; prefix, type, type_h, type_v cglobal %1_%2_16bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) %endif %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 1, 2, 6 %elif WIN64 DECLARE_REG_TMP 4, 5, 8 %else DECLARE_REG_TMP 7, 8, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR %if ARCH_X86_32 cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my %define mxb r0b %define mxd r0 %define mxq r0 %define myb r1b %define myd r1 %define myq r1 %define m8 [esp+16*0] %define m9 [esp+16*1] %define m10 [esp+16*2] %define m11 [esp+16*3] %define m12 [esp+16*4] %define m13 [esp+16*5] %define m14 [esp+16*6] %define m15 [esp+16*7] %else cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my %endif %define base t2-put_ssse3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v LEA t2, put_ssse3 movifnidn wd, wm movifnidn srcq, srcmp movifnidn ssq, ssmp movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [base+put_ssse3_table+wq*2] movifnidn dstq, dstmp movifnidn dsq, dsmp add wq, t2 %if WIN64 pop r8 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv mov myd, r8m movd m5, r8m shr myd, 11 movddup m4, [base+put_8tap_h_rnd+myq*8] movifnidn dsq, dsmp pshufb m5, [base+pw_256] cmp wd, 4 jg .h_w8 movzx mxd, mxb lea srcq, [srcq-2] movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp punpcklbw m3, m3 psraw m3, 8 ; sign-extend je .h_w4 .h_w2: mova m2, [base+spel_h_shuf2] pshufd m3, m3, q2121 .h_w2_loop: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m2 pshufb m1, m2 pmaddwd m0, m3 pmaddwd m1, m3 phaddd m0, m1 paddd m0, m4 psrad m0, 6 packssdw m0, m0 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 movd [dstq+dsq*0], m0 pshuflw m0, m0, q3232 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: WIN64_SPILL_XMM 8 mova m6, [base+spel_h_shufA] mova m7, [base+spel_h_shufB] pshufd m2, m3, q1111 pshufd m3, m3, q2222 .h_w4_loop: movu m1, [srcq] add srcq, ssq pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m2 pmaddwd m1, m3 paddd m0, m4 paddd m0, m1 psrad m0, 6 packssdw m0, m0 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 movq [dstq], m0 add dstq, dsq dec hd jg .h_w4_loop RET .h_w8: %if WIN64 %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 12 %endif shr mxd, 16 movq m3, [base+subpel_filters+mxq*8] movifnidn dstq, dstmp mova m6, [base+spel_h_shufA] mova m7, [base+spel_h_shufB] %if UNIX64 mov wd, wd %endif lea srcq, [srcq+wq*2] punpcklbw m3, m3 lea dstq, [dstq+wq*2] psraw m3, 8 neg wq %if ARCH_X86_32 ALLOC_STACK -16*4 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m0, [srcq+r6*2- 6] movu m1, [srcq+r6*2+ 2] pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 pshufb m0, m7 ; 2 3 3 4 4 5 5 6 pmaddwd m2, m8 ; abcd0 pmaddwd m0, m9 ; abcd1 pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 pshufb m1, m7 ; 6 7 7 8 8 9 9 a paddd m2, m4 paddd m0, m2 pmaddwd m2, m10, m3 ; abcd2 pmaddwd m3, m8 ; efgh0 paddd m0, m2 pmaddwd m2, m11, m1 ; abcd3 pmaddwd m1, m9 ; efgh1 paddd m0, m2 movu m2, [srcq+r6*2+10] paddd m3, m4 paddd m1, m3 pshufb m3, m2, m6 ; 8 9 9 a a b b c pshufb m2, m7 ; a b b c c d d e pmaddwd m3, m10 ; efgh2 pmaddwd m2, m11 ; efgh3 paddd m1, m3 paddd m1, m2 psrad m0, 6 psrad m1, 6 packssdw m0, m1 pxor m1, m1 pminsw m0, m5 pmaxsw m0, m1 mova [dstq+r6*2], m0 add r6, 8 jl .h_w8_loop add srcq, ssq add dstq, dsq dec hd jg .h_w8_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif %if WIN64 WIN64_SPILL_XMM 15 %endif movd m7, r8m movifnidn dstq, dstmp movifnidn dsq, dsmp punpcklbw m3, m3 pshufb m7, [base+pw_256] psraw m3, 8 ; sign-extend %if ARCH_X86_32 ALLOC_STACK -16*7 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif lea r6, [ssq*3] sub srcq, r6 cmp wd, 2 jne .v_w4 .v_w2: movd m1, [srcq+ssq*0] movd m4, [srcq+ssq*1] movd m2, [srcq+ssq*2] add srcq, r6 movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m6, [srcq+ssq*2] add srcq, r6 movd m0, [srcq+ssq*0] punpckldq m1, m4 ; 0 1 punpckldq m4, m2 ; 1 2 punpckldq m2, m5 ; 2 3 punpckldq m5, m3 ; 3 4 punpckldq m3, m6 ; 4 5 punpckldq m6, m0 ; 5 6 punpcklwd m1, m4 ; 01 12 punpcklwd m2, m5 ; 23 34 punpcklwd m3, m6 ; 45 56 pxor m6, m6 .v_w2_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m8, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m9 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m10 ; a2 b2 paddd m5, m3 punpckldq m3, m0, m4 ; 6 7 movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m11, m3 ; a3 b3 paddd m5, m4 psrad m5, 5 packssdw m5, m5 pmaxsw m5, m6 pavgw m5, m6 pminsw m5, m7 movd [dstq+dsq*0], m5 pshuflw m5, m5, q3232 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 shl wd, 14 %if STACK_ALIGNMENT < 16 mov [esp+4*29], srcq mov [esp+4*30], dstq %else mov srcmp, srcq %endif lea wd, [wq+hq-(1<<16)] %else shl wd, 6 mov r7, srcq mov r8, dstq lea wd, [wq+hq-(1<<8)] %endif .v_w4_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] movq m3, [srcq+ssq*2] add srcq, r6 movq m4, [srcq+ssq*0] movq m5, [srcq+ssq*1] movq m6, [srcq+ssq*2] add srcq, r6 movq m0, [srcq+ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m5 ; 34 punpcklwd m5, m6 ; 45 punpcklwd m6, m0 ; 56 %if ARCH_X86_32 jmp .v_w4_loop_start .v_w4_loop: mova m1, m12 mova m2, m13 mova m3, m14 .v_w4_loop_start: pmaddwd m1, m8 ; a0 pmaddwd m2, m8 ; b0 mova m12, m3 mova m13, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m1, m3 paddd m2, m4 mova m14, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m1, m5 paddd m2, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m3, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m1, m3 pmaddwd m3, m11, m6 ; b3 paddd m2, m3 psrad m1, 5 psrad m2, 5 packssdw m1, m2 pxor m2, m2 pmaxsw m1, m2 pavgw m1, m2 pminsw m1, m7 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if STACK_ALIGNMENT < 16 mov srcq, [esp+4*29] mov dstq, [esp+4*30] movzx hd, ww add srcq, 8 add dstq, 8 mov [esp+4*29], srcq mov [esp+4*30], dstq %else mov srcq, srcmp mov dstq, dstmp movzx hd, ww add srcq, 8 add dstq, 8 mov srcmp, srcq mov dstmp, dstq %endif sub wd, 1<<16 %else .v_w4_loop: pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 paddd m13, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m14, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m14 psrad m12, 5 psrad m13, 5 packssdw m12, m13 pxor m13, m13 pmaxsw m12, m13 pavgw m12, m13 pminsw m12, m7 movq [dstq+dsq*0], m12 movhps [dstq+dsq*1], m12 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop add r7, 8 add r8, 8 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 %endif jg .v_w4_loop0 RET .hv: %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif %if ARCH_X86_32 movd m4, r8m mova m6, [base+pd_512] pshufb m4, [base+pw_256] %else %if WIN64 ALLOC_STACK 16*6, 16 %endif movd m15, r8m pshufb m15, [base+pw_256] %endif cmp wd, 4 jg .hv_w8 movzx mxd, mxb je .hv_w4 movq m0, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 mov dstq, dstmp mov dsq, dsmp mova m5, [base+spel_h_shuf2] ALLOC_STACK -16*8 %else mova m6, [base+pd_512] mova m9, [base+spel_h_shuf2] %endif pshuflw m0, m0, q2121 pxor m7, m7 punpcklbw m7, m0 punpcklbw m3, m3 psraw m3, 8 ; sign-extend test dword r8m, 0x800 jz .hv_w2_10bpc psraw m7, 2 psllw m3, 2 .hv_w2_10bpc: lea r6, [ssq*3] sub srcq, 2 sub srcq, r6 %if ARCH_X86_32 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m9, m5 mova m11, m0 mova m12, m1 mova m13, m2 mova m14, m3 mova m15, m4 %else pshufd m11, m3, q0000 pshufd m12, m3, q1111 pshufd m13, m3, q2222 pshufd m14, m3, q3333 %endif movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] movu m1, [srcq+ssq*2] add srcq, r6 movu m4, [srcq+ssq*0] %if ARCH_X86_32 REPX {pshufb x, m5}, m2, m3, m1, m4 %else REPX {pshufb x, m9}, m2, m3, m1, m4 %endif REPX {pmaddwd x, m7}, m2, m3, m1, m4 phaddd m2, m3 ; 0 1 phaddd m1, m4 ; 2 3 movu m3, [srcq+ssq*1] movu m4, [srcq+ssq*2] add srcq, r6 movu m0, [srcq+ssq*0] %if ARCH_X86_32 REPX {pshufb x, m5}, m3, m4, m0 %else REPX {pshufb x, m9}, m3, m4, m0 %endif REPX {pmaddwd x, m7}, m3, m4, m0 phaddd m3, m4 ; 4 5 phaddd m0, m0 ; 6 6 REPX {paddd x, m6}, m2, m1, m3, m0 REPX {psrad x, 10}, m2, m1, m3, m0 packssdw m2, m1 ; 0 1 2 3 packssdw m3, m0 ; 4 5 6 _ palignr m4, m3, m2, 4 ; 1 2 3 4 pshufd m5, m3, q0321 ; 5 6 _ _ punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 punpcklwd m3, m5 ; 45 56 .hv_w2_loop: movu m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movu m5, [srcq+ssq*0] pshufb m4, m9 pshufb m5, m9 pmaddwd m4, m7 pmaddwd m5, m7 phaddd m4, m5 pmaddwd m5, m11, m1 ; a0 b0 mova m1, m2 pmaddwd m2, m12 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m13 ; a2 b2 paddd m5, m3 paddd m4, m6 psrad m4, 10 ; 7 8 packssdw m0, m4 pshufd m3, m0, q2103 punpckhwd m3, m0 ; 67 78 mova m0, m4 pmaddwd m4, m14, m3 ; a3 b3 paddd m5, m6 paddd m5, m4 psrad m5, 10 packssdw m5, m5 pxor m4, m4 pminsw m5, m15 pmaxsw m5, m4 movd [dstq+dsq*0], m5 pshuflw m5, m5, q3232 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w8: shr mxd, 16 .hv_w4: movq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovb myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif mov dstq, dstmp mov dsq, dsmp mova m0, [base+spel_h_shufA] mova m1, [base+spel_h_shufB] ALLOC_STACK -16*15 mova m8, m0 mova m9, m1 mova m14, m6 %else mova m8, [base+spel_h_shufA] mova m9, [base+spel_h_shufB] %endif pxor m0, m0 punpcklbw m0, m2 punpcklbw m3, m3 psraw m3, 8 test dword r8m, 0x800 jz .hv_w4_10bpc psraw m0, 2 psllw m3, 2 .hv_w4_10bpc: lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 %if ARCH_X86_32 %define tmp esp+16*8 shl wd, 14 %if STACK_ALIGNMENT < 16 mov [esp+4*61], srcq mov [esp+4*62], dstq %else mov srcmp, srcq %endif mova [tmp+16*5], m4 lea wd, [wq+hq-(1<<16)] pshufd m1, m0, q0000 pshufd m2, m0, q1111 pshufd m5, m0, q2222 pshufd m0, m0, q3333 mova m10, m1 mova m11, m2 mova m12, m5 mova m13, m0 %else %if WIN64 %define tmp rsp %else %define tmp rsp-104 ; red zone %endif shl wd, 6 mov r7, srcq mov r8, dstq lea wd, [wq+hq-(1<<8)] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 mova [tmp+16*5], m15 %endif pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [tmp+16*1], m0 mova [tmp+16*2], m1 mova [tmp+16*3], m2 mova [tmp+16*4], m3 %macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 pmaddwd m%3, m10 pmaddwd m%1, m11 paddd m%3, %5 paddd m%1, m%3 pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a pmaddwd m%3, m12 pmaddwd m%2, m13 paddd m%1, m%3 paddd m%1, m%2 psrad m%1, %4 %endmacro .hv_w4_loop0: %if ARCH_X86_64 mova m14, [pd_512] %endif movu m4, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] movu m5, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] movu m6, [srcq+ssq*2+0] movu m3, [srcq+ssq*2+8] add srcq, r6 PUT_8TAP_HV_H 4, 1, 0, 10 PUT_8TAP_HV_H 5, 2, 0, 10 PUT_8TAP_HV_H 6, 3, 0, 10 movu m7, [srcq+ssq*0+0] movu m2, [srcq+ssq*0+8] movu m1, [srcq+ssq*1+0] movu m3, [srcq+ssq*1+8] PUT_8TAP_HV_H 7, 2, 0, 10 PUT_8TAP_HV_H 1, 3, 0, 10 movu m2, [srcq+ssq*2+0] movu m3, [srcq+ssq*2+8] add srcq, r6 PUT_8TAP_HV_H 2, 3, 0, 10 packssdw m4, m7 ; 0 3 packssdw m5, m1 ; 1 4 movu m0, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 1, 3, 10 packssdw m6, m2 ; 2 5 packssdw m7, m0 ; 3 6 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 %if ARCH_X86_32 jmp .hv_w4_loop_start .hv_w4_loop: mova m1, [tmp+16*6] mova m2, m15 .hv_w4_loop_start: mova m7, [tmp+16*1] pmaddwd m1, m7 ; a0 pmaddwd m2, m7 ; b0 mova m7, [tmp+16*2] mova [tmp+16*6], m3 pmaddwd m3, m7 ; a1 mova m15, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m1, m3 paddd m2, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m1, m5 paddd m2, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 10 packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 10 mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m1, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m2, m7 ; b3 psrad m1, 9 psrad m2, 9 packssdw m1, m2 pxor m7, m7 pmaxsw m1, m7 pavgw m7, m1 pminsw m7, [tmp+16*5] movq [dstq+dsq*0], m7 movhps [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop %if STACK_ALIGNMENT < 16 mov srcq, [esp+4*61] mov dstq, [esp+4*62] add srcq, 8 add dstq, 8 mov [esp+4*61], srcq mov [esp+4*62], dstq %else mov srcq, srcmp mov dstq, dstmp add srcq, 8 add dstq, 8 mov srcmp, srcq mov dstmp, dstq %endif movzx hd, ww sub wd, 1<<16 %else .hv_w4_loop: mova m15, [tmp+16*1] pmaddwd m14, m15, m1 ; a0 pmaddwd m15, m2 ; b0 mova m7, [tmp+16*2] mova m1, m3 pmaddwd m3, m7 ; a1 mova m2, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m14, m3 paddd m15, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m14, m5 paddd m15, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m14, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m15, m7 ; b3 psrad m14, 9 psrad m15, 9 packssdw m14, m15 pxor m7, m7 pmaxsw m14, m7 pavgw m7, m14 pminsw m7, [tmp+16*5] movq [dstq+dsq*0], m7 movhps [dstq+dsq*1], m7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop add r7, 8 add r8, 8 movzx hd, wb mov srcq, r7 mov dstq, r8 sub wd, 1<<8 %endif jg .hv_w4_loop0 RET %undef tmp %if ARCH_X86_32 DECLARE_REG_TMP 2, 1, 6, 4 %elif WIN64 DECLARE_REG_TMP 6, 4, 7, 4 %else DECLARE_REG_TMP 6, 7, 7, 8 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR %if ARCH_X86_32 cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my %define mxb r0b %define mxd r0 %define mxq r0 %define myb r2b %define myd r2 %define myq r2 %else cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my %endif %define base t2-prep_ssse3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v LEA t2, prep_ssse3 movifnidn wd, wm movifnidn srcq, srcmp test mxd, 0xf00 jnz .h movifnidn hd, hm test myd, 0xf00 jnz .v tzcnt wd, wd mov myd, r7m ; bitdepth_max movzx wd, word [base+prep_ssse3_table+wq*2] mova m5, [base+pw_8192] shr myd, 11 add wq, t2 movddup m4, [base+prep_mul+myq*8] movifnidn ssq, ssmp movifnidn tmpq, tmpmp lea r6, [ssq*3] %if WIN64 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv movifnidn ssq, r2mp movifnidn hd, r4m movddup m5, [base+prep_8tap_1d_rnd] cmp wd, 4 jne .h_w8 movzx mxd, mxb movq m0, [base+subpel_filters+mxq*8] mova m3, [base+spel_h_shufA] mova m4, [base+spel_h_shufB] movifnidn tmpq, tmpmp sub srcq, 2 WIN64_SPILL_XMM 8 punpcklbw m0, m0 psraw m0, 8 test dword r7m, 0x800 jnz .h_w4_12bpc psllw m0, 2 .h_w4_12bpc: pshufd m6, m0, q1111 pshufd m7, m0, q2222 .h_w4_loop: movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 pmaddwd m0, m6 pmaddwd m1, m7 paddd m0, m5 paddd m0, m1 pshufb m1, m2, m3 pshufb m2, m4 pmaddwd m1, m6 pmaddwd m2, m7 paddd m1, m5 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq], m0 add tmpq, 16 sub hd, 2 jg .h_w4_loop RET .h_w8: WIN64_SPILL_XMM 11 shr mxd, 16 movq m2, [base+subpel_filters+mxq*8] mova m4, [base+spel_h_shufA] mova m6, [base+spel_h_shufB] movifnidn tmpq, r0mp add wd, wd punpcklbw m2, m2 add srcq, wq psraw m2, 8 add tmpq, wq neg wq test dword r7m, 0x800 jnz .h_w8_12bpc psllw m2, 2 .h_w8_12bpc: pshufd m7, m2, q0000 %if ARCH_X86_32 ALLOC_STACK -16*3 pshufd m0, m2, q1111 pshufd m1, m2, q2222 pshufd m2, m2, q3333 mova m8, m0 mova m9, m1 mova m10, m2 %else pshufd m8, m2, q1111 pshufd m9, m2, q2222 pshufd m10, m2, q3333 %endif .h_w8_loop0: mov r6, wq .h_w8_loop: movu m0, [srcq+r6- 6] movu m1, [srcq+r6+ 2] pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 pshufb m0, m6 ; 2 3 3 4 4 5 5 6 pmaddwd m2, m7 ; abcd0 pmaddwd m0, m8 ; abcd1 pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 pshufb m1, m6 ; 6 7 7 8 8 9 9 a paddd m2, m5 paddd m0, m2 pmaddwd m2, m9, m3 ; abcd2 pmaddwd m3, m7 ; efgh0 paddd m0, m2 pmaddwd m2, m10, m1 ; abcd3 pmaddwd m1, m8 ; efgh1 paddd m0, m2 movu m2, [srcq+r6+10] paddd m3, m5 paddd m1, m3 pshufb m3, m2, m4 ; a b b c c d d e pshufb m2, m6 ; 8 9 9 a a b b c pmaddwd m3, m9 ; efgh2 pmaddwd m2, m10 ; efgh3 paddd m1, m3 paddd m1, m2 psrad m0, 4 psrad m1, 4 packssdw m0, m1 mova [tmpq+r6], m0 add r6, 16 jl .h_w8_loop add srcq, ssq sub tmpq, wq dec hd jg .h_w8_loop0 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif WIN64_SPILL_XMM 15 movddup m7, [base+prep_8tap_1d_rnd] movifnidn ssq, r2mp movifnidn tmpq, r0mp punpcklbw m3, m3 psraw m3, 8 ; sign-extend test dword r7m, 0x800 jnz .v_12bpc psllw m3, 2 .v_12bpc: %if ARCH_X86_32 ALLOC_STACK -16*7 pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova m8, m0 mova m9, m1 mova m10, m2 mova m11, m3 %else pshufd m8, m3, q0000 pshufd m9, m3, q1111 pshufd m10, m3, q2222 pshufd m11, m3, q3333 %endif lea r6, [ssq*3] sub srcq, r6 mov r6d, wd shl wd, 6 mov r5, srcq %if ARCH_X86_64 mov r7, tmpq %elif STACK_ALIGNMENT < 16 mov [esp+4*29], tmpq %endif lea wd, [wq+hq-(1<<8)] .v_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m3, [srcq+ssq*0] movq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m5, [srcq+ssq*0] movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] punpcklwd m1, m2 ; 01 punpcklwd m2, m3 ; 12 punpcklwd m3, m4 ; 23 punpcklwd m4, m5 ; 34 punpcklwd m5, m6 ; 45 punpcklwd m6, m0 ; 56 %if ARCH_X86_32 jmp .v_loop_start .v_loop: mova m1, m12 mova m2, m13 mova m3, m14 .v_loop_start: pmaddwd m1, m8 ; a0 pmaddwd m2, m8 ; b0 mova m12, m3 mova m13, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m1, m3 paddd m2, m4 mova m14, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m1, m5 paddd m2, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m3, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m1, m7 paddd m1, m3 pmaddwd m3, m11, m6 ; b3 paddd m2, m7 paddd m2, m3 psrad m1, 4 psrad m2, 4 packssdw m1, m2 movq [tmpq+r6*0], m1 movhps [tmpq+r6*2], m1 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .v_loop %if STACK_ALIGNMENT < 16 mov tmpq, [esp+4*29] add r5, 8 add tmpq, 8 mov srcq, r5 mov [esp+4*29], tmpq %else mov tmpq, tmpmp add r5, 8 add tmpq, 8 mov srcq, r5 mov tmpmp, tmpq %endif %else .v_loop: pmaddwd m12, m8, m1 ; a0 pmaddwd m13, m8, m2 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m9 ; a1 pmaddwd m4, m9 ; b1 paddd m12, m3 paddd m13, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m10 ; a2 pmaddwd m6, m10 ; b2 paddd m12, m5 paddd m13, m6 movq m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklwd m5, m0, m6 ; 67 movq m0, [srcq+ssq*0] pmaddwd m14, m11, m5 ; a3 punpcklwd m6, m0 ; 78 paddd m12, m7 paddd m12, m14 pmaddwd m14, m11, m6 ; b3 paddd m13, m7 paddd m13, m14 psrad m12, 4 psrad m13, 4 packssdw m12, m13 movq [tmpq+r6*0], m12 movhps [tmpq+r6*2], m12 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .v_loop add r5, 8 add r7, 8 mov srcq, r5 mov tmpq, r7 %endif movzx hd, wb sub wd, 1<<8 jg .v_loop0 RET .hv: %if STACK_ALIGNMENT < 16 %xdefine rstk rsp %else %assign stack_offset stack_offset - stack_size_padded %endif movzx t3d, mxb shr mxd, 16 cmp wd, 4 cmove mxd, t3d movifnidn hd, r4m movq m2, [base+subpel_filters+mxq*8] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd movq m3, [base+subpel_filters+myq*8] %if ARCH_X86_32 mov ssq, r2mp mov tmpq, r0mp mova m0, [base+spel_h_shufA] mova m1, [base+spel_h_shufB] mova m4, [base+prep_8tap_2d_rnd] ALLOC_STACK -16*14 mova m8, m0 mova m9, m1 mova m14, m4 %else %if WIN64 ALLOC_STACK 16*6, 16 %endif mova m8, [base+spel_h_shufA] mova m9, [base+spel_h_shufB] %endif pxor m0, m0 punpcklbw m0, m2 punpcklbw m3, m3 psraw m0, 4 psraw m3, 8 test dword r7m, 0x800 jz .hv_10bpc psraw m0, 2 .hv_10bpc: lea r6, [ssq*3] sub srcq, 6 sub srcq, r6 mov r6d, wd shl wd, 6 mov r5, srcq %if ARCH_X86_32 %define tmp esp+16*8 %if STACK_ALIGNMENT < 16 mov [esp+4*61], tmpq %endif pshufd m1, m0, q0000 pshufd m2, m0, q1111 pshufd m5, m0, q2222 pshufd m0, m0, q3333 mova m10, m1 mova m11, m2 mova m12, m5 mova m13, m0 %else %if WIN64 %define tmp rsp %else %define tmp rsp-88 ; red zone %endif mov r7, tmpq pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif lea wd, [wq+hq-(1<<8)] pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [tmp+16*1], m0 mova [tmp+16*2], m1 mova [tmp+16*3], m2 mova [tmp+16*4], m3 .hv_loop0: %if ARCH_X86_64 mova m14, [prep_8tap_2d_rnd] %endif movu m4, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] movu m5, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] movu m6, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+8] PUT_8TAP_HV_H 4, 1, 0, 6 PUT_8TAP_HV_H 5, 2, 0, 6 PUT_8TAP_HV_H 6, 3, 0, 6 movu m7, [srcq+ssq*1+0] movu m2, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] movu m1, [srcq+ssq*0+0] movu m3, [srcq+ssq*0+8] PUT_8TAP_HV_H 7, 2, 0, 6 PUT_8TAP_HV_H 1, 3, 0, 6 movu m2, [srcq+ssq*1+0] movu m3, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 2, 3, 0, 6 packssdw m4, m7 ; 0 3 packssdw m5, m1 ; 1 4 movu m0, [srcq+ssq*0+0] movu m1, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 1, 3, 6 packssdw m6, m2 ; 2 5 packssdw m7, m0 ; 3 6 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 %if ARCH_X86_32 jmp .hv_loop_start .hv_loop: mova m1, [tmp+16*5] mova m2, m15 .hv_loop_start: mova m7, [tmp+16*1] pmaddwd m1, m7 ; a0 pmaddwd m2, m7 ; b0 mova m7, [tmp+16*2] mova [tmp+16*5], m3 pmaddwd m3, m7 ; a1 mova m15, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m1, m14 paddd m2, m14 paddd m1, m3 paddd m2, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m1, m5 paddd m2, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 6 packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 6 mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m1, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m2, m7 ; b3 psrad m1, 6 psrad m2, 6 packssdw m1, m2 movq [tmpq+r6*0], m1 movhps [tmpq+r6*2], m1 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .hv_loop %if STACK_ALIGNMENT < 16 mov tmpq, [esp+4*61] add r5, 8 add tmpq, 8 mov srcq, r5 mov [esp+4*61], tmpq %else mov tmpq, tmpmp add r5, 8 add tmpq, 8 mov srcq, r5 mov tmpmp, tmpq %endif %else .hv_loop: mova m15, [tmp+16*1] mova m7, [prep_8tap_2d_rnd] pmaddwd m14, m15, m1 ; a0 pmaddwd m15, m2 ; b0 paddd m14, m7 paddd m15, m7 mova m7, [tmp+16*2] mova m1, m3 pmaddwd m3, m7 ; a1 mova m2, m4 pmaddwd m4, m7 ; b1 mova m7, [tmp+16*3] paddd m14, m3 paddd m15, m4 mova m3, m5 pmaddwd m5, m7 ; a2 mova m4, m6 pmaddwd m6, m7 ; b2 paddd m14, m5 paddd m15, m6 movu m7, [srcq+ssq*1+0] movu m5, [srcq+ssq*1+8] lea srcq, [srcq+ssq*2] PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] packssdw m0, m7 ; 6 7 mova [tmp+16*0], m0 movu m0, [srcq+ssq*0+0] movu m5, [srcq+ssq*0+8] PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] mova m6, [tmp+16*0] packssdw m7, m0 ; 7 8 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, [tmp+16*4] paddd m14, m7 ; a3 pmaddwd m7, m6, [tmp+16*4] paddd m15, m7 ; b3 psrad m14, 6 psrad m15, 6 packssdw m14, m15 movq [tmpq+r6*0], m14 movhps [tmpq+r6*2], m14 lea tmpq, [tmpq+r6*4] sub hd, 2 jg .hv_loop add r5, 8 add r7, 8 mov srcq, r5 mov tmpq, r7 %endif movzx hd, wb sub wd, 1<<8 jg .hv_loop0 RET %undef tmp %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro SAVE_REG 1 %xdefine r%1_save r%1 %xdefine r%1q_save r%1q %xdefine r%1d_save r%1d %if ARCH_X86_32 %define r%1m_save [rstk+stack_offset+(%1+1)*4] %endif %endmacro %macro LOAD_REG 1 %xdefine r%1 r%1_save %xdefine r%1q r%1q_save %xdefine r%1d r%1d_save %if ARCH_X86_32 %define r%1m r%1m_save %endif %undef r%1d_save %undef r%1q_save %undef r%1_save %endmacro %macro REMAP_REG 2-3 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %if ARCH_X86_32 %if %3 == 0 %xdefine r%1m r%2m %else %define r%1m [rstk+stack_offset+(%1+1)*4] %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %if ARCH_X86_64 SAVE_REG 14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %else SAVE_REG 5 %assign %%i 5 %rep 5 %assign %%j %%i-1 REMAP_REG %%i, %%j, 0 %assign %%i %%i-1 %endrep %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %if ARCH_X86_64 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep LOAD_REG 14 %else %rep 4 %assign %%j %%i+1 REMAP_REG %%i, %%j, 1 %assign %%i %%i+1 %endrep LOAD_REG 5 %endif %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %if ARCH_X86_32 %macro MC_4TAP_SCALED_H 1 ; dst_mem movu m7, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m5, [r4 +ssq*0] movu m6, [r4 +ssq*1] lea srcq, [srcq+ssq*2] lea r4, [r4 +ssq*2] REPX {pshufb x, m12}, m7, m2 REPX {pmaddwd x, m13}, m7, m2 REPX {pshufb x, m14}, m5, m6 REPX {pmaddwd x, m15}, m5, m6 phaddd m7, m5 phaddd m2, m6 mova m5, [esp+0x00] movd m6, [esp+0x10] paddd m7, m5 paddd m2, m5 psrad m7, m6 psrad m2, m6 packssdw m7, m2 mova [stk+%1], m7 %endmacro %endif %if ARCH_X86_64 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] movu m%1, [srcq+ r4*2] movu m%2, [srcq+ r6*2] movu m%3, [srcq+ r7*2] movu m%4, [srcq+ r9*2] movu m%5, [srcq+r10*2] movu m%6, [srcq+r11*2] movu m%7, [srcq+r13*2] movu m%8, [srcq+ rX*2] add srcq, ssq pmaddwd m%1, [stk+0x10] pmaddwd m%2, [stk+0x20] pmaddwd m%3, [stk+0x30] pmaddwd m%4, [stk+0x40] pmaddwd m%5, [stk+0x50] pmaddwd m%6, [stk+0x60] pmaddwd m%7, [stk+0x70] pmaddwd m%8, [stk+0x80] phaddd m%1, m%2 phaddd m%3, m%4 phaddd m%5, m%6 phaddd m%7, m%8 phaddd m%1, m%3 phaddd m%5, m%7 paddd m%1, hround paddd m%5, hround psrad m%1, m12 psrad m%5, m12 packssdw m%1, m%5 %endmacro %else %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets %if %3 == 1 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] %endif movu m0, [srcq+r0*2] movu m1, [srcq+rX*2] movu m2, [srcq+r4*2] movu m3, [srcq+r5*2] mov r0, [stk+16] mov rX, [stk+20] mov r4, [stk+24] mov r5, [stk+28] pmaddwd m0, [stk+%1+0x00] pmaddwd m1, [stk+%1+0x10] pmaddwd m2, [stk+%1+0x20] pmaddwd m3, [stk+%1+0x30] phaddd m0, m1 phaddd m2, m3 movu m4, [srcq+r0*2] movu m5, [srcq+rX*2] movu m6, [srcq+r4*2] movu m7, [srcq+r5*2] add srcq, ssq pmaddwd m4, [stk+%1+0xa0] pmaddwd m5, [stk+%1+0xb0] pmaddwd m6, [stk+%1+0xc0] pmaddwd m7, [stk+%1+0xd0] phaddd m4, m5 phaddd m6, m7 phaddd m0, m2 phaddd m4, m6 paddd m0, hround paddd m4, hround psrad m0, m12 psrad m4, m12 packssdw m0, m4 %if %2 != 0 mova [stk+%2], m0 %endif %endmacro %endif %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isput 1 %assign isprep 0 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %endif %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax %endif %endif %xdefine base_reg r12 %else ; prep %assign isput 0 %assign isprep 1 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %xdefine tmp_stridem r14q %else cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %define tmp_stridem qword [stk+0x138] %endif %xdefine base_reg r11 %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %else cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax %endif %define tmp_stridem dword [stk+0x138] %endif %endif %if ARCH_X86_32 mov [esp+0x1f0], t0d mov [esp+0x1f4], t1d %if isput && required_stack_alignment > STACK_ALIGNMENT mov dstd, dstm mov dsd, dsm mov srcd, srcm mov ssd, ssm mov hd, hm mov r4, mxm %define r0m [esp+0x200] %define dsm [esp+0x204] %define dsmp dsm %define r1m dsm %define r2m [esp+0x208] %define ssm [esp+0x20c] %define r3m ssm %define hm [esp+0x210] %define mxm [esp+0x214] mov r0m, dstd mov dsm, dsd mov r2m, srcd mov ssm, ssd mov hm, hd mov r0, mym mov r1, dxm mov r2, dym %define mym [esp+0x218] %define dxm [esp+0x21c] %define dym [esp+0x220] mov mxm, r4 mov mym, r0 mov dxm, r1 mov dym, r2 tzcnt wd, wm %endif %if isput mov r3, pxmaxm %define pxmaxm r3 %else mov r2, pxmaxm %endif %if isprep && required_stack_alignment > STACK_ALIGNMENT %xdefine base_reg r5 %else %xdefine base_reg r6 %endif %endif LEA base_reg, %1_8tap_scaled_16bpc_ssse3 %xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT tzcnt wd, wm %endif %if ARCH_X86_64 %if isput mov r7d, pxmaxm %endif %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %endif movd m8, dxm movd m14, mxm %if isput movd m15, pxmaxm %endif pshufd m8, m8, q0000 pshufd m14, m14, q0000 %if isput pshuflw m15, m15, q0000 punpcklqdq m15, m15 %endif %if isprep %if UNIX64 mov r5d, t0d DECLARE_REG_TMP 5, 7 %endif %if ARCH_X86_64 mov r6d, pxmaxm %endif %endif %if ARCH_X86_64 mov dyd, dym %endif %if isput %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %elif ARCH_X86_64 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %else %endif %if ARCH_X86_64 %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0x138] %define rX r1 %define rXd r1d %else %define dsm dsq %define rX r14 %define rXd r14d %endif %else %define rX r1 %endif %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %elif ARCH_X86_64 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %xdefine hm r7m %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %if ARCH_X86_64 %define rX r14 %define rXd r14d %else %define rX r3 %endif %endif %if ARCH_X86_64 shr r7d, 11 mova m10, [base+pd_0x3ff] movddup m11, [base+s_8tap_h_rnd+r7*8] movd m12, [base+s_8tap_h_sh+r7*4] %if isput movddup m13, [base+put_s_8tap_v_rnd+r7*8] movd m7, [base+put_s_8tap_v_sh+r7*4] %define pxmaxm [rsp] mova pxmaxm, m15 punpcklqdq m12, m7 %endif lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q %else %define m10 [base+pd_0x3ff] %define m11 [esp+0x00] %define m12 [esp+0x10] shr r3, 11 movddup m1, [base+s_8tap_h_rnd+r3*8] movd m2, [base+s_8tap_h_sh+r3*4] %if isput %define m13 [esp+0x20] %define pxmaxm [esp+0x30] %define stk esp+0x40 movddup m5, [base+put_s_8tap_v_rnd+r3*8] movd m6, [base+put_s_8tap_v_sh+r3*4] mova pxmaxm, m15 punpcklqdq m2, m6 mova m13, m5 %else %define m13 [base+pd_m524256] %endif mov ssd, ssm mova m11, m1 mova m12, m2 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT mov r1, [esp+0x1f4] lea r0, [ssd*3] movzx r2, r1b shr r1, 16 cmp dword hm, 6 cmovs r1, r2 mov [esp+0x1f4], r1 %if isprep mov r1, r1m %endif mov r2, r2m sub srcq, r0 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define ss3q r0 %define myd r4 %define dyd dword dym %define hd dword hm %endif cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] add wq, base_reg jmp wq %if isput .w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b sub srcq, 2 movd m15, t0d %else movzx r4, byte [esp+0x1f0] sub srcq, 2 movd m15, r4 %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6, m7 REPX {pmaddwd x, m15}, m4, m5, m6, m7 phaddd m0, m1 phaddd m2, m3 phaddd m4, m5 phaddd m6, m7 REPX {paddd x, m11}, m0, m2, m4, m6 REPX {psrad x, m12}, m0, m2, m4, m6 packssdw m0, m2 ; 0 1 2 3 packssdw m4, m6 ; 4 5 6 7 SWAP m1, m4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m2, m3 movu m1, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m14}, m1, m7, m6, m3 REPX {pmaddwd x, m15}, m1, m7, m6, m3 phaddd m1, m7 phaddd m6, m3 REPX {paddd x, m11}, m0, m2, m1, m6 REPX {psrad x, m12}, m0, m2, m1, m6 packssdw m0, m2 packssdw m1, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] %endif palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mov myd, mym mov r0, r0m mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif .w2_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m10, r6q punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pmaddwd m5, m3, m7 pmaddwd m6, m0, m8 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddwd m7, m2, m9 pmaddwd m8, m4, m10 paddd m5, m6 paddd m7, m8 %else mov r1, [esp+0x1f4] xor r3, r3 mov r5, myd shr r5, 6 lea r1, [r1+r5] mov r5, 64 << 24 cmovnz r3, [base+subpel_filters+r1*8+4] cmovnz r5, [base+subpel_filters+r1*8+0] movd m6, r3 movd m7, r5 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m5, m7, q0000 pshufd m6, m7, q1111 pmaddwd m3, m5 pmaddwd m0, m6 pshufd m5, m7, q2222 pshufd m7, m7, q3333 pmaddwd m2, m5 pmaddwd m4, m7 paddd m3, m0 paddd m2, m4 SWAP m5, m3 SWAP m7, m2 %define m8 m3 %endif paddd m5, m13 pshufd m6, m12, q1032 pxor m8, m8 paddd m5, m7 psrad m5, m6 packssdw m5, m5 pmaxsw m5, m8 pminsw m5, pxmaxm movd [dstq], m5 add dstq, dsmp dec hd jz .ret %if ARCH_X86_64 add myd, dyd %else add myd, dym %endif test myd, ~0x3ff %if ARCH_X86_32 SWAP m3, m5 SWAP m2, m7 mova m3, [stk+0x20] mova m0, [stk+0x30] mova m2, [stk+0x40] mova m4, [stk+0x50] %endif jz .w2_loop %if ARCH_X86_32 mov r3, r3m %endif movu m5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps m3, m0, q1032 ; 01 12 shufps m0, m2, q1032 ; 23 34 shufps m2, m4, q1032 ; 45 56 pshufb m5, m14 pmaddwd m5, m15 phaddd m5, m5 paddd m5, m11 psrad m5, m12 packssdw m5, m5 palignr m4, m5, m1, 12 punpcklqdq m1, m4, m4 ; 6 7 6 7 punpcklwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif jmp .w2_loop .w2_skip_line: movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m3, m0 ; 01 12 mova m0, m2 ; 23 34 pshufb m5, m14 pshufb m6, m14 pmaddwd m5, m15 pmaddwd m6, m15 phaddd m5, m6 paddd m5, m11 psrad m5, m12 packssdw m5, m5 ; 6 7 6 7 punpckhqdq m1, m5 ; 4 5 6 7 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [stk+0x20], m3 mova [stk+0x30], m0 mova [stk+0x40], m2 mova [stk+0x50], m4 %endif jmp .w2_loop %endif INIT_XMM ssse3 .w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 movzx r4, byte [esp+0x1f0] sub srcq, 2 movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %else %define m9 [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd rX, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] movifprep r3, r3m SWAP m4, m7 %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x20], m13 mova [stk+0x30], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] movu m1, [srcq+r4 ] movu m3, [srcq+r6 ] movu m2, [srcq+r11 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m13}, m7, m9, m8, m10 REPX {pshufb x, m14}, m1, m2, m3, m4 REPX {pmaddwd x, m15}, m1, m2, m3, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m4, [srcq+ss3q ] REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 packssdw m8, m10 ; 2 3 movu m0, [srcq+r4 ] movu m9, [srcq+r6 ] movu m10, [srcq+r11 ] movu m11, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m1, m2, m3, m4 REPX {pmaddwd x, m13}, m1, m2, m3, m4 REPX {pshufb x, m14}, m0, m9, m10, m11 REPX {pmaddwd x, m15}, m0, m9, m10, m11 phaddd m1, m0 phaddd m2, m9 phaddd m3, m10 phaddd m4, m11 REPX {paddd x, m5}, m1, m2, m3, m4 REPX {psrad x, xm6}, m1, m2, m3, m4 packssdw m1, m2 ; 4 5 packssdw m3, m4 ; 6 7 SWAP m9, m1 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 pshufd m10, m3, q1032 ; 7 _ punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 punpcklwd m3, m10 ; 67 mova [rsp+0x40], m7 mova [rsp+0x50], m8 mova [rsp+0x60], m9 %else mova [stk+0x00], m12 mova [stk+0x10], m14 add r4, srcq MC_4TAP_SCALED_H 0x40 ; 0 1 MC_4TAP_SCALED_H 0x50 ; 2 3 MC_4TAP_SCALED_H 0x60 ; 4 5 MC_4TAP_SCALED_H 0x70 ; 6 7 mova m4, [stk+0x40] mova m5, [stk+0x50] mova m6, [stk+0x60] mova m7, [stk+0x70] mov [stk+0xc0], r4 shufps m1, m4, m5, q1032 ; 1 2 shufps m2, m5, m6, q1032 ; 3 4 shufps m3, m6, m7, q1032 ; 5 6 pshufd m0, m7, q1032 ; 7 _ mova [stk+0xb0], m0 punpcklwd m0, m4, m1 ; 01 punpckhwd m4, m1 ; 12 punpcklwd m1, m5, m2 ; 23 punpckhwd m5, m2 ; 34 punpcklwd m2, m6, m3 ; 45 punpckhwd m6, m3 ; 56 punpcklwd m3, m7, [stk+0xb0] ; 67 mov myd, mym mov r0, r0m mova [stk+0x40], m0 ; 01 mova [stk+0x50], m1 ; 23 mova [stk+0x60], m2 ; 45 mova [stk+0x70], m3 ; 67 mova [stk+0x80], m4 ; 12 mova [stk+0x90], m5 ; 34 mova [stk+0xa0], m6 ; 56 %define m12 [stk+0x00] %define m14 [stk+0x10] %define m13 [stk+0x20] %define m15 [stk+0x30] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %endif .w4_loop: and myd, 0x3ff %if ARCH_X86_64 mov r11d, 64 << 24 mov r13d, myd shr r13d, 6 lea r13d, [t1+r13] cmovnz r11q, [base+subpel_filters+r13*8] movq m9, r11q punpcklbw m9, m9 psraw m9, 8 pshufd m7, m9, q0000 pshufd m8, m9, q1111 pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pshufd m7, m9, q2222 pshufd m9, m9, q3333 pmaddwd m6, m2, m7 pmaddwd m8, m3, m9 %if isput movd m9, [rsp+0x28] %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif paddd m4, m5 paddd m6, m8 paddd m4, m6 paddd m4, vrnd_mem %else mov mym, myd mov r5, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 pmaddwd m0, m4 pmaddwd m1, m5 pmaddwd m2, m6 pmaddwd m3, m7 %if isput movd m4, [esp+0x18] %endif paddd m0, m1 paddd m2, m3 paddd m0, vrnd_mem paddd m0, m2 SWAP m4, m0 %define m9 m0 %endif %if isput pxor m5, m5 psrad m4, m9 packssdw m4, m4 pmaxsw m4, m5 pminsw m4, pxmaxm movq [dstq], m4 add dstq, dsmp %else psrad m4, 6 packssdw m4, m4 movq [tmpq], m4 add tmpq, 8 %endif dec hd jz .ret %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .w4_loop mova m8, [rsp+0x10] movd m9, [rsp+0x20] movu m4, [srcq] movu m5, [srcq+r4] test myd, 0x400 jz .w4_skip_line mova m0, [rsp+0x40] mova [rsp+0x40], m1 mova m1, [rsp+0x50] mova [rsp+0x50], m2 mova m2, [rsp+0x60] mova [rsp+0x60], m3 pshufb m4, m12 pshufb m5, m14 pmaddwd m4, m13 pmaddwd m5, m15 phaddd m4, m5 paddd m4, m8 psrad m4, m9 packssdw m4, m4 punpcklwd m3, m10, m4 mova m10, m4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu m6, [srcq+ssq*1] movu m7, [srcq+r6] mova m0, [rsp+0x50] mova m11, [rsp+0x60] pshufb m4, m12 pshufb m6, m12 pshufb m5, m14 pshufb m7, m14 pmaddwd m4, m13 pmaddwd m6, m13 pmaddwd m5, m15 pmaddwd m7, m15 mova [rsp+0x40], m0 mova [rsp+0x50], m11 phaddd m4, m5 phaddd m6, m7 paddd m4, m8 paddd m6, m8 psrad m4, m9 psrad m6, m9 packssdw m4, m6 punpcklwd m9, m10, m4 mova [rsp+0x60], m9 pshufd m10, m4, q1032 mova m0, m1 mova m1, m2 mova m2, m3 punpcklwd m3, m4, m10 lea srcq, [srcq+ssq*2] jmp .w4_loop %else SWAP m0, m4 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff jnz .w4_next_line mova m0, [stk+0x40] mova m1, [stk+0x50] mova m2, [stk+0x60] mova m3, [stk+0x70] jmp .w4_loop .w4_next_line: mov r5, [stk+0xc0] movu m4, [srcq] movu m5, [r5] test myd, 0x400 jz .w4_skip_line add [stk+0xc0], ssq mova m0, [stk+0x80] mova m3, [stk+0x50] mova [stk+0x40], m0 mova [stk+0x80], m3 mova m1, [stk+0x90] mova m6, [stk+0x60] mova [stk+0x50], m1 mova [stk+0x90], m6 mova m2, [stk+0xa0] mova m7, [stk+0x70] mova [stk+0x60], m2 mova [stk+0xa0], m7 pshufb m4, m12 pshufb m5, m14 pmaddwd m4, m13 pmaddwd m5, m15 phaddd m4, m5 paddd m4, hrnd_mem psrad m4, hsh_mem packssdw m4, m4 punpcklwd m3, [stk+0xb0], m4 mova [stk+0xb0], m4 mova [stk+0x70], m3 add srcq, ssq jmp .w4_loop .w4_skip_line: movu m6, [srcq+ssq*1] movu m7, [r5 +ssq*1] lea r5, [r5 +ssq*2] mov [stk+0xc0], r5 mova m0, [stk+0x50] mova m1, [stk+0x60] mova m2, [stk+0x70] mova m3, [stk+0x90] pshufb m4, m12 pshufb m6, m12 pshufb m5, m14 pshufb m7, m14 pmaddwd m4, m13 pmaddwd m6, m13 pmaddwd m5, m15 pmaddwd m7, m15 mova [stk+0x40], m0 mova [stk+0x50], m1 mova [stk+0x60], m2 mova [stk+0x80], m3 phaddd m4, m5 phaddd m6, m7 mova m5, [stk+0xa0] mova m7, [stk+0xb0] paddd m4, hrnd_mem paddd m6, hrnd_mem psrad m4, hsh_mem psrad m6, hsh_mem packssdw m4, m6 punpcklwd m7, m4 pshufd m6, m4, q1032 mova [stk+0x90], m5 mova [stk+0xa0], m7 mova [stk+0xb0], m6 punpcklwd m3, m4, m6 mova [stk+0x70], m3 lea srcq, [srcq+ssq*2] jmp .w4_loop %endif INIT_XMM ssse3 %if ARCH_X86_64 %define stk rsp+0x20 %endif .w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .w_start: %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 %define hround m11 shr t0d, 16 movd m15, t0d %if isprep mova m13, [base+pd_m524256] %endif %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq ssm %endif mov r4, [esp+0x1f0] shr r4, 16 movd m15, r4 mov r0, r0m mov myd, mym %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 mov r5, hm mov [stk+0x0f4], myd mov [stk+0x134], r5 %endif jmp .hloop .hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov myd, [stk+0x0f4] mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov mym, myd mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] mov myd, mym mov dyd, dym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m5, m11, q0000 pshufd m7, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m4, m5, m0 pmaddwd m5, m5, m1 pmaddwd m6, m7, m2 pmaddwd m7, m7, m3 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m10 pmaddwd m7, [stk+0xa0], m10 pmaddwd m8, [stk+0xb0], m11 pmaddwd m9, [stk+0xc0], m11 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov myd, mym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 .vloop: mov r0, r0m mov r5, [esp+0x1f4] and myd, 0x3ff mov mym, myd xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 pshufd m6, m7, q2222 pshufd m7, m7, q3333 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [stk+0x140], myd mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] jz .skip_line mova m14, [base+unpckw] movu m8, [srcq+r10*2] movu m9, [srcq+r11*2] movu m10, [srcq+r13*2] movu m11, [srcq+ rX*2] movu m4, [srcq+ r4*2] movu m5, [srcq+ r6*2] movu m6, [srcq+ r7*2] movu m7, [srcq+ r9*2] add srcq, ssq mov myd, [stk+0x140] mov dyd, dym pshufd m15, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m15 ; 3a 2a pshufb m3, m15 ; 3b 2b pmaddwd m8, [stk+0x50] pmaddwd m9, [stk+0x60] pmaddwd m10, [stk+0x70] pmaddwd m11, [stk+0x80] pmaddwd m4, [stk+0x10] pmaddwd m5, [stk+0x20] pmaddwd m6, [stk+0x30] pmaddwd m7, [stk+0x40] phaddd m8, m9 phaddd m10, m11 mova m11, hround phaddd m4, m5 phaddd m6, m7 phaddd m8, m10 phaddd m4, m6 paddd m4, m11 paddd m8, m11 psrad m4, m12 psrad m8, m12 packssdw m4, m8 pshufb m5, [stk+0x90], m14 ; 4a 5a pshufb m6, [stk+0xa0], m14 ; 4b 5b pshufb m7, [stk+0xb0], m15 ; 7a 6a pshufb m8, [stk+0xc0], m15 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b punpckhwd m5, m7 ; 56a punpckhwd m6, m8 ; 56b punpcklwd m7, m4 ; 78a punpckhqdq m4, m4 punpcklwd m8, m4 ; 78b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m8 jmp .vloop .skip_line: MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 mov myd, [stk+0x140] mov dyd, dym mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [stk+0x90] ; 23a mova m3, [stk+0xa0] ; 23b mova m5, [stk+0xb0] ; 45a mova m6, [stk+0xc0] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m4 %else mov r0m, r0 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff mov mym, myd jnz .next_line mova m0, [stk+0x20] mova m1, [stk+0x30] mova m2, [stk+0x40] mova m3, [stk+0x50] jmp .vloop .next_line: test myd, 0x400 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] jz .skip_line MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 mova m7, [base+unpckw] pshufd m4, m7, q1032 pshufb m0, [stk+0x20], m7 ; 0a 1a pshufb m1, [stk+0x30], m7 ; 0b 1b pshufb m2, [stk+0x40], m4 ; 3a 2a pshufb m3, [stk+0x50], m4 ; 3b 2b pshufb m5, [stk+0x60], m7 ; 4a 5a pshufb m6, [stk+0x70], m7 ; 4b 5b pshufb m7, [stk+0x80], m4 ; 7a 6a punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 punpckhwd m5, m7 ; 56a mova [stk+0x60], m5 pshufb m5, [stk+0x90], m4 ; 7b 6b punpcklwd m7, [stk+0xe0] ; 78a punpckhwd m6, m5 ; 56b mova [stk+0x70], m6 movq m6, [stk+0xe8] mova [stk+0x80], m7 punpcklwd m5, m6 mov myd, mym mova [stk+0x90], m5 jmp .vloop .skip_line: MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 MC_8TAP_SCALED_H 0xa0, 0 ; 9 mova m7, [stk+0xe0] mova m2, [stk+0x60] ; 23a mova m3, [stk+0x70] ; 23b mova m4, [stk+0x80] ; 45a mova m5, [stk+0x90] ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova m0, [stk+0x40] ; 01a mova m1, [stk+0x50] ; 01b mov myd, mym mova [stk+0x40], m2 mova [stk+0x50], m3 mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova [stk+0x20], m0 mova [stk+0x30], m1 %endif jmp .vloop INIT_XMM ssse3 .dy1: movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %if isput .dy1_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %define m11 [esp+0x00] %define m12 [esp+0x10] %define m13 [esp+0x20] movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 mov r1, r1m %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] mov [stk+0x20], r3 mov r3, r3m %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2, m3 REPX {pmaddwd x, m15}, m0, m1, m2, m3 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6 REPX {pmaddwd x, m15}, m4, m5, m6 phaddd m0, m1 phaddd m2, m3 phaddd m4, m5 phaddd m6, m6 REPX {paddd x, m11}, m0, m2, m4, m6 REPX {psrad x, m12}, m0, m2, m4, m6 packssdw m0, m2 ; 0 1 2 3 packssdw m4, m6 ; 4 5 6 SWAP m1, m4 movq m10, r4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m2, m3 movu m1, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q REPX {pshufb x, m14}, m1, m7, m6 REPX {pmaddwd x, m15}, m1, m7, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] phaddd m1, m7 phaddd m6, m6 REPX {paddd x, m11}, m0, m2, m1, m6 REPX {psrad x, m12}, m0, m2, m1, m6 packssdw m0, m2 packssdw m1, m6 %define m8 m6 %define m9 m4 %define m10 m5 movd m10, r4 movd m9, [stk+0x20] punpckldq m10, m9 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 mova [stk+0x50], m7 mova [stk+0x60], m8 mova [stk+0x70], m9 mova [stk+0x80], m10 %define m7 [stk+0x50] %define m8 [stk+0x60] %define m9 [stk+0x70] %define m10 [stk+0x80] %endif palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m4, m1, q2121 ; 5 6 5 6 punpcklwd m2, m1, m4 ; 45 56 %if ARCH_X86_32 mov r0, r0m %endif .dy1_w2_loop: movu m1, [srcq+ssq*0] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m3, m7 mova m3, m0 pmaddwd m0, m8 pshufb m1, m14 pshufb m6, m14 pmaddwd m1, m15 pmaddwd m6, m15 phaddd m1, m6 paddd m1, m11 psrad m1, m12 packssdw m1, m1 paddd m5, m0 mova m0, m2 pmaddwd m2, m9 paddd m5, m2 palignr m2, m1, m4, 12 punpcklwd m2, m1 ; 67 78 pmaddwd m4, m2, m10 paddd m5, m13 paddd m5, m4 pxor m6, m6 mova m4, m1 pshufd m1, m12, q1032 psrad m5, m1 packssdw m5, m5 pmaxsw m5, m6 pminsw m5, pxmaxm movd [dstq+dsq*0], m5 pshuflw m5, m5, q1032 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif INIT_XMM ssse3 .dy1_w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %define vrnd_mem [rsp+0x30] %define stk rsp+0x40 %else %define vrnd_mem [base+pd_m524256] %define stk rsp+0x30 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m9 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd rX, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] SWAP m4, m7 %if isprep mov r3, r3m %endif %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x40], m13 mova [stk+0x50], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] movu m1, [srcq+r4 ] movu m3, [srcq+r6 ] movu m2, [srcq+r11 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m7, m9, m8, m10 REPX {pmaddwd x, m13}, m7, m9, m8, m10 REPX {pshufb x, m14}, m1, m3, m2, m4 REPX {pmaddwd x, m15}, m1, m3, m2, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m7, m1 phaddd m9, m3 phaddd m8, m2 phaddd m10, m4 movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] REPX {paddd x, m5}, m7, m9, m8, m10 REPX {psrad x, xm6}, m7, m9, m8, m10 packssdw m7, m9 ; 0 1 packssdw m8, m10 ; 2 3 movu m0, [srcq+r4 ] movu m9, [srcq+r6 ] movu m10, [srcq+r11 ] add srcq, ss3q REPX {pshufb x, m12}, m1, m2, m3 REPX {pmaddwd x, m13}, m1, m2, m3 REPX {pshufb x, m14}, m0, m9, m10 REPX {pmaddwd x, m15}, m0, m9, m10 phaddd m1, m0 phaddd m2, m9 phaddd m3, m10 shr myd, 6 mov r13d, 64 << 24 lea myd, [t1+myq] cmovnz r13q, [base+subpel_filters+myq*8] REPX {paddd x, m5}, m1, m2, m3 REPX {psrad x, xm6}, m1, m2, m3 packssdw m1, m2 ; 4 5 packssdw m3, m3 ; 6 6 SWAP m9, m1 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 movq m10, r13 mova [stk+0x00], m1 mova [stk+0x10], m8 mova [stk+0x20], m2 mova [stk+0x30], m9 mova [stk+0x40], m3 %define hrnd_mem [rsp+0x10] %define hsh_mem [rsp+0x20] %define vsh_mem [rsp+0x28] %if isput %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif %else mova [stk+0x20], m12 mova [stk+0x30], m14 add r4, srcq MC_4TAP_SCALED_H 0x60 ; 0 1 MC_4TAP_SCALED_H 0x70 ; 2 3 MC_4TAP_SCALED_H 0x80 ; 4 5 movu m7, [srcq] movu m2, [r4] add srcq, ssq add r4, ssq mov [stk+0xb0], r4 pshufb m7, m12 pshufb m2, m14 pmaddwd m7, m13 pmaddwd m2, m15 phaddd m7, m2 paddd m7, [esp+0x00] psrad m7, [esp+0x10] packssdw m7, m7 ; 6 6 mova m4, [stk+0x60] mova m5, [stk+0x70] mova m6, [stk+0x80] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m shufps m1, m4, m5, q1032 ; 1 2 shufps m2, m5, m6, q1032 ; 3 4 shufps m3, m6, m7, q1032 ; 5 6 mova [stk+0xa0], m7 punpcklwd m0, m4, m1 ; 01 punpckhwd m4, m1 ; 12 punpcklwd m1, m5, m2 ; 23 punpckhwd m5, m2 ; 34 punpcklwd m2, m6, m3 ; 45 punpckhwd m6, m3 ; 56 movd m7, r4 movd m3, r5 mov r0, r0m %if isput mov r1, r1m %endif mov r4, [stk+0xb0] mova [stk+0xc0], m4 ; 12 mova [stk+0x60], m1 ; 23 mova [stk+0x70], m2 ; 45 mova [stk+0x80], m5 ; 34 mova [stk+0x90], m6 ; 56 %define m12 [stk+0x20] %define m14 [stk+0x30] %define m13 [stk+0x40] %define m15 [stk+0x50] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %define vsh_mem [esp+0x18] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %define m10 m7 punpckldq m10, m3 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m3, m10, q0000 pshufd m4, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 %xdefine m8 m3 %xdefine m9 m6 %xdefine m11 m5 %xdefine m6 m4 mova [stk+0x100], m3 mova [stk+0x110], m4 mova [stk+0x120], m5 mova [stk+0x130], m10 %define m3 [stk+0x100] %define m4 [stk+0x110] %define m5 [stk+0x120] %define m10 [stk+0x130] mova m7, [stk+0xc0] mova m8, [stk+0x80] %endif .dy1_w4_loop: movu m11, [srcq+ssq*0] movu m6, [srcq+ssq*1] pmaddwd m0, m3 pmaddwd m7, m3 pmaddwd m1, m4 pmaddwd m8, m4 pmaddwd m2, m5 pmaddwd m9, m5 paddd m1, m0 paddd m8, m7 %if ARCH_X86_64 movu m0, [srcq+r4] movu m7, [srcq+r6] %else movu m0, [r4+ssq*0] movu m7, [r4+ssq*1] lea r4, [r4+ssq*2] %endif lea srcq, [srcq+ssq*2] paddd m1, m2 paddd m8, m9 pshufb m11, m12 pshufb m6, m12 pmaddwd m11, m13 pmaddwd m6, m13 pshufb m0, m14 pshufb m7, m14 pmaddwd m0, m15 pmaddwd m7, m15 phaddd m11, m0 phaddd m6, m7 paddd m11, hrnd_mem paddd m6, hrnd_mem psrad m11, hsh_mem psrad m6, hsh_mem packssdw m11, m6 ; 7 8 %if ARCH_X86_64 shufps m9, [stk+0x40], m11, q1032 ; 6 7 mova m0, [stk+0x00] mova [stk+0x40], m11 %else shufps m9, [stk+0xa0], m11, q1032 ; 6 7 mova m0, [stk+0x60] mova [stk+0xa0], m11 %endif punpcklwd m2, m9, m11 ; 67 punpckhwd m9, m11 ; 78 pmaddwd m6, m2, m10 pmaddwd m7, m9, m10 %if isput movd m11, vsh_mem %endif paddd m1, vrnd_mem paddd m8, vrnd_mem paddd m1, m6 paddd m8, m7 %if ARCH_X86_64 mova m7, [stk+0x10] %else mova m7, [stk+0x80] %endif %if isput psrad m1, m11 psrad m8, m11 %else psrad m1, 6 psrad m8, 6 %endif packssdw m1, m8 %if ARCH_X86_64 mova m8, [stk+0x30] %else mova m8, [stk+0x90] %endif %if isput pxor m6, m6 pmaxsw m1, m6 pminsw m1, pxmaxm movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] %else mova [tmpq], m1 add tmpq, 16 %endif %if ARCH_X86_64 mova m1, [stk+0x20] mova [stk+0x10], m8 mova [stk+0x00], m1 mova [stk+0x20], m2 mova [stk+0x30], m9 %else mova m1, [stk+0x70] mova [stk+0x80], m8 mova [stk+0x60], m1 mova [stk+0x70], m2 mova [stk+0x90], m9 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET ; why not jz .ret? INIT_XMM ssse3 .dy1_w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 mova [rsp+0x20], m12 %define hround m11 %if isput mova [rsp+0x30], m13 %else mova m13, [base+pd_m524256] %endif shr t0d, 16 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q %else movd m5, r4 movd m6, r5 punpckldq m5, m6 SWAP m3, m5 %endif punpcklbw m3, m3 psraw m3, 8 mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 %if ARCH_X86_64 mova [stk+0x140], m0 mova [stk+0x150], m1 mova [stk+0x160], m2 mova [stk+0x170], m3 %if UNIX64 mov hm, hd %endif %else mova [stk+0x180], m0 mova [stk+0x190], m1 mova [stk+0x1a0], m2 mova [stk+0x1b0], m3 SWAP m5, m3 mov r5, hm mov [stk+0x134], r5 %endif jmp .dy1_hloop .dy1_hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy1_hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova m10, [stk+0x140] mova m11, [stk+0x150] mova m14, [stk+0x160] mova m15, [stk+0x170] mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] %define shift [rsp+0x20] %if isput %define vround [rsp+0x30] %else %define vround [base+pd_m524256] %endif .dy1_vloop: pmaddwd m4, m0, m10 pmaddwd m5, m1, m10 pmaddwd m6, m2, m11 pmaddwd m7, m3, m11 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m14 pmaddwd m7, [stk+0xa0], m14 pmaddwd m8, [stk+0xb0], m15 pmaddwd m9, [stk+0xc0], m15 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov r0, r0m punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova m4, [stk+0x180] mova m5, [stk+0x190] mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 .dy1_vloop: pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep %if ARCH_X86_64 movu m8, [srcq+r10*2] movu m9, [srcq+r11*2] movu m12, [srcq+r13*2] movu m13, [srcq+ rX*2] movu m4, [srcq+ r4*2] movu m5, [srcq+ r6*2] movu m6, [srcq+ r7*2] movu m7, [srcq+ r9*2] add srcq, ssq pmaddwd m8, [stk+0x50] pmaddwd m9, [stk+0x60] pmaddwd m12, [stk+0x70] pmaddwd m13, [stk+0x80] pmaddwd m4, [stk+0x10] pmaddwd m5, [stk+0x20] pmaddwd m6, [stk+0x30] pmaddwd m7, [stk+0x40] phaddd m8, m9 phaddd m12, m13 mova m9, [base+unpckw] mova m13, hround phaddd m4, m5 phaddd m6, m7 phaddd m8, m12 phaddd m4, m6 pshufd m5, m9, q1032 pshufb m0, m9 ; 0a 1a pshufb m1, m9 ; 0b 1b pshufb m2, m5 ; 3a 2a pshufb m3, m5 ; 3b 2b mova m12, shift paddd m4, m13 paddd m8, m13 psrad m4, m12 psrad m8, m12 packssdw m4, m8 pshufb m6, [stk+0x90], m9 ; 4a 5a pshufb m7, [stk+0xa0], m9 ; 4b 5b pshufb m8, [stk+0xb0], m5 ; 7a 6a pshufb m13, [stk+0xc0], m5 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m6 ; 34a punpcklwd m3, m7 ; 34b punpckhwd m6, m8 ; 56a punpckhwd m7, m13 ; 56b punpcklwd m8, m4 ; 78a punpckhqdq m4, m4 punpcklwd m13, m4 ; 78b mova [stk+0x90], m6 mova [stk+0xa0], m7 mova [stk+0xb0], m8 mova [stk+0xc0], m13 mova m13, vround %else mov r0m, r0 mov r3, r3m mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 mova m7, [base+unpckw] pshufd m4, m7, q1032 pshufb m0, [stk+0x20], m7 ; 0a 1a pshufb m1, [stk+0x30], m7 ; 0b 1b pshufb m2, [stk+0x40], m4 ; 3a 2a pshufb m3, [stk+0x50], m4 ; 3b 2b pshufb m5, [stk+0x60], m7 ; 4a 5a pshufb m6, [stk+0x70], m7 ; 4b 5b pshufb m7, [stk+0x80], m4 ; 7a 6a punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 punpckhwd m5, m7 ; 56a mova [stk+0x60], m5 pshufb m5, [stk+0x90], m4 ; 7b 6b punpcklwd m7, [stk+0xe0] ; 78a mova m4, [stk+0x180] punpckhwd m6, m5 ; 56b mova [stk+0x70], m6 movq m6, [stk+0xe8] mova [stk+0x80], m7 mova m7, [stk+0x1b0] punpcklwd m5, m6 mova m6, [stk+0x1a0] mova [stk+0x90], m5 mova m5, [stk+0x190] mov r0, r0m %endif jmp .dy1_vloop INIT_XMM ssse3 %if ARCH_X86_64 %define stk rsp+0x20 %endif .dy2: movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %if isput .dy2_w2: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m13 %define vrnd_mem [rsp+0x10] movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %define m11 [esp+0x00] %define m12 [esp+0x10] %define vrnd_mem [esp+0x20] mov r1, r1m movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pxor m9, m9 punpckldq m9, m8 paddd m14, m9 ; mx+dx*[0-1] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 pshufd m15, m15, q0321 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_q] mova m6, [base+spel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m2, m2 pcmpeqd m8, m2 psrld m14, 10 paddd m14, m14 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [stk], m14 SWAP m5, m0 SWAP m6, m3 %define m15 m6 %endif movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*2] movu m2, [srcq+ssq*4] punpckldq m15, m7 %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 pand m9, m8 pandn m8, m15 SWAP m15, m8 por m15, m9 movu m4, [srcq+ssq*1] movu m5, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else pand m7, m5, [base+pd_0x4000] pandn m5, m15 por m5, m7 %define m15 m5 mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] mov [stk+0x20], r3 mov r3, r3m %endif punpcklbw m15, m15 psraw m15, 8 REPX {pshufb x, m14}, m0, m1, m2 REPX {pmaddwd x, m15}, m0, m1, m2 %if ARCH_X86_64 REPX {pshufb x, m14}, m4, m5, m6 REPX {pmaddwd x, m15}, m4, m5, m6 phaddd m0, m1 phaddd m1, m2 phaddd m4, m5 phaddd m5, m6 REPX {paddd x, m11}, m0, m1, m4, m5 REPX {psrad x, m12}, m0, m1, m4, m5 packssdw m0, m1 ; 0 2 2 4 packssdw m4, m5 ; 1 3 3 5 SWAP m2, m4 movq m10, r4 %else mova [stk+0x10], m15 phaddd m0, m1 phaddd m1, m2 movu m2, [srcq+ssq*1] movu m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m6, [srcq+ssq*1] lea srcq, [srcq+ssq*2] REPX {pshufb x, m14}, m2, m7, m6 REPX {pmaddwd x, m15}, m2, m7, m6 %define m14 [stk+0x00] %define m15 [stk+0x10] phaddd m2, m7 phaddd m7, m6 REPX {paddd x, m11}, m0, m1, m2, m7 REPX {psrad x, m12}, m0, m1, m2, m7 packssdw m0, m1 packssdw m2, m7 %define m8 m6 %define m9 m4 %define m10 m5 movd m10, r4 movd m9, [stk+0x20] punpckldq m10, m9 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 mova [stk+0x50], m7 mova [stk+0x60], m8 mova [stk+0x70], m9 mova [stk+0x80], m10 %xdefine m13 m7 %define m7 [stk+0x50] %define m8 [stk+0x60] %define m9 [stk+0x70] %define m10 [stk+0x80] %endif punpcklwd m1, m0, m2 ; 01 23 punpckhwd m3, m0, m2 ; 23 45 %if ARCH_X86_32 mov r4, r0m %define dstq r4 mova [stk+0x20], m3 mova [stk+0x30], m0 %endif .dy2_w2_loop: movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m6, [srcq+ssq*2] movu m13, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd m3, m8 REPX {pshufb x, m14}, m4, m5, m6, m13 REPX {pmaddwd x, m15}, m4, m5, m6, m13 phaddd m4, m5 phaddd m6, m13 pmaddwd m5, m1, m7 paddd m4, m11 paddd m6, m11 psrad m4, m12 psrad m6, m12 packssdw m4, m6 ; 6 7 8 9 paddd m5, m3 pshufd m3, m4, q2200 pshufd m4, m4, q3311 palignr m3, m0, 12 ; 4 6 6 8 palignr m4, m2, 12 ; 5 7 7 9 mova m0, m3 mova m2, m4 punpcklwd m1, m3, m4 punpckhwd m3, m4 pmaddwd m6, m1, m9 pmaddwd m4, m3, m10 paddd m5, vrnd_mem paddd m6, m4 paddd m5, m6 pshufd m4, m12, q1032 pxor m6, m6 psrad m5, m4 packssdw m5, m5 pmaxsw m5, m6 pminsw m5, pxmaxm movd [dstq+dsq*0], m5 pshuflw m5, m5, q1032 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif INIT_XMM ssse3 .dy2_w4: %if ARCH_X86_64 mov myd, mym mova [rsp+0x10], m11 mova [rsp+0x20], m12 %if isput mova [rsp+0x30], m13 %define vrnd_mem [rsp+0x30] %define stk rsp+0x40 %else %define vrnd_mem [base+pd_m524256] %define stk rsp+0x30 %endif movzx t0d, t0b sub srcq, 2 movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m9 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r5, byte [esp+0x1f0] sub srcq, 2 movd m15, r5 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m9, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 pshufd m7, m15, q1032 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r6d, m15 movd r13d, m7 mova m10, [base+bdct_lb_q+ 0] mova m11, [base+bdct_lb_q+16] movd m13, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+ r6*8+2] movd m15, [base+subpel_filters+r11*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r1, m15 movd r4, m7 pshufd m15, m15, q0321 pshufd m7, m7, q0321 movd r3, m15 movd r5, m7 mova m5, [base+bdct_lb_q+ 0] mova m6, [base+bdct_lb_q+16] movd m1, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] SWAP m4, m7 mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %define m10 m5 %define m11 m6 %define m12 m1 %define m13 m1 %endif psrld m14, 10 paddd m14, m14 punpckldq m13, m2 punpckldq m15, m4 punpcklqdq m13, m15 pxor m2, m2 pcmpeqd m0, m2 %if ARCH_X86_64 pand m9, m0 %else pand m2, m9, m0 %define m9 m2 SWAP m7, m4 %endif pandn m0, m13 %if ARCH_X86_64 SWAP m13, m0 %else %define m13 m0 %endif por m13, m9 punpckhbw m15, m13, m13 punpcklbw m13, m13 psraw m15, 8 psraw m13, 8 pshufb m12, m14, m10 pshufb m14, m11 mova m10, [base+spel_s_shuf2] movd r4d, m14 shr r4d, 24 %if ARCH_X86_32 mova [stk+0x40], m13 mova [stk+0x50], m15 pxor m2, m2 %endif pshufb m7, m14, m2 psubb m14, m7 paddb m12, m10 paddb m14, m10 %if ARCH_X86_64 lea r6, [r4+ssq*1] lea r11, [r4+ssq*2] lea r13, [r4+ss3q ] movu m1, [srcq+ssq*0] movu m8, [srcq+ssq*2] movu m9, [srcq+ssq*1] movu m10, [srcq+ss3q ] movu m7, [srcq+r4 ] movu m2, [srcq+r11 ] movu m3, [srcq+r6 ] movu m4, [srcq+r13 ] lea srcq, [srcq+ssq*4] REPX {pshufb x, m12}, m1, m9, m8, m10 REPX {pmaddwd x, m13}, m1, m9, m8, m10 REPX {pshufb x, m14}, m7, m3, m2, m4 REPX {pmaddwd x, m15}, m7, m3, m2, m4 mova m5, [rsp+0x10] movd xm6, [rsp+0x20] phaddd m1, m7 phaddd m8, m2 phaddd m9, m3 phaddd m10, m4 movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] REPX {paddd x, m5}, m1, m9, m8, m10 REPX {psrad x, xm6}, m1, m9, m8, m10 packssdw m1, m8 ; 0 2 packssdw m9, m10 ; 1 3 movu m0, [srcq+r4 ] movu m8, [srcq+r6 ] lea srcq, [srcq+ssq*2] REPX {pshufb x, m12}, m2, m3 REPX {pmaddwd x, m13}, m2, m3 REPX {pshufb x, m14}, m0, m8 REPX {pmaddwd x, m15}, m0, m8 phaddd m2, m0 phaddd m3, m8 shr myd, 6 mov r9d, 64 << 24 lea myd, [t1+myq] cmovnz r9q, [base+subpel_filters+myq*8] REPX {paddd x, m5}, m2, m3 REPX {psrad x, xm6}, m2, m3 packssdw m2, m3 ; 4 5 pshufd m3, m2, q1032 ; 5 _ punpcklwd m0, m1, m9 ; 01 punpckhwd m1, m9 ; 23 punpcklwd m2, m3 ; 45 movq m10, r9 %define hrnd_mem [rsp+0x10] %define hsh_mem [rsp+0x20] %define vsh_mem [rsp+0x28] %if isput %define vrnd_mem [rsp+0x30] %else %define vrnd_mem [base+pd_m524256] %endif %else mova [stk+0x20], m12 mova [stk+0x30], m14 add r4, srcq MC_4TAP_SCALED_H 0x60 ; 0 1 MC_4TAP_SCALED_H 0x70 ; 2 3 MC_4TAP_SCALED_H 0x80 ; 4 5 mov [stk+0xe0], r4 mova m3, [base+spel_s_shuf8] mova m0, [stk+0x60] mova m1, [stk+0x70] mova m2, [stk+0x80] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m pshufb m0, m3 ; 01 pshufb m1, m3 ; 23 pshufb m2, m3 ; 45 movd m7, r4 movd m4, r5 mov r5, r0m %if isput mov r1, r1m %endif mov r4, [stk+0xe0] %define dstq r5 %define tmpq r5 %define m12 [stk+0x20] %define m14 [stk+0x30] %define m13 [stk+0x40] %define m15 [stk+0x50] %define hrnd_mem [esp+0x00] %define hsh_mem [esp+0x10] %define vsh_mem [esp+0x18] %if isput %define vrnd_mem [esp+0x20] %else %define vrnd_mem [base+pd_m524256] %endif %define m10 m7 punpckldq m10, m4 %endif punpcklbw m10, m10 psraw m10, 8 pshufd m3, m10, q0000 pshufd m4, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_32 %xdefine m8 m3 %xdefine m9 m6 %xdefine m11 m5 %xdefine m6 m4 mova [stk+0x100], m3 mova [stk+0x110], m4 mova [stk+0x120], m5 mova [stk+0x130], m10 %define m3 [stk+0x100] %define m4 [stk+0x110] %define m5 [stk+0x120] %define m10 [stk+0x130] %endif .dy2_w4_loop: pmaddwd m8, m0, m3 pmaddwd m9, m1, m3 mova m0, m2 pmaddwd m1, m4 pmaddwd m11, m2, m4 paddd m8, vrnd_mem paddd m9, vrnd_mem pmaddwd m2, m5 paddd m8, m1 paddd m9, m11 paddd m8, m2 movu m6, [srcq+ssq*0] movu m1, [srcq+ssq*2] %if ARCH_X86_64 movu m11, [srcq+r4 ] movu m2, [srcq+r11] %else movu m11, [r4+ssq*0] movu m2, [r4+ssq*2] %endif pshufb m6, m12 pshufb m1, m12 pmaddwd m6, m13 pmaddwd m1, m13 pshufb m11, m14 pshufb m2, m14 pmaddwd m11, m15 pmaddwd m2, m15 phaddd m6, m11 phaddd m1, m2 paddd m6, hrnd_mem paddd m1, hrnd_mem psrad m6, hsh_mem psrad m1, hsh_mem movu m7, [srcq+ssq*1] movu m11, [srcq+ss3q ] packssdw m6, m1 ; 6 8 %if ARCH_X86_64 movu m2, [srcq+r6 ] movu m1, [srcq+r13] %else movu m2, [r4+ssq*1] movu m1, [r4+ss3q ] %endif pshufb m7, m12 pshufb m11, m12 pmaddwd m7, m13 pmaddwd m11, m13 pshufb m2, m14 pshufb m1, m14 pmaddwd m2, m15 pmaddwd m1, m15 phaddd m7, m2 phaddd m11, m1 paddd m7, hrnd_mem paddd m11, hrnd_mem psrad m7, hsh_mem psrad m11, hsh_mem packssdw m7, m11 ; 7 9 %if ARCH_X86_32 lea r4, [r4+ssq*4] %endif lea srcq, [srcq+ssq*4] punpcklwd m1, m6, m7 ; 67 punpckhwd m6, m7 ; 89 mova m2, m6 pmaddwd m11, m1, m5 pmaddwd m7, m1, m10 pmaddwd m6, m10 paddd m9, m11 %if isput movd m11, vsh_mem %endif paddd m8, m7 paddd m9, m6 %if isput psrad m8, m11 psrad m9, m11 packssdw m8, m9 pxor m7, m7 pmaxsw m8, m7 pminsw m8, pxmaxm movq [dstq+dsq*0], m8 movhps [dstq+dsq*1], m8 lea dstq, [dstq+dsq*2] %else psrad m8, 6 psrad m9, 6 packssdw m8, m9 mova [tmpq], m8 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET ; why not jz .ret? INIT_XMM ssse3 .dy2_w8: mov dword [stk+0xf0], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [stk+0xf0], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [stk+0xf0], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [stk+0xf0], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [stk+0xf0], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %if ARCH_X86_64 %ifidn %1, put movifnidn dsm, dsq %endif mova [rsp+0x10], m11 mova [rsp+0x20], m12 %define hround m11 %if isput mova [rsp+0x30], m13 %else mova m13, [base+pd_m524256] %endif shr t0d, 16 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define hround [esp+0x00] %define m12 [esp+0x10] %define m10 [base+pd_0x3ff] %define m8 m0 %xdefine m14 m4 %xdefine m15 m3 %if isput %define dstq r0 %else %define tmpq r0 %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif sub srcq, 6 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q %else movd m5, r4 movd m6, r5 punpckldq m5, m6 SWAP m3, m5 %endif punpcklbw m3, m3 psraw m3, 8 mova [stk+0x100], m7 mova [stk+0x120], m15 mov [stk+0x0f8], srcq mov [stk+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 %if ARCH_X86_64 mova [stk+0x140], m0 mova [stk+0x150], m1 mova [stk+0x160], m2 mova [stk+0x170], m3 %if UNIX64 mov hm, hd %endif %else mova [stk+0x180], m0 mova [stk+0x190], m1 mova [stk+0x1a0], m2 mova [stk+0x1b0], m3 SWAP m5, m3 mov r5, hm mov [stk+0x134], r5 %endif jmp .dy2_hloop .dy2_hloop_prep: dec dword [stk+0x0f0] jz .ret %if ARCH_X86_64 add qword [stk+0x130], 16 mov hd, hm %else add dword [stk+0x130], 16 mov r5, [stk+0x134] mov r0, [stk+0x130] %endif mova m7, [stk+0x100] mova m14, [stk+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m11, [rsp+0x10] %endif mova m15, [stk+0x120] mov srcq, [stk+0x0f8] %if ARCH_X86_64 mov r0q, [stk+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy2_hloop: %if ARCH_X86_64 mova m9, [base+pq_0x40000000] %else %define m9 [base+pq_0x40000000] %endif pxor m1, m1 psrld m2, m14, 10 mova [stk], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m1 pshufd m2, m5, q1032 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 pshufd m5, m5, q0321 pshufd m2, m2, q0321 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pxor m2, m2 pcmpeqd m5, m2 mova [stk+0x110], m14 pshufd m4, m15, q1032 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 movq r11, m14 punpckhqdq m14, m14 movq rX, m14 mov r10d, r11d shr r11, 32 mov r13d, rXd shr rX, 32 mov r4d, [stk+ 0] mov r6d, [stk+ 4] mov r7d, [stk+ 8] mov r9d, [stk+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m9, m4 pand m8, m9, m6 pand m15, m9, m14 pand m9, m9, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m9, m5 punpcklbw m0, m7, m7 punpckhbw m7, m7 punpcklbw m1, m8, m8 punpckhbw m8, m8 psraw m0, 8 psraw m7, 8 psraw m1, 8 psraw m8, 8 punpcklbw m2, m15, m15 punpckhbw m15, m15 punpcklbw m3, m9, m9 punpckhbw m9, m9 psraw m2, 8 psraw m15, 8 psraw m3, 8 psraw m9, 8 mova [stk+0x10], m0 mova [stk+0x20], m7 mova [stk+0x30], m1 mova [stk+0x40], m8 mova [stk+0x50], m2 mova [stk+0x60], m15 mova [stk+0x70], m3 mova [stk+0x80], m9 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 mova [stk+0x90], m1 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 mova [stk+0xa0], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 mova [stk+0xb0], m3 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 mova [stk+0xc0], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 mova [stk+0xd0], m5 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 mova m5, [stk+0xd0] mova m1, [stk+0x90] mova m2, [stk+0xa0] mova m3, [stk+0xb0] mova m9, [stk+0xc0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova m10, [stk+0x140] mova m11, [stk+0x150] mova m14, [stk+0x160] mova m15, [stk+0x170] mova [stk+0x90], m4 mova [stk+0xa0], m5 mova [stk+0xb0], m6 mova [stk+0xc0], m7 %define hround [rsp+0x10] %define shift [rsp+0x20] %if isput %define vround [rsp+0x30] %else %define vround [base+pd_m524256] %endif .dy2_vloop: pmaddwd m4, m0, m10 pmaddwd m5, m1, m10 pmaddwd m6, m2, m11 pmaddwd m7, m3, m11 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [stk+0x90], m14 pmaddwd m7, [stk+0xa0], m14 pmaddwd m8, [stk+0xb0], m15 pmaddwd m9, [stk+0xc0], m15 paddd m4, m6 paddd m5, m7 %if isput pshufd m6, m12, q1032 %endif paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 pshufd m15, m15, q0321 pshufd m4, m4, q0321 movd r4, m15 movd r5, m4 mova m14, [stk+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [stk+16], m14 mov r0, [stk+ 0] mov rX, [stk+ 4] mov r4, [stk+ 8] mov r5, [stk+12] mova [stk+0x20], m0 mova [stk+0x30], m1 mova [stk+0x40], m2 mova [stk+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m9, m4 pand m1, m9, m6 pand m2, m9, m7 pand m3, m9, m5 pandn m4, [stk+0x20] pandn m6, [stk+0x30] pandn m7, [stk+0x40] pandn m5, [stk+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 punpcklbw m4, m0, m0 punpckhbw m0, m0 punpcklbw m5, m1, m1 punpckhbw m1, m1 psraw m4, 8 psraw m0, 8 psraw m5, 8 psraw m1, 8 punpcklbw m6, m2, m2 punpckhbw m2, m2 punpcklbw m7, m3, m3 punpckhbw m3, m3 psraw m6, 8 psraw m2, 8 psraw m7, 8 psraw m3, 8 mova [stk+0x0a0], m4 mova [stk+0x0b0], m0 mova [stk+0x0c0], m5 mova [stk+0x0d0], m1 mova [stk+0x140], m6 mova [stk+0x150], m2 mova [stk+0x160], m7 mova [stk+0x170], m3 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 mova m5, [stk+0x60] mova m6, [stk+0x70] mova m7, [stk+0x80] mova m0, [stk+0x90] mov r0, r0m punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [stk+0x60], m4 mova [stk+0x70], m5 mova [stk+0x80], m6 mova [stk+0x90], m7 mova m1, [stk+0x20] mova m2, [stk+0x30] mova m3, [stk+0x40] mova m4, [stk+0x50] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova m4, [stk+0x180] mova m5, [stk+0x190] mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mova [stk+0x40], m2 mova [stk+0x50], m3 .dy2_vloop: pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 paddd m0, m2 paddd m1, m3 pmaddwd m2, [stk+0x60], m6 pmaddwd m3, [stk+0x70], m6 pmaddwd m4, [stk+0x80], m7 pmaddwd m5, [stk+0x90], m7 %if isput movd m6, [esp+0x18] %endif paddd m0, m2 paddd m1, m3 paddd m0, vrnd_mem paddd m1, vrnd_mem paddd m4, m0 paddd m5, m1 %endif %ifidn %1, put psrad m4, m6 psrad m5, m6 packssdw m4, m5 pxor m7, m7 pmaxsw m4, m7 pminsw m4, pxmaxm mova [dstq], m4 add dstq, dsm %else psrad m4, 6 psrad m5, 6 packssdw m4, m5 mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep %if ARCH_X86_64 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 mova [stk+0xd0], m4 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 mova m4, [stk+0xd0] mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [stk+0x90] ; 23a mova m3, [stk+0xa0] ; 23b mova m5, [stk+0xb0] ; 45a mova m6, [stk+0xc0] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [stk+0x90], m5 mova [stk+0xa0], m6 mova [stk+0xb0], m7 mova [stk+0xc0], m4 %else mov r0m, r0 mov r3, r3m MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 MC_8TAP_SCALED_H 0xa0, 0 ; 9 mova m7, [stk+0xe0] mova m2, [stk+0x60] ; 23a mova m3, [stk+0x70] ; 23b mova m4, [stk+0x80] ; 45a mova m5, [stk+0x90] ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova m0, [stk+0x40] ; 01a mova m1, [stk+0x50] ; 01b mova [stk+0x40], m2 mova [stk+0x50], m3 mova [stk+0x60], m4 mova [stk+0x70], m5 mova m4, [stk+0x180] mova m5, [stk+0x190] mova [stk+0x80], m6 mova [stk+0x90], m7 mova m6, [stk+0x1a0] mova m7, [stk+0x1b0] mov r0, r0m %endif jmp .dy2_vloop INIT_XMM ssse3 .ret: MC_8TAP_SCALED_RET 0 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT %define r0m [rstk+stack_offset+ 4] %define r1m [rstk+stack_offset+ 8] %define r2m [rstk+stack_offset+12] %define r3m [rstk+stack_offset+16] %endif %undef isput %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_16bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, (5*15 << 16) | 5*15 jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN put FN put_8tap_scaled, sharp, SHARP, SHARP FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN put_8tap_scaled, smooth, SMOOTH, SMOOTH FN put_8tap_scaled, sharp_regular, SHARP, REGULAR FN put_8tap_scaled, regular_sharp, REGULAR, SHARP FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN put_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN prep FN prep_8tap_scaled, sharp, SHARP, SHARP FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN prep_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_64 DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 2 %endif %if ARCH_X86_64 ; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that ; by allocating 16 bytes more stack space so that stack offsets match up. %if WIN64 && STACK_ALIGNMENT == 16 %assign stksz 16*14 %else %assign stksz 16*13 %endif cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ mx, tmp, alpha, beta, \ filter, my, gamma, cnt %assign stack_size_padded_8x8t stack_size_padded %else cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my %define m8 [esp+16*13] %define m9 [esp+16*14] %define cntd dword [esp+4*63] %define dstq tmpq %define dsq 0 %if STACK_ALIGNMENT < 16 %define dstm [esp+4*65] %define dsm [esp+4*66] %else %define dstm r0m %define dsm r1m %endif %endif %define base filterq-$$ mov t0d, r7m LEA filterq, $$ shr t0d, 11 %if ARCH_X86_64 movddup m8, [base+warp8x8t_rnd] %else movddup m1, [base+warp8x8t_rnd] mov r1, r1m add r1, r1 mova m8, m1 mov r1m, r1 ; ds *= 2 %endif call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main jmp .start .loop: %if ARCH_X86_64 lea dstq, [dstq+dsq*4] %else add dstq, dsm mov dstm, dstq %endif call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 .start: %if ARCH_X86_32 mov dstq, dstm %endif paddd m1, m8 paddd m2, m8 psrad m1, 15 psrad m2, 15 packssdw m1, m2 mova [dstq+dsq*0], m1 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 %if ARCH_X86_32 mov dstq, dstm add dstq, dsm %endif paddd m1, m8 paddd m2, m8 psrad m1, 15 psrad m2, 15 packssdw m1, m2 mova [dstq+dsq*2], m1 dec cntd jg .loop RET %if ARCH_X86_64 cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ mx, tmp, alpha, beta, \ filter, my, gamma, cnt ASSERT stack_size_padded == stack_size_padded_8x8t %else cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ filter, mx, my %endif mov t0d, r7m LEA filterq, $$ shr t0d, 11 %if ARCH_X86_64 movddup m8, [base+warp8x8_rnd2+t0*8] movd m9, r7m ; pixel_max pshufb m9, [base+pw_256] %else movddup m1, [base+warp8x8_rnd2+t0*8] movd m2, r7m ; pixel_max pshufb m2, [base+pw_256] mova m8, m1 mova m9, m2 %endif call .main jmp .start .loop: %if ARCH_X86_64 lea dstq, [dstq+dsq*2] %else add dstq, dsm mov dstm, dstq %endif call .main2 .start: %if ARCH_X86_32 mov dstq, dstm %endif psrad m1, 16 psrad m2, 16 packssdw m1, m2 pmaxsw m1, m6 pmulhrsw m1, m8 pminsw m1, m9 mova [dstq+dsq*0], m1 call .main3 %if ARCH_X86_32 mov dstq, dstm add dstq, dsm %endif psrad m1, 16 psrad m2, 16 packssdw m1, m2 pmaxsw m1, m6 pmulhrsw m1, m8 pminsw m1, m9 mova [dstq+dsq*1], m1 dec cntd jg .loop RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov deltaq, r5m mov mxd, r6m %endif movd m0, [base+warp8x8_shift+t0*4] movddup m7, [base+warp8x8_rnd1+t0*8] add filterq, mc_warp_filter-$$ %if ARCH_X86_64 movsx alphad, word [deltaq+2*0] movsx betad, word [deltaq+2*1] movsx gammad, word [deltaq+2*2] movsx deltad, word [deltaq+2*3] lea tmpq, [ssq*3] add mxd, 512+(64<<10) sub srcq, tmpq ; src -= ss*3 imul tmpd, alphad, -7 mov myd, r7m add betad, tmpd ; beta -= alpha*7 imul tmpd, gammad, -7 add myd, 512+(64<<10) mov cntd, 4 add deltad, tmpd ; delta -= gamma*7 %else %if STACK_ALIGNMENT < 16 %assign stack_offset stack_offset - gprsize %endif mov r3d, r5m ; abcd %if STACK_ALIGNMENT < 16 mov r0, r1m ; dst mov r1, r2m ; ds mov [esp+gprsize+4*65], r0 mov [esp+gprsize+4*66], r1 %endif movsx alphad, word [r3+2*0] movsx r2d, word [r3+2*1] movsx gammad, word [r3+2*2] movsx r3d, word [r3+2*3] imul r5d, alphad, -7 add r2d, r5d ; beta -= alpha*7 imul r5d, gammad, -7 mov [esp+gprsize+4*60], r2d add r3d, r5d ; delta -= gamma*7 mov [esp+gprsize+4*61], r3d mov r3d, r4m ; ss mov srcq, r3m mov mxd, r6m mov myd, r7m mov dword [esp+gprsize+4*63], 4 ; cnt mov [esp+gprsize+4*62], r3 lea r3, [r3*3] add mxd, 512+(64<<10) add myd, 512+(64<<10) sub srcq, r3 ; src -= ss*3 %if STACK_ALIGNMENT < 16 %assign stack_offset stack_offset + gprsize %endif %endif mova [rsp+gprsize], m0 pxor m6, m6 call .h mova m5, m0 call .h punpcklwd m1, m5, m0 ; 01 punpckhwd m5, m0 mova [rsp+gprsize+16* 1], m1 mova [rsp+gprsize+16* 4], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 12 punpckhwd m5, m0 mova [rsp+gprsize+16* 7], m1 mova [rsp+gprsize+16*10], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 23 punpckhwd m5, m0 mova [rsp+gprsize+16* 2], m1 mova [rsp+gprsize+16* 5], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 34 punpckhwd m5, m0 mova [rsp+gprsize+16* 8], m1 mova [rsp+gprsize+16*11], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 45 punpckhwd m5, m0 mova [rsp+gprsize+16* 3], m1 mova [rsp+gprsize+16* 6], m5 mova m5, m0 call .h punpcklwd m1, m5, m0 ; 56 punpckhwd m5, m0 mova [rsp+gprsize+16* 9], m1 mova [rsp+gprsize+16*12], m5 mova m5, m0 .main2: call .h %macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h lea tmpd, [myq+gammaq] shr myd, 10 movq m4, [filterq+myq*8] ; a lea myd, [tmpq+gammaq] shr tmpd, 10 movq m2, [filterq+tmpq*8] ; b lea tmpd, [myq+gammaq] shr myd, 10 movq m3, [filterq+myq*8] ; c lea myd, [tmpq+gammaq] shr tmpd, 10 movq m1, [filterq+tmpq*8] ; d lea tmpd, [myq+gammaq] shr myd, 10 punpcklwd m4, m2 punpcklwd m3, m1 punpckldq m2, m4, m3 punpckhdq m4, m3 punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 pmaddwd m1, [rsp+gprsize+16*%1] punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 mova m2, [rsp+gprsize+16*%2] pmaddwd m3, m2 mova [rsp+gprsize+16*%1], m2 paddd m1, m3 punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 mova m2, [rsp+gprsize+16*%3] pmaddwd m3, m2 mova [rsp+gprsize+16*%2], m2 paddd m1, m3 punpcklwd m3, m5, m0 ; 67 punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 pmaddwd m2, m3 mova [rsp+gprsize+16*%3], m3 paddd m1, m2 movq m4, [filterq+myq*8] ; e lea myd, [tmpq+gammaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] ; f lea tmpd, [myq+gammaq] shr myd, 10 movq m2, [filterq+myq*8] ; g %if ARCH_X86_64 lea myd, [tmpq+deltaq] ; my += delta %else mov myd, [esp+gprsize+4*61] add myd, tmpd %endif shr tmpd, 10 punpcklwd m4, m3 movq m3, [filterq+tmpq*8] ; h punpcklwd m2, m3 punpckldq m3, m4, m2 punpckhdq m4, m2 punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 pmaddwd m2, [rsp+gprsize+16*%4] punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 mova m3, [rsp+gprsize+16*%5] pmaddwd m6, m3 mova [rsp+gprsize+16*%4], m3 pxor m3, m3 paddd m2, m6 punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 mova m6, [rsp+gprsize+16*%6] pmaddwd m3, m6 mova [rsp+gprsize+16*%5], m6 punpckhwd m5, m0 pxor m6, m6 paddd m2, m3 punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 pmaddwd m3, m5 mova [rsp+gprsize+16*%6], m5 mova m5, m0 paddd m2, m3 %endmacro WARP_V 1, 2, 3, 4, 5, 6 ret .main3: call .h WARP_V 7, 8, 9, 10, 11, 12 ret ALIGN function_align .h: lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] punpcklbw m0, m6, m3 movu m3, [srcq-6] pmaddwd m0, m3 ; 0 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m2, m6, m3 movu m3, [srcq-4] pmaddwd m2, m3 ; 1 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m0, m2 ; 0 1 punpcklbw m2, m6, m3 movu m3, [srcq-2] pmaddwd m2, m3 ; 2 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m1, m6, m3 movu m3, [srcq+0] pmaddwd m1, m3 ; 3 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m2, m1 ; 2 3 punpcklbw m1, m6, m3 movu m3, [srcq+2] pmaddwd m1, m3 ; 4 lea mxd, [tmpq+alphaq] shr tmpd, 10 movq m3, [filterq+tmpq*8] phaddd m0, m2 ; 0 1 2 3 punpcklbw m2, m6, m3 movu m3, [srcq+4] pmaddwd m2, m3 ; 5 lea tmpd, [mxq+alphaq] shr mxd, 10 movq m3, [filterq+mxq*8] phaddd m1, m2 ; 4 5 punpcklbw m2, m6, m3 movu m3, [srcq+6] pmaddwd m2, m3 ; 6 %if ARCH_X86_64 lea mxd, [tmpq+betaq] ; mx += beta %else mov mxd, [esp+gprsize*2+4*60] add mxd, tmpd %endif shr tmpd, 10 movq m3, [filterq+tmpq*8] punpcklbw m4, m6, m3 movu m3, [srcq+8] %if ARCH_X86_64 add srcq, ssq %else add srcq, [esp+gprsize*2+4*62] %endif pmaddwd m3, m4 ; 7 phaddd m2, m3 ; 6 7 phaddd m1, m2 ; 4 5 6 7 paddd m0, m7 paddd m1, m7 psrad m0, [rsp+gprsize*2] psrad m1, [rsp+gprsize*2] packssdw m0, m1 ret %macro BIDIR_FN 0 call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .ret: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jne .w8_loop RET .w16_loop: call .main add dstq, strideq .w16: mova [dstq+16*0], m0 mova [dstq+16*1], m1 dec hd jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h %define base r6-avg_ssse3_table LEA r6, avg_ssse3_table tzcnt wd, wm mov t0d, r6m ; pixel_max movsxd wq, [r6+wq*4] shr t0d, 11 movddup m2, [base+bidir_rnd+t0*8] movddup m3, [base+bidir_mul+t0*8] movifnidn hd, hm add wq, r6 BIDIR_FN ALIGN function_align .main: mova m0, [tmp1q+16*0] paddsw m0, [tmp2q+16*0] mova m1, [tmp1q+16*1] paddsw m1, [tmp2q+16*1] add tmp1q, 16*2 add tmp2q, 16*2 pmaxsw m0, m2 pmaxsw m1, m2 psubsw m0, m2 psubsw m1, m2 pmulhw m0, m3 pmulhw m1, m3 ret cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h %define base r6-w_avg_ssse3_table LEA r6, w_avg_ssse3_table tzcnt wd, wm mov t0d, r6m ; weight movd m6, r7m ; pixel_max movddup m5, [base+pd_65538] movsxd wq, [r6+wq*4] pshufb m6, [base+pw_256] add wq, r6 lea r6d, [t0-16] shl t0d, 16 sub t0d, r6d ; 16-weight, weight paddw m5, m6 mov r6d, t0d shl t0d, 2 test dword r7m, 0x800 cmovnz r6d, t0d movifnidn hd, hm movd m4, r6d pslld m5, 7 pxor m7, m7 pshufd m4, m4, q0000 BIDIR_FN ALIGN function_align .main: mova m2, [tmp1q+16*0] mova m0, [tmp2q+16*0] punpckhwd m3, m0, m2 punpcklwd m0, m2 mova m2, [tmp1q+16*1] mova m1, [tmp2q+16*1] add tmp1q, 16*2 add tmp2q, 16*2 pmaddwd m3, m4 pmaddwd m0, m4 paddd m3, m5 paddd m0, m5 psrad m3, 8 psrad m0, 8 packssdw m0, m3 punpckhwd m3, m1, m2 punpcklwd m1, m2 pmaddwd m3, m4 pmaddwd m1, m4 paddd m3, m5 paddd m1, m5 psrad m3, 8 psrad m1, 8 packssdw m1, m3 pminsw m0, m6 pminsw m1, m6 pmaxsw m0, m7 pmaxsw m1, m7 ret %if ARCH_X86_64 cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %else cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask %define hd dword r5m %define m8 [base+pw_64] %endif %define base r6-mask_ssse3_table LEA r6, mask_ssse3_table tzcnt wd, wm mov t0d, r7m ; pixel_max shr t0d, 11 movsxd wq, [r6+wq*4] movddup m6, [base+bidir_rnd+t0*8] movddup m7, [base+bidir_mul+t0*8] %if ARCH_X86_64 mova m8, [base+pw_64] movifnidn hd, hm %endif add wq, r6 mov maskq, r6mp BIDIR_FN ALIGN function_align .main: movq m3, [maskq+8*0] mova m0, [tmp1q+16*0] mova m4, [tmp2q+16*0] pxor m5, m5 punpcklbw m3, m5 punpckhwd m2, m0, m4 punpcklwd m0, m4 psubw m1, m8, m3 punpckhwd m4, m3, m1 ; m, 64-m punpcklwd m3, m1 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) pmaddwd m0, m3 movq m3, [maskq+8*1] mova m1, [tmp1q+16*1] mova m4, [tmp2q+16*1] add maskq, 8*2 add tmp1q, 16*2 add tmp2q, 16*2 psrad m2, 5 psrad m0, 5 packssdw m0, m2 punpcklbw m3, m5 punpckhwd m2, m1, m4 punpcklwd m1, m4 psubw m5, m8, m3 punpckhwd m4, m3, m5 ; m, 64-m punpcklwd m3, m5 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) pmaddwd m1, m3 psrad m2, 5 psrad m1, 5 packssdw m1, m2 pmaxsw m0, m6 pmaxsw m1, m6 psubsw m0, m6 psubsw m1, m6 pmulhw m0, m7 pmulhw m1, m7 ret cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_420_ssse3_table LEA t0, w_mask_420_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max movd m0, r7m ; sign shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m4, [base+bidir_mul+r6*8] ALLOC_STACK -16*4 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 mova [rsp+16*3], m4 %define m8 [rsp+gprsize+16*0] %define m9 [rsp+gprsize+16*1] %define m10 [rsp+gprsize+16*2] %define m11 [rsp+gprsize+16*3] %endif movd m7, [base+pw_2] psubw m7, m0 pshufb m7, [base+pw_256] add wq, t0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 4 .w4: movq [dstq+strideq*0], m0 phaddw m2, m3 movhps [dstq+strideq*1], m0 phaddd m2, m2 lea dstq, [dstq+strideq*2] paddw m2, m7 movq [dstq+strideq*0], m1 psrlw m2, 2 movhps [dstq+strideq*1], m1 packuswb m2, m2 movd [maskq], m2 sub hd, 4 jg .w4_loop RET .w8_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 4 .w8: mova [dstq+strideq*0], m0 paddw m2, m3 phaddw m2, m2 mova [dstq+strideq*1], m1 paddw m2, m7 psrlw m2, 2 packuswb m2, m2 movd [maskq], m2 sub hd, 2 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 8 .w16: mova [dstq+strideq*1+16*0], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*1], m3 mova [dstq+strideq*0+16*1], m1 call .main paddw m2, [dstq+strideq*1+16*0] paddw m3, [dstq+strideq*1+16*1] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*1], m1 paddw m2, m7 psrlw m2, 2 packuswb m2, m2 movq [maskq], m2 sub hd, 2 jg .w16_loop RET .w32_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16 .w32: mova [dstq+strideq*1+16*0], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*1], m3 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*0+16*2], m0 phaddw m2, m3 mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m1 call .main paddw m2, [dstq+strideq*1+16*0] paddw m3, [dstq+strideq*1+16*1] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*1], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*2] paddw m2, [dstq+strideq*1+16*3] mova [dstq+strideq*1+16*2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*3], m1 packuswb m3, m2 mova [maskq], m3 sub hd, 2 jg .w32_loop RET .w64_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16*2 .w64: mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*1+16*2], m3 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*2], m0 mova [dstq+strideq*1+16*4], m3 mova [dstq+strideq*0+16*3], m1 call .main mova [dstq+strideq*1+16*5], m2 mova [dstq+strideq*0+16*4], m0 mova [dstq+strideq*1+16*6], m3 mova [dstq+strideq*0+16*5], m1 call .main mova [dstq+strideq*0+16*6], m0 phaddw m2, m3 mova [dstq+strideq*1+16*7], m2 mova [dstq+strideq*0+16*7], m1 call .main paddw m2, [dstq+strideq*1+16*1] paddw m3, [dstq+strideq*1+16*2] mova [dstq+strideq*1+16*0], m0 phaddw m2, m3 mova [dstq+strideq*1+16*2], m2 mova [dstq+strideq*1+16*1], m1 call .main paddw m2, [dstq+strideq*1+16*3] paddw m3, [dstq+strideq*1+16*4] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*2] mova [dstq+strideq*1+16*2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*3], m1 packuswb m3, m2 mova [maskq+16*0], m3 call .main paddw m2, [dstq+strideq*1+16*5] paddw m3, [dstq+strideq*1+16*6] mova [dstq+strideq*1+16*4], m0 phaddw m2, m3 mova [dstq+strideq*1+16*6], m2 mova [dstq+strideq*1+16*5], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*6] paddw m2, [dstq+strideq*1+16*7] mova [dstq+strideq*1+16*6], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*7], m1 packuswb m3, m2 mova [maskq+16*1], m3 sub hd, 2 jg .w64_loop RET .w128_loop: call .main lea dstq, [dstq+strideq*2] add maskq, 16*4 .w128: mova [dstq+strideq*1+16* 1], m2 mova [dstq+strideq*0+16* 0], m0 mova [dstq+strideq*1+16* 2], m3 mova [dstq+strideq*0+16* 1], m1 call .main mova [dstq+strideq*1+16* 3], m2 mova [dstq+strideq*0+16* 2], m0 mova [dstq+strideq*1+16* 4], m3 mova [dstq+strideq*0+16* 3], m1 call .main mova [dstq+strideq*1+16* 5], m2 mova [dstq+strideq*0+16* 4], m0 mova [dstq+strideq*1+16* 6], m3 mova [dstq+strideq*0+16* 5], m1 call .main mova [dstq+strideq*1+16* 7], m2 mova [dstq+strideq*0+16* 6], m0 mova [dstq+strideq*1+16* 8], m3 mova [dstq+strideq*0+16* 7], m1 call .main mova [dstq+strideq*1+16* 9], m2 mova [dstq+strideq*0+16* 8], m0 mova [dstq+strideq*1+16*10], m3 mova [dstq+strideq*0+16* 9], m1 call .main mova [dstq+strideq*1+16*11], m2 mova [dstq+strideq*0+16*10], m0 mova [dstq+strideq*1+16*12], m3 mova [dstq+strideq*0+16*11], m1 call .main mova [dstq+strideq*1+16*13], m2 mova [dstq+strideq*0+16*12], m0 mova [dstq+strideq*1+16*14], m3 mova [dstq+strideq*0+16*13], m1 call .main mova [dstq+strideq*0+16*14], m0 phaddw m2, m3 mova [dstq+strideq*1+16*15], m2 mova [dstq+strideq*0+16*15], m1 call .main paddw m2, [dstq+strideq*1+16* 1] paddw m3, [dstq+strideq*1+16* 2] mova [dstq+strideq*1+16* 0], m0 phaddw m2, m3 mova [dstq+strideq*1+16* 2], m2 mova [dstq+strideq*1+16* 1], m1 call .main paddw m2, [dstq+strideq*1+16* 3] paddw m3, [dstq+strideq*1+16* 4] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16* 2] mova [dstq+strideq*1+16* 2], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16* 3], m1 packuswb m3, m2 mova [maskq+16*0], m3 call .main paddw m2, [dstq+strideq*1+16* 5] paddw m3, [dstq+strideq*1+16* 6] mova [dstq+strideq*1+16* 4], m0 phaddw m2, m3 mova [dstq+strideq*1+16* 6], m2 mova [dstq+strideq*1+16* 5], m1 call .main paddw m2, [dstq+strideq*1+16* 7] paddw m3, [dstq+strideq*1+16* 8] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16* 6] mova [dstq+strideq*1+16* 6], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16* 7], m1 packuswb m3, m2 mova [maskq+16*1], m3 call .main paddw m2, [dstq+strideq*1+16* 9] paddw m3, [dstq+strideq*1+16*10] mova [dstq+strideq*1+16* 8], m0 phaddw m2, m3 mova [dstq+strideq*1+16*10], m2 mova [dstq+strideq*1+16* 9], m1 call .main paddw m2, [dstq+strideq*1+16*11] paddw m3, [dstq+strideq*1+16*12] phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*10] mova [dstq+strideq*1+16*10], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*11], m1 packuswb m3, m2 mova [maskq+16*2], m3 call .main paddw m2, [dstq+strideq*1+16*13] paddw m3, [dstq+strideq*1+16*14] mova [dstq+strideq*1+16*12], m0 phaddw m2, m3 mova [dstq+strideq*1+16*14], m2 mova [dstq+strideq*1+16*13], m1 call .main phaddw m2, m3 paddw m3, m7, [dstq+strideq*1+16*14] paddw m2, [dstq+strideq*1+16*15] mova [dstq+strideq*1+16*14], m0 paddw m2, m7 psrlw m3, 2 psrlw m2, 2 mova [dstq+strideq*1+16*15], m1 packuswb m3, m2 mova [maskq+16*3], m3 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: %macro W_MASK 2 ; dst/tmp_offset, mask mova m%1, [tmp1q+16*%1] mova m%2, [tmp2q+16*%1] punpcklwd m4, m%2, m%1 punpckhwd m5, m%2, m%1 psubsw m%1, m%2 pabsw m%1, m%1 psubusw m6, m8, m%1 psrlw m6, 10 ; 64-m psubw m%2, m9, m6 ; m punpcklwd m%1, m6, m%2 punpckhwd m6, m%2 pmaddwd m%1, m4 pmaddwd m6, m5 psrad m%1, 5 psrad m6, 5 packssdw m%1, m6 pmaxsw m%1, m10 psubsw m%1, m10 pmulhw m%1, m11 %endmacro W_MASK 0, 2 W_MASK 1, 3 add tmp1q, 16*2 add tmp2q, 16*2 ret cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_422_ssse3_table LEA t0, w_mask_422_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max movd m7, r7m ; sign shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m4, [base+bidir_mul+r6*8] ALLOC_STACK -16*4 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 mova [rsp+16*3], m4 %endif pxor m0, m0 add wq, t0 pshufb m7, m0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .end: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2 W_MASK 1, 3 phaddw m2, m3 add tmp1q, 16*2 add tmp2q, 16*2 packuswb m2, m2 pxor m3, m3 psubb m2, m7 pavgb m2, m3 movq [maskq], m2 add maskq, 8 ret cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_444_ssse3_table LEA t0, w_mask_444_ssse3_table tzcnt wd, wm mov r6d, r8m ; pixel_max shr r6d, 11 movsxd wq, [t0+wq*4] %if ARCH_X86_64 mova m8, [base+pw_27615] mova m9, [base+pw_64] movddup m10, [base+bidir_rnd+r6*8] movddup m11, [base+bidir_mul+r6*8] %else mova m1, [base+pw_27615] mova m2, [base+pw_64] movddup m3, [base+bidir_rnd+r6*8] movddup m7, [base+bidir_mul+r6*8] ALLOC_STACK -16*3 mova [rsp+16*0], m1 mova [rsp+16*1], m2 mova [rsp+16*2], m3 %define m11 m7 %endif add wq, t0 movifnidn hd, r5m mov maskq, r6mp call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] movq [dstq+strideq*0], m1 movhps [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop .end: RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 sub hd, 2 jg .w8_loop .w8_end: RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0+16*0], m0 mova [dstq+strideq*0+16*1], m1 call .main mova [dstq+strideq*1+16*0], m0 mova [dstq+strideq*1+16*1], m1 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 mova [dstq+16*1], m1 call .main mova [dstq+16*2], m0 mova [dstq+16*3], m1 call .main mova [dstq+16*4], m0 mova [dstq+16*5], m1 call .main mova [dstq+16*6], m0 mova [dstq+16*7], m1 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16* 0], m0 mova [dstq+16* 1], m1 call .main mova [dstq+16* 2], m0 mova [dstq+16* 3], m1 call .main mova [dstq+16* 4], m0 mova [dstq+16* 5], m1 call .main mova [dstq+16* 6], m0 mova [dstq+16* 7], m1 call .main mova [dstq+16* 8], m0 mova [dstq+16* 9], m1 call .main mova [dstq+16*10], m0 mova [dstq+16*11], m1 call .main mova [dstq+16*12], m0 mova [dstq+16*13], m1 call .main mova [dstq+16*14], m0 mova [dstq+16*15], m1 dec hd jg .w128_loop RET ALIGN function_align .main: W_MASK 0, 2 W_MASK 1, 3 packuswb m2, m3 add tmp1q, 16*2 add tmp2q, 16*2 mova [maskq], m2 add maskq, 16 ret ; (a * (64 - m) + b * m + 32) >> 6 ; = (((b - a) * m + 32) >> 6) + a ; = (((b - a) * (m << 9) + 16384) >> 15) + a ; except m << 9 overflows int16_t when m == 64 (which is possible), ; but if we negate m it works out (-64 << 9 == -32768). ; = (((a - b) * (m * -512) + 16384) >> 15) + a cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 %define base r6-blend_ssse3_table LEA r6, blend_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r6+wq*4] movifnidn maskq, maskmp mova m7, [base+pw_m512] add wq, r6 lea stride3q, [strideq*3] pxor m6, m6 jmp wq .w4: mova m5, [maskq] movq m0, [dstq+strideq*0] movhps m0, [dstq+strideq*1] movq m1, [dstq+strideq*2] movhps m1, [dstq+stride3q ] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 movq [dstq+strideq*2], m1 movhps [dstq+stride3q ], m1 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w4 RET .w8: mova m5, [maskq] mova m0, [dstq+strideq*0] mova m1, [dstq+strideq*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8 RET .w16: mova m5, [maskq] mova m0, [dstq+16*0] mova m1, [dstq+16*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] add maskq, 16 add tmpq, 32 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, strideq dec hd jg .w16 RET .w32: mova m5, [maskq+16*0] mova m0, [dstq+16*0] mova m1, [dstq+16*1] psubw m2, m0, [tmpq+16*0] psubw m3, m1, [tmpq+16*1] punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova m5, [maskq+16*1] mova m0, [dstq+16*2] mova m1, [dstq+16*3] psubw m2, m0, [tmpq+16*2] psubw m3, m1, [tmpq+16*3] add maskq, 32 add tmpq, 64 punpcklbw m4, m5, m6 punpckhbw m5, m6 pmullw m4, m7 pmullw m5, m7 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*2], m0 mova [dstq+16*3], m1 add dstq, strideq dec hd jg .w32 RET cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h %define base r5-blend_v_ssse3_table LEA r5, blend_v_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] add wq, r5 jmp wq .w2: movd m4, [base+obmc_masks+2*2] .w2_loop: movd m0, [dstq+strideq*0] movd m2, [tmpq+4*0] movd m1, [dstq+strideq*1] movd m3, [tmpq+4*1] add tmpq, 4*2 psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w2_loop RET .w4: movddup m2, [base+obmc_masks+4*2] .w4_loop: movq m0, [dstq+strideq*0] movhps m0, [dstq+strideq*1] mova m1, [tmpq] add tmpq, 8*2 psubw m1, m0 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w4_loop RET .w8: mova m4, [base+obmc_masks+8*2] .w8_loop: mova m0, [dstq+strideq*0] mova m2, [tmpq+16*0] mova m1, [dstq+strideq*1] mova m3, [tmpq+16*1] add tmpq, 16*2 psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m4 paddw m0, m2 paddw m1, m3 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w8_loop RET .w16: mova m4, [base+obmc_masks+16*2] movq m5, [base+obmc_masks+16*3] .w16_loop: mova m0, [dstq+16*0] mova m2, [tmpq+16*0] mova m1, [dstq+16*1] mova m3, [tmpq+16*1] add tmpq, 16*2 psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, strideq dec hd jg .w16_loop RET .w32: %if WIN64 movaps [rsp+8], m6 %endif mova m4, [base+obmc_masks+16*4] mova m5, [base+obmc_masks+16*5] mova m6, [base+obmc_masks+16*6] .w32_loop: mova m0, [dstq+16*0] mova m2, [tmpq+16*0] mova m1, [dstq+16*1] mova m3, [tmpq+16*1] psubw m2, m0 psubw m3, m1 pmulhrsw m2, m4 pmulhrsw m3, m5 paddw m0, m2 mova m2, [dstq+16*2] paddw m1, m3 mova m3, [tmpq+16*2] add tmpq, 16*4 psubw m3, m2 pmulhrsw m3, m6 paddw m2, m3 mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 add dstq, strideq dec hd jg .w32_loop %if WIN64 movaps m6, [rsp+8] %endif RET %macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp mova m0, [dstq+16*(%1+0)] mova m2, [tmpq+16*(%2+0)] mova m1, [dstq+16*(%1+1)] mova m3, [tmpq+16*(%2+1)] %if %3 add tmpq, 16*%3 %endif psubw m2, m0 psubw m3, m1 pmulhrsw m2, m5 pmulhrsw m3, m5 paddw m0, m2 paddw m1, m3 mova [dstq+16*(%1+0)], m0 mova [dstq+16*(%1+1)], m1 %endmacro cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask %define base r6-blend_h_ssse3_table LEA r6, blend_h_ssse3_table tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] movddup m4, [base+blend_shuf] lea maskq, [base+obmc_masks+hq*2] lea hd, [hq*3] add wq, r6 shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd m0, [dstq+dsq*0] movd m2, [dstq+dsq*1] movd m3, [maskq+hq*2] movq m1, [tmpq] add tmpq, 4*2 punpckldq m0, m2 punpcklwd m3, m3 psubw m1, m0 pmulhrsw m1, m3 paddw m0, m1 movd [dstq+dsq*0], m0 psrlq m0, 32 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova m3, [base+blend_shuf] .w4_loop: movq m0, [dstq+dsq*0] movhps m0, [dstq+dsq*1] movd m2, [maskq+hq*2] mova m1, [tmpq] add tmpq, 8*2 psubw m1, m0 pshufb m2, m3 pmulhrsw m1, m2 paddw m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: movddup m5, [base+blend_shuf+8] %if WIN64 movaps [rsp+ 8], m6 movaps [rsp+24], m7 %endif .w8_loop: movd m7, [maskq+hq*2] mova m0, [dstq+dsq*0] mova m2, [tmpq+16*0] mova m1, [dstq+dsq*1] mova m3, [tmpq+16*1] add tmpq, 16*2 pshufb m6, m7, m4 psubw m2, m0 pshufb m7, m5 psubw m3, m1 pmulhrsw m2, m6 pmulhrsw m3, m7 paddw m0, m2 paddw m1, m3 mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop %if WIN64 movaps m6, [rsp+ 8] movaps m7, [rsp+24] %endif RET .w16: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0, 2 add dstq, dsq inc hq jl .w16 RET .w32: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2, 4 add dstq, dsq inc hq jl .w32 RET .w64: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2 BLEND_H_ROW 4, 4 BLEND_H_ROW 6, 6, 8 add dstq, dsq inc hq jl .w64 RET .w128: movd m5, [maskq+hq*2] pshufb m5, m4 BLEND_H_ROW 0, 0 BLEND_H_ROW 2, 2 BLEND_H_ROW 4, 4 BLEND_H_ROW 6, 6, 16 BLEND_H_ROW 8, -8 BLEND_H_ROW 10, -6 BLEND_H_ROW 12, -4 BLEND_H_ROW 14, -2 add dstq, dsq inc hq jl .w128 RET ; emu_edge args: ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, ; const pixel *ref, const ptrdiff_t ref_stride ; ; bw, bh total filled size ; iw, ih, copied block -> fill bottom, right ; x, y, offset in bw/bh -> fill top, left cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ y, dst, dstride, src, sstride, \ bottomext, rightext, blk ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes %if ARCH_X86_64 %define reg_zero r12q %define reg_tmp r10 %define reg_src srcq %define reg_bottomext bottomextq %define reg_rightext rightextq %define reg_blkm r9m %else %define reg_zero r6 %define reg_tmp r0 %define reg_src r1 %define reg_bottomext r0 %define reg_rightext r1 %define reg_blkm r2m %endif ; ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor reg_zero, reg_zero lea reg_tmp, [ihq-1] cmp yq, ihq cmovs reg_tmp, yq test yq, yq cmovs reg_tmp, reg_zero %if ARCH_X86_64 imul reg_tmp, sstrideq add srcq, reg_tmp %else imul reg_tmp, sstridem mov reg_src, srcm add reg_src, reg_tmp %endif ; ; ref += iclip(x, 0, iw - 1) lea reg_tmp, [iwq-1] cmp xq, iwq cmovs reg_tmp, xq test xq, xq cmovs reg_tmp, reg_zero lea reg_src, [reg_src+reg_tmp*2] %if ARCH_X86_32 mov srcm, reg_src %endif ; ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) %if ARCH_X86_32 mov r1, r1m ; restore bh %endif lea reg_bottomext, [yq+bhq] sub reg_bottomext, ihq lea r3, [bhq-1] cmovs reg_bottomext, reg_zero ; DEFINE_ARGS bw, bh, iw, ih, x, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, reg_zero cmp reg_bottomext, bhq cmovns reg_bottomext, r3 cmp topextq, bhq cmovg topextq, r3 %if ARCH_X86_32 mov r4m, reg_bottomext ; ; right_ext = iclip(x + bw - iw, 0, bw - 1) mov r0, r0m ; restore bw %endif lea reg_rightext, [xq+bwq] sub reg_rightext, iwq lea r2, [bwq-1] cmovs reg_rightext, reg_zero DEFINE_ARGS bw, bh, iw, ih, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, reg_zero cmp reg_rightext, bwq cmovns reg_rightext, r2 %if ARCH_X86_32 mov r3m, r1 %endif cmp leftextq, bwq cmovns leftextq, r2 %undef reg_zero %undef reg_tmp %undef reg_src %undef reg_bottomext %undef reg_rightext DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; center_h = bh - top_ext - bottom_ext %if ARCH_X86_64 lea r3, [bottomextq+topextq] sub centerhq, r3 %else mov r1, centerhm ; restore r1 sub centerhq, topextq sub centerhq, r4m mov r1m, centerhq %endif ; ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq %if ARCH_X86_64 imul r2, dstrideq %else mov r6, r6m ; restore dstq imul r2, dstridem %endif add dstq, r2 mov reg_blkm, dstq ; save pointer for ext ; ; center_w = bw - left_ext - right_ext mov centerwq, bwq %if ARCH_X86_64 lea r3, [rightextq+leftextq] sub centerwq, r3 %else sub centerwq, r3m sub centerwq, leftextq %endif ; vloop Macro %macro v_loop 3 ; need_left_ext, need_right_ext, suffix %if ARCH_X86_64 %define reg_tmp r12 %else %define reg_tmp r0 %endif .v_loop_%3: %if ARCH_X86_32 mov r0, r0m mov r1, r1m %endif %if %1 ; left extension %if ARCH_X86_64 movd m0, [srcq] %else mov r3, srcm movd m0, [r3] %endif pshuflw m0, m0, q0000 punpcklqdq m0, m0 xor r3, r3 .left_loop_%3: mova [dstq+r3*2], m0 add r3, mmsize/2 cmp r3, leftextq jl .left_loop_%3 ; body lea reg_tmp, [dstq+leftextq*2] %endif xor r3, r3 .body_loop_%3: %if ARCH_X86_64 movu m0, [srcq+r3*2] %else mov r1, srcm movu m0, [r1+r3*2] %endif %if %1 movu [reg_tmp+r3*2], m0 %else movu [dstq+r3*2], m0 %endif add r3, mmsize/2 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 lea reg_tmp, [reg_tmp+centerwq*2] %else lea reg_tmp, [dstq+centerwq*2] %endif %if ARCH_X86_64 movd m0, [srcq+centerwq*2-2] %else mov r3, srcm movd m0, [r3+centerwq*2-2] %endif pshuflw m0, m0, q0000 punpcklqdq m0, m0 xor r3, r3 .right_loop_%3: movu [reg_tmp+r3*2], m0 add r3, mmsize/2 %if ARCH_X86_64 cmp r3, rightextq %else cmp r3, r3m %endif jl .right_loop_%3 %endif %if ARCH_X86_64 add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %else add dstq, dstridem mov r0, sstridem add srcm, r0 sub dword centerhm, 1 jg .v_loop_%3 mov r0, r0m ; restore r0 %endif %endmacro ; vloop MACRO test leftextq, leftextq jnz .need_left_ext %if ARCH_X86_64 test rightextq, rightextq jnz .need_right_ext %else cmp leftextq, r3m ; leftextq == 0 jne .need_right_ext %endif v_loop 0, 0, 0 jmp .body_done ;left right extensions .need_left_ext: %if ARCH_X86_64 test rightextq, rightextq %else mov r3, r3m test r3, r3 %endif jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; r0 ; bw ; r1 ;; x loop ; r4 ;; y loop ; r5 ; topextq ; r6 ;dstq ; r7 ;dstrideq ; r8 ; srcq %if ARCH_X86_64 %define reg_dstride dstrideq %else %define reg_dstride r2 %endif ; ; bottom edge extension %if ARCH_X86_64 test bottomextq, bottomextq jz .top %else xor r1, r1 cmp r1, r4m je .top %endif ; %if ARCH_X86_64 mov srcq, dstq sub srcq, dstrideq xor r1, r1 %else mov r3, dstq mov reg_dstride, dstridem sub r3, reg_dstride mov srcm, r3 %endif ; .bottom_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1*2] lea r3, [dstq+r1*2] mov r4, bottomextq %else mov r3, srcm mova m0, [r3+r1*2] lea r3, [dstq+r1*2] mov r4, r4m %endif ; .bottom_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .bottom_y_loop add r1, mmsize/2 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end %if ARCH_X86_64 mov srcq, reg_blkm %else mov r3, reg_blkm mov reg_dstride, dstridem %endif mov dstq, dstm xor r1, r1 ; .top_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1*2] %else mov r3, reg_blkm mova m0, [r3+r1*2] %endif lea r3, [dstq+r1*2] mov r4, topextq ; .top_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .top_y_loop add r1, mmsize/2 cmp r1, bwq jl .top_x_loop .end: RET %undef reg_dstride %undef reg_blkm %undef reg_tmp %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro %if ARCH_X86_64 cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax %elif STACK_ALIGNMENT >= 16 cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax %else cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0, pxmax %endif movifnidn dstq, dstmp movifnidn srcq, srcmp %if STACK_ALIGNMENT >= 16 movifnidn dst_wd, dst_wm %endif %if ARCH_X86_64 movifnidn hd, hm %endif sub dword mx0m, 4<<14 sub dword src_wm, 8 movd m4, pxmaxm movd m7, dxm movd m6, mx0m movd m5, src_wm punpcklwd m4, m4 pshufd m4, m4, q0000 pshufd m7, m7, q0000 pshufd m6, m6, q0000 pshufd m5, m5, q0000 mova [rsp+16*3*ARCH_X86_32], m4 %if ARCH_X86_64 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ %else DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x %define hd dword r5m %if STACK_ALIGNMENT >= 16 LEA r6, $$ %define base r6-$$ %else LEA r4, $$ %define base r4-$$ %endif %endif %if ARCH_X86_64 mova m12, [base+pd_64] mova m11, [base+pd_63] %else %define m12 [base+pd_64] %define m11 [base+pd_63] %endif pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] pslld m7, 2 ; dx*4 pslld m5, 14 paddd m6, m4 ; mx+[0..3]*dx SCRATCH 7, 15, 0 SCRATCH 6, 14, 1 SCRATCH 5, 13, 2 pxor m1, m1 .loop_y: xor xd, xd mova m0, m14 ; per-line working version of mx .loop_x: pcmpgtd m1, m0 pandn m1, m0 psrad m2, m0, 8 ; filter offset (unmasked) pcmpgtd m3, m13, m1 pand m1, m3 pandn m3, m13 por m1, m3 psubd m3, m0, m1 ; pshufb offset psrad m1, 14 ; clipped src_x offset psrad m3, 14 ; pshufb edge_emu offset pand m2, m11 ; filter offset (masked) ; load source pixels %if ARCH_X86_64 movd r8d, m1 pshuflw m1, m1, q3232 movd r9d, m1 punpckhqdq m1, m1 movd r10d, m1 psrlq m1, 32 movd r11d, m1 movu m4, [srcq+r8*2] movu m5, [srcq+r9*2] movu m6, [srcq+r10*2] movu m7, [srcq+r11*2] ; if no emulation is required, we don't need to shuffle or emulate edges packssdw m3, m3 movq r11, m3 test r11, r11 jz .filter movsx r8, r11w sar r11, 16 movsx r9, r11w sar r11, 16 movsx r10, r11w sar r11, 16 movu m1, [base+resize_shuf+8+r8*2] movu m3, [base+resize_shuf+8+r9*2] movu m8, [base+resize_shuf+8+r10*2] movu m9, [base+resize_shuf+8+r11*2] pshufb m4, m1 pshufb m5, m3 pshufb m6, m8 pshufb m7, m9 .filter: movd r8d, m2 pshuflw m2, m2, q3232 movd r9d, m2 punpckhqdq m2, m2 movd r10d, m2 psrlq m2, 32 movd r11d, m2 movq m8, [base+resize_filter+r8*8] movq m2, [base+resize_filter+r9*8] pxor m9, m9 punpcklbw m1, m9, m8 punpcklbw m3, m9, m2 psraw m1, 8 psraw m3, 8 movq m10, [base+resize_filter+r10*8] movq m2, [base+resize_filter+r11*8] punpcklbw m8, m9, m10 punpcklbw m9, m2 psraw m8, 8 psraw m9, 8 pmaddwd m4, m1 pmaddwd m5, m3 pmaddwd m6, m8 pmaddwd m7, m9 phaddd m4, m5 %else movd r3, m1 pshuflw m1, m1, q3232 movd r1, m1 punpckhqdq m1, m1 movu m4, [srcq+r3*2] movu m5, [srcq+r1*2] movd r3, m1 psrlq m1, 32 movd r1, m1 movu m6, [srcq+r3*2] movu m7, [srcq+r1*2] ; if no emulation is required, we don't need to shuffle or emulate edges pxor m1, m1 pcmpeqb m1, m3 pmovmskb r3d, m1 cmp r3d, 0xffff je .filter movd r3, m3 movu m1, [base+resize_shuf+8+r3*2] pshuflw m3, m3, q3232 movd r1, m3 pshufb m4, m1 movu m1, [base+resize_shuf+8+r1*2] punpckhqdq m3, m3 movd r3, m3 pshufb m5, m1 movu m1, [base+resize_shuf+8+r3*2] psrlq m3, 32 movd r1, m3 pshufb m6, m1 movu m1, [base+resize_shuf+8+r1*2] pshufb m7, m1 .filter: mova [esp+4*16], m6 mova [esp+5*16], m7 movd r3, m2 pshuflw m2, m2, q3232 movd r1, m2 movq m6, [base+resize_filter+r3*8] movq m7, [base+resize_filter+r1*8] pxor m3, m3 punpcklbw m1, m3, m6 punpcklbw m3, m7 psraw m1, 8 psraw m3, 8 pmaddwd m4, m1 pmaddwd m5, m3 punpckhqdq m2, m2 movd r3, m2 psrlq m2, 32 movd r1, m2 phaddd m4, m5 movq m2, [base+resize_filter+r3*8] movq m5, [base+resize_filter+r1*8] mova m6, [esp+4*16] mova m7, [esp+5*16] pxor m3, m3 punpcklbw m1, m3, m2 punpcklbw m3, m5 psraw m1, 8 psraw m3, 8 pmaddwd m6, m1 pmaddwd m7, m3 %endif phaddd m6, m7 phaddd m4, m6 pxor m1, m1 psubd m2, m12, m4 psrad m2, 7 packssdw m2, m2 pmaxsw m2, m1 pminsw m2, [rsp+16*3*ARCH_X86_32] movq [dstq+xq*2], m2 paddd m0, m15 add xd, 4 %if STACK_ALIGNMENT >= 16 cmp xd, dst_wd %else cmp xd, dst_wm %endif jl .loop_x add dstq, dst_stridemp add srcq, src_stridemp dec hd jg .loop_y RET rav1e-0.7.1/src/x86/mc_avx2.asm000064400000000000000000005700751046102023000141420ustar 00000000000000; Copyright © 2018-2021, VideoLAN and dav1d authors ; Copyright © 2018-2021, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 ; dav1d_obmc_masks[] with 64-x interleaved obmc_masks: db 0, 0, 0, 0 ; 2 db 45, 19, 64, 0 ; 4 db 39, 25, 50, 14, 59, 5, 64, 0 ; 8 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 ; 16 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 ; 32 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8 bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 wm_420_sign: dd 0x01020102, 0x01010101 wm_422_sign: dd 0x80808080, 0x7f7f7f7f pb_64: times 4 db 64 pw_m256: times 2 dw -256 pw_15: times 2 dw 15 pw_32: times 2 dw 32 pw_34: times 2 dw 34 pw_258: times 2 dw 258 pw_512: times 2 dw 512 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 pd_32: dd 32 pd_63: dd 63 pd_512: dd 512 pd_32768: dd 32768 pd_0x3ff: dd 0x3ff pd_0x4000: dd 0x4000 pq_0x40000000: dq 0x40000000 cextern mc_subpel_filters cextern mc_warp_filter2 cextern resize_filter %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_8bpc_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_8bpc_avx2.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE put_8tap_scaled, avx2, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx2, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx2, 2, 4, 8, 16, 32, 32, 32 SECTION .text INIT_XMM avx2 cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx lea r7, [put_avx2] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: movzx r6d, word [srcq+ssq*0] movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6w mov [dstq+dsq*1], r7w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET INIT_YMM avx2 .put_w32: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] movu m3, [srcq+ssq*1+32*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+32*0], m0 mova [dstq+dsq*0+32*1], m1 mova [dstq+dsq*1+32*0], m2 mova [dstq+dsq*1+32*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] movu m3, [srcq+32*3] add srcq, ssq mova [dstq+32*0], m0 mova [dstq+32*1], m1 mova [dstq+32*2], m2 mova [dstq+32*3], m3 add dstq, dsq dec hd jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 vbroadcasti128 m4, [bilin_h_shuf8] add mxyd, 16 movd xm5, mxyd mov mxyd, r7m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] vpbroadcastd m3, [pw_2048] add wq, r7 jmp wq .h_w2: movd xm0, [srcq+ssq*0] pinsrd xm0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm5 pmulhrsw xm0, xm3 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: mova xm4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm5 pmulhrsw xm0, xm3 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pshufb xm1, xm4 pmaddubsw xm0, xm5 pmaddubsw xm1, xm5 pmulhrsw xm0, xm3 pmulhrsw xm1, xm3 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu m1, [srcq+8*4] movu m2, [srcq+8*5] add srcq, ssq pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 packuswb m1, m2 mova [dstq+32*0], m0 mova [dstq+32*1], m1 add dstq, dsq dec hd jg .h_w64 RET .h_w128: mov r6, -32*3 .h_w128_loop: movu m0, [srcq+r6+32*3+8*0] movu m1, [srcq+r6+32*3+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+32*3], m0 add r6, 32 jle .h_w128_loop add srcq, ssq add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] imul mxyd, 255 vpbroadcastd m5, [pw_2048] add mxyd, 16 add wq, r7 movd xm4, mxyd vpbroadcastw m4, xm4 jmp wq .v_w2: movd xm0, [srcq+ssq*0] .v_w2_loop: pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xm1, xm1, q2301 ; 1 0 punpcklbw xm1, xm0 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 pextrw [dstq+dsq*0], xm1, 1 pextrw [dstq+dsq*1], xm1, 0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastd xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm1, xm2, xm0, 0x01 ; 0 1 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm2, xm0, 0x02 ; 1 2 punpcklbw xm1, xm2 pmaddubsw xm1, xm4 pmulhrsw xm1, xm5 packuswb xm1, xm1 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xm0, [srcq+ssq*0] .v_w8_loop: movq xm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw xm1, xm0, xm2 movq xm0, [srcq+ssq*0] punpcklbw xm2, xm0 pmaddubsw xm1, xm4 pmaddubsw xm2, xm4 pmulhrsw xm1, xm5 pmulhrsw xm2, xm5 packuswb xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: movu xm0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd m2, m3, m0, 0x0f ; 0 1 vbroadcasti128 m0, [srcq+ssq*0] vpblendd m3, m0, 0xf0 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: %macro PUT_BILIN_V_W32 0 movu m0, [srcq+ssq*0] %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m1, m0, m3 punpckhbw m2, m0, m3 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 punpcklbw m2, m3, m0 punpckhbw m3, m0 pmaddubsw m2, m4 pmaddubsw m3, m4 pmulhrsw m2, m5 pmulhrsw m3, m5 packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop %endmacro PUT_BILIN_V_W32 RET .v_w64: movu m0, [srcq+32*0] movu m1, [srcq+32*1] .v_w64_loop: add srcq, ssq movu m3, [srcq+32*0] punpcklbw m2, m0, m3 punpckhbw m0, m3 pmaddubsw m2, m4 pmaddubsw m0, m4 pmulhrsw m2, m5 pmulhrsw m0, m5 packuswb m2, m0 mova m0, m3 movu m3, [srcq+32*1] mova [dstq+32*0], m2 punpcklbw m2, m1, m3 punpckhbw m1, m3 pmaddubsw m2, m4 pmaddubsw m1, m4 pmulhrsw m2, m5 pmulhrsw m1, m5 packuswb m2, m1 mova m1, m3 mova [dstq+32*1], m2 add dstq, dsq dec hd jg .v_w64_loop RET .v_w128: lea r6d, [hq+(3<<8)] mov r4, srcq mov r7, dstq .v_w128_loop: PUT_BILIN_V_W32 add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow vpbroadcastd m7, [pw_15] movd xm6, mxyd add wq, r7 paddb m5, m5 vpbroadcastw m6, xm6 jmp wq .hv_w2: vpbroadcastd xm0, [srcq+ssq*0] pshufb xm0, xm4 pmaddubsw xm0, xm5 .hv_w2_loop: movd xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pinsrd xm1, [srcq+ssq*0], 1 pshufb xm1, xm4 pmaddubsw xm1, xm5 ; 1 _ 2 _ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _ mova xm0, xm1 psubw xm1, xm2 pmulhw xm1, xm6 pavgw xm2, xm7 paddw xm1, xm2 psrlw xm1, 4 packuswb xm1, xm1 pextrw [dstq+dsq*0], xm1, 0 pextrw [dstq+dsq*1], xm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova xm4, [bilin_h_shuf4] movddup xm0, [srcq+ssq*0] pshufb xm0, xm4 pmaddubsw xm0, xm5 .hv_w4_loop: movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm1, [srcq+ssq*0] pshufb xm1, xm4 pmaddubsw xm1, xm5 ; 1 2 shufps xm2, xm0, xm1, q1032 ; 0 1 mova xm0, xm1 psubw xm1, xm2 pmulhw xm1, xm6 pavgw xm2, xm7 paddw xm1, xm2 psrlw xm1, 4 packuswb xm1, xm1 movd [dstq+dsq*0], xm1 pextrd [dstq+dsq*1], xm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 m1, [srcq+ssq*0], 1 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 vperm2i128 m2, m0, m1, 0x21 ; 0 1 mova m0, m1 psubw m1, m2 pmulhw m1, m6 pavgw m2, m7 paddw m1, m2 psrlw m1, 4 vextracti128 xm2, m1, 1 packuswb xm1, xm2 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: movu m0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w16_loop: movu xm2, [srcq+ssq*1+8*0] vinserti128 m2, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] movu xm3, [srcq+ssq*0+8*0] vinserti128 m3, [srcq+ssq*0+8*1], 1 pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 psubw m1, m2, m0 pmulhw m1, m6 pavgw m0, m7 paddw m1, m0 pmaddubsw m0, m3, m5 psubw m3, m0, m2 pmulhw m3, m6 pavgw m2, m7 paddw m3, m2 psrlw m1, 4 psrlw m3, 4 packuswb m1, m3 vpermq m1, m1, q3120 mova [dstq+dsq*0], xm1 vextracti128 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w128: lea r6d, [hq+(3<<16)] jmp .hv_w32_start .hv_w64: lea r6d, [hq+(1<<16)] .hv_w32_start: mov r4, srcq mov r7, dstq .hv_w32: %if WIN64 movaps r4m, xmm8 %endif .hv_w32_loop0: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w32_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m8, m2, m0 pmulhw m8, m6 pavgw m0, m7 paddw m8, m0 mova m0, m2 psubw m2, m3, m1 pmulhw m2, m6 pavgw m1, m7 paddw m2, m1 mova m1, m3 psrlw m8, 4 psrlw m2, 4 packuswb m8, m2 mova [dstq], m8 add dstq, dsq dec hd jg .hv_w32_loop add r4, 32 add r7, 32 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<16 jg .hv_w32_loop0 %if WIN64 movaps xmm8, r4m %endif RET cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea r6, [prep%+SUFFIX] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [r6+wq*2+table_offset(prep,)] add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: movd xm0, [srcq+strideq*0] pinsrd xm0, [srcq+strideq*1], 1 pinsrd xm0, [srcq+strideq*2], 2 pinsrd xm0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmovzxbw m0, xm0 psllw m0, 4 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] movq xm1, [srcq+strideq*2] movhps xm1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmovzxbw m0, xm0 pmovzxbw m1, xm1 psllw m0, 4 psllw m1, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: pmovzxbw m0, [srcq+strideq*0] pmovzxbw m1, [srcq+strideq*1] pmovzxbw m2, [srcq+strideq*2] pmovzxbw m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmovzxbw m0, [srcq+strideq*0+16*0] pmovzxbw m1, [srcq+strideq*0+16*1] pmovzxbw m2, [srcq+strideq*1+16*0] pmovzxbw m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 2 jg .prep_w32 RET .prep_w64: pmovzxbw m0, [srcq+16*0] pmovzxbw m1, [srcq+16*1] pmovzxbw m2, [srcq+16*2] pmovzxbw m3, [srcq+16*3] add srcq, strideq psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 dec hd jg .prep_w64 RET .prep_w128: pmovzxbw m0, [srcq+16*0] pmovzxbw m1, [srcq+16*1] pmovzxbw m2, [srcq+16*2] pmovzxbw m3, [srcq+16*3] psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 pmovzxbw m0, [srcq+16*4] pmovzxbw m1, [srcq+16*5] pmovzxbw m2, [srcq+16*6] pmovzxbw m3, [srcq+16*7] add tmpq, 32*8 add srcq, strideq psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq-32*4], m0 mova [tmpq-32*3], m1 mova [tmpq-32*2], m2 mova [tmpq-32*1], m3 dec hd jg .prep_w128 RET .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 vbroadcasti128 m4, [bilin_h_shuf8] add mxyd, 16 movd xm5, mxyd mov mxyd, r6m ; my vpbroadcastw m5, xm5 test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] add wq, r6 lea stride3q, [strideq*3] jmp wq .h_w4: vbroadcasti128 m4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] movq xm1, [srcq+strideq*2] movhps xm1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 m0, xm1, 1 pshufb m0, m4 pmaddubsw m0, m5 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: .h_w8_loop: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 movu xm1, [srcq+strideq*2] vinserti128 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 add tmpq, 32*2 sub hd, 4 jg .h_w8_loop RET .h_w16: .h_w16_loop: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 movu xm1, [srcq+strideq*1+8*0] vinserti128 m1, [srcq+strideq*1+8*1], 1 movu xm2, [srcq+strideq*2+8*0] vinserti128 m2, [srcq+strideq*2+8*1], 1 movu xm3, [srcq+stride3q +8*0] vinserti128 m3, [srcq+stride3q +8*1], 1 lea srcq, [srcq+strideq*4] pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 4 jg .h_w16_loop RET .h_w32: .h_w32_loop: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 movu xm1, [srcq+strideq*0+8*2] vinserti128 m1, [srcq+strideq*0+8*3], 1 movu xm2, [srcq+strideq*1+8*0] vinserti128 m2, [srcq+strideq*1+8*1], 1 movu xm3, [srcq+strideq*1+8*2] vinserti128 m3, [srcq+strideq*1+8*3], 1 lea srcq, [srcq+strideq*2] pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 sub hd, 2 jg .h_w32_loop RET .h_w64: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 movu xm2, [srcq+8*4] vinserti128 m2, [srcq+8*5], 1 movu xm3, [srcq+8*6] vinserti128 m3, [srcq+8*7], 1 add srcq, strideq pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 add tmpq, 32*4 dec hd jg .h_w64 RET .h_w128: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 movu xm2, [srcq+8*4] vinserti128 m2, [srcq+8*5], 1 movu xm3, [srcq+8*6] vinserti128 m3, [srcq+8*7], 1 pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+32*0], m0 mova [tmpq+32*1], m1 mova [tmpq+32*2], m2 mova [tmpq+32*3], m3 movu xm0, [srcq+8* 8] vinserti128 m0, [srcq+8* 9], 1 movu xm1, [srcq+8*10] vinserti128 m1, [srcq+8*11], 1 movu xm2, [srcq+8*12] vinserti128 m2, [srcq+8*13], 1 movu xm3, [srcq+8*14] vinserti128 m3, [srcq+8*15], 1 add tmpq, 32*8 add srcq, strideq pshufb m0, m4 pshufb m1, m4 pshufb m2, m4 pshufb m3, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq-32*4], m0 mova [tmpq-32*3], m1 mova [tmpq-32*2], m2 mova [tmpq-32*1], m3 dec hd jg .h_w128 RET .v: WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 255 add mxyd, 16 add wq, r6 lea stride3q, [strideq*3] movd xm6, mxyd vpbroadcastw m6, xm6 jmp wq .v_w4: movd xm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastd m1, [srcq+strideq*2] vpbroadcastd xm2, [srcq+strideq*1] vpbroadcastd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0x05 ; 0 2 2 2 vpbroadcastd m0, [srcq+strideq*0] vpblendd m3, m2, 0x0f ; 1 1 3 3 vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4 vpblendd m1, m3, 0xaa ; 0 1 2 3 vpblendd m2, m3, 0x55 ; 1 2 3 4 punpcklbw m1, m2 pmaddubsw m1, m6 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movq xm0, [srcq+strideq*0] .v_w8_loop: vpbroadcastq m1, [srcq+strideq*2] vpbroadcastq m2, [srcq+strideq*1] vpbroadcastq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m0, 0x03 ; 0 2 2 2 vpbroadcastq m0, [srcq+strideq*0] vpblendd m2, m3, 0xcc ; 1 3 1 3 vpblendd m3, m2, m1, 0xf0 ; 1 3 2 2 vpblendd m2, m1, 0x0f ; 0 2 1 3 vpblendd m3, m0, 0xc0 ; 1 3 2 4 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m6 pmaddubsw m2, m6 mova [tmpq+32*0], m1 mova [tmpq+32*1], m2 add tmpq, 32*2 sub hd, 4 jg .v_w8_loop RET .v_w16: vbroadcasti128 m0, [srcq+strideq*0] .v_w16_loop: vbroadcasti128 m1, [srcq+strideq*1] vbroadcasti128 m2, [srcq+strideq*2] vbroadcasti128 m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] shufpd m4, m0, m2, 0x0c ; 0 2 vbroadcasti128 m0, [srcq+strideq*0] shufpd m1, m3, 0x0c ; 1 3 shufpd m2, m0, 0x0c ; 2 4 punpcklbw m3, m4, m1 punpcklbw m5, m1, m2 punpckhbw m4, m1 punpckhbw m1, m2 pmaddubsw m3, m6 pmaddubsw m5, m6 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+32*0], m3 mova [tmpq+32*1], m5 mova [tmpq+32*2], m4 mova [tmpq+32*3], m1 add tmpq, 32*4 sub hd, 4 jg .v_w16_loop RET .v_w32: vpermq m0, [srcq+strideq*0], q3120 .v_w32_loop: vpermq m1, [srcq+strideq*1], q3120 vpermq m2, [srcq+strideq*2], q3120 vpermq m3, [srcq+stride3q ], q3120 lea srcq, [srcq+strideq*4] punpcklbw m4, m0, m1 punpckhbw m5, m0, m1 vpermq m0, [srcq+strideq*0], q3120 pmaddubsw m4, m6 pmaddubsw m5, m6 mova [tmpq+32*0], m4 mova [tmpq+32*1], m5 punpcklbw m4, m1, m2 punpckhbw m1, m2 pmaddubsw m4, m6 pmaddubsw m1, m6 punpcklbw m5, m2, m3 punpckhbw m2, m3 pmaddubsw m5, m6 pmaddubsw m2, m6 mova [tmpq+32*2], m4 mova [tmpq+32*3], m1 add tmpq, 32*8 punpcklbw m1, m3, m0 punpckhbw m3, m0 pmaddubsw m1, m6 pmaddubsw m3, m6 mova [tmpq-32*4], m5 mova [tmpq-32*3], m2 mova [tmpq-32*2], m1 mova [tmpq-32*1], m3 sub hd, 4 jg .v_w32_loop RET .v_w64: vpermq m0, [srcq+strideq*0+32*0], q3120 vpermq m1, [srcq+strideq*0+32*1], q3120 .v_w64_loop: vpermq m2, [srcq+strideq*1+32*0], q3120 vpermq m3, [srcq+strideq*1+32*1], q3120 lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+32*0], m4 mova [tmpq+32*1], m0 punpcklbw m4, m1, m3 punpckhbw m5, m1, m3 vpermq m0, [srcq+strideq*0+32*0], q3120 vpermq m1, [srcq+strideq*0+32*1], q3120 pmaddubsw m4, m6 pmaddubsw m5, m6 mova [tmpq+32*2], m4 mova [tmpq+32*3], m5 add tmpq, 32*8 punpcklbw m4, m2, m0 punpckhbw m2, m0 punpcklbw m5, m3, m1 punpckhbw m3, m1 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m5, m6 pmaddubsw m3, m6 mova [tmpq-32*4], m4 mova [tmpq-32*3], m2 mova [tmpq-32*2], m5 mova [tmpq-32*1], m3 sub hd, 2 jg .v_w64_loop RET .v_w128: lea r6d, [hq+(3<<8)] mov r3, srcq mov r5, tmpq .v_w128_loop0: vpermq m0, [srcq+strideq*0], q3120 .v_w128_loop: vpermq m1, [srcq+strideq*1], q3120 lea srcq, [srcq+strideq*2] punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 vpermq m0, [srcq+strideq*0], q3120 pmaddubsw m2, m6 pmaddubsw m3, m6 punpcklbw m4, m1, m0 punpckhbw m1, m0 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+32*0], m2 mova [tmpq+32*1], m3 mova [tmpq+32*8], m4 mova [tmpq+32*9], m1 add tmpq, 32*16 sub hd, 2 jg .v_w128_loop add r3, 32 add r5, 64 movzx hd, r6b mov srcq, r3 mov tmpq, r5 sub r6d, 1<<8 jg .v_w128_loop0 RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 movd xm6, mxyd vpbroadcastw m6, xm6 add wq, r6 lea stride3q, [strideq*3] jmp wq .hv_w4: vbroadcasti128 m4, [bilin_h_shuf4] vpbroadcastq m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: movq xm1, [srcq+strideq*1] movhps xm1, [srcq+strideq*2] movq xm2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movhps xm2, [srcq+strideq*0] vinserti128 m1, xm2, 1 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 3 4 vpblendd m2, m1, m0, 0xc0 vpermq m2, m2, q2103 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xm1, [srcq+strideq*1] vinserti128 m1, [srcq+strideq*2], 1 movu xm2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 m2, [srcq+strideq*0], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 ; 1 2 vperm2i128 m3, m0, m1, 0x21 ; 0 1 pmaddubsw m0, m2, m5 ; 3 4 vperm2i128 m2, m1, m0, 0x21 ; 2 3 psubw m1, m3 pmulhrsw m1, m6 paddw m1, m3 psubw m3, m0, m2 pmulhrsw m3, m6 paddw m3, m2 mova [tmpq+32*0], m1 mova [tmpq+32*1], m3 add tmpq, 32*2 sub hd, 4 jg .hv_w8_loop RET .hv_w16: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w16_loop: movu xm1, [srcq+strideq*1+8*0] vinserti128 m1, [srcq+strideq*1+8*1], 1 lea srcq, [srcq+strideq*2] movu xm2, [srcq+strideq*0+8*0] vinserti128 m2, [srcq+strideq*0+8*1], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+32*0], m3 mova [tmpq+32*1], m2 add tmpq, 32*2 sub hd, 2 jg .hv_w16_loop RET .hv_w32: movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] vinserti128 m1, [srcq+8*3], 1 pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w32_loop: add srcq, strideq movu xm2, [srcq+8*0] vinserti128 m2, [srcq+8*1], 1 pshufb m2, m4 pmaddubsw m2, m5 psubw m3, m2, m0 pmulhrsw m3, m6 paddw m3, m0 mova m0, m2 movu xm2, [srcq+8*2] vinserti128 m2, [srcq+8*3], 1 pshufb m2, m4 pmaddubsw m2, m5 mova [tmpq+32*0], m3 psubw m3, m2, m1 pmulhrsw m3, m6 paddw m3, m1 mova m1, m2 mova [tmpq+32*1], m3 add tmpq, 32*2 dec hd jg .hv_w32_loop RET .hv_w128: lea r3d, [hq+(7<<8)] mov r6d, 256 jmp .hv_w64_start .hv_w64: lea r3d, [hq+(3<<8)] mov r6d, 128 .hv_w64_start: %if WIN64 PUSH r7 %endif mov r5, srcq mov r7, tmpq .hv_w64_loop0: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 pshufb m0, m4 pmaddubsw m0, m5 .hv_w64_loop: movu xm1, [srcq+strideq*1+8*0] vinserti128 m1, [srcq+strideq*1+8*1], 1 lea srcq, [srcq+strideq*2] movu xm2, [srcq+strideq*0+8*0] vinserti128 m2, [srcq+strideq*0+8*1], 1 pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+r6*0], m3 mova [tmpq+r6*1], m2 lea tmpq, [tmpq+r6*2] sub hd, 2 jg .hv_w64_loop add r5, 16 add r7, 32 movzx hd, r3b mov srcq, r5 mov tmpq, r7 sub r3d, 1<<8 jg .hv_w64_loop0 %if WIN64 POP r7 %endif RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; fn, type, type_h, type_v cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) %endif %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx2] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 lea r6, [ssq*3] lea r7, [dsq*3] %if WIN64 pop r8 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) WIN64_SPILL_XMM 11 cmp wd, 4 jl .h_w2 vbroadcasti128 m6, [subpel_h_shufA] je .h_w4 tzcnt wd, wd vbroadcasti128 m7, [subpel_h_shufB] vbroadcasti128 m8, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0] vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4] add wq, r8 jmp wq .h_w2: movzx mxd, mxb dec srcq mova xm4, [subpel_h_shuf4] vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] .h_w2_loop: movq xm0, [srcq+ssq*0] movhps xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm4 pmaddubsw xm0, xm3 phaddw xm0, xm0 paddw xm0, xm5 psraw xm0, 6 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb dec srcq vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] .h_w4_loop: movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xm0, xm6 pshufb xm1, xm6 pmaddubsw xm0, xm3 pmaddubsw xm1, xm3 phaddw xm0, xm1 paddw xm0, xm5 psraw xm0, 6 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] pshufb m%2, m%1, m7 pshufb m%3, m%1, m8 pshufb m%1, m6 pmaddubsw m%4, m%2, m9 pmaddubsw m%2, m10 pmaddubsw m%3, m10 pmaddubsw m%1, m9 paddw m%3, m%4 paddw m%1, m%2 phaddw m%1, m%3 paddw m%1, m5 psraw m%1, 6 %endmacro movu xm0, [srcq+ssq*0] vinserti128 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 1, 2, 3 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] vinserti128 m1, [srcq+ssq*1+8*1], 1 PUT_8TAP_H 0, 2, 3, 4 lea srcq, [srcq+ssq*2] PUT_8TAP_H 1, 2, 3, 4 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 jmp .h_start .h_w128: mov r6, -32*3 .h_start: sub srcq, r6 sub dstq, r6 mov r4, r6 .h_loop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 2, 3, 4 packuswb m0, m1 mova [dstq+r6], m0 add r6, 32 jle .h_loop add srcq, ssq add dstq, dsq mov r6, r4 dec hd jg .h_loop RET .v: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] vpbroadcastd m7, [pw_512] lea myq, [r8+myq*8+subpel_filters-put_avx2] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] add r6, r8 lea ss3q, [ssq*3] sub srcq, ss3q jmp r6 .v_w2: movd xm2, [srcq+ssq*0] pinsrw xm2, [srcq+ssq*1], 2 pinsrw xm2, [srcq+ssq*2], 4 add srcq, ss3q pinsrw xm2, [srcq+ssq*0], 6 ; 0 1 2 3 movd xm3, [srcq+ssq*1] vpbroadcastd xm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklbw xm3, xm1 ; 45 56 punpcklbw xm1, xm2, xm4 ; 01 12 punpckhbw xm2, xm4 ; 23 34 .v_w2_loop: pmaddubsw xm5, xm1, xm8 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm9 ; a1 b1 paddw xm5, xm2 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 paddw xm5, xm4 pmulhrsw xm5, xm7 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 add srcq, ss3q pinsrd xm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xm3, [srcq+ssq*1] vpbroadcastd xm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm3, xm1, 0x02 ; 4 5 vpblendd xm1, xm0, 0x02 ; 5 6 palignr xm4, xm3, xm2, 4 ; 1 2 3 4 punpcklbw xm3, xm1 ; 45 56 punpcklbw xm1, xm2, xm4 ; 01 12 punpckhbw xm2, xm4 ; 23 34 .v_w4_loop: pmaddubsw xm5, xm1, xm8 ; a0 b0 mova xm1, xm2 pmaddubsw xm2, xm9 ; a1 b1 paddw xm5, xm2 mova xm2, xm3 pmaddubsw xm3, xm10 ; a2 b2 paddw xm5, xm3 vpbroadcastd xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xm3, xm0, xm4, 0x02 ; 6 7 vpbroadcastd xm0, [srcq+ssq*0] vpblendd xm4, xm0, 0x02 ; 7 8 punpcklbw xm3, xm4 ; 67 78 pmaddubsw xm4, xm3, xm11 ; a3 b3 paddw xm5, xm4 pmulhrsw xm5, xm7 packuswb xm5, xm5 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m5, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpbroadcastq m6, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m0, [srcq+ssq*0] vpblendd m1, m4, 0x30 vpblendd m4, m2, 0x30 punpcklbw m1, m4 ; 01 12 vpblendd m2, m5, 0x30 vpblendd m5, m3, 0x30 punpcklbw m2, m5 ; 23 34 vpblendd m3, m6, 0x30 vpblendd m6, m0, 0x30 punpcklbw m3, m6 ; 45 56 .v_w8_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, m8 ; a0 b0 mova m1, m2 pmaddubsw m2, m9 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, m10 ; a2 b2 paddw m5, m3 vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+ssq*0] vpblendd m4, m0, 0x30 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, m11 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 vextracti128 xm4, m5, 1 packuswb xm5, xm4 movq [dstq+dsq*0], xm5 movhps [dstq+dsq*1], xm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: .v_w32: .v_w64: .v_w128: lea r6d, [wq*8-128] mov r4, srcq mov r7, dstq lea r6d, [hq+r6*2] .v_w16_loop0: vbroadcasti128 m4, [srcq+ssq*0] vbroadcasti128 m5, [srcq+ssq*1] vbroadcasti128 m6, [srcq+ssq*2] add srcq, ss3q vbroadcasti128 m0, [srcq+ssq*0] vbroadcasti128 m1, [srcq+ssq*1] vbroadcasti128 m2, [srcq+ssq*2] add srcq, ss3q vbroadcasti128 m3, [srcq+ssq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 shufpd m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 shufpd m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: vbroadcasti128 m12, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti128 m13, [srcq+ssq*0] pmaddubsw m14, m1, m8 ; a0 pmaddubsw m15, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 paddw m14, m3 paddw m15, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 paddw m14, m5 paddw m15, m6 shufpd m6, m0, m12, 0x0d shufpd m0, m12, m13, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 pmaddubsw m13, m6, m11 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 packuswb m14, m15 vpermq m14, m14, q3120 mova [dstq+dsq*0], xm14 vextracti128 [dstq+dsq*1], m14, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] lea ss3q, [ssq*3] sub srcq, ss3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m8, [pw_8192] vpbroadcastd m9, [pd_512] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 m6, [subpel_h_shuf4] movq xm2, [srcq+ssq*0] movhps xm2, [srcq+ssq*1] movq xm0, [srcq+ssq*2] add srcq, ss3q movhps xm0, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpbroadcastq m4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m1, [srcq+ssq*0] vpblendd m2, m3, 0x30 vpblendd m0, m1, 0x30 vpblendd m2, m4, 0xc0 pshufb m2, m6 pshufb m0, m6 pmaddubsw m2, m7 pmaddubsw m0, m7 phaddw m2, m0 pmulhrsw m2, m8 vextracti128 xm3, m2, 1 palignr xm4, xm3, xm2, 4 punpcklwd xm1, xm2, xm4 ; 01 12 punpckhwd xm2, xm4 ; 23 34 pshufd xm0, xm3, q2121 punpcklwd xm3, xm0 ; 45 56 .hv_w2_loop: movq xm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xm4, [srcq+ssq*0] pshufb xm4, xm6 pmaddubsw xm4, xm7 pmaddwd xm5, xm1, xm10 ; a0 b0 mova xm1, xm2 pmaddwd xm2, xm11 ; a1 b1 paddd xm5, xm2 mova xm2, xm3 pmaddwd xm3, xm12 ; a2 b2 phaddw xm4, xm4 pmulhrsw xm4, xm8 paddd xm5, xm3 palignr xm3, xm4, xm0, 12 mova xm0, xm4 punpcklwd xm3, xm0 ; 67 78 pmaddwd xm4, xm3, xm13 ; a3 b3 paddd xm5, xm9 paddd xm5, xm4 psrad xm5, 10 packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova m6, [subpel_h_shuf4] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m0, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m5, [srcq+ssq*0] vpbroadcastq m3, [srcq+ssq*1] vpblendd m2, m4, 0xcc ; 0 1 vpbroadcastq m4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq m1, [srcq+ssq*0] vpblendd m0, m5, 0xcc ; 2 3 vpblendd m3, m4, 0xcc ; 4 5 pshufb m2, m6 pshufb m0, m6 pshufb m3, m6 pshufb m1, m6 pmaddubsw m2, m7 pmaddubsw m0, m7 pmaddubsw m3, m7 pmaddubsw m1, m7 phaddw m2, m0 phaddw m3, m1 pmulhrsw m2, m8 pmulhrsw m3, m8 palignr m4, m3, m2, 4 punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 pshufd m0, m3, q2121 punpcklwd m3, m0 ; 45 56 .hv_w4_loop: vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m1, m10 ; a0 b0 mova m1, m2 pmaddwd m2, m11 ; a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, m12 ; a2 b2 paddd m5, m3 vpbroadcastq m3, [srcq+ssq*0] vpblendd m4, m3, 0xcc ; 7 8 pshufb m4, m6 pmaddubsw m4, m7 phaddw m4, m4 pmulhrsw m4, m8 palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; 67 78 pmaddwd m4, m3, m13 ; a3 b3 paddd m5, m9 paddd m5, m4 psrad m5, 10 vextracti128 xm4, m5, 1 packssdw xm5, xm4 packuswb xm5, xm5 pshuflw xm5, xm5, q3120 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2] lea ss3q, [ssq*3] sub srcq, ss3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 lea r6d, [wq*8-64] mov r4, srcq mov r7, dstq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+ssq*0] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+ssq*1] vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+ssq*2] add srcq, ss3q vbroadcasti128 m0, [srcq+ssq*0] vpblendd m4, m0, 0xf0 ; 0 3 vinserti128 m5, [srcq+ssq*1], 1 ; 1 4 vinserti128 m6, [srcq+ssq*2], 1 ; 2 5 add srcq, ss3q vinserti128 m0, [srcq+ssq*0], 1 ; 3 6 %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] pshufb %3, %1, %6 pshufb %4, %1, %7 pshufb %1, %5 pmaddubsw %2, %3, m10 pmaddubsw %4, m11 pmaddubsw %3, m11 pmaddubsw %1, m10 paddw %2, %4 paddw %1, %3 phaddw %1, %2 %endmacro HV_H_W8 m4, m1, m2, m3, m7, m8, m9 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 vpermq m6, m6, q3120 pmulhrsw m0, m7 pmulhrsw m4, m7 pmulhrsw m5, m7 pmulhrsw m6, m7 vpermq m7, m0, q3120 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 r6m, m0, 1 ; not enough registers movu xm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 m0, [srcq+ssq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m13 ; a1 pmaddwd m4, m13 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m14 ; a2 pmaddwd m6, m14 ; b2 paddd m8, m5 paddd m9, m6 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] HV_H_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_512] vbroadcasti128 m6, r6m pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 paddd m8, m7 pmaddwd m7, m6, m15 ; b3 paddd m7, m9 psrad m8, 10 psrad m7, 10 packssdw m8, m7 vextracti128 xm7, m8, 1 packuswb xm8, xm7 pshufd xm7, xm8, q3120 movq [dstq+dsq*0], xm7 movhps [dstq+dsq*1], xm7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop add r4, 8 add r7, 8 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET %macro PREP_8TAP_H 0 pshufb m1, m0, m5 pshufb m2, m0, m6 pshufb m3, m0, m7 pmaddubsw m1, m8 pmaddubsw m0, m2, m8 pmaddubsw m2, m9 pmaddubsw m3, m9 paddw m1, m2 paddw m0, m3 phaddw m0, m1, m0 pmulhrsw m0, m4 %endmacro %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep%+SUFFIX] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m4, [pw_8192] vbroadcasti128 m5, [subpel_h_shufA] WIN64_SPILL_XMM 10 cmp wd, 4 je .h_w4 tzcnt wd, wd vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] add wq, r7 jmp wq .h_w4: movzx mxd, mxb dec srcq vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] lea stride3q, [strideq*3] .h_w4_loop: movq xm0, [srcq+strideq*0] vpbroadcastq m2, [srcq+strideq*2] movq xm1, [srcq+strideq*1] vpblendd m0, m2, 0xf0 vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd m1, m2, 0xf0 pshufb m0, m5 pshufb m1, m5 pmaddubsw m0, m6 pmaddubsw m1, m6 phaddw m0, m1 pmulhrsw m0, m4 mova [tmpq], m0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] PREP_8TAP_H mova [tmpq], m0 add tmpq, 32 sub hd, 2 jg .h_w8 RET .h_w16: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 PREP_8TAP_H mova [tmpq+32*0], m0 movu xm0, [srcq+strideq*1+8*0] vinserti128 m0, [srcq+strideq*1+8*1], 1 lea srcq, [srcq+strideq*2] PREP_8TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 sub hd, 2 jg .h_w16 RET .h_w32: xor r6d, r6d jmp .h_start .h_w64: mov r6, -32*1 jmp .h_start .h_w128: mov r6, -32*3 .h_start: sub srcq, r6 mov r5, r6 .h_loop: movu xm0, [srcq+r6+8*0] vinserti128 m0, [srcq+r6+8*1], 1 PREP_8TAP_H mova [tmpq+32*0], m0 movu xm0, [srcq+r6+8*2] vinserti128 m0, [srcq+r6+8*3], 1 PREP_8TAP_H mova [tmpq+32*1], m0 add tmpq, 32*2 add r6, 32 jle .h_loop add srcq, strideq mov r6, r5 dec hd jg .h_loop RET .v: %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. ; TODO: Would a 6-tap code path be worth it? lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q vpbroadcastd m7, [pw_8192] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] cmp wd, 8 jg .v_w16 je .v_w8 .v_w4: movd xm0, [srcq+strideq*0] vpbroadcastd m1, [srcq+strideq*2] vpbroadcastd xm2, [srcq+strideq*1] add srcq, stride3q vpbroadcastd m3, [srcq+strideq*0] vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _ vpbroadcastd m0, [srcq+strideq*1] vpbroadcastd m2, [srcq+strideq*2] vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _ vpbroadcastd m0, [srcq+stride3q ] vbroadcasti128 m5, [deint_shuf4] vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5 vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5 vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _ punpcklbw m1, m2, m3 ; 01 12 23 34 vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6 punpckhbw m2, m3 ; 23 34 45 56 .v_w4_loop: lea srcq, [srcq+strideq*4] pinsrd xm0, [srcq+strideq*0], 1 vpbroadcastd m3, [srcq+strideq*1] vpbroadcastd m4, [srcq+strideq*2] vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 _ _ _ vpbroadcastd m0, [srcq+stride3q ] vpblendd m3, m4, 0x20 ; 6 7 8 _ 8 9 _ _ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _ pshufb m3, m5 ; 67 78 89 9a pmaddubsw m4, m1, m8 vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78 pmaddubsw m2, m9 paddw m4, m2 mova m2, m3 pmaddubsw m3, m11 paddw m3, m4 pmaddubsw m4, m1, m10 paddw m3, m4 pmulhrsw m3, m7 mova [tmpq], m3 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: movq xm1, [srcq+strideq*0] vpbroadcastq m4, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq m5, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m6, [srcq+strideq*1] vpbroadcastq m0, [srcq+strideq*2] vpblendd m1, m4, 0x30 vpblendd m4, m2, 0x30 punpcklbw m1, m4 ; 01 12 vpblendd m2, m5, 0x30 vpblendd m5, m3, 0x30 punpcklbw m2, m5 ; 23 34 vpblendd m3, m6, 0x30 vpblendd m6, m0, 0x30 punpcklbw m3, m6 ; 45 56 .v_w8_loop: vpbroadcastq m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmaddubsw m5, m2, m9 ; a1 pmaddubsw m6, m2, m8 ; b0 vpblendd m2, m0, m4, 0x30 vpbroadcastq m0, [srcq+strideq*0] vpblendd m4, m0, 0x30 punpcklbw m2, m4 ; 67 78 pmaddubsw m1, m8 ; a0 pmaddubsw m4, m3, m9 ; b1 paddw m5, m1 mova m1, m3 pmaddubsw m3, m10 ; a2 paddw m6, m4 paddw m5, m3 vpbroadcastq m4, [srcq+strideq*1] vpblendd m3, m0, m4, 0x30 vpbroadcastq m0, [srcq+strideq*2] vpblendd m4, m0, 0x30 punpcklbw m3, m4 ; 89 9a pmaddubsw m4, m2, m11 ; a3 paddw m5, m4 pmaddubsw m4, m2, m10 ; b2 paddw m6, m4 pmaddubsw m4, m3, m11 ; b3 paddw m6, m4 pmulhrsw m5, m7 pmulhrsw m6, m7 mova [tmpq+32*0], m5 mova [tmpq+32*1], m6 add tmpq, 32*2 sub hd, 4 jg .v_w8_loop RET .v_w16: add wd, wd mov r5, srcq mov r7, tmpq lea r6d, [hq+wq*8-256] .v_w16_loop0: vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m0, [srcq+strideq*1] vbroadcasti128 m6, [srcq+strideq*0] lea srcq, [srcq+strideq*2] vbroadcasti128 m1, [srcq+strideq*0] vbroadcasti128 m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m3, [srcq+strideq*0] shufpd m4, m4, m0, 0x0c shufpd m5, m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 punpckhbw m4, m5 ; 34 shufpd m6, m6, m2, 0x0c punpcklbw m2, m5, m6 ; 12 punpckhbw m5, m6 ; 45 shufpd m0, m0, m3, 0x0c punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w16_loop: vbroadcasti128 m12, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m13, [srcq+strideq*0] pmaddubsw m14, m1, m8 ; a0 pmaddubsw m15, m2, m8 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, m9 ; a1 pmaddubsw m4, m9 ; b1 paddw m14, m3 paddw m15, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, m10 ; a2 pmaddubsw m6, m10 ; b2 paddw m14, m5 paddw m15, m6 shufpd m6, m0, m12, 0x0d shufpd m0, m12, m13, 0x0c punpcklbw m5, m6, m0 ; 67 punpckhbw m6, m0 ; 78 pmaddubsw m12, m5, m11 ; a3 pmaddubsw m13, m6, m11 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 mova [tmpq+wq*0], m14 mova [tmpq+wq*1], m15 lea tmpq, [tmpq+wq*2] sub hd, 2 jg .v_w16_loop add r5, 16 add r7, 32 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .v_w16_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0] vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 jmp .hv_w8 .hv_w4: movzx mxd, mxb dec srcq vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX] lea stride3q, [strideq*3] sub srcq, stride3q mova m7, [subpel_h_shuf4] pmovzxbd m9, [deint_shuf4] vpbroadcastd m10, [pw_8192] punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 vpbroadcastq m2, [srcq+strideq*0] vpbroadcastq m4, [srcq+strideq*1] vpbroadcastq m0, [srcq+strideq*2] vpbroadcastq m5, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m6, [srcq+strideq*1] vpbroadcastq m1, [srcq+strideq*2] vpblendd m2, m4, 0xcc ; 0 1 vpblendd m0, m5, 0xcc ; 2 3 vpblendd m3, m6, 0xcc ; 4 5 pshufb m2, m7 ; 00 01 10 11 02 03 12 13 pshufb m0, m7 ; 20 21 30 31 22 23 32 33 pshufb m3, m7 ; 40 41 50 51 42 43 52 53 pshufb m1, m7 ; 60 61 60 61 62 63 62 63 pmaddubsw m2, m8 pmaddubsw m0, m8 pmaddubsw m3, m8 pmaddubsw m1, m8 phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __ pmulhrsw m2, m10 pmulhrsw m3, m10 palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b punpcklwd m1, m2, m4 ; 01 12 punpckhwd m2, m4 ; 23 34 pshufd m0, m3, q2121 punpcklwd m3, m0 ; 45 56 .hv_w4_loop: pmaddwd m5, m1, m12 ; a0 b0 pmaddwd m6, m2, m12 ; c0 d0 pmaddwd m2, m13 ; a1 b1 pmaddwd m4, m3, m13 ; c1 d1 mova m1, m3 pmaddwd m3, m14 ; a2 b2 paddd m5, m2 vpbroadcastq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] paddd m6, m4 vpbroadcastq m4, [srcq+strideq*0] paddd m5, m3 vpbroadcastq m3, [srcq+strideq*1] vpblendd m2, m4, 0xcc vpbroadcastq m4, [srcq+strideq*2] vpblendd m3, m4, 0xcc pshufb m2, m7 pshufb m3, m7 pmaddubsw m2, m8 pmaddubsw m3, m8 phaddw m2, m3 pmulhrsw m2, m10 palignr m3, m2, m0, 12 mova m0, m2 punpcklwd m2, m3, m0 ; 67 78 punpckhwd m3, m0 ; 89 9a pmaddwd m4, m2, m14 ; c2 d2 paddd m6, m11 paddd m5, m11 paddd m6, m4 pmaddwd m4, m2, m15 ; a3 b3 paddd m5, m4 pmaddwd m4, m3, m15 ; c3 d3 paddd m6, m4 psrad m5, 6 psrad m6, 6 packssdw m5, m6 vpermd m5, m9, m5 mova [tmpq], m5 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: lea r6d, [wq*8-64] mov r5, srcq mov r7, tmpq lea r6d, [hq+r6*4] .hv_w8_loop0: vbroadcasti128 m7, [subpel_h_shufA] movu xm4, [srcq+strideq*0] vbroadcasti128 m8, [subpel_h_shufB] movu xm5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vbroadcasti128 m9, [subpel_h_shufC] movu xm6, [srcq+strideq*0] vbroadcasti128 m0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpblendd m4, m0, 0xf0 ; 0 3 vinserti128 m5, [srcq+strideq*0], 1 ; 1 4 vinserti128 m6, [srcq+strideq*1], 1 ; 2 5 lea srcq, [srcq+strideq*2] vinserti128 m0, [srcq+strideq*0], 1 ; 3 6 HV_H_W8 m4, m1, m2, m3, m7, m8, m9 HV_H_W8 m5, m1, m2, m3, m7, m8, m9 HV_H_W8 m6, m1, m2, m3, m7, m8, m9 HV_H_W8 m0, m1, m2, m3, m7, m8, m9 vpbroadcastd m7, [pw_8192] vpermq m4, m4, q3120 vpermq m5, m5, q3120 vpermq m6, m6, q3120 pmulhrsw m0, m7 pmulhrsw m4, m7 pmulhrsw m5, m7 pmulhrsw m6, m7 vpermq m7, m0, q3120 punpcklwd m1, m4, m5 ; 01 punpckhwd m4, m5 ; 34 punpcklwd m2, m5, m6 ; 12 punpckhwd m5, m6 ; 45 punpcklwd m3, m6, m7 ; 23 punpckhwd m6, m7 ; 56 .hv_w8_loop: vextracti128 [tmpq], m0, 1 ; not enough registers movu xm0, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti128 m0, [srcq+strideq*0], 1 ; 7 8 pmaddwd m8, m1, m12 ; a0 pmaddwd m9, m2, m12 ; b0 mova m1, m3 mova m2, m4 pmaddwd m3, m13 ; a1 pmaddwd m4, m13 ; b1 paddd m8, m3 paddd m9, m4 mova m3, m5 mova m4, m6 pmaddwd m5, m14 ; a2 pmaddwd m6, m14 ; b2 paddd m8, m5 paddd m9, m6 vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] vbroadcasti128 m5, [subpel_h_shufA] HV_H_W8 m0, m5, m6, m7, m5, m6, m7 vpbroadcastd m5, [pw_8192] vpbroadcastd m7, [pd_32] vbroadcasti128 m6, [tmpq] pmulhrsw m0, m5 paddd m8, m7 paddd m9, m7 vpermq m7, m0, q3120 ; 7 8 shufpd m6, m6, m7, 0x04 ; 6 7 punpcklwd m5, m6, m7 ; 67 punpckhwd m6, m7 ; 78 pmaddwd m7, m5, m15 ; a3 paddd m8, m7 pmaddwd m7, m6, m15 ; b3 paddd m7, m9 psrad m8, 6 psrad m7, 6 packssdw m8, m7 vpermq m7, m8, q3120 mova [tmpq+wq*0], xm7 vextracti128 [tmpq+wq*2], m7, 1 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .hv_w8_loop add r5, 8 add r7, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .hv_w8_loop0 RET %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro REMAP_REG 2 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %xdefine r14_save r14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep %xdefine r14 r14_save %undef r14_save %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] movq xm%1, [srcq+ r4] movq xm%2, [srcq+ r6] movhps xm%1, [srcq+ r7] movhps xm%2, [srcq+ r9] vinserti128 m%1, [srcq+r10], 1 vinserti128 m%2, [srcq+r11], 1 vpbroadcastq m%5, [srcq+r13] vpbroadcastq m%6, [srcq+ rX] add srcq, ssq movq xm%3, [srcq+ r4] movq xm%4, [srcq+ r6] movhps xm%3, [srcq+ r7] movhps xm%4, [srcq+ r9] vinserti128 m%3, [srcq+r10], 1 vinserti128 m%4, [srcq+r11], 1 vpbroadcastq m%7, [srcq+r13] vpbroadcastq m%8, [srcq+ rX] add srcq, ssq vpblendd m%1, m%5, 0xc0 vpblendd m%2, m%6, 0xc0 vpblendd m%3, m%7, 0xc0 vpblendd m%4, m%8, 0xc0 pmaddubsw m%1, m15 pmaddubsw m%2, m10 pmaddubsw m%3, m15 pmaddubsw m%4, m10 phaddw m%1, m%2 phaddw m%3, m%4 phaddw m%1, m%3 pmulhrsw m%1, m12 %endmacro %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 cglobal put_8tap_scaled_8bpc, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy %xdefine base_reg r12 %define rndshift 10 %else %assign isprep 1 cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy %define tmp_stridem qword [rsp+120] %xdefine base_reg r11 %define rndshift 6 %endif lea base_reg, [%1_8tap_scaled_8bpc_avx2] %define base base_reg-%1_8tap_scaled_8bpc_avx2 tzcnt wd, wm vpbroadcastd m8, dxm %if isprep && UNIX64 movd xm14, mxd vpbroadcastd m14, xm14 mov r5d, t0d DECLARE_REG_TMP 5, 7 %else vpbroadcastd m14, mxm %endif mov dyd, dym %ifidn %1, put %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %else DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %define dsm [rsp+112] %define rX r1 %define rXd r1d %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %else DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+112] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define rX r14 %define rXd r14d %endif vpbroadcastd m10, [base+pd_0x3ff] vpbroadcastd m12, [base+pw_8192] %ifidn %1, put vpbroadcastd m13, [base+pd_512] %else vpbroadcastd m13, [base+pd_32] %endif pxor m9, m9 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0,1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*2] movhps xm0, [srcq+ssq*1] movhps xm1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*2], 1 vpbroadcastq m2, [srcq+ssq*1] vpbroadcastq m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] vpblendd m15, m7, 0xaa vpblendd m0, m2, 0xc0 ; 0 1 4 5 vpblendd m1, m3, 0xc0 ; 2 3 6 7 pblendvb m15, m11, m8 pshufb m0, m14 pshufb m1, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 phaddw m0, m1 pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 vextracti128 xm1, m0, 1 ; 4 5 6 7 palignr xm2, xm1, xm0, 4 ; 1 2 3 4 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 pshufd xm4, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm4 ; 45 56 punpckhwd xm4, xm1, xm4 ; 67 __ .w2_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q pmovsxbw xm11, xm11 pshufd xm8, xm11, q0000 pshufd xm9, xm11, q1111 pshufd xm10, xm11, q2222 pshufd xm11, xm11, q3333 pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pmaddwd xm7, xm2, xm10 pmaddwd xm8, xm4, xm11 paddd xm5, xm6 paddd xm7, xm8 paddd xm5, xm13 paddd xm5, xm7 psrad xm5, 10 packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq], xm5, 0 add dstq, dsq dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w2_loop movq xm5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps xm3, xm0, q1032 ; 01 12 shufps xm0, xm2, q1032 ; 23 34 shufps xm2, xm4, q1032 ; 45 56 pshufb xm5, xm14 pmaddubsw xm5, xm15 phaddw xm5, xm5 pmulhrsw xm5, xm12 palignr xm1, xm5, xm1, 12 punpcklqdq xm1, xm1 ; 6 7 6 7 punpcklwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop .w2_skip_line: movhps xm5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova xm3, xm0 ; 01 12 mova xm0, xm2 ; 23 34 pshufb xm5, xm14 pmaddubsw xm5, xm15 phaddw xm5, xm5 pmulhrsw xm5, xm12 ; 6 7 6 7 palignr xm1, xm5, xm1, 8 ; 4 5 6 7 pshufd xm5, xm1, q0321 ; 5 6 7 _ punpcklwd xm2, xm1, xm5 ; 45 56 punpckhwd xm4, xm1, xm5 ; 67 __ jmp .w2_loop %endif .w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd xm15, xm0 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pinsrd xm15, [base+subpel_filters+r6*8+2], 1 pcmpeqd m0, m9 psrld m14, 10 movu xm7, [srcq+ssq*0] movu xm9, [srcq+ssq*1] pinsrd xm15, [base+subpel_filters+r11*8+2], 2 movu xm8, [srcq+ssq*2] movu xm10, [srcq+ss3q ] pinsrd xm15, [base+subpel_filters+r13*8+2], 3 lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vinserti128 m7, [srcq+ssq*0], 1 vinserti128 m9, [srcq+ssq*1], 1 vinserti128 m15, xm15, 1 vinserti128 m8, [srcq+ssq*2], 1 vinserti128 m10, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] pblendvb m15, m11, m0 pshufb m7, m14 pshufb m9, m14 pshufb m8, m14 pshufb m10, m14 pmaddubsw m7, m15 pmaddubsw m9, m15 pmaddubsw m8, m15 pmaddubsw m10, m15 phaddw m7, m9 phaddw m8, m10 pmulhrsw m7, m12 ; 0 1 4 5 pmulhrsw m8, m12 ; 2 3 6 7 vextracti128 xm9, m7, 1 ; 4 5 vextracti128 xm3, m8, 1 ; 6 7 shufps xm4, xm7, xm8, q1032 ; 1 2 shufps xm5, xm8, xm9, q1032 ; 3 4 shufps xm6, xm9, xm3, q1032 ; 5 6 psrldq xm11, xm3, 8 ; 7 _ punpcklwd xm0, xm7, xm4 ; 01 punpckhwd xm7, xm4 ; 12 punpcklwd xm1, xm8, xm5 ; 23 punpckhwd xm8, xm5 ; 34 punpcklwd xm2, xm9, xm6 ; 45 punpckhwd xm9, xm6 ; 56 punpcklwd xm3, xm11 ; 67 mova [rsp+0x00], xm7 mova [rsp+0x10], xm8 mova [rsp+0x20], xm9 .w4_loop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm10, r6q pmovsxbw xm10, xm10 pshufd xm7, xm10, q0000 pshufd xm8, xm10, q1111 pshufd xm9, xm10, q2222 pshufd xm10, xm10, q3333 pmaddwd xm4, xm0, xm7 pmaddwd xm5, xm1, xm8 pmaddwd xm6, xm2, xm9 pmaddwd xm7, xm3, xm10 paddd xm4, xm5 paddd xm6, xm7 paddd xm4, xm13 paddd xm4, xm6 psrad xm4, rndshift packssdw xm4, xm4 %ifidn %1, put packuswb xm4, xm4 movd [dstq], xm4 add dstq, dsq %else movq [tmpq], xm4 add tmpq, 8 %endif dec hd jz .ret add myd, dyd test myd, ~0x3ff jz .w4_loop movu xm4, [srcq] test myd, 0x400 jz .w4_skip_line mova xm0, [rsp+0x00] mova [rsp+0x00], xm1 mova xm1, [rsp+0x10] mova [rsp+0x10], xm2 mova xm2, [rsp+0x20] mova [rsp+0x20], xm3 pshufb xm4, xm14 pmaddubsw xm4, xm15 phaddw xm4, xm4 pmulhrsw xm4, xm12 punpcklwd xm3, xm11, xm4 mova xm11, xm4 add srcq, ssq jmp .w4_loop .w4_skip_line: movu xm5, [srcq+ssq*1] movu m6, [rsp+0x10] pshufb xm4, xm14 pshufb xm5, xm14 pmaddubsw xm4, xm15 pmaddubsw xm5, xm15 movu [rsp+0x00], m6 phaddw xm4, xm5 pmulhrsw xm4, xm12 punpcklwd xm9, xm11, xm4 mova [rsp+0x20], xm9 psrldq xm11, xm4, 8 mova xm0, xm1 mova xm1, xm2 mova xm2, xm3 punpcklwd xm3, xm4, xm11 lea srcq, [srcq+ssq*2] jmp .w4_loop .w8: mov dword [rsp+48], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+48], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+48], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+48], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+48], 16 movifprep tmp_stridem, 256 .w_start: %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+72], t0d mov [rsp+56], srcq mov [rsp+64], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] jmp .hloop .hloop_prep: dec dword [rsp+48] jz .ret add qword [rsp+64], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp+16] vpbroadcastd m15, [rsp+72] pxor m9, m9 mov srcq, [rsp+56] mov r0q, [rsp+64] ; dstq / tmpq .hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp+16], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 mova [rsp], xm14 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b mov myd, mym mov dyd, dym pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b vbroadcasti128 m14, [base+wswap] .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq xm11, r6q punpcklqdq xm11, xm11 pmovsxbw m11, xm11 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pshufd m8, m11, q2222 pshufd m11, m11, q3333 pmaddwd m6, m2, m8 pmaddwd m7, m3, m11 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+52], myd mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] jz .skip_line vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 add srcq, ssq mov myd, [rsp+52] mov dyd, dym pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, m10 phaddw m4, m5 pslld m5, m4, 16 paddw m4, m5 pmulhrsw m4, m12 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .vloop .skip_line: mova m0, m1 mova m1, m2 mova m2, m3 vpbroadcastq m7, [srcq+r13] vpbroadcastq m8, [srcq+ rX] movq xm3, [srcq+ r4] movq xm4, [srcq+ r6] movhps xm3, [srcq+ r7] movhps xm4, [srcq+ r9] vinserti128 m3, [srcq+r10], 1 vinserti128 m4, [srcq+r11], 1 add srcq, ssq movq xm5, [srcq+ r4] movq xm6, [srcq+ r6] movhps xm5, [srcq+ r7] movhps xm6, [srcq+ r9] vinserti128 m5, [srcq+r10], 1 vinserti128 m6, [srcq+r11], 1 vpbroadcastq m9, [srcq+r13] vpbroadcastq m11, [srcq+ rX] add srcq, ssq mov myd, [rsp+52] mov dyd, dym vpblendd m3, m7, 0xc0 vpblendd m4, m8, 0xc0 vpblendd m5, m9, 0xc0 vpblendd m6, m11, 0xc0 pmaddubsw m3, m15 pmaddubsw m4, m10 pmaddubsw m5, m15 pmaddubsw m6, m10 phaddw m3, m4 phaddw m5, m6 psrld m4, m3, 16 pslld m6, m5, 16 paddw m3, m4 paddw m5, m6 pblendw m3, m5, 0xaa pmulhrsw m3, m12 jmp .vloop .dy1: movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0-1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] movq xm1, [srcq+ssq*2] movhps xm0, [srcq+ssq*1] movhps xm1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 vinserti128 m0, [srcq+ssq*0], 1 vinserti128 m1, [srcq+ssq*2], 1 vpbroadcastq m2, [srcq+ssq*1] add srcq, ss3q movq xm10, r4q pmovsxbw xm10, xm10 vpblendd m15, m7, 0xaa pblendvb m15, m11, m8 pshufd xm8, xm10, q0000 pshufd xm9, xm10, q1111 pshufd xm11, xm10, q3333 pshufd xm10, xm10, q2222 vpblendd m0, m2, 0xc0 pshufb m1, m14 pshufb m0, m14 pmaddubsw m1, m15 pmaddubsw m0, m15 phaddw m0, m1 pmulhrsw m0, m12 vextracti128 xm1, m0, 1 palignr xm2, xm1, xm0, 4 pshufd xm4, xm1, q2121 punpcklwd xm3, xm0, xm2 ; 01 12 punpckhwd xm0, xm2 ; 23 34 punpcklwd xm2, xm1, xm4 ; 45 56 .dy1_w2_loop: movq xm1, [srcq+ssq*0] movhps xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd xm5, xm3, xm8 pmaddwd xm6, xm0, xm9 pmaddwd xm7, xm2, xm10 mova xm3, xm0 mova xm0, xm2 paddd xm5, xm13 paddd xm6, xm7 pshufb xm1, xm14 pmaddubsw xm1, xm15 phaddw xm1, xm1 pmulhrsw xm1, xm12 palignr xm7, xm1, xm4, 12 punpcklwd xm2, xm7, xm1 ; 67 78 pmaddwd xm7, xm2, xm11 mova xm4, xm1 paddd xm5, xm6 paddd xm5, xm7 psrad xm5, rndshift packssdw xm5, xm5 packuswb xm5, xm5 pextrw [dstq+dsq*0], xm5, 0 pextrw [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif .dy1_w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 vpermq m8, m8, q3120 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r11d, xm15, 1 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] movu xm2, [srcq+ssq*0] movu xm3, [srcq+ssq*2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pcmpeqd m8, m9 psrld m14, 10 pinsrd xm15, [base+subpel_filters+r11*8+2], 1 vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 vinserti128 m2, [srcq+ssq*1], 1 vinserti128 m3, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movu xm4, [srcq+ssq*0] movu xm5, [srcq+ssq*2] vinserti128 m4, [srcq+ssq*1], 1 add srcq, ss3q vpblendd m15, m7, 0x30 punpcklqdq m15, m15 pblendvb m15, m11, m8 movq xm10, r4q punpcklqdq xm10, xm10 pmovsxbw m10, xm10 pshufb m2, m14 pshufb m3, m14 pshufb m4, m14 pshufb xm5, xm14 vpermq m2, m2, q3120 vpermq m3, m3, q3120 vpermq m4, m4, q3120 vpermq m5, m5, q3120 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m2, m3 phaddw m4, m5 pmulhrsw m2, m12 pmulhrsw m4, m12 palignr m5, m4, m2, 4 pshufd m3, m4, q2121 punpcklwd m0, m2, m5 ; 01 12 punpckhwd m1, m2, m5 ; 23 34 punpcklwd m2, m4, m3 ; 45 56 .dy1_w4_loop: movu xm11, [srcq+ssq*0] vinserti128 m11, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 mova m0, m1 mova m1, m2 paddd m4, m13 paddd m5, m6 pshufb m11, m14 vpermq m11, m11, q3120 pmaddubsw m11, m15 phaddw m11, m11 pmulhrsw m11, m12 palignr m6, m11, m3, 12 punpcklwd m2, m6, m11 ; 67 78 mova m3, m11 pmaddwd m6, m2, m10 paddd m4, m5 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 pshuflw xm4, xm4, q3120 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] %else pshufd xm4, xm4, q3120 mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy1_w4_loop MC_8TAP_SCALED_RET .dy1_w8: mov dword [rsp+72], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+72], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+72], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+72], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+72], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+76], t0d mov [rsp+80], srcq mov [rsp+88], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+96], xm0 jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+72] jz .ret add qword [rsp+88], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp+32] vpbroadcastd m15, [rsp+76] pxor m9, m9 mov srcq, [rsp+80] mov r0q, [rsp+88] ; dstq / tmpq .dy1_hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp+32], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 movq [rsp+64], xm14 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b movu [rsp], m10 vpbroadcastd m8, [rsp+0x60] vpbroadcastd m9, [rsp+0x64] vpbroadcastd m10, [rsp+0x68] vpbroadcastd m11, [rsp+0x6c] pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b vbroadcasti128 m14, [base+wswap] .dy1_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m10 pmaddwd m7, m3, m11 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy1_hloop_prep movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] add srcq, ssq pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, [rsp] phaddw m4, m5 pslld m5, m4, 16 paddw m4, m5 pmulhrsw m4, m12 pblendw m0, m1, 0xaa pblendw m1, m2, 0xaa pblendw m2, m3, 0xaa pblendw m3, m4, 0xaa jmp .dy1_vloop .dy2: movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: mov myd, mym movzx t0d, t0b dec srcq movd xm15, t0d punpckldq m8, m9, m8 paddd m14, m8 ; mx+dx*[0-1] vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 vbroadcasti128 m5, [base+bdct_lb_dw] vbroadcasti128 m6, [base+subpel_s_shuf2] vpbroadcastd m15, [base+subpel_filters+r4*8+2] vpbroadcastd m7, [base+subpel_filters+r6*8+2] pcmpeqd m8, m9 psrld m14, 10 movq xm0, [srcq+ssq*0] vpbroadcastq m2, [srcq+ssq*1] movhps xm0, [srcq+ssq*2] vpbroadcastq m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m14, m5 paddb m14, m6 vpblendd m15, m7, 0xaa pblendvb m15, m11, m8 movhps xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] vpblendd m0, m2, 0x30 vpblendd m1, m4, 0xc0 vpblendd m0, m3, 0xc0 pshufb m0, m14 pshufb m1, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 movq xm11, r4q pmovsxbw xm11, xm11 phaddw m0, m1 pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 pshufd xm8, xm11, q0000 pshufd xm9, xm11, q1111 pshufd xm10, xm11, q2222 pshufd xm11, xm11, q3333 pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 vextracti128 xm1, m2, 1 punpcklwd xm3, xm2, xm1 ; 01 23 punpckhwd xm2, xm1 ; 23 45 .dy2_w2_loop: movq xm6, [srcq+ssq*0] vpbroadcastq m7, [srcq+ssq*1] movhps xm6, [srcq+ssq*2] vpbroadcastq m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd xm4, xm3, xm8 pmaddwd xm5, xm2, xm9 vpblendd m6, m7, 0x30 vpblendd m6, m1, 0xc0 pshufb m6, m14 pmaddubsw m6, m15 phaddw m6, m6 pmulhrsw m6, m12 palignr m0, m6, m0, 8 pshufd m2, m0, q3221 vextracti128 xm1, m2, 1 punpcklwd xm3, xm2, xm1 ; 45 67 punpckhwd xm2, xm1 ; 67 89 pmaddwd xm6, xm3, xm10 pmaddwd xm7, xm2, xm11 paddd xm4, xm5 paddd xm4, xm13 paddd xm6, xm7 paddd xm4, xm6 psrad xm4, rndshift packssdw xm4, xm4 packuswb xm4, xm4 pextrw [dstq+dsq*0], xm4, 0 pextrw [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif .dy2_w4: mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b dec srcq movd xm15, t0d pmaddwd m8, m7 vpbroadcastd m11, [base+pd_0x4000] vpbroadcastd xm15, xm15 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd xm15, xm8 movd r4d, xm15 pextrd r6d, xm15, 1 pextrd r11d, xm15, 2 pextrd r13d, xm15, 3 movd xm15, [base+subpel_filters+r4*8+2] vbroadcasti128 m5, [base+bdct_lb_dw] vpbroadcastq m6, [base+subpel_s_shuf2] pinsrd xm15, [base+subpel_filters+r6*8+2], 1 pcmpeqd m8, m9 psrld m14, 10 movu xm0, [srcq+ssq*0] movu xm2, [srcq+ssq*2] pinsrd xm15, [base+subpel_filters+r11*8+2], 2 movu xm1, [srcq+ssq*1] movu xm3, [srcq+ss3q ] pinsrd xm15, [base+subpel_filters+r13*8+2], 3 lea srcq, [srcq+ssq*4] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] vinserti128 m15, xm15, 1 pshufb m14, m5 paddb m14, m6 vinserti128 m2, [srcq+ssq*0], 1 vinserti128 m3, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pblendvb m15, m11, m8 pshufb xm0, xm14 pshufb m2, m14 pshufb xm1, xm14 pshufb m3, m14 pmaddubsw xm0, xm15 pmaddubsw m2, m15 pmaddubsw xm1, xm15 pmaddubsw m3, m15 movq xm11, r4q punpcklqdq xm11, xm11 pmovsxbw m11, xm11 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 pmulhrsw m1, m12 ; 1 3 _ 5 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 punpcklwd xm2, xm0, xm1 punpckhwd m1, m0, m1 ; 23 45 vinserti128 m0, m2, xm1, 1 ; 01 23 .dy2_w4_loop: movu xm6, [srcq+ssq*0] movu xm7, [srcq+ssq*1] vinserti128 m6, [srcq+ssq*2], 1 vinserti128 m7, [srcq+ss3q ], 1 lea srcq, [srcq+ssq*4] pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pshufb m6, m14 pshufb m7, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 psrld m2, m6, 16 pslld m3, m7, 16 paddw m6, m2 paddw m7, m3 pblendw m6, m7, 0xaa ; 67 89 pmulhrsw m6, m12 paddd m4, m5 vperm2i128 m0, m1, m6, 0x21 ; 45 67 mova m1, m6 pmaddwd m6, m0, m10 pmaddwd m7, m1, m11 paddd m4, m13 paddd m6, m7 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movd [dstq+dsq*0], xm4 pextrd [dstq+dsq*1], xm4, 1 lea dstq, [dstq+dsq*2] %else mova [tmpq], xm4 add tmpq, 16 %endif sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET .dy2_w8: mov dword [rsp+40], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+40], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+40], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+40], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+40], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pmaddwd m8, [base+rescale_mul] movd xm15, t0d mov [rsp+64], t0d mov [rsp+48], srcq mov [rsp+56], r0q ; dstq / tmpq %if UNIX64 mov hm, hd %endif shl dword dxm, 3 ; dx*8 vpbroadcastd m15, xm15 paddd m14, m8 ; mx+dx*[0-7] movq xm0, r4q pmovsxbw xm0, xm0 mova [rsp+0x50], xm0 jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+40] jz .ret add qword [rsp+56], 8*(isprep+1) mov hd, hm vpbroadcastd m8, dxm vpbroadcastd m10, [base+pd_0x3ff] paddd m14, m8, [rsp] vpbroadcastd m15, [rsp+64] pxor m9, m9 mov srcq, [rsp+48] mov r0q, [rsp+56] ; dstq / tmpq .dy2_hloop: vpbroadcastq m11, [base+pq_0x40000000] pand m6, m14, m10 psrld m6, 6 paddd m15, m6 pcmpeqd m6, m9 vextracti128 xm7, m15, 1 movd r4d, xm15 pextrd r6d, xm15, 2 pextrd r7d, xm15, 1 pextrd r9d, xm15, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 movu [rsp], m14 movq xm15, [base+subpel_filters+ r4*8] movq xm10, [base+subpel_filters+ r6*8] movhps xm15, [base+subpel_filters+ r7*8] movhps xm10, [base+subpel_filters+ r9*8] vinserti128 m15, [base+subpel_filters+r10*8], 1 vinserti128 m10, [base+subpel_filters+r11*8], 1 vpbroadcastq m9, [base+subpel_filters+r13*8] vpbroadcastq m8, [base+subpel_filters+ rX*8] psrld m14, 10 vextracti128 xm7, m14, 1 movd r4d, xm14 pextrd r6d, xm14, 2 pextrd r7d, xm14, 1 pextrd r9d, xm14, 3 movd r10d, xm7 pextrd r11d, xm7, 2 pextrd r13d, xm7, 1 pextrd rXd, xm7, 3 pshufd m5, m6, q1100 pshufd m6, m6, q3322 vpblendd m15, m9, 0xc0 vpblendd m10, m8, 0xc0 pblendvb m15, m11, m5 pblendvb m10, m11, m6 vbroadcasti128 m14, [base+subpel_s_shuf8] MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b vpbroadcastd m8, [rsp+0x50] vpbroadcastd m9, [rsp+0x54] vpbroadcastd m11, [rsp+0x58] vpbroadcastd m4, [rsp+0x5c] pshufb m0, m14 ; 01a 01b pshufb m1, m14 ; 23a 23b pshufb m2, m14 ; 45a 45b pshufb m3, m14 ; 67a 67b SWAP m14, m4 .dy2_vloop: pmaddwd m4, m0, m8 pmaddwd m5, m1, m9 pmaddwd m6, m2, m11 pmaddwd m7, m3, m14 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 psrad m4, rndshift vextracti128 xm5, m4, 1 packssdw xm4, xm5 %ifidn %1, put packuswb xm4, xm4 movq [dstq], xm4 add dstq, dsm %else mova [tmpq], xm4 add tmpq, tmp_stridem %endif dec hd jz .dy2_hloop_prep mova m0, m1 mova m1, m2 mova m2, m3 movq xm3, [srcq+ r4] movq xm4, [srcq+ r6] movhps xm3, [srcq+ r7] movhps xm4, [srcq+ r9] vinserti128 m3, [srcq+r10], 1 vinserti128 m4, [srcq+r11], 1 vpbroadcastq m5, [srcq+r13] vpbroadcastq m6, [srcq+ rX] add srcq, ssq vpblendd m3, m5, 0xc0 vpblendd m4, m6, 0xc0 pmaddubsw m3, m15 pmaddubsw m4, m10 phaddw m3, m4 movq xm4, [srcq+ r4] movq xm5, [srcq+ r6] movhps xm4, [srcq+ r7] movhps xm5, [srcq+ r9] vinserti128 m4, [srcq+r10], 1 vinserti128 m5, [srcq+r11], 1 vpbroadcastq m6, [srcq+r13] vpbroadcastq m7, [srcq+ rX] add srcq, ssq vpblendd m4, m6, 0xc0 vpblendd m5, m7, 0xc0 pmaddubsw m4, m15 pmaddubsw m5, m10 phaddw m4, m5 psrld m5, m3, 16 pslld m6, m4, 16 paddw m3, m5 paddw m4, m6 pblendw m3, m4, 0xaa pmulhrsw m3, m12 jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_8bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, t0d jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %else DECLARE_REG_TMP 6, 8 %endif %define PUT_8TAP_SCALED_FN FN put_8tap_scaled, %define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, BILIN_SCALED_FN put PUT_8TAP_SCALED_FN sharp, SHARP, SHARP PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %else DECLARE_REG_TMP 6, 7 %endif BILIN_SCALED_FN prep PREP_8TAP_SCALED_FN sharp, SHARP, SHARP PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR MC_8TAP_SCALED prep %macro WARP_V 5 ; dst, 02, 46, 13, 57 ; Can be done using gathers, but that's terribly slow on many CPU:s lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm8, [filterq+myq *8] vinserti128 m8, [filterq+tmp1q*8], 1 ; a e lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; b f lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq xm9, [filterq+myq *8] vinserti128 m9, [filterq+tmp1q*8], 1 ; c g lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma shr tmp2d, 10 shr tmp1d, 10 punpcklwd m8, m0 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 ; d h punpcklwd m0, m9, m0 punpckldq m9, m8, m0 punpckhdq m0, m8, m0 punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 pmaddwd m%2, m8 pmaddwd m9, m%3 punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 pmaddwd m8, m%4 pmaddwd m0, m%5 paddd m%2, m9 paddd m0, m8 paddd m%1, m0, m%2 %endmacro cglobal warp_affine_8x8t_8bpc, 0, 14, 0, tmp, ts %if WIN64 sub rsp, 0xa0 %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main .loop: psrad m7, 13 psrad m0, 13 packssdw m7, m0 pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7 vpermq m7, m7, q3120 mova [tmpq+tsq*0], xm7 vextracti128 [tmpq+tsq*2], m7, 1 dec r4d jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).end call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx2).main2 lea tmpq, [tmpq+tsq*4] jmp .loop cglobal warp_affine_8x8_8bpc, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \ beta, filter, tmp1, delta, my, gamma %if WIN64 sub rsp, 0xa0 %assign xmm_regs_used 16 %assign stack_size_padded 0xa0 %assign stack_offset stack_offset+stack_size_padded %endif call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m7, 18 psrad m0, 18 packusdw m7, m0 pavgw m7, m11 ; (x + (1 << 10)) >> 11 vextracti128 xm0, m7, 1 packuswb xm7, xm0 pshufd xm7, xm7, q3120 movq [dstq+dsq*0], xm7 movhps [dstq+dsq*1], xm7 dec r4d jg .loop .end: RET ALIGN function_align .main: ; Stack args offset by one (r4m -> r5m etc.) due to call %if WIN64 mov abcdq, r5m mov mxd, r6m movaps [rsp+stack_offset+0x10], xmm6 movaps [rsp+stack_offset+0x20], xmm7 movaps [rsp+0x28], xmm8 movaps [rsp+0x38], xmm9 movaps [rsp+0x48], xmm10 movaps [rsp+0x58], xmm11 movaps [rsp+0x68], xmm12 movaps [rsp+0x78], xmm13 movaps [rsp+0x88], xmm14 movaps [rsp+0x98], xmm15 %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] mova m12, [warp_8x8_shufA] mova m13, [warp_8x8_shufB] vpbroadcastd m14, [pw_8192] vpbroadcastd m15, [pd_32768] pxor m11, m11 lea filterq, [mc_warp_filter2] lea tmp1q, [ssq*3+3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 sub betad, tmp2d ; beta -= alpha*3 mov myd, r7m call .h psrld m1, m0, 16 call .h psrld m4, m0, 16 call .h pblendw m1, m0, 0xaa ; 02 call .h pblendw m4, m0, 0xaa ; 13 call .h psrld m2, m1, 16 pblendw m2, m0, 0xaa ; 24 call .h psrld m5, m4, 16 pblendw m5, m0, 0xaa ; 35 call .h psrld m3, m2, 16 pblendw m3, m0, 0xaa ; 46 movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] add myd, 512+(64<<10) mov r4d, 4 lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 .main2: call .h psrld m6, m5, 16 pblendw m6, m0, 0xaa ; 57 WARP_V 7, 1, 3, 4, 6 call .h mova m1, m2 mova m2, m3 psrld m3, 16 pblendw m3, m0, 0xaa ; 68 WARP_V 0, 4, 6, 1, 3 mova m4, m5 mova m5, m6 ret ALIGN function_align .h: lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] vbroadcasti128 m10, [srcq] shr mxd, 10 shr tmp1d, 10 movq xm8, [filterq+mxq *8] vinserti128 m8, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] shr tmp2d, 10 shr tmp1d, 10 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 movq xm9, [filterq+mxq *8] vinserti128 m9, [filterq+tmp1q*8], 1 lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 punpcklqdq m8, m0 ; 0 1 4 5 movq xm0, [filterq+tmp2q*8] vinserti128 m0, [filterq+tmp1q*8], 1 punpcklqdq m9, m0 ; 2 3 6 7 pshufb m0, m10, m12 pmaddubsw m0, m8 pshufb m10, m13 pmaddubsw m10, m9 add srcq, ssq phaddw m0, m10 pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword ret %macro BIDIR_FN 1 ; op %1 0 lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 4 je .ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 cmp hd, 8 je .ret %1 2 lea dstq, [dstq+strideq*4] vextracti128 xm1, m0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .ret: RET .w8_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w8: vextracti128 xm1, m0, 1 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 sub hd, 4 jg .w8_loop RET .w16_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq*4] .w16: vpermq m0, m0, q3120 mova [dstq ], xm0 vextracti128 [dstq+strideq*1], m0, 1 %1 2 vpermq m0, m0, q3120 mova [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 sub hd, 4 jg .w16_loop RET .w32_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq*2] .w32: vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 %1 2 vpermq m0, m0, q3120 mova [dstq+strideq*1], m0 sub hd, 2 jg .w32_loop RET .w64_loop: %1_INC_PTR 4 %1 0 add dstq, strideq .w64: vpermq m0, m0, q3120 mova [dstq], m0 %1 2 vpermq m0, m0, q3120 mova [dstq+32], m0 dec hd jg .w64_loop RET .w128_loop: %1 0 add dstq, strideq .w128: vpermq m0, m0, q3120 mova [dstq+0*32], m0 %1 2 vpermq m0, m0, q3120 mova [dstq+1*32], m0 %1_INC_PTR 8 %1 -4 vpermq m0, m0, q3120 mova [dstq+2*32], m0 %1 -2 vpermq m0, m0, q3120 mova [dstq+3*32], m0 dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset mova m0, [tmp1q+(%1+0)*32] paddw m0, [tmp2q+(%1+0)*32] mova m1, [tmp1q+(%1+1)*32] paddw m1, [tmp2q+(%1+1)*32] pmulhrsw m0, m2 pmulhrsw m1, m2 packuswb m0, m1 %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*32 add tmp2q, %1*32 %endmacro cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg %+ SUFFIX %+ _table lea r6, [avg %+ SUFFIX %+ _table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m2, [base+pw_1024] add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m0, [tmp1q+(%1+0)*32] psubw m2, m0, [tmp2q+(%1+0)*32] mova m1, [tmp1q+(%1+1)*32] psubw m3, m1, [tmp2q+(%1+1)*32] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg %+ SUFFIX %+ _table lea r6, [w_avg %+ SUFFIX %+ _table] tzcnt wd, wm movifnidn hd, hm vpbroadcastw m4, r6m ; weight movsxd wq, dword [r6+wq*4] vpbroadcastd m5, [base+pw_2048] psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q pxor m0, m0 mov tmp1q, tmp2q psubw m4, m0, m4 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 vpermq m3, [maskq+%1*16], q3120 mova m0, [tmp2q+(%1+0)*32] psubw m1, m0, [tmp1q+(%1+0)*32] psubb m3, m4, m3 paddw m1, m1 ; (b - a) << 1 paddb m3, m3 punpcklbw m2, m4, m3 ; -m << 9 pmulhw m1, m2 paddw m0, m1 mova m1, [tmp2q+(%1+1)*32] psubw m2, m1, [tmp1q+(%1+1)*32] paddw m2, m2 punpckhbw m3, m4, m3 pmulhw m2, m3 paddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*16 add tmp2q, %1*32 add tmp1q, %1*32 %endmacro cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask %+ SUFFIX %+ _table lea r7, [mask %+ SUFFIX %+ _table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] vpbroadcastd m5, [base+pw_2048] pxor m4, m4 add wq, r7 BIDIR_FN MASK %macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 mova m%1, [tmp1q+32*%3] mova m1, [tmp2q+32*%3] psubw m1, m%1 pabsw m%2, m1 psubusw m%2, m6, m%2 psrlw m%2, 8 ; 64 - m psllw m2, m%2, 10 pmulhw m1, m2 paddw m%1, m1 mova m1, [tmp1q+32*%4] mova m2, [tmp2q+32*%4] psubw m2, m1 pabsw m3, m2 psubusw m3, m6, m3 psrlw m3, 8 %if %5 packuswb m%2, m3 psubb m%2, m5, m%2 vpermq m%2, m%2, q3120 %else phaddw m%2, m3 %endif psllw m3, 10 pmulhw m2, m3 paddw m1, m2 pmulhrsw m%1, m7 pmulhrsw m1, m7 packuswb m%1, m1 %endmacro cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_avx2_table lea r6, [blend_avx2_table] tzcnt wd, wm movifnidn maskq, maskmp movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m4, [base+pb_64] vpbroadcastd m5, [base+pw_512] sub tmpq, maskq add wq, r6 lea r6, [dsq*3] jmp wq .w4: movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 vpbroadcastd xm1, [dstq+dsq*2] pinsrd xm1, [dstq+r6 ], 3 mova xm6, [maskq] psubb xm3, xm4, xm6 punpcklbw xm2, xm3, xm6 punpckhbw xm3, xm6 mova xm6, [maskq+tmpq] add maskq, 4*4 punpcklbw xm0, xm6 punpckhbw xm1, xm6 pmaddubsw xm0, xm2 pmaddubsw xm1, xm3 pmulhrsw xm0, xm5 pmulhrsw xm1, xm5 packuswb xm0, xm1 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 pextrd [dstq+dsq*2], xm0, 2 pextrd [dstq+r6 ], xm0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET ALIGN function_align .w8: movq xm1, [dstq+dsq*0] movhps xm1, [dstq+dsq*1] vpbroadcastq m2, [dstq+dsq*2] vpbroadcastq m3, [dstq+r6 ] mova m0, [maskq] mova m6, [maskq+tmpq] add maskq, 8*4 vpblendd m1, m2, 0x30 vpblendd m1, m3, 0xc0 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 vextracti128 xm1, m0, 1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 movq [dstq+dsq*2], xm1 movhps [dstq+r6 ], xm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 RET ALIGN function_align .w16: mova m0, [maskq] mova xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 mova m6, [maskq+tmpq] add maskq, 16*2 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16 RET ALIGN function_align .w32: mova m0, [maskq] mova m1, [dstq] mova m6, [maskq+tmpq] add maskq, 32 psubb m3, m4, m0 punpcklbw m2, m3, m0 punpckhbw m3, m0 punpcklbw m0, m1, m6 punpckhbw m1, m6 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .w32 RET cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_avx2_table lea r5, [blend_v_avx2_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 add maskq, obmc_masks-blend_v_avx2_table jmp wq .w2: vpbroadcastd xm2, [maskq+2*2] .w2_s0_loop: movd xm0, [dstq+dsq*0] pinsrw xm0, [dstq+dsq*1], 1 movd xm1, [tmpq] add tmpq, 2*2 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_s0_loop RET ALIGN function_align .w4: vpbroadcastq xm2, [maskq+4*2] .w4_loop: movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 movq xm1, [tmpq] add tmpq, 4*2 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET ALIGN function_align .w8: mova xm3, [maskq+8*2] .w8_loop: movq xm0, [dstq+dsq*0] vpbroadcastq xm1, [dstq+dsq*1] mova xm2, [tmpq] add tmpq, 8*2 punpcklbw xm0, xm2 punpckhbw xm1, xm2 pmaddubsw xm0, xm3 pmaddubsw xm1, xm3 pmulhrsw xm0, xm5 pmulhrsw xm1, xm5 packuswb xm0, xm1 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET ALIGN function_align .w16: vbroadcasti128 m3, [maskq+16*2] vbroadcasti128 m4, [maskq+16*3] .w16_loop: mova xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 mova m2, [tmpq] add tmpq, 16*2 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET ALIGN function_align .w32: mova xm3, [maskq+16*4] vinserti128 m3, [maskq+16*6], 1 mova xm4, [maskq+16*5] vinserti128 m4, [maskq+16*7], 1 .w32_loop: mova m1, [dstq] mova m2, [tmpq] add tmpq, 32 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .w32_loop RET cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mask %define base r5-blend_h_avx2_table lea r5, [blend_h_avx2_table] mov r6d, wd tzcnt wd, wd mov hd, hm movsxd wq, dword [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 lea maskq, [base+obmc_masks+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xm0, [dstq+dsq*0] pinsrw xm0, [dstq+dsq*1], 1 movd xm2, [maskq+hq*2] movd xm1, [tmpq] add tmpq, 2*2 punpcklwd xm2, xm2 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET ALIGN function_align .w4: mova xm3, [blend_shuf] .w4_loop: movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 movd xm2, [maskq+hq*2] movq xm1, [tmpq] add tmpq, 4*2 pshufb xm2, xm3 punpcklbw xm0, xm1 pmaddubsw xm0, xm2 pmulhrsw xm0, xm5 packuswb xm0, xm0 movd [dstq+dsq*0], xm0 pextrd [dstq+dsq*1], xm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET ALIGN function_align .w8: vbroadcasti128 m4, [blend_shuf] shufpd m4, m4, 0x03 .w8_loop: vpbroadcastq m1, [dstq+dsq*0] movq xm0, [dstq+dsq*1] vpblendd m0, m1, 0x30 vpbroadcastd m3, [maskq+hq*2] movq xm1, [tmpq+8*1] vinserti128 m1, [tmpq+8*0], 1 add tmpq, 8*2 pshufb m3, m4 punpcklbw m0, m1 pmaddubsw m0, m3 pmulhrsw m0, m5 vextracti128 xm1, m0, 1 packuswb xm0, xm1 movhps [dstq+dsq*0], xm0 movq [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop RET ALIGN function_align .w16: vbroadcasti128 m4, [blend_shuf] shufpd m4, m4, 0x0c .w16_loop: mova xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 vpbroadcastd m3, [maskq+hq*2] mova m2, [tmpq] add tmpq, 16*2 pshufb m3, m4 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET ALIGN function_align .w32: ; w32/w64/w128 sub dsq, r6 .w32_loop0: vpbroadcastw m3, [maskq+hq*2] mov wd, r6d .w32_loop: mova m1, [dstq] mova m2, [tmpq] add tmpq, 32 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, 32 sub wd, 32 jg .w32_loop add dstq, dsq inc hq jl .w32_loop0 RET cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ bottomext, rightext ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor r12d, r12d lea r10, [ihq-1] cmp yq, ihq cmovs r10, yq test yq, yq cmovs r10, r12 imul r10, sstrideq add srcq, r10 ; ref += iclip(x, 0, iw - 1) lea r10, [iwq-1] cmp xq, iwq cmovs r10, xq test xq, xq cmovs r10, r12 add srcq, r10 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) lea bottomextq, [yq+bhq] sub bottomextq, ihq lea r3, [bhq-1] cmovs bottomextq, r12 DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \ bottomext, rightext ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, r12 cmp bottomextq, bhq cmovns bottomextq, r3 cmp topextq, bhq cmovg topextq, r3 ; right_ext = iclip(x + bw - iw, 0, bw - 1) lea rightextq, [xq+bwq] sub rightextq, iwq lea r2, [bwq-1] cmovs rightextq, r12 DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \ bottomext, rightext ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, r12 cmp rightextq, bwq cmovns rightextq, r2 cmp leftextq, bwq cmovns leftextq, r2 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \ dst, dstride, src, sstride, bottomext, rightext ; center_h = bh - top_ext - bottom_ext lea r3, [bottomextq+topextq] sub centerhq, r3 ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq imul r2, dstrideq add dstq, r2 mov r9m, dstq ; center_w = bw - left_ext - right_ext mov centerwq, bwq lea r3, [rightextq+leftextq] sub centerwq, r3 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 ; left extension xor r3, r3 vpbroadcastb m0, [srcq] .left_loop_%3: mova [dstq+r3], m0 add r3, 32 cmp r3, leftextq jl .left_loop_%3 ; body lea r12, [dstq+leftextq] %endif xor r3, r3 .body_loop_%3: movu m0, [srcq+r3] %if %1 movu [r12+r3], m0 %else movu [dstq+r3], m0 %endif add r3, 32 cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 add r12, centerwq %else lea r12, [dstq+centerwq] %endif xor r3, r3 vpbroadcastb m0, [srcq+centerwq-1] .right_loop_%3: movu [r12+r3], m0 add r3, 32 cmp r3, rightextq jl .right_loop_%3 %endif add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %endmacro test leftextq, leftextq jnz .need_left_ext test rightextq, rightextq jnz .need_right_ext v_loop 0, 0, 0 jmp .body_done .need_left_ext: test rightextq, rightextq jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; bottom edge extension test bottomextq, bottomextq jz .top mov srcq, dstq sub srcq, dstrideq xor r1, r1 .bottom_x_loop: mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, bottomextq .bottom_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .bottom_y_loop add r1, 32 cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end mov srcq, r9m mov dstq, dstm xor r1, r1 .top_x_loop: mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, topextq .top_y_loop: mova [r3], m0 add r3, dstrideq dec r4 jg .top_y_loop add r1, 32 cmp r1, bwq jl .top_x_loop .end: RET cglobal resize_8bpc, 6, 12, 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 sub dword mx0m, 4<<14 sub dword src_wm, 8 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ vpbroadcastd xm3, [base+pw_m256] vpbroadcastd m7, [base+pd_63] vbroadcasti128 m15, [base+pb_8x0_8x8] pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] pslld m5, 3 ; dx*8 pslld m6, 14 paddd m8, m2 ; mx+[0..7]*dx pxor m2, m2 ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset pand m9, m7 ; filter offset (masked) ; load source pixels - this ugly code is vpgatherdq emulation since ; directly using vpgatherdq on Haswell is quite a bit slower :( movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vextracti128 xm0, m0, 1 movq xm12, [srcq+r8] movq xm13, [srcq+r10] movhps xm12, [srcq+r9] movhps xm13, [srcq+r11] movd r8d, xm0 pextrd r9d, xm0, 1 pextrd r10d, xm0, 2 pextrd r11d, xm0, 3 vinserti128 m12, [srcq+r8], 1 vinserti128 m13, [srcq+r10], 1 vpbroadcastq m10, [srcq+r9] vpbroadcastq m11, [srcq+r11] vpblendd m12, m10, 11000000b vpblendd m13, m11, 11000000b ; if no emulation is required, we don't need to shuffle or emulate edges ; this also saves 2 quasi-vpgatherdqs vptest m1, m1 jz .filter movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vextracti128 xm1, m1, 1 movq xm14, [base+resize_shuf+4+r8] movq xm0, [base+resize_shuf+4+r10] movhps xm14, [base+resize_shuf+4+r9] movhps xm0, [base+resize_shuf+4+r11] movq r9, xm1 pextrq r11, xm1, 1 movsxd r8, r9d sar r9, 32 movsxd r10, r11d sar r11, 32 vinserti128 m14, [base+resize_shuf+4+r8], 1 vinserti128 m0, [base+resize_shuf+4+r10], 1 vpbroadcastq m10, [base+resize_shuf+4+r9] vpbroadcastq m11, [base+resize_shuf+4+r11] vpblendd m14, m10, 11000000b vpblendd m0, m11, 11000000b paddb m14, m15 paddb m0, m15 pshufb m12, m14 pshufb m13, m0 .filter: movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 vextracti128 xm9, m9, 1 movq xm10, [base+resize_filter+r8*8] movq xm11, [base+resize_filter+r10*8] movhps xm10, [base+resize_filter+r9*8] movhps xm11, [base+resize_filter+r11*8] movd r8d, xm9 pextrd r9d, xm9, 1 pextrd r10d, xm9, 2 pextrd r11d, xm9, 3 vinserti128 m10, [base+resize_filter+r8*8], 1 vinserti128 m11, [base+resize_filter+r10*8], 1 vpbroadcastq m14, [base+resize_filter+r9*8] vpbroadcastq m1, [base+resize_filter+r11*8] vpblendd m10, m14, 11000000b vpblendd m11, m1, 11000000b pmaddubsw m12, m10 pmaddubsw m13, m11 phaddw m12, m13 vextracti128 xm13, m12, 1 phaddsw xm12, xm13 pmulhrsw xm12, xm3 ; x=(x+64)>>7 packuswb xm12, xm12 movq [dstq+xq], xm12 paddd m4, m5 add xd, 8 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx2_table lea r7, [w_mask_420_avx2_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] pmovzxbd m9, [base+deint_shuf4] vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign add wq, r7 W_MASK 0, 4, 0, 1 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 jg .w4_h16 .w4_end: vextracti128 xm0, m4, 1 vpblendd xm1, xm4, xm0, 0x05 vpblendd xm4, xm0, 0x0a pshufd xm1, xm1, q2301 psubw xm4, xm8, xm4 psubw xm4, xm1 psrlw xm4, 2 packuswb xm4, xm4 movq [maskq], xm4 RET .w4_h16: W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] phaddd m4, m5 vextracti128 xm1, m0, 1 psubw m4, m8, m4 psrlw m4, 2 vpermd m4, m9, m4 vextracti128 xm5, m4, 1 packuswb xm4, xm5 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq], xm4 RET .w8_loop: add tmp1q, 2*32 add tmp2q, 2*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 8 .w8: vextracti128 xm2, m4, 1 vextracti128 xm1, m0, 1 psubw xm4, xm8, xm4 psubw xm4, xm2 psrlw xm4, 2 packuswb xm4, xm4 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 movq [maskq], xm4 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 16 .w16: vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 W_MASK 0, 5, 2, 3 punpckhqdq m1, m4, m5 punpcklqdq m4, m5 psubw m1, m8, m1 psubw m1, m4 psrlw m1, 2 vpermq m0, m0, q3120 packuswb m1, m1 vpermd m1, m9, m1 mova [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 mova [maskq], xm1 sub hd, 4 jg .w16_loop RET .w32_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*2] add maskq, 16 .w32: vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 psubw m4, m8, m4 psubw m4, m5 psrlw m4, 2 vpermq m0, m0, q3120 packuswb m4, m4 vpermd m4, m9, m4 mova [dstq+strideq*1], m0 mova [maskq], xm4 sub hd, 2 jg .w32_loop RET .w64_loop_even: psubw m10, m8, m4 psubw m11, m8, m5 dec hd .w64_loop: add tmp1q, 4*32 add tmp2q, 4*32 W_MASK 0, 4, 0, 1 add dstq, strideq .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 vpermq m0, m0, q3120 mova [dstq+32*1], m0 test hd, 1 jz .w64_loop_even psubw m4, m10, m4 psubw m5, m11, m5 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m9, m4 mova [maskq], m4 add maskq, 32 dec hd jg .w64_loop RET .w128_loop_even: psubw m12, m8, m4 psubw m13, m8, m5 dec hd .w128_loop: W_MASK 0, 4, 0, 1 add dstq, strideq .w128: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 vpermq m0, m0, q3120 mova [dstq+32*1], m0 add tmp1q, 8*32 add tmp2q, 8*32 test hd, 1 jz .w128_even psubw m4, m10, m4 psubw m5, m11, m5 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m9, m4 mova [maskq+32*0], m4 jmp .w128_odd .w128_even: psubw m10, m8, m4 psubw m11, m8, m5 .w128_odd: W_MASK 0, 4, -4, -3 vpermq m0, m0, q3120 mova [dstq+32*2], m0 W_MASK 0, 5, -2, -1 vpermq m0, m0, q3120 mova [dstq+32*3], m0 test hd, 1 jz .w128_loop_even psubw m4, m12, m4 psubw m5, m13, m5 psrlw m4, 2 psrlw m5, 2 packuswb m4, m5 vpermd m4, m9, m4 mova [maskq+32*1], m4 add maskq, 64 dec hd jg .w128_loop RET cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx2_table lea r7, [w_mask_422_avx2_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm pxor m9, m9 movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] pmovzxbd m10, [base+deint_shuf4] vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign add wq, r7 mov maskq, maskmp W_MASK 0, 4, 0, 1 lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 jg .w4_h16 .w4_end: vextracti128 xm5, m4, 1 packuswb xm4, xm5 psubb xm5, xm8, xm4 pavgb xm5, xm9 pshufd xm5, xm5, q3120 mova [maskq], xm5 RET .w4_h16: W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermd m5, m10, m5 vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq], m5 RET .w8_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 16 .w8: vextracti128 xm5, m4, 1 vextracti128 xm1, m0, 1 packuswb xm4, xm5 psubb xm5, xm8, xm4 pavgb xm5, xm9 pshufd xm5, xm5, q3120 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 mova [maskq], xm5 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w16: vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+strideq*2], xm0 vextracti128 [dstq+stride3q ], m0, 1 mova [maskq], m5 sub hd, 4 jg .w16_loop RET .w32_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 lea dstq, [dstq+strideq*2] add maskq, 32 .w32: vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+strideq*1], m0 mova [maskq], m5 sub hd, 2 jg .w32_loop RET .w64_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1 add dstq, strideq add maskq, 32 .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+32*1], m0 mova [maskq], m5 dec hd jg .w64_loop RET .w128_loop: add tmp1q, 32*8 add tmp2q, 32*8 W_MASK 0, 4, 0, 1 add dstq, strideq add maskq, 32*2 .w128: vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+32*1], m0 mova [maskq+32*0], m5 W_MASK 0, 4, 4, 5 vpermq m0, m0, q3120 mova [dstq+32*2], m0 W_MASK 0, 5, 6, 7 packuswb m4, m5 psubb m5, m8, m4 pavgb m5, m9 vpermq m0, m0, q3120 vpermd m5, m10, m5 mova [dstq+32*3], m0 mova [maskq+32*1], m5 dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx2_table lea r7, [w_mask_444_avx2_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m5, [base+pb_64] vpbroadcastd m7, [base+pw_2048] add wq, r7 W_MASK 0, 4, 0, 1, 1 lea stride3q, [strideq*3] jmp wq .w4: vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 mova [maskq+32*0], m4 cmp hd, 8 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 je .w4_end W_MASK 0, 4, 2, 3, 1 lea dstq, [dstq+strideq*4] vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 mova [maskq+32*1], m4 .w4_end: RET .w8_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 lea dstq, [dstq+strideq*4] add maskq, 32 .w8: vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 mova [maskq], m4 sub hd, 4 jg .w8_loop RET .w16_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 lea dstq, [dstq+strideq*2] add maskq, 32 .w16: vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 mova [maskq], m4 sub hd, 2 jg .w16_loop RET .w32_loop: add tmp1q, 32*2 add tmp2q, 32*2 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32 .w32: vpermq m0, m0, q3120 mova [dstq], m0 mova [maskq], m4 dec hd jg .w32_loop RET .w64_loop: add tmp1q, 32*4 add tmp2q, 32*4 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32*2 .w64: vpermq m0, m0, q3120 mova [dstq+32*0], m0 mova [maskq+32*0], m4 W_MASK 0, 4, 2, 3, 1 vpermq m0, m0, q3120 mova [dstq+32*1], m0 mova [maskq+32*1], m4 dec hd jg .w64_loop RET .w128_loop: add tmp1q, 32*8 add tmp2q, 32*8 W_MASK 0, 4, 0, 1, 1 add dstq, strideq add maskq, 32*4 .w128: vpermq m0, m0, q3120 mova [dstq+32*0], m0 mova [maskq+32*0], m4 W_MASK 0, 4, 2, 3, 1 vpermq m0, m0, q3120 mova [dstq+32*1], m0 mova [maskq+32*1], m4 W_MASK 0, 4, 4, 5, 1 vpermq m0, m0, q3120 mova [dstq+32*2], m0 mova [maskq+32*2], m4 W_MASK 0, 4, 6, 7, 1 vpermq m0, m0, q3120 mova [dstq+32*3], m0 mova [maskq+32*3], m4 dec hd jg .w128_loop RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/mc_avx512.asm000064400000000000000000005001651046102023000143010ustar 00000000000000; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 64 obmc_masks: pw_512: times 2 dw 512 ; 2 db 45, 19, 64, 0 ; 4 db 39, 25, 50, 14, 59, 5, 64, 0 ; 8 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 ; 16 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 ; 32 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 warp_8x8_permA: db 4, 5, 6, 7, 16, 17, 18, 19, 5, 6, 7, 8, 17, 18, 19, 20 db 6, 7, 8, 9, 18, 19, 20, 21, 7, 8, 9, 10, 19, 20, 21, 22 db 8, 9, 10, 11, 20, 21, 22, 23, 9, 10, 11, 12, 21, 22, 23, 24 db 10, 11, 12, 13, 22, 23, 24, 25, 11, 12, 13, 14, 23, 24, 25, 26 warp_8x8_permB: db 0, 1, 2, 3, 20, 21, 22, 23, 1, 2, 3, 4, 21, 22, 23, 24 db 2, 3, 4, 5, 22, 23, 24, 25, 3, 4, 5, 6, 23, 24, 25, 26 db 4, 5, 6, 7, 24, 25, 26, 27, 5, 6, 7, 8, 25, 26, 27, 28 db 6, 7, 8, 9, 26, 27, 28, 29, 7, 8, 9, 10, 27, 28, 29, 30 warp_8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13 warp_8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15 pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7 warp_8x8_hpack: db 3, 11, 3, 11, 35, 43, 35, 43 pd_16384: dd 16384 pd_262144: dd 262144 warp_8x8_end: db 0, 4, 16, 20, 32, 36, 48, 52, 2, 6, 18, 22, 34, 38, 50, 54 warp_8x8t_end: db 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59 db 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63 bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31 db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63 db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30 db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62 wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31 db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63 db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30 db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62 wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47 db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63 db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46 db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62 wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62 db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125 wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63 db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62 bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39 db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47 bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15 db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23 db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31 bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87 db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39 bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15 db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23 db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31 bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7 db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15 db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23 db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7 spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42 db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50 spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46 db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54 spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26 db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34 spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22 db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30 db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38 spel_v_perm16: db 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 db 40, 16, 41, 17, 42, 18, 43, 19, 44, 20, 45, 21, 46, 22, 47, 23 db 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 spel_v_perm32: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39 db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55 db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23 db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31 spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39 db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47 spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55 db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63 spel_hv_perm4d: db 18, 19, 0, 1, 22, 23, 4, 5, 26, 27, 8, 9, 30, 31, 12, 13 db 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29 spel_hv_perm8a: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39 db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47 spel_hv_perm8b: db 32, 33, 48, 49, 34, 35, 50, 51, 36, 37, 52, 53, 38, 39, 54, 55 db 40, 41, 56, 57, 42, 43, 58, 59, 44, 45, 60, 61, 46, 47, 62, 63 db 48, 49, 64, 65, 50, 51, 66, 67, 52, 53, 68, 69, 54, 55, 70, 71 db 56, 57, 72, 73, 58, 59, 74, 75, 60, 61, 76, 77, 62, 63, 78, 79 spel_hv_perm8c: db 34, 35, 0, 1, 38, 39, 4, 5, 42, 43, 8, 9, 46, 47, 12, 13 db 50, 51, 16, 17, 54, 55, 20, 21, 58, 59, 24, 25, 62, 63, 28, 29 db 0, 1, 32, 33, 4, 5, 36, 37, 8, 9, 40, 41, 12, 13, 44, 45 db 16, 17, 48, 49, 20, 21, 52, 53, 24, 25, 56, 57, 28, 29, 60, 61 spel_hv_end16: db 1, 3, 17, 19, 5, 7, 21, 23, 33, 35, 49, 51, 37, 39, 53, 55 db 9, 11, 25, 27, 13, 15, 29, 31, 41, 43, 57, 59, 45, 47, 61, 63 spel_hv_perm16a:db 0, 1, 2, 3, 32, 33, 34, 35, 1, 2, 3, 4, 33, 34, 35, 36 db 2, 3, 4, 5, 34, 35, 36, 37, 3, 4, 5, 6, 35, 36, 37, 38 spel_hv_perm16c:db 8, 9, 10, 11, 40, 41, 42, 43, 9, 10, 11, 12, 41, 42, 43, 44 db 10, 11, 12, 13, 42, 43, 44, 45, 11, 12, 13, 14, 43, 44, 45, 46 db 16, 17, 18, 19, 48, 49, 50, 51, 17, 18, 19, 20, 49, 50, 51, 52 db 18, 19, 20, 21, 50, 51, 52, 53, 19, 20, 21, 22, 51, 52, 53, 54 spel_hv_perm16b:db 4, 5, 6, 7, 36, 37, 38, 39, 5, 6, 7, 8, 37, 38, 39, 40 db 6, 7, 8, 9, 38, 39, 40, 41, 7, 8, 9, 10, 39, 40, 41, 42 db 12, 13, 14, 15, 44, 45, 46, 47, 13, 14, 15, 16, 45, 46, 47, 48 db 14, 15, 16, 17, 46, 47, 48, 49, 15, 16, 17, 18, 47, 48, 49, 50 spel_hv_perm16d:db 0, 1, 2, 3, 1, 2, 3, 4, 4, 5, 6, 7, 5, 6, 7, 8 db 2, 3, 4, 5, 3, 4, 5, 6, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9, 10, 11, 9, 10, 11, 12, 12, 13, 14, 15, 13, 14, 15, 16 db 10, 11, 12, 13, 11, 12, 13, 14, 14, 15, 16, 17, 15, 16, 17, 18 spel_hv_perm16e:db 4, 5, 6, 7, 5, 6, 7, 8, 8, 9, 10, 11, 9, 10, 11, 12 db 6, 7, 8, 9, 7, 8, 9, 10, 10, 11, 12, 13, 11, 12, 13, 14 db 12, 13, 14, 15, 13, 14, 15, 16, 16, 17, 18, 19, 17, 18, 19, 20 db 14, 15, 16, 17, 15, 16, 17, 18, 18, 19, 20, 21, 19, 20, 21, 22 spel_hv_end: db 1, 3, 5, 7, 17, 19, 21, 23, 33, 35, 37, 39, 49, 51, 53, 55 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 resize_permA: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 resize_permB: dd 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 resize_permC: dd 0, 4, 8, 12 pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7 wm_420_perm64: dq 0xfedcba9876543210 wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040 pb_8x0_8x8: times 8 db 0 times 8 db 8 pb_127: times 4 db 127 pw_m128 times 2 dw -128 pw_m256: times 2 dw -256 pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 pd_32: dd 32 pd_34: dd 34 pd_63: dd 63 pd_512: dd 512 pd_32768: dd 32768 %define pb_m64 (wm_sign+4) %define pb_64 (wm_sign+8) %define pd_2 (pd_0to7+8) cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) cextern mc_warp_filter cextern resize_filter %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro %macro BIDIR_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro %xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_8bpc_avx512icl.put) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_8bpc_avx512icl.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, avx512icl, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128 SECTION .text %macro WRAP_YMM 1+ INIT_YMM cpuname %1 INIT_ZMM cpuname %endmacro INIT_ZMM avx512icl cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx lea r7, [put_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: movzx r6d, word [srcq+ssq*0] movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6w mov [dstq+dsq*1], r7w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6d mov [dstq+dsq*1], r7d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r6 mov [dstq+dsq*1], r7 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu xmm0, [srcq+ssq*0] movu xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], xmm0 mova [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu ym0, [srcq+ssq*0] movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], ym0 mova [dstq+dsq*1], ym1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w64 RET .put_w128: movu m0, [srcq+ssq*0+64*0] movu m1, [srcq+ssq*0+64*1] movu m2, [srcq+ssq*1+64*0] movu m3, [srcq+ssq*1+64*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+64*0], m0 mova [dstq+dsq*0+64*1], m1 mova [dstq+dsq*1+64*0], m2 mova [dstq+dsq*1+64*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 0xff01 vbroadcasti128 m4, [bilin_h_shuf8] add mxyd, 16 << 8 vpbroadcastw m5, mxyd mov mxyd, r7m ; my test mxyd, mxyd jnz .hv movzx wd, word [r7+wq*2+table_offset(put, _bilin_h)] vpbroadcastd m3, [pw_2048] add wq, r7 jmp wq .h_w2: movd xmm0, [srcq+ssq*0] pinsrd xmm0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb xmm0, xm4 pmaddubsw xmm0, xm5 pmulhrsw xmm0, xm3 packuswb xmm0, xmm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2 RET .h_w4: mova xmm4, [bilin_h_shuf4] .h_w4_loop: movq xmm0, [srcq+ssq*0] movhps xmm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xmm4 pmaddubsw xmm0, xm5 pmulhrsw xmm0, xm3 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pshufb ym0, ym4 pmaddubsw ym0, ym5 pmulhrsw ym0, ym3 vpmovuswb xm0, ym0 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mova m4, [bilin_h_perm16] .h_w16_loop: movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] vpermb m0, m4, m0 pmaddubsw m0, m5 pmulhrsw m0, m3 vpmovuswb ym0, m0 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: movu ym0, [srcq+ssq*0+8*0] vinserti32x8 m0, [srcq+ssq*1+8*0], 1 movu ym1, [srcq+ssq*0+8*1] vinserti32x8 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 add srcq, ssq mova [dstq], m0 add dstq, dsq dec hd jg .h_w64 RET .h_w128: movu m0, [srcq+8*0] movu m2, [srcq+8*1] movu m1, [srcq+8*8] movu m6, [srcq+8*9] add srcq, ssq REPX {pshufb x, m4}, m0, m2, m1, m6 REPX {pmaddubsw x, m5}, m0, m2, m1, m6 REPX {pmulhrsw x, m3}, m0, m2, m1, m6 packuswb m0, m2 packuswb m1, m6 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] imul mxyd, 0xff01 vpbroadcastd m5, [pw_2048] add mxyd, 16 << 8 add wq, r7 vpbroadcastw m4, mxyd jmp wq .v_w2: movd xmm0, [srcq+ssq*0] .v_w2_loop: pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pinsrw xmm0, xmm1, [srcq+ssq*0], 0 ; 2 1 pshuflw xmm1, xmm1, q2301 ; 1 0 punpcklbw xmm1, xmm0, xmm1 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 pextrw [dstq+dsq*0], xmm1, 1 pextrw [dstq+dsq*1], xmm1, 0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xmm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xmm2, xmm1, xmm0, 0x01 ; 0 1 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm1, xmm0, 0x02 ; 1 2 punpcklbw xmm1, xmm2 pmaddubsw xmm1, xm4 pmulhrsw xmm1, xm5 packuswb xmm1, xmm1 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xmm0, [srcq+ssq*0] .v_w8_loop: movq xmm3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw xmm1, xmm3, xmm0 movq xmm0, [srcq+ssq*0] punpcklbw xmm2, xmm0, xmm3 pmaddubsw xmm1, xm4 pmaddubsw xmm2, xm4 pmulhrsw xmm1, xm5 pmulhrsw xmm2, xm5 packuswb xmm1, xmm2 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET .v_w16: movu xmm0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 ymm2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd ymm3, ymm2, ymm0, 0x0f ; 0 1 vbroadcasti128 ymm0, [srcq+ssq*0] vpblendd ymm2, ymm2, ymm0, 0xf0 ; 1 2 punpcklbw ymm1, ymm2, ymm3 punpckhbw ymm2, ymm3 pmaddubsw ymm1, ym4 pmaddubsw ymm2, ym4 pmulhrsw ymm1, ym5 pmulhrsw ymm2, ym5 packuswb ymm1, ymm2 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop vzeroupper RET .v_w32: movu ym0, [srcq+ssq*0] kxnorb k1, k1, k1 .v_w32_loop: vbroadcasti32x8 m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendmd m3{k1}, m2, m0 ; 0 1 vbroadcasti32x8 m0, [srcq+ssq*0] vpblendmd m2{k1}, m0, m2 ; 1 2 punpcklbw m1, m2, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop RET .v_w64: movu m0, [srcq+ssq*0] .v_w64_loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m1, m3, m0 punpckhbw m6, m3, m0 movu m0, [srcq+ssq*0] pmaddubsw m1, m4 pmaddubsw m6, m4 punpcklbw m2, m0, m3 punpckhbw m7, m0, m3 pmaddubsw m2, m4 pmaddubsw m7, m4 REPX {pmulhrsw x, m5}, m1, m6, m2, m7 packuswb m1, m6 packuswb m2, m7 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w64_loop RET .v_w128: movu m0, [srcq+64*0] movu m1, [srcq+64*1] .v_w128_loop: add srcq, ssq movu m2, [srcq+64*0] movu m3, [srcq+64*1] punpcklbw m6, m2, m0 pmaddubsw m6, m4 punpckhbw m0, m2, m0 pmaddubsw m0, m4 punpcklbw m7, m3, m1 pmaddubsw m7, m4 punpckhbw m1, m3, m1 pmaddubsw m1, m4 REPX {pmulhrsw x, m5}, m6, m0, m7, m1 packuswb m6, m0 mova m0, m2 packuswb m7, m1 mova m1, m3 mova [dstq+64*0], m6 mova [dstq+64*1], m7 add dstq, dsq dec hd jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow vpbroadcastd m7, [pw_2048] add wq, r7 vpbroadcastw m6, mxyd jmp wq .hv_w2: vpbroadcastd xmm0, [srcq+ssq*0] pshufb xmm0, xm4 pmaddubsw xmm0, xm5 .hv_w2_loop: movd xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pinsrd xmm1, [srcq+ssq*0], 1 pshufb xmm1, xm4 pmaddubsw xmm1, xm5 ; 1 _ 2 _ shufps xmm2, xmm0, xmm1, q1032 ; 0 _ 1 _ mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm6 paddw xmm1, xmm2 pmulhrsw xmm1, xm7 packuswb xmm1, xmm1 pextrw [dstq+dsq*0], xmm1, 0 pextrw [dstq+dsq*1], xmm1, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova xmm4, [bilin_h_shuf4] movddup xmm0, [srcq+ssq*0] pshufb xmm0, xmm4 pmaddubsw xmm0, xm5 .hv_w4_loop: movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm1, [srcq+ssq*0] pshufb xmm1, xmm4 pmaddubsw xmm1, xm5 ; 1 2 shufps xmm2, xmm0, xmm1, q1032 ; 0 1 mova xmm0, xmm1 psubw xmm1, xmm2 paddw xmm1, xmm1 pmulhw xmm1, xm6 paddw xmm1, xmm2 pmulhrsw xmm1, xm7 packuswb xmm1, xmm1 movd [dstq+dsq*0], xmm1 pextrd [dstq+dsq*1], xmm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: vbroadcasti128 ym0, [srcq+ssq*0] pshufb ym0, ym4 pmaddubsw ym0, ym5 .hv_w8_loop: movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti128 ym1, [srcq+ssq*0], 1 pshufb ym1, ym4 pmaddubsw ym1, ym5 ; 1 2 valignq ym2, ym1, ym0, 2 mova ym0, ym1 psubw ym1, ym2 paddw ym1, ym1 pmulhw ym1, ym6 paddw ym1, ym2 pmulhrsw ym1, ym7 vpmovuswb xm1, ym1 movq [dstq+dsq*0], xm1 movhps [dstq+dsq*1], xm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w16: vbroadcasti32x8 m0, [srcq+ssq*0] mova m4, [bilin_h_perm16] vpermb m0, m4, m0 pmaddubsw m0, m5 .hv_w16_loop: movu ym1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m1, [srcq+ssq*0], 1 vpermb m1, m4, m1 pmaddubsw m1, m5 ; 1 2 valignq m2, m1, m0, 4 ; 0 1 mova m0, m1 psubw m1, m2 paddw m1, m1 pmulhw m1, m6 paddw m1, m2 pmulhrsw m1, m7 vpmovuswb ym1, m1 mova [dstq+dsq*0], xm1 vextracti32x4 [dstq+dsq*1], ym1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop RET .hv_w32: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+ssq*0] pmovzxbq m8, [pb_02461357] pmaddubsw m0, m5 .hv_w32_loop: vpermb m2, m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpermb m3, m4, [srcq+ssq*0] pmaddubsw m2, m5 psubw m1, m2, m0 paddw m1, m1 pmulhw m1, m6 paddw m1, m0 pmaddubsw m0, m3, m5 psubw m3, m0, m2 paddw m3, m3 pmulhw m3, m6 paddw m3, m2 pmulhrsw m1, m7 pmulhrsw m3, m7 packuswb m1, m3 vpermq m1, m8, m1 mova [dstq+dsq*0], ym1 vextracti32x8 [dstq+dsq*1], m1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w32_loop RET .hv_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w64_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m8, m2, m0 psubw m9, m3, m1 paddw m8, m8 pmulhw m8, m6 paddw m9, m9 pmulhw m9, m6 paddw m8, m0 pmulhrsw m8, m7 paddw m9, m1 pmulhrsw m9, m7 mova m0, m2 mova m1, m3 packuswb m8, m9 mova [dstq], m8 add dstq, dsq dec hd jg .hv_w64_loop RET .hv_w128: movu m0, [srcq+8*0] movu m1, [srcq+8*1] movu m2, [srcq+8*8] movu m3, [srcq+8*9] REPX {pshufb x, m4}, m0, m1, m2, m3 REPX {pmaddubsw x, m5}, m0, m1, m2, m3 .hv_w128_loop: add srcq, ssq movu m8, [srcq+8*0] movu m9, [srcq+8*1] movu m10, [srcq+8*8] movu m11, [srcq+8*9] REPX {pshufb x, m4}, m8, m9, m10, m11 REPX {pmaddubsw x, m5}, m8, m9, m10, m11 psubw m12, m8, m0 psubw m13, m9, m1 psubw m14, m10, m2 psubw m15, m11, m3 paddw m12, m12 pmulhw m12, m6 paddw m13, m13 pmulhw m13, m6 paddw m14, m14 pmulhw m14, m6 paddw m15, m15 pmulhw m15, m6 paddw m12, m0 pmulhrsw m12, m7 paddw m13, m1 pmulhrsw m13, m7 paddw m14, m2 pmulhrsw m14, m7 paddw m15, m3 pmulhrsw m15, m7 mova m0, m8 mova m1, m9 mova m2, m10 mova m3, m11 packuswb m12, m13 packuswb m14, m15 mova [dstq+64*0], m12 mova [dstq+64*1], m14 add dstq, dsq dec hd jg .hv_w128_loop RET DECLARE_REG_TMP 3, 5, 6 cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx lea t2, [prep_avx512icl] tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: movzx wd, word [t2+wq*2+table_offset(prep,)] add wq, t2 lea stride3q, [strideq*3] jmp wq .prep_w4: movd xmm0, [srcq+strideq*0] pinsrd xmm0, [srcq+strideq*1], 1 pinsrd xmm0, [srcq+strideq*2], 2 pinsrd xmm0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pmovzxbw ym0, xmm0 psllw ym0, 4 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .prep_w4 RET .prep_w8: movq xmm0, [srcq+strideq*0] movq xmm1, [srcq+strideq*1] vinserti128 ym0, ymm0, [srcq+strideq*2], 1 vinserti128 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1 pmovzxbw m0, ym0 psllw m0, 4 mova [tmpq], m0 add tmpq, 32*2 sub hd, 4 jg .prep_w8 RET .prep_w16: movu xmm0, [srcq+strideq*0] vinserti128 ym0, ymm0, [srcq+strideq*1], 1 movu xmm1, [srcq+strideq*2] vinserti128 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] pmovzxbw m0, ym0 pmovzxbw m1, ym1 psllw m0, 4 psllw m1, 4 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 32*4 sub hd, 4 jg .prep_w16 RET .prep_w32: pmovzxbw m0, [srcq+strideq*0] pmovzxbw m1, [srcq+strideq*1] pmovzxbw m2, [srcq+strideq*2] pmovzxbw m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .prep_w32 RET .prep_w64: pmovzxbw m0, [srcq+strideq*0+32*0] pmovzxbw m1, [srcq+strideq*0+32*1] pmovzxbw m2, [srcq+strideq*1+32*0] pmovzxbw m3, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .prep_w64 RET .prep_w128: pmovzxbw m0, [srcq+32*0] pmovzxbw m1, [srcq+32*1] pmovzxbw m2, [srcq+32*2] pmovzxbw m3, [srcq+32*3] REPX {psllw x, 4}, m0, m1, m2, m3 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 add srcq, strideq dec hd jg .prep_w128 RET .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 0xff01 add mxyd, 16 << 8 vpbroadcastw m5, mxyd mov mxyd, r6m ; my test mxyd, mxyd jnz .hv movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] add wq, t2 lea stride3q, [strideq*3] jmp wq .h_w4: vbroadcasti32x4 ym4, [bilin_h_shuf4] .h_w4_loop: movq xmm0, [srcq+strideq*0] movq xmm1, [srcq+strideq*1] vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1 vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] punpcklqdq ym0, ym1 pshufb ym0, ym4 pmaddubsw ym0, ym5 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: vbroadcasti32x4 m4, [bilin_h_shuf8] .h_w8_loop: movu xmm0, [srcq+strideq*0] vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1 vinserti32x4 m0, [srcq+strideq*2], 2 vinserti32x4 m0, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pshufb m0, m4 pmaddubsw m0, m5 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: mova m4, [bilin_h_perm16] .h_w16_loop: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 movu ym1, [srcq+strideq*2] vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vpermb m0, m4, m0 vpermb m1, m4, m1 pmaddubsw m0, m5 pmaddubsw m1, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 add tmpq, 64*2 sub hd, 4 jg .h_w16_loop RET .h_w32: mova m4, [bilin_h_perm32] .h_w32_loop: vpermb m0, m4, [srcq+strideq*0] vpermb m1, m4, [srcq+strideq*1] vpermb m2, m4, [srcq+strideq*2] vpermb m3, m4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 4 jg .h_w32_loop RET .h_w64: mova m4, [bilin_h_perm32] .h_w64_loop: vpermb m0, m4, [srcq+strideq*0+32*0] vpermb m1, m4, [srcq+strideq*0+32*1] vpermb m2, m4, [srcq+strideq*1+32*0] vpermb m3, m4, [srcq+strideq*1+32*1] lea srcq, [srcq+strideq*2] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 sub hd, 2 jg .h_w64_loop RET .h_w128: mova m4, [bilin_h_perm32] .h_w128_loop: vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] vpermb m2, m4, [srcq+32*2] vpermb m3, m4, [srcq+32*3] pmaddubsw m0, m5 pmaddubsw m1, m5 pmaddubsw m2, m5 pmaddubsw m3, m5 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 mova [tmpq+64*2], m2 mova [tmpq+64*3], m3 add tmpq, 64*4 add srcq, strideq dec hd jg .h_w128_loop RET .v: WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 0xff01 add mxyd, 16 << 8 add wq, t2 lea stride3q, [strideq*3] vpbroadcastw m6, mxyd jmp wq .v_w4: vpbroadcastd xm0, [srcq+strideq*0] mov r3d, 0x29 vbroadcasti32x4 ym3, [bilin_v_shuf4] kmovb k1, r3d .v_w4_loop: vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____ vpbroadcastd ym2, [srcq+strideq*2] vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__ lea srcq, [srcq+strideq*4] vpbroadcastd ym0, [srcq+strideq*0] punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_ pshufb ym2, ym3 pmaddubsw ym2, ym6 mova [tmpq], ym2 add tmpq, 32 sub hd, 4 jg .v_w4_loop RET .v_w8: mova m5, [bilin_v_perm8] vbroadcasti32x4 ym0, [srcq+strideq*0] .v_w8_loop: vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vpbroadcastq ym0, [srcq+strideq*2] vinserti32x4 m1, [srcq+stride3q ], 2 lea srcq, [srcq+strideq*4] vinserti32x4 ym0, [srcq+strideq*0], 0 vpermt2b m1, m5, m0 pmaddubsw m1, m6 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: mova m5, [bilin_v_perm16] movu xm0, [srcq+strideq*0] .v_w16_loop: movu xm2, [srcq+strideq*2] vinserti32x4 ym1, ym0, [srcq+strideq*1], 1 vpermt2b m1, m5, m2 vinserti32x4 ym2, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] movu xm0, [srcq+strideq*0] vpermt2b m2, m5, m0 pmaddubsw m1, m6 pmaddubsw m2, m6 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: mova m5, [bilin_v_perm32] movu ym0, [srcq+strideq*0] .v_w32_loop: movu ym2, [srcq+strideq*1] movu ym3, [srcq+strideq*2] movu ym4, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpermt2b m0, m5, m2 vpermt2b m2, m5, m3 vpermt2b m3, m5, m4 pmaddubsw m1, m0, m6 movu ym0, [srcq+strideq*0] vpermt2b m4, m5, m0 pmaddubsw m2, m6 pmaddubsw m3, m6 pmaddubsw m4, m6 mova [tmpq+64*0], m1 mova [tmpq+64*1], m2 mova [tmpq+64*2], m3 mova [tmpq+64*3], m4 add tmpq, 64*4 sub hd, 4 jg .v_w32_loop RET .v_w64: mova m5, [bilin_v_perm64] vpermq m0, m5, [srcq+strideq*0] .v_w64_loop: vpermq m1, m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m1, m0 punpckhbw m2, m1, m0 vpermq m0, m5, [srcq+strideq*0] punpcklbw m3, m0, m1 punpckhbw m1, m0, m1 pmaddubsw m4, m6 pmaddubsw m2, m6 pmaddubsw m3, m6 pmaddubsw m1, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m2 mova [tmpq+64*2], m3 mova [tmpq+64*3], m1 add tmpq, 64*4 sub hd, 2 jg .v_w64_loop RET .v_w128: mova m5, [bilin_v_perm64] vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] .v_w128_loop: vpermq m2, m5, [srcq+strideq*1+ 0] vpermq m3, m5, [srcq+strideq*1+64] lea srcq, [srcq+strideq*2] punpcklbw m4, m2, m0 punpckhbw m0, m2, m0 pmaddubsw m4, m6 pmaddubsw m0, m6 mova [tmpq+64*0], m4 mova [tmpq+64*1], m0 punpcklbw m4, m3, m1 punpckhbw m1, m3, m1 pmaddubsw m4, m6 pmaddubsw m1, m6 mova [tmpq+64*2], m4 mova [tmpq+64*3], m1 vpermq m0, m5, [srcq+strideq*0+ 0] vpermq m1, m5, [srcq+strideq*0+64] punpcklbw m4, m0, m2 punpckhbw m2, m0, m2 pmaddubsw m4, m6 pmaddubsw m2, m6 mova [tmpq+64*4], m4 mova [tmpq+64*5], m2 punpcklbw m4, m1, m3 punpckhbw m3, m1, m3 pmaddubsw m4, m6 pmaddubsw m3, m6 mova [tmpq+64*6], m4 mova [tmpq+64*7], m3 add tmpq, 64*8 sub hd, 2 jg .v_w128_loop RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 7 movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] shl mxyd, 11 vpbroadcastw m6, mxyd add wq, t2 lea stride3q, [strideq*3] jmp wq .hv_w4: vbroadcasti32x4 ym4, [bilin_h_shuf4] vpbroadcastq ym0, [srcq+strideq*0] pshufb ym0, ym4 pmaddubsw ym0, ym5 .hv_w4_loop: movq xmm1, [srcq+strideq*1] movq xmm2, [srcq+strideq*2] vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1 punpcklqdq ym1, ym2 pshufb ym1, ym4 pmaddubsw ym1, ym5 ; 1 2 3 4 valignq ym2, ym1, ym0, 3 ; 0 1 2 3 mova ym0, ym1 psubw ym1, ym2 pmulhrsw ym1, ym6 paddw ym1, ym2 mova [tmpq], ym1 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: vbroadcasti32x4 m4, [bilin_h_shuf8] vbroadcasti32x4 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu xmm1, [srcq+strideq*1] vinserti128 ym1, ymm1, [srcq+strideq*2], 1 vinserti128 m1, [srcq+stride3q ], 2 lea srcq, [srcq+strideq*4] vinserti128 m1, [srcq+strideq*0], 3 pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 3 4 valignq m2, m1, m0, 6 ; 0 1 2 3 mova m0, m1 psubw m1, m2 pmulhrsw m1, m6 paddw m1, m2 mova [tmpq], m1 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: mova m4, [bilin_h_perm16] vbroadcasti32x8 m0, [srcq+strideq*0] vpermb m0, m4, m0 pmaddubsw m0, m5 .hv_w16_loop: movu ym1, [srcq+strideq*1] vinserti32x8 m1, [srcq+strideq*2], 1 movu ym2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti32x8 m2, [srcq+strideq*0], 1 vpermb m1, m4, m1 vpermb m2, m4, m2 pmaddubsw m1, m5 ; 1 2 vshufi32x4 m3, m0, m1, q1032 ; 0 1 pmaddubsw m0, m2, m5 ; 3 4 vshufi32x4 m2, m1, m0, q1032 ; 2 3 psubw m1, m3 pmulhrsw m1, m6 paddw m1, m3 psubw m3, m0, m2 pmulhrsw m3, m6 paddw m3, m2 mova [tmpq+64*0], m1 mova [tmpq+64*1], m3 add tmpq, 64*2 sub hd, 4 jg .hv_w16_loop RET .hv_w32: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+strideq*0] pmaddubsw m0, m5 .hv_w32_loop: vpermb m1, m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermb m2, m4, [srcq+strideq*0] pmaddubsw m1, m5 psubw m3, m1, m0 pmulhrsw m3, m6 paddw m3, m0 pmaddubsw m0, m2, m5 psubw m2, m0, m1 pmulhrsw m2, m6 paddw m2, m1 mova [tmpq+64*0], m3 mova [tmpq+64*1], m2 add tmpq, 64*2 sub hd, 2 jg .hv_w32_loop RET .hv_w64: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w64_loop: add srcq, strideq vpermb m2, m4, [srcq+32*0] vpermb m3, m4, [srcq+32*1] pmaddubsw m2, m5 pmaddubsw m3, m5 psubw m7, m2, m0 psubw m8, m3, m1 pmulhrsw m7, m6 pmulhrsw m8, m6 paddw m7, m0 mova m0, m2 paddw m8, m1 mova m1, m3 mova [tmpq+64*0], m7 mova [tmpq+64*1], m8 add tmpq, 64*2 dec hd jg .hv_w64_loop RET .hv_w128: mova m4, [bilin_h_perm32] vpermb m0, m4, [srcq+32*0] vpermb m1, m4, [srcq+32*1] vpermb m2, m4, [srcq+32*2] vpermb m3, m4, [srcq+32*3] REPX {pmaddubsw x, m5}, m0, m1, m2, m3 .hv_w128_loop: add srcq, strideq vpermb m7, m4, [srcq+32*0] vpermb m8, m4, [srcq+32*1] vpermb m9, m4, [srcq+32*2] vpermb m10, m4, [srcq+32*3] REPX {pmaddubsw x, m5}, m7, m8, m9, m10 psubw m11, m7, m0 psubw m12, m8, m1 psubw m13, m9, m2 psubw m14, m10, m3 REPX {pmulhrsw x, m6}, m11, m12, m13, m14 paddw m11, m0 mova m0, m7 paddw m12, m1 mova m1, m8 paddw m13, m2 mova m2, m9 paddw m14, m3 mova m3, m10 mova [tmpq+64*0], m11 mova [tmpq+64*1], m12 mova [tmpq+64*2], m13 mova [tmpq+64*3], m14 add tmpq, 64*4 dec hd jg .hv_w128_loop RET ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; fn, type, type_h, type_v cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) %endif %endmacro %macro PUT_8TAP_H 4-5 0 ; dst/src, tmp[1-3], vpermb %if %5 vpermb m%2, m6, m%1 vpermb m%3, m7, m%1 vpermb m%4, m8, m%1 %else %if %2 < %4 ; reuse a previous value if possible pshufb m%2, m%1, m6 %endif pshufb m%3, m%1, m7 pshufb m%4, m%1, m8 %endif mova m%1, m5 vpdpbusd m%1, m%2, m9 mova m%2, m5 vpdpbusd m%2, m%3, m9 vpdpbusd m%1, m%3, m10 vpdpbusd m%2, m%4, m10 packusdw m%1, m%2 psrlw m%1, 6 %endmacro %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif %define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN sharp, SHARP, SHARP PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP PUT_8TAP_FN smooth, SMOOTH, SMOOTH PUT_8TAP_FN sharp_regular, SHARP, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH PUT_8TAP_FN regular, REGULAR, REGULAR cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %define base r8-put_avx512icl imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r8, [put_avx512icl] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r8+wq*2+table_offset(put,)] add wq, r8 lea r6, [ssq*3] lea r7, [dsq*3] %if WIN64 pop r8 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m5, [pd_34] ; 2 + (8 << 2) WIN64_SPILL_XMM 11 cmp wd, 4 jl .h_w2 vbroadcasti128 m6, [subpel_h_shufA] je .h_w4 tzcnt wd, wd vbroadcasti128 m7, [subpel_h_shufB] vbroadcasti128 m8, [subpel_h_shufC] shr mxd, 16 sub srcq, 3 movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)] vpbroadcastd m9, [base+mxq*8+subpel_filters+0] vpbroadcastd m10, [base+mxq*8+subpel_filters+4] add wq, r8 jmp wq .h_w2: movzx mxd, mxb dec srcq mova xmm4, [subpel_h_shuf4] vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] .h_w2_loop: movq xmm0, [srcq+ssq*0] movhps xmm0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xmm4 mova xmm1, xm5 vpdpbusd xmm1, xmm0, xmm3 packssdw xmm0, xmm1, xmm1 psraw xmm0, 6 packuswb xmm0, xm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movzx mxd, mxb dec srcq vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2] .h_w4_loop: movq xmm0, [srcq+ssq*0] movq xmm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb xmm0, xm6 pshufb xmm1, xm6 mova xmm2, xm5 vpdpbusd xmm2, xmm0, xmm3 mova xmm0, xm5 vpdpbusd xmm0, xmm1, xmm3 packssdw xmm0, xmm2, xmm0 psraw xmm0, 6 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET .h_w8: movu xm0, [srcq+ssq*0] vinserti32x4 ym0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] WRAP_YMM PUT_8TAP_H 0, 1, 2, 3 vpmovuswb xm0, ym0 movq [dstq+dsq*0], xm0 movhps [dstq+dsq*1], xm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: mova m6, [spel_h_perm16a] mova m7, [spel_h_perm16b] mova m8, [spel_h_perm16c] .h_w16_loop: movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 1, 2, 3, 1 vpmovuswb ym0, m0 mova [dstq+dsq*0], xm0 vextracti128 [dstq+dsq*1], ym0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w16_loop RET .h_w32: movu ym0, [srcq+ssq*0+8*0] vinserti32x8 m0, [srcq+ssq*1+8*0], 1 movu ym1, [srcq+ssq*0+8*1] vinserti32x8 m1, [srcq+ssq*1+8*1], 1 lea srcq, [srcq+ssq*2] PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 4, 3, 2 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w32 RET .h_w64: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq PUT_8TAP_H 0, 2, 3, 4 PUT_8TAP_H 1, 4, 3, 2 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w64 RET .h_w128: movu m0, [srcq+8*0] movu m2, [srcq+8*1] movu m1, [srcq+8*8] movu m3, [srcq+8*9] add srcq, ssq PUT_8TAP_H 0, 4, 11, 12 PUT_8TAP_H 2, 12, 11, 4 PUT_8TAP_H 1, 4, 11, 12 PUT_8TAP_H 3, 12, 11, 4 packuswb m0, m2 packuswb m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq dec hd jg .h_w128 RET .v: movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd tzcnt r6d, wd movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)] vpbroadcastd m7, [pw_512] lea myq, [base+subpel_filters+myq*8] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] add r6, r8 lea ss3q, [ssq*3] sub srcq, ss3q jmp r6 .v_w2: movd xmm2, [srcq+ssq*0] pinsrw xmm2, [srcq+ssq*1], 2 pinsrw xmm2, [srcq+ssq*2], 4 add srcq, ss3q pinsrw xmm2, [srcq+ssq*0], 6 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklbw xmm3, xmm1 ; 45 56 punpcklbw xmm1, xmm2, xmm4 ; 01 12 punpckhbw xmm2, xmm4 ; 23 34 .v_w2_loop: pmaddubsw xmm5, xmm1, xm8 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm9 ; a1 b1 paddw xmm5, xmm2 mova xmm2, xmm3 pmaddubsw xmm3, xm10 ; a2 b2 paddw xmm5, xmm3 vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 punpcklbw xmm3, xmm4 ; 67 78 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 paddw xmm5, xmm4 pmulhrsw xmm5, xm7 packuswb xmm5, xmm5 pextrw [dstq+dsq*0], xmm5, 0 pextrw [dstq+dsq*1], xmm5, 2 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd xmm2, [srcq+ssq*0] pinsrd xmm2, [srcq+ssq*1], 1 pinsrd xmm2, [srcq+ssq*2], 2 add srcq, ss3q pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3 movd xmm3, [srcq+ssq*1] vpbroadcastd xmm1, [srcq+ssq*2] add srcq, ss3q vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm3, xmm3, xmm1, 0x02 ; 4 5 vpblendd xmm1, xmm1, xmm0, 0x02 ; 5 6 palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4 punpcklbw xmm3, xmm1 ; 45 56 punpcklbw xmm1, xmm2, xmm4 ; 01 12 punpckhbw xmm2, xmm4 ; 23 34 .v_w4_loop: vpbroadcastd xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw xmm5, xmm1, xm8 ; a0 b0 mova xmm1, xmm2 pmaddubsw xmm2, xm9 ; a1 b1 paddw xmm5, xmm2 mova xmm2, xmm3 pmaddubsw xmm3, xm10 ; a2 b2 paddw xmm5, xmm3 vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7 vpbroadcastd xmm0, [srcq+ssq*0] vpblendd xmm4, xmm4, xmm0, 0x02 ; 7 8 punpcklbw xmm3, xmm4 ; 67 78 pmaddubsw xmm4, xmm3, xm11 ; a3 b3 paddw xmm5, xmm4 pmulhrsw xmm5, xm7 packuswb xmm5, xmm5 movd [dstq+dsq*0], xmm5 pextrd [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq xmm1, [srcq+ssq*0] vpbroadcastq ymm0, [srcq+ssq*1] vpbroadcastq ymm2, [srcq+ssq*2] add srcq, ss3q vpbroadcastq ymm5, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm4, [srcq+ssq*2] add srcq, ss3q vpblendd ymm1, ymm0, 0x30 vpblendd ymm0, ymm2, 0x30 punpcklbw ymm1, ymm0 ; 01 12 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm2, ymm5, 0x30 vpblendd ymm5, ymm3, 0x30 punpcklbw ymm2, ymm5 ; 23 34 vpblendd ymm3, ymm4, 0x30 vpblendd ymm4, ymm0, 0x30 punpcklbw ymm3, ymm4 ; 45 56 .v_w8_loop: vpbroadcastq ymm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw ymm5, ymm1, ym8 ; a0 b0 mova ymm1, ymm2 pmaddubsw ymm2, ym9 ; a1 b1 paddw ymm5, ymm2 mova ymm2, ymm3 pmaddubsw ymm3, ym10 ; a2 b2 paddw ymm5, ymm3 vpblendd ymm3, ymm0, ymm4, 0x30 vpbroadcastq ymm0, [srcq+ssq*0] vpblendd ymm4, ymm4, ymm0, 0x30 punpcklbw ymm3, ymm4 ; 67 78 pmaddubsw ymm4, ymm3, ym11 ; a3 b3 paddw ymm5, ymm4 pmulhrsw ymm5, ym7 vextracti128 xmm4, ymm5, 1 packuswb xmm5, xmm4 movq [dstq+dsq*0], xmm5 movhps [dstq+dsq*1], xmm5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop vzeroupper RET .v_w16: mova m12, [spel_v_perm16] vbroadcasti32x4 m1, [srcq+ssq*0] vbroadcasti32x4 ym4, [srcq+ssq*1] mov r6d, 0x0f vbroadcasti32x4 m2, [srcq+ssq*2] add srcq, ss3q vbroadcasti32x4 ym5, [srcq+ssq*0] kmovb k1, r6d vbroadcasti32x4 m3, [srcq+ssq*1] vbroadcasti32x4 ym6, [srcq+ssq*2] add srcq, ss3q vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m1{k1}, m4, m2, 0xcc vshufpd m2{k1}, m5, m3, 0xcc vshufpd m3{k1}, m6, m0, 0xcc vpermb m1, m12, m1 ; 01 12 vpermb m2, m12, m2 ; 23 34 vpermb m3, m12, m3 ; 45 56 .v_w16_loop: pmaddubsw m4, m1, m8 ; a0 b0 mova m1, m2 pmaddubsw m5, m2, m9 ; a1 b1 mova m2, m3 pmaddubsw m6, m3, m10 ; a2 b2 mova m3, m0 paddw m4, m5 vbroadcasti32x4 ym5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m0, [srcq+ssq*0] vshufpd m3{k1}, m5, m0, 0xcc vpermb m3, m12, m3 ; 67 78 pmaddubsw m5, m3, m11 ; a3 b3 paddw m4, m6 paddw m4, m5 pmulhrsw m4, m7 vextracti32x8 ym5, m4, 1 packuswb ym4, ym5 mova [dstq+dsq*0], xm4 vextracti32x4 [dstq+dsq*1], ym4, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w16_loop RET .v_w32: mova m12, [spel_v_perm32] pmovzxbq m14, [pb_02461357] vpshrdw m13, m12, m12, 8 movu ym0, [srcq+ssq*0] vinserti32x8 m0, [srcq+ssq*1], 1 vpermb m1, m12, m0 ; 01 vinserti32x8 m0, [srcq+ssq*2], 0 add srcq, ss3q vpermb m2, m13, m0 ; 12 vinserti32x8 m0, [srcq+ssq*0], 1 vpermb m3, m12, m0 ; 23 vinserti32x8 m0, [srcq+ssq*1], 0 vpermb m4, m13, m0 ; 34 vinserti32x8 m0, [srcq+ssq*2], 1 add srcq, ss3q vpermb m5, m12, m0 ; 45 vinserti32x8 m0, [srcq+ssq*0], 0 vpermb m6, m13, m0 ; 56 .v_w32_loop: vinserti32x8 m0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] pmaddubsw m15, m1, m8 mova m1, m3 pmaddubsw m16, m2, m8 mova m2, m4 pmaddubsw m17, m3, m9 mova m3, m5 pmaddubsw m18, m4, m9 mova m4, m6 pmaddubsw m19, m5, m10 vpermb m5, m12, m0 ; 67 vinserti32x8 m0, [srcq+ssq*0], 0 pmaddubsw m20, m6, m10 vpermb m6, m13, m0 ; 78 paddw m15, m17 pmaddubsw m17, m5, m11 paddw m16, m18 pmaddubsw m18, m6, m11 paddw m15, m19 paddw m16, m20 paddw m15, m17 paddw m16, m18 pmulhrsw m15, m7 pmulhrsw m16, m7 packuswb m15, m16 vpermq m15, m14, m15 mova [dstq+dsq*0], ym15 vextracti32x8 [dstq+dsq*1], m15, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w32_loop vzeroupper RET .v_w64: .v_w128: lea r6d, [hq+wq*4-256] mov r4, srcq mov r7, dstq .v_loop0: movu m2, [srcq+ssq*0] movu m4, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q movu m13, [srcq+ssq*0] movu m15, [srcq+ssq*1] movu m17, [srcq+ssq*2] add srcq, ss3q movu m0, [srcq+ssq*0] punpcklbw m1, m2, m4 ; 01l punpckhbw m2, m4 ; 01h punpcklbw m3, m4, m6 ; 12l punpckhbw m4, m6 ; 12h punpcklbw m5, m6, m13 ; 23l punpckhbw m6, m13 ; 23h punpcklbw m12, m13, m15 ; 34l punpckhbw m13, m15 ; 34h punpcklbw m14, m15, m17 ; 45l punpckhbw m15, m17 ; 45h punpcklbw m16, m17, m0 ; 56l punpckhbw m17, m0 ; 56h .v_loop: pmaddubsw m18, m1, m8 ; a0l mova m1, m5 pmaddubsw m19, m2, m8 ; a0h mova m2, m6 pmaddubsw m20, m3, m8 ; b0l mova m3, m12 pmaddubsw m21, m4, m8 ; b0h mova m4, m13 pmaddubsw m5, m9 ; a1l pmaddubsw m6, m9 ; a1h pmaddubsw m12, m9 ; b1l pmaddubsw m13, m9 ; b1h paddw m18, m5 mova m5, m14 pmaddubsw m14, m10 ; a2l paddw m19, m6 mova m6, m15 pmaddubsw m15, m10 ; a2h paddw m20, m12 mova m12, m16 pmaddubsw m16, m10 ; b2l paddw m21, m13 mova m13, m17 pmaddubsw m17, m10 ; b2h paddw m18, m14 paddw m19, m15 paddw m20, m16 paddw m21, m17 movu m17, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m14, m0, m17 ; 67l punpckhbw m15, m0, m17 ; 67h pmaddubsw m16, m14, m11 ; a3l pmaddubsw m0, m15, m11 ; a3h paddw m18, m16 paddw m19, m0 movu m0, [srcq+ssq*0] punpcklbw m16, m17, m0 ; 78l punpckhbw m17, m0 ; 78h pmulhrsw m18, m7 pmulhrsw m19, m7 packuswb m18, m19 mova [dstq+dsq*0], m18 pmaddubsw m18, m16, m11 ; b3l pmaddubsw m19, m17, m11 ; b3h paddw m18, m20 paddw m19, m21 pmulhrsw m18, m7 pmulhrsw m19, m7 packuswb m18, m19 mova [dstq+dsq*1], m18 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_loop add r4, 64 add r7, 64 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 256 jg .v_loop0 vzeroupper RET .hv: cmp wd, 4 jg .hv_w8 movzx mxd, mxb dec srcq vpbroadcastd m7, [base+subpel_filters+mxq*8+2] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m8, [pd_2] vpbroadcastq ym0, [base+subpel_filters+myq*8] lea ss3q, [ssq*3] vpbroadcastd ym9, [pd_32768] mov r6, srcq punpcklbw ym0, ym8, ym0 sub r6, ss3q psraw ym0, 2 ; << 6 mova xm14, [spel_hv_end] pshufd ym10, ym0, q0000 pshufd ym11, ym0, q1111 pshufd ym12, ym0, q2222 pshufd ym13, ym0, q3333 cmp wd, 4 je .hv_w4 vbroadcasti128 ym6, [subpel_h_shuf4] movq xmm2, [r6+ssq*0] movhps xmm2, [r6+ssq*1] movq xmm0, [r6+ssq*2] movhps xmm0, [srcq+ssq*0] vpbroadcastq ymm3, [srcq+ssq*1] vpbroadcastq ymm4, [srcq+ssq*2] add srcq, ss3q vpbroadcastq ymm1, [srcq+ssq*0] vpblendd ymm2, ymm3, 0x30 vpblendd ymm0, ymm1, 0x30 ; 2 3 6 _ vpblendd ymm2, ymm4, 0xc0 ; 0 1 4 5 pshufb ymm2, ym6 pshufb ymm0, ym6 mova ymm1, ym8 vpdpbusd ymm1, ymm2, ym7 mova ymm2, ym8 vpdpbusd ymm2, ymm0, ym7 packssdw ymm2, ymm1, ymm2 psraw ymm2, 2 vextracti128 xmm3, ymm2, 1 palignr xmm4, xmm3, xmm2, 4 punpcklwd xmm1, xmm2, xmm4 ; 01 12 punpckhwd xmm2, xmm4 ; 23 34 pshufd xmm0, xmm3, q2121 punpcklwd xmm3, xmm0 ; 45 56 .hv_w2_loop: movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps xmm4, [srcq+ssq*0] mova xmm5, xm9 vpdpwssd xmm5, xmm1, xm10 ; a0 b0 mova xmm1, xmm2 vpdpwssd xmm5, xmm2, xm11 ; a1 b1 pshufb xmm4, xm6 mova xmm2, xmm3 vpdpwssd xmm5, xmm3, xm12 ; a2 b2 mova xmm3, xm8 vpdpbusd xmm3, xmm4, xm7 packssdw xmm4, xmm3, xmm3 psraw xmm4, 2 palignr xmm3, xmm4, xmm0, 12 mova xmm0, xmm4 punpcklwd xmm3, xmm4 ; 67 78 vpdpwssd xmm5, xmm3, xm13 ; a3 b3 packuswb xmm5, xmm5 pshufb xmm5, xm14 pextrw [dstq+dsq*0], xmm5, 0 pextrw [dstq+dsq*1], xmm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop vzeroupper RET .hv_w4: movq xmm1, [r6+ssq*0] vpbroadcastq ym2, [r6+ssq*1] vinserti32x4 ym1, ymm1, [r6+ssq*2], 1 vinserti32x4 m2, [srcq+ssq*0], 2 vinserti32x4 m1, [srcq+ssq*1], 2 vinserti32x4 m2, [srcq+ssq*2], 3 ; _ 1 3 5 vbroadcasti32x4 m6, [subpel_h_shufA] add srcq, ss3q vinserti32x4 m1, [srcq+ssq*0], 3 ; 0 2 4 6 pshufb m2, m6 pshufb m1, m6 mova m0, m8 vpdpbusd m0, m2, m7 mova m4, m8 vpdpbusd m4, m1, m7 mova ym1, [spel_hv_perm4a] mova ym2, [spel_hv_perm4b] mova ym3, [spel_hv_perm4c] packssdw m0, m4 psraw m0, 2 ; _ 0 1 2 3 4 5 6 mov r6d, 0x5555 vpermb ym1, ym1, ym0 ; 01 12 vpermb m2, m2, m0 ; 23 34 vpermb m3, m3, m0 ; 45 56 kmovw k1, r6d mova ym15, [spel_hv_perm4d] .hv_w4_loop: movq xmm4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x4 ym4, ymm4, [srcq+ssq*0], 1 mova ym5, ym9 vpdpwssd ym5, ym1, ym10 ; a0 b0 mova ym1, ym2 pshufb ym4, ym6 mova ym0, ym8 vpdpbusd ym0, ym4, ym7 vpdpwssd ym5, ym2, ym11 ; a1 b1 mova ym2, ym3 vpdpwssd ym5, ym3, ym12 ; a2 b2 vpsraw ym3{k1}, ym0, 2 ; 7 8 vpermb ym3, ym15, ym3 ; 67 78 vpdpwssd ym5, ym3, ym13 ; a3 b3 packuswb ym5, ym5 vpermb ym5, ym14, ym5 movd [dstq+dsq*0], xm5 pextrd [dstq+dsq*1], xm5, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [base+subpel_filters+mxq*8+0] vpbroadcastd m11, [base+subpel_filters+mxq*8+4] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd vpbroadcastd m8, [pd_2] vpbroadcastq m0, [base+subpel_filters+myq*8] vpbroadcastd m9, [pd_32768] punpcklbw m0, m8, m0 lea ss3q, [ssq*3] psraw m0, 2 ; << 6 pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 cmp wd, 8 jne .hv_w16 mov r6, srcq sub r6, ss3q movu xmm1, [r6+ssq*0] vinserti128 ymm1, [r6+ssq*1], 1 movu xmm2, [srcq+ssq*1] vinserti32x4 m6, zmm1, [r6+ssq*2], 2 vinserti128 ymm2, [srcq+ssq*2], 1 vinserti32x4 m6, [srcq+ssq*0], 3 ; 0 1 2 3 add srcq, ss3q vbroadcasti32x4 m4, [subpel_h_shufA] vinserti32x4 m0, zmm2, [srcq+ssq*0], 2 ; 4 5 6 _ vbroadcasti32x4 m7, [subpel_h_shufB] vbroadcasti32x4 m17, [subpel_h_shufC] pshufb m1, m6, m4 ; 0 1 2 3 0123 mova m2, m8 vpdpbusd m2, m1, m10 pshufb m5, m6, m7 ; 0 1 2 3 4567 mova m1, m8 vpdpbusd m1, m5, m10 pshufb m4, m0, m4 ; 4 5 6 _ 0123 mova m3, m8 vpdpbusd m3, m4, m10 pshufb m7, m0, m7 ; 4 5 6 _ 4567 mova m4, m8 vpdpbusd m4, m7, m10 pshufb m6, m17 vpdpbusd m2, m5, m11 vpdpbusd m1, m6, m11 pshufb m6, m0, m17 vpdpbusd m3, m7, m11 vpdpbusd m4, m6, m11 mova m5, [spel_hv_perm8a] mova m0, [spel_hv_perm8b] mov r6, 0x55555555ff00 packssdw m2, m1 packssdw m3, m4 mova m18, [spel_hv_perm8c] psraw m2, 2 ; 0 1 2 3 psraw m3, 2 ; 4 5 6 _ vpermb m1, m5, m2 ; 01 12 vbroadcasti32x8 m6, [subpel_h_shufA] kmovq k1, r6 vpermt2b m2, m0, m3 ; 23 34 vbroadcasti32x8 m7, [subpel_h_shufB] kshiftrq k2, k1, 16 mova xm16, [spel_hv_end] vpermb m3, m5, m3 ; 45 56 .hv_w8_loop: vbroadcasti32x4 ym4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vbroadcasti32x4 m4{k1}, [srcq+ssq*0] mova m0, m9 vpdpwssd m0, m1, m12 ; a0 b0 pshufb m1, m4, m6 ; 7 8 0123 4567 mova m5, m8 vpdpbusd m5, m1, m10 pshufb m4, m7 ; 7 8 4567 89ab vpdpwssd m0, m2, m13 ; a1 b1 mova m1, m2 vpdpbusd m5, m4, m11 mova m2, m3 vpdpwssd m0, m3, m14 ; a2 b2 psraw m3{k2}, m5, 2 ; 75 86 vpermb m3, m18, m3 ; 67 78 vpdpwssd m0, m3, m15 ; a3 b3 packuswb m0, m0 vpermb zmm1, m16, m0 movq [dstq+dsq*0], xmm1 movhps [dstq+dsq*1], xmm1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop vzeroupper RET .hv_w16: movu m7, [spel_hv_perm16a] sub srcq, ss3q mova m20, [spel_hv_perm16b] lea r6d, [wq*2-32] mova m21, [spel_hv_perm16c] mov r4, srcq mov r7, dstq mova ym16, [spel_hv_end16] lea r6d, [hq+r6*8] .hv_w16_loop0: movu ym17, [srcq+ssq*0] vinserti32x8 m17, [srcq+ssq*1], 1 ; 0 1 movu ym18, [srcq+ssq*2] add srcq, ss3q vinserti32x8 m18, [srcq+ssq*0], 1 ; 2 3 movu ym19, [srcq+ssq*1] vinserti32x8 m19, [srcq+ssq*2], 1 ; 4 5 add srcq, ss3q vpermb m2, m7, m17 ; 0 1 0123 89ab vpermb m0, m20, m17 ; 0 1 4567 cdef vpermb m4, m7, m18 ; 2 3 0123 89ab mova m1, m8 vpdpbusd m1, m2, m10 vpermb m5, m20, m18 ; 2 3 4567 cdef mova m2, m8 vpdpbusd m2, m0, m10 vpermb m17, m21, m17 ; 0 1 89ab ghij mova m3, m8 vpdpbusd m3, m4, m10 vpermb m6, m7, m19 ; 4 5 0123 89ab mova m4, m8 vpdpbusd m4, m5, m10 vpermb m18, m21, m18 ; 2 3 89ab ghij vpdpbusd m1, m0, m11 movu ym0, [srcq+ssq*0] ; 6 vpdpbusd m2, m17, m11 vpermb m17, m20, m19 ; 4 5 4567 cdef vpdpbusd m3, m5, m11 mova m5, m8 vpdpbusd m5, m6, m10 mova m6, m8 vpdpbusd m6, m17, m10 vpdpbusd m4, m18, m11 mova m18, [spel_hv_perm16d] vpermb m18, m18, m0 ; 6 0145 2367 89cd abef vpdpbusd m5, m17, m11 vpermb m19, m21, m19 ; 4 5 89ab ghij mova m17, m8 vpdpbusd m17, m18, m10 mova m18, [spel_hv_perm16e] vpermb m0, m18, m0 ; 6 4589 67ab cdgh efij packssdw m1, m2 ; 01 vpdpbusd m6, m19, m11 packssdw m3, m4 ; 23 vpdpbusd m17, m0, m11 psraw m1, 2 packssdw m5, m6 ; 45 psraw m3, 2 vpshrdd m2, m1, m3, 16 ; 12 psraw m5, 2 vpshrdd m4, m3, m5, 16 ; 34 psraw m17, 2 vpshrdd m6, m5, m17, 16 ; 56 .hv_w16_loop: movu ym18, [srcq+ssq*1] lea srcq, [srcq+ssq*2] vinserti32x8 m18, [srcq+ssq*0], 1 mova m0, m9 vpdpwssd m0, m1, m12 ; a0 vpermb m1, m7, m18 ; 7 8 0123 89ab mova m17, m9 vpdpwssd m17, m2, m12 ; b0 vpermb m2, m20, m18 ; 7 8 4567 cdef mova m19, m8 vpdpbusd m19, m1, m10 vpermb m18, m21, m18 mova m1, m8 vpdpbusd m1, m2, m10 vpdpwssd m0, m3, m13 ; a1 vpdpwssd m17, m4, m13 ; b1 vpdpbusd m19, m2, m11 mova m2, m4 vpdpbusd m1, m18, m11 mova m4, m6 vpdpwssd m0, m5, m14 ; a2 vpdpwssd m17, m6, m14 ; b2 packssdw m19, m1 mova m1, m3 mova m3, m5 psraw m6, m19, 2 ; 7 8 vpshrdd m5, m4, m6, 16 ; 6 7 vpdpwssd m17, m6, m15 ; b3 vpdpwssd m0, m5, m15 ; a3 packuswb m0, m17 vpermb zmm1, m16, m0 mova [dstq+dsq*0], xmm1 vextracti128 [dstq+dsq*1], ymm1, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w16_loop add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .hv_w16_loop0 vzeroupper RET %macro PREP_8TAP_H 0 vpermb m10, m5, m0 vpermb m11, m5, m1 vpermb m12, m6, m0 vpermb m13, m6, m1 vpermb m14, m7, m0 vpermb m15, m7, m1 mova m0, m4 vpdpbusd m0, m10, m8 mova m2, m4 vpdpbusd m2, m12, m8 mova m1, m4 vpdpbusd m1, m11, m8 mova m3, m4 vpdpbusd m3, m13, m8 vpdpbusd m0, m12, m9 vpdpbusd m2, m14, m9 vpdpbusd m1, m13, m9 vpdpbusd m3, m15, m9 packssdw m0, m2 packssdw m1, m3 psraw m0, 2 psraw m1, 2 mova [tmpq+64*0], m0 mova [tmpq+64*1], m1 %endmacro %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %define PREP_8TAP_FN FN prep_8tap, PREP_8TAP_FN sharp, SHARP, SHARP PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP PREP_8TAP_FN smooth, SMOOTH, SMOOTH PREP_8TAP_FN sharp_regular, SHARP, REGULAR PREP_8TAP_FN regular_sharp, REGULAR, SHARP PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH PREP_8TAP_FN regular, REGULAR, REGULAR cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v lea r7, [prep_avx512icl] movsxd wq, wm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v tzcnt wd, wd movzx wd, word [r7+wq*2+table_offset(prep,)] add wq, r7 lea r6, [strideq*3] %if WIN64 pop r7 %endif jmp wq .h: test myd, 0xf00 jnz .hv vpbroadcastd m4, [pd_2] WIN64_SPILL_XMM 10 cmp wd, 4 je .h_w4 tzcnt wd, wd shr mxd, 16 sub srcq, 3 movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)] vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0] vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4] add wq, r7 jmp wq .h_w4: movzx mxd, mxb vbroadcasti128 ym5, [subpel_h_shufA] mov r3d, 0x4 dec srcq vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2] kmovb k1, r3d lea stride3q, [strideq*3] .h_w4_loop: movq xm2, [srcq+strideq*0] movq xm3, [srcq+strideq*1] vpbroadcastq ym2{k1}, [srcq+strideq*2] vpbroadcastq ym3{k1}, [srcq+stride3q ] lea srcq, [srcq+strideq*4] pshufb ym2, ym5 pshufb ym3, ym5 mova ym0, ym4 vpdpbusd ym0, ym2, ym6 mova ym1, ym4 vpdpbusd ym1, ym3, ym6 packssdw ym0, ym1 psraw ym0, 2 mova [tmpq], ym0 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: vbroadcasti128 m5, [subpel_h_shufA] vbroadcasti128 m6, [subpel_h_shufB] vbroadcasti128 m7, [subpel_h_shufC] lea stride3q, [strideq*3] .h_w8_loop: movu xmm3, [srcq+strideq*0] vinserti128 ym3, ymm3, [srcq+strideq*1], 1 vinserti128 m3, [srcq+strideq*2], 2 vinserti128 m3, [srcq+stride3q ], 3 lea srcq, [srcq+strideq*4] pshufb m1, m3, m5 pshufb m2, m3, m6 mova m0, m4 vpdpbusd m0, m1, m8 mova m1, m4 vpdpbusd m1, m2, m8 pshufb m3, m7 vpdpbusd m0, m2, m9 vpdpbusd m1, m3, m9 packssdw m0, m1 psraw m0, 2 mova [tmpq], m0 add tmpq, 64 sub hd, 4 jg .h_w8_loop RET .h_w16: mova m5, [spel_h_perm16a] mova m6, [spel_h_perm16b] mova m7, [spel_h_perm16c] lea stride3q, [strideq*3] .h_w16_loop: movu ym0, [srcq+strideq*0] movu ym1, [srcq+strideq*2] vinserti32x8 m0, [srcq+strideq*1], 1 vinserti32x8 m1, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] PREP_8TAP_H add tmpq, 64*2 sub hd, 4 jg .h_w16_loop RET .h_w32: mova m5, [spel_h_perm32a] mova m6, [spel_h_perm32b] mova m7, [spel_h_perm32c] .h_w32_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] PREP_8TAP_H add tmpq, 64*2 sub hd, 2 jg .h_w32_loop RET .h_w64: xor r6d, r6d jmp .h_start .h_w128: mov r6, -64*1 .h_start: mova m5, [spel_h_perm32a] mova m6, [spel_h_perm32b] mova m7, [spel_h_perm32c] sub srcq, r6 mov r5, r6 .h_loop: movu m0, [srcq+r6+32*0] movu m1, [srcq+r6+32*1] PREP_8TAP_H add tmpq, 64*2 add r6, 64 jle .h_loop add srcq, strideq mov r6, r5 dec hd jg .h_loop RET .v: movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. shr myd, 16 ; Note that the code is 8-tap only, having tzcnt wd, wd cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4 cmove myd, mxd ; had a negligible effect on performance. ; TODO: Would a 6-tap code path be worth it? lea myq, [r7+myq*8+subpel_filters-prep_avx512icl] movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)] add wq, r7 lea stride3q, [strideq*3] sub srcq, stride3q vpbroadcastd m7, [pw_8192] vpbroadcastw m8, [myq+0] vpbroadcastw m9, [myq+2] vpbroadcastw m10, [myq+4] vpbroadcastw m11, [myq+6] jmp wq .v_w4: movd xmm0, [srcq+strideq*0] vpbroadcastd ymm1, [srcq+strideq*2] vpbroadcastd xmm2, [srcq+strideq*1] vpbroadcastd ymm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _ vpbroadcastd ymm0, [srcq+strideq*0] vpbroadcastd ymm2, [srcq+strideq*1] vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _ vpbroadcastd ymm0, [srcq+strideq*2] vbroadcasti128 ymm5, [deint_shuf4] vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5 vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5 vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34 vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6 punpckhbw ymm2, ymm3 ; 23 34 45 56 .v_w4_loop: pinsrd xmm0, [srcq+stride3q ], 1 lea srcq, [srcq+strideq*4] vpbroadcastd ymm3, [srcq+strideq*0] vpbroadcastd ymm4, [srcq+strideq*1] vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _ vpbroadcastd ymm0, [srcq+strideq*2] vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _ pshufb ymm3, ymm5 ; 67 78 89 9a pmaddubsw ymm4, ymm1, ym8 vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78 pmaddubsw ymm2, ym9 paddw ymm4, ymm2 mova ymm2, ymm3 pmaddubsw ymm3, ym11 paddw ymm3, ymm4 pmaddubsw ymm4, ymm1, ym10 paddw ymm3, ymm4 pmulhrsw ymm3, ym7 mova [tmpq], ymm3 add tmpq, 32 sub hd, 4 jg .v_w4_loop vzeroupper RET .v_w8: mov r3d, 0xf044 kmovw k1, r3d kshiftrw k2, k1, 8 movq xm0, [srcq+strideq*0] vpbroadcastq ym1, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] vpbroadcastq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m4, [srcq+strideq*0] vpbroadcastq m5, [srcq+strideq*1] vpbroadcastq m6, [srcq+strideq*2] vmovdqa64 ym0{k1}, ym1 vmovdqa64 ym1{k1}, ym2 vmovdqa64 m2{k1}, m3 vmovdqa64 m3{k1}, m4 vmovdqa64 m4{k1}, m5 vmovdqa64 m5{k1}, m6 punpcklbw ym0, ym1 ; 01 12 __ __ punpcklbw m2, m3 ; 23 34 23 34 punpcklbw m4, m5 ; 45 56 45 56 vmovdqa64 m0{k2}, m2 ; 01 12 23 34 vmovdqa64 m2{k2}, m4 ; 23 34 45 56 .v_w8_loop: vpbroadcastq m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3, [srcq+strideq*0] vpbroadcastq m5, [srcq+strideq*1] pmaddubsw m14, m0, m8 pmaddubsw m15, m2, m9 vpblendmq m0{k1}, m6, m1 vpblendmq m2{k1}, m1, m3 vpbroadcastq m6, [srcq+strideq*2] paddw m14, m15 punpcklbw m2, m0, m2 ; 67 78 67 78 vpblendmq m12{k1}, m3, m5 vpblendmq m13{k1}, m5, m6 vpblendmq m0{k2}, m4, m2 ; 45 56 67 78 punpcklbw m4, m12, m13 ; 89 9a 89 9a vmovdqa64 m2{k2}, m4 ; 67 78 89 9a pmaddubsw m12, m0, m10 pmaddubsw m13, m2, m11 paddw m14, m12 paddw m14, m13 pmulhrsw m14, m7 mova [tmpq], m14 add tmpq, 64 sub hd, 4 jg .v_w8_loop RET .v_w16: mov r3d, 0xf0 kmovb k1, r3d vbroadcasti128 m0, [srcq+strideq*0] vbroadcasti128 m1, [srcq+strideq*1] vbroadcasti128 m2, [srcq+strideq*2] vbroadcasti128 m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vbroadcasti128 m4, [srcq+strideq*0] vbroadcasti128 m5, [srcq+strideq*1] vbroadcasti128 m6, [srcq+strideq*2] vmovdqa64 m0{k1}, m1 vmovdqa64 m1{k1}, m2 vmovdqa64 m2{k1}, m3 vmovdqa64 m3{k1}, m4 vmovdqa64 m4{k1}, m5 vmovdqa64 m5{k1}, m6 shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_-- shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_-- punpckhbw m2, m0, m1 ; 23a 23b 34a 34b punpcklbw m0, m1 ; 01a 01b 12a 12b punpcklbw m4, m5 ; 45a 45b 56a 56b .v_w16_loop: vbroadcasti128 m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vbroadcasti128 m5, [srcq+strideq*0] vpblendmq m1{k1}, m6, m3 vmovdqa64 m3{k1}, m5 pmaddubsw m12, m0, m8 pmaddubsw m13, m2, m8 pmaddubsw m14, m2, m9 pmaddubsw m15, m4, m9 pmaddubsw m0, m4, m10 vbroadcasti128 m2, [srcq+strideq*1] vbroadcasti128 m6, [srcq+strideq*2] paddw m12, m14 paddw m13, m15 paddw m12, m0 vmovdqa64 m5{k1}, m2 vmovdqa64 m2{k1}, m6 mova m0, m4 shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab punpcklbw m2, m1, m3 ; 67a 67b 78a 78b punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab pmaddubsw m14, m2, m10 pmaddubsw m15, m2, m11 paddw m13, m14 paddw m12, m15 pmaddubsw m14, m4, m11 paddw m13, m14 pmulhrsw m12, m7 pmulhrsw m13, m7 mova [tmpq+ 0], m12 mova [tmpq+64], m13 add tmpq, 64*2 sub hd, 4 jg .v_w16_loop RET .v_w32: mova m18, [bilin_v_perm64] movu ym0, [srcq+strideq*0] movu ym1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym2, [srcq+strideq*0] movu ym3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym4, [srcq+strideq*0] movu ym5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym6, [srcq+strideq*0] vpermq m0, m18, m0 vpermq m1, m18, m1 vpermq m2, m18, m2 vpermq m3, m18, m3 vpermq m4, m18, m4 vpermq m5, m18, m5 vpermq m6, m18, m6 punpcklbw m0, m1 punpcklbw m1, m2 punpcklbw m2, m3 punpcklbw m3, m4 punpcklbw m4, m5 punpcklbw m5, m6 .v_w32_loop: movu ym12, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu ym13, [srcq+strideq*0] pmaddubsw m14, m0, m8 pmaddubsw m16, m2, m9 pmaddubsw m15, m1, m8 pmaddubsw m17, m3, m9 mova m0, m2 mova m1, m3 vpermq m12, m18, m12 vpermq m13, m18, m13 paddw m14, m16 paddw m15, m17 pmaddubsw m16, m4, m10 pmaddubsw m17, m5, m10 punpcklbw m6, m12 punpcklbw m12, m13 mova m2, m4 mova m3, m5 paddw m14, m16 paddw m15, m17 pmaddubsw m16, m6, m11 pmaddubsw m17, m12, m11 mova m4, m6 mova m5, m12 paddw m14, m16 paddw m15, m17 pmulhrsw m14, m7 pmulhrsw m15, m7 mova m6, m13 mova [tmpq+ 0], m14 mova [tmpq+64], m15 add tmpq, 64*2 sub hd, 2 jg .v_w32_loop vzeroupper RET .v_w64: mov wd, 64 jmp .v_start .v_w128: mov wd, 128 .v_start: WIN64_SPILL_XMM 27 mova m26, [bilin_v_perm64] lea r6d, [hq+wq*2] mov r5, srcq mov r7, tmpq .v_loop0: vpermq m0, m26, [srcq+strideq*0] vpermq m1, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m2, m26, [srcq+strideq*0] vpermq m3, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m4, m26, [srcq+strideq*0] vpermq m5, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m6, m26, [srcq+strideq*0] punpckhbw m12, m0, m1 punpcklbw m0, m1 punpckhbw m13, m1, m2 punpcklbw m1, m2 punpckhbw m14, m2, m3 punpcklbw m2, m3 punpckhbw m15, m3, m4 punpcklbw m3, m4 punpckhbw m16, m4, m5 punpcklbw m4, m5 punpckhbw m17, m5, m6 punpcklbw m5, m6 .v_loop: vpermq m18, m26, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vpermq m19, m26, [srcq+strideq*0] pmaddubsw m20, m0, m8 pmaddubsw m21, m12, m8 pmaddubsw m22, m1, m8 pmaddubsw m23, m13, m8 mova m0, m2 mova m12, m14 mova m1, m3 mova m13, m15 pmaddubsw m2, m9 pmaddubsw m14, m9 pmaddubsw m3, m9 pmaddubsw m15, m9 punpckhbw m24, m6, m18 punpcklbw m6, m18 paddw m20, m2 paddw m21, m14 paddw m22, m3 paddw m23, m15 mova m2, m4 mova m14, m16 mova m3, m5 mova m15, m17 pmaddubsw m4, m10 pmaddubsw m16, m10 pmaddubsw m5, m10 pmaddubsw m17, m10 punpckhbw m25, m18, m19 punpcklbw m18, m19 paddw m20, m4 paddw m21, m16 paddw m22, m5 paddw m23, m17 mova m4, m6 mova m16, m24 mova m5, m18 mova m17, m25 pmaddubsw m6, m11 pmaddubsw m24, m11 pmaddubsw m18, m11 pmaddubsw m25, m11 paddw m20, m6 paddw m21, m24 paddw m22, m18 paddw m23, m25 pmulhrsw m20, m7 pmulhrsw m21, m7 pmulhrsw m22, m7 pmulhrsw m23, m7 mova m6, m19 mova [tmpq+wq*0+ 0], m20 mova [tmpq+wq*0+64], m21 mova [tmpq+wq*2+ 0], m22 mova [tmpq+wq*2+64], m23 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_loop add r5, 64 add r7, 128 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .v_loop0 RET .hv: %assign stack_offset stack_offset - stack_size_padded %assign stack_size_padded 0 WIN64_SPILL_XMM 16 cmp wd, 4 je .hv_w4 shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0] vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd tzcnt wd, wd vpbroadcastd m8, [pd_2] movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)] vpbroadcastd m9, [pd_32] add wq, r7 vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] lea stride3q, [strideq*3] sub srcq, stride3q punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 jmp wq .hv_w4: movzx mxd, mxb dec srcq vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2] movzx mxd, myb shr myd, 16 cmp hd, 4 cmove myd, mxd vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl] lea stride3q, [strideq*3] sub srcq, stride3q mov r3d, 0x04 kmovb k1, r3d kshiftlb k2, k1, 2 kshiftlb k3, k1, 4 vpbroadcastd m10, [pd_2] vbroadcasti128 m16, [subpel_h_shufA] punpcklbw m0, m0 psraw m0, 8 ; sign-extend vpbroadcastd m11, [pd_32] pshufd m12, m0, q0000 pshufd m13, m0, q1111 pshufd m14, m0, q2222 pshufd m15, m0, q3333 movq xm3, [srcq+strideq*0] vpbroadcastq ym2, [srcq+strideq*1] vpbroadcastq ym3{k1}, [srcq+strideq*2] vpbroadcastq m2{k2}, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vpbroadcastq m3{k2}, [srcq+strideq*0] vpbroadcastq m2{k3}, [srcq+strideq*1] vpbroadcastq m3{k3}, [srcq+strideq*2] mova m17, [spel_hv_perm4a] movu m18, [spel_hv_perm4b] mova m0, m10 mova m1, m10 pshufb m2, m16 pshufb m3, m16 vpdpbusd m0, m2, m8 vpdpbusd m1, m3, m8 packssdw m0, m1 ; _ 0 1 2 3 4 5 6 psraw m0, 2 vpermb m1, m17, m0 ; 01 12 23 34 vpermb m2, m18, m0 ; 23 34 45 56 .hv_w4_loop: movq xm3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] movq xm4, [srcq+strideq*0] vpbroadcastq ym3{k1}, [srcq+strideq*1] vpbroadcastq ym4{k1}, [srcq+strideq*2] mova ym5, ym10 mova ym6, ym10 pshufb ym3, ym16 pshufb ym4, ym16 vpdpbusd ym5, ym3, ym8 vpdpbusd ym6, ym4, ym8 mova m7, m11 packssdw ym5, ym6 ; 7 8 9 a _ _ _ _ psraw ym5, 2 valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a vpdpwssd m7, m1, m12 vpdpwssd m7, m2, m13 vpermb m1, m17, m0 ; 45 56 67 78 vpermb m2, m18, m0 ; 67 78 89 9a vpdpwssd m7, m1, m14 vpdpwssd m7, m2, m15 psrad m7, 6 vpmovdw [tmpq], m7 add tmpq, 32 sub hd, 4 jg .hv_w4_loop vzeroupper RET .hv_w8: WIN64_SPILL_XMM 24 vbroadcasti128 m16, [subpel_h_shufA] vbroadcasti128 m17, [subpel_h_shufB] vbroadcasti128 m18, [subpel_h_shufC] vinserti128 ym0, [srcq+strideq*0], 1 vinserti128 m0, [srcq+strideq*1], 2 vinserti128 m0, [srcq+strideq*2], 3 movu xm1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 ym1, [srcq+strideq*0], 1 vinserti128 m1, [srcq+strideq*1], 2 vinserti128 m1, [srcq+strideq*2], 3 mova m2, m8 mova m4, m8 mova m3, m8 mova m5, m8 pshufb m20, m0, m16 pshufb m21, m0, m17 pshufb m22, m0, m18 pshufb m23, m1, m16 pshufb m6, m1, m17 pshufb m7, m1, m18 vpdpbusd m2, m20, m10 vpdpbusd m4, m21, m10 vpdpbusd m2, m21, m11 vpdpbusd m4, m22, m11 vpdpbusd m3, m23, m10 vpdpbusd m5, m6, m10 vpdpbusd m3, m6, m11 vpdpbusd m5, m7, m11 packssdw m2, m4 packssdw m3, m5 psraw m2, 2 ; _ 0 1 2 psraw m3, 2 ; 3 4 5 6 valignq m0, m3, m2, 2 ; 0 1 2 3 valignq m1, m3, m2, 4 ; 1 2 3 4 valignq m2, m3, m2, 6 ; 2 3 4 5 punpcklwd m4, m0, m1 ; 01a 12a 23a 34a punpckhwd m5, m0, m1 ; 01b 12b 23b 34b punpcklwd m6, m2, m3 ; 23a 34a 45a 56a punpckhwd m7, m2, m3 ; 23b 34b 45b 56b .hv_w8_loop: movu xm19, [srcq+stride3q ] lea srcq, [srcq+strideq*4] vinserti128 ym19, [srcq+strideq*0], 1 vinserti128 m19, [srcq+strideq*1], 2 vinserti128 m19, [srcq+strideq*2], 3 mova m20, m9 mova m21, m9 mova m22, m8 mova m23, m8 vpdpwssd m20, m4, m12 vpdpwssd m21, m5, m12 vpdpwssd m20, m6, m13 vpdpwssd m21, m7, m13 pshufb m0, m19, m16 pshufb m1, m19, m17 pshufb m2, m19, m18 vpdpbusd m22, m0, m10 vpdpbusd m23, m1, m10 vpdpbusd m22, m1, m11 vpdpbusd m23, m2, m11 packssdw m22, m23 psraw m22, 2 ; 7 8 9 A valignq m0, m22, m3, 2 ; 4 5 6 7 valignq m1, m22, m3, 4 ; 5 6 7 8 valignq m2, m22, m3, 6 ; 6 7 8 9 mova m3, m22 punpcklwd m4, m0, m1 ; 45a 56a 67a 78a punpckhwd m5, m0, m1 ; 45b 56b 67b 78b punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab vpdpwssd m20, m4, m14 vpdpwssd m21, m5, m14 vpdpwssd m20, m6, m15 vpdpwssd m21, m7, m15 psrad m20, 6 psrad m21, 6 packssdw m20, m21 mova [tmpq], m20 add tmpq, 64 sub hd, 4 jg .hv_w8_loop RET .hv_w16: mov wd, 16*2 jmp .hv_start .hv_w32: mov wd, 32*2 jmp .hv_start .hv_w64: mov wd, 64*2 jmp .hv_start .hv_w128: mov wd, 128*2 .hv_start: WIN64_SPILL_XMM 31 mova m16, [spel_h_perm16a] mova m17, [spel_h_perm16b] mova m18, [spel_h_perm16c] lea r6d, [hq+wq*8-256] mov r5, srcq mov r7, tmpq .hv_loop0: movu ym0, [srcq+strideq*0] vinserti32x8 m0, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] movu ym1, [srcq+strideq*0] vinserti32x8 m1, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] movu ym2, [srcq+strideq*0] vinserti32x8 m2, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] movu ym3, [srcq+strideq*0] mova m4, m8 mova m5, m8 mova m6, m8 mova m7, m8 vpermb m19, m16, m0 vpermb m20, m17, m0 vpermb m21, m18, m0 vpermb m22, m16, m1 vpermb m23, m17, m1 vpermb m24, m18, m1 vpermb m25, m16, m2 vpermb m26, m17, m2 vpermb m27, m18, m2 vpermb ym28, ym16, ym3 vpermb ym29, ym17, ym3 vpermb ym30, ym18, ym3 mova m0, m8 mova m1, m8 mova ym2, ym8 mova ym3, ym8 vpdpbusd m4, m19, m10 vpdpbusd m5, m20, m10 vpdpbusd m6, m22, m10 vpdpbusd m7, m23, m10 vpdpbusd m0, m25, m10 vpdpbusd m1, m26, m10 vpdpbusd ym2, ym28, ym10 vpdpbusd ym3, ym29, ym10 vpdpbusd m4, m20, m11 vpdpbusd m5, m21, m11 vpdpbusd m6, m23, m11 vpdpbusd m7, m24, m11 vpdpbusd m0, m26, m11 vpdpbusd m1, m27, m11 vpdpbusd ym2, ym29, ym11 vpdpbusd ym3, ym30, ym11 packssdw m4, m5 packssdw m6, m7 packssdw m0, m1 packssdw ym2, ym3 psraw m4, 2 ; 0a 0b 1a 1b psraw m6, 2 ; 2a 2b 3a 3b psraw m0, 2 ; 4a 4b 5a 5b psraw ym2, 2 ; 6a 6b __ __ vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b punpcklwd m2, m4, m5 ; 01a 01c 12a 12c punpckhwd m3, m4, m5 ; 01b 01d 12b 12d punpcklwd m4, m6, m7 ; 23a 23c 34a 34c punpckhwd m5, m6, m7 ; 23b 23d 34b 34d punpcklwd m6, m0, m1 ; 45a 45c 56a 56c punpckhwd m7, m0, m1 ; 45b 45d 56b 56d .hv_loop: movu ym19, [srcq+strideq*1] lea srcq, [srcq+strideq*2] vinserti32x8 m19, [srcq+strideq*0], 1 mova m20, m9 mova m21, m9 mova m22, m8 mova m23, m8 vpdpwssd m20, m2, m12 vpdpwssd m21, m3, m12 vpdpwssd m20, m4, m13 vpdpwssd m21, m5, m13 vpermb m24, m16, m19 vpermb m25, m17, m19 vpermb m26, m18, m19 vpdpbusd m22, m24, m10 vpdpbusd m23, m25, m10 vpdpbusd m22, m25, m11 vpdpbusd m23, m26, m11 packssdw m22, m23 psraw m22, 2 ; 7a 7b 8a 8b vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b mova m2, m4 mova m3, m5 mova m1, m22 mova m4, m6 mova m5, m7 punpcklwd m6, m0, m1 ; 67a 67c 78a 78c punpckhwd m7, m0, m1 ; 67b 67d 78b 78d vpdpwssd m20, m4, m14 vpdpwssd m21, m5, m14 vpdpwssd m20, m6, m15 vpdpwssd m21, m7, m15 psrad m20, 6 psrad m21, 6 packssdw m20, m21 mova [tmpq+wq*0], ym20 vextracti32x8 [tmpq+wq*1], m20, 1 lea tmpq, [tmpq+wq*2] sub hd, 2 jg .hv_loop add r5, 16 add r7, 32 movzx hd, r6b mov srcq, r5 mov tmpq, r7 sub r6d, 1<<8 jg .hv_loop0 RET cglobal warp_affine_8x8t_8bpc, 4, 7, 22, tmp, ts vpbroadcastd m9, [pd_16384] mova ym15, [warp_8x8t_end] call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main jmp .start .loop: call mangle(private_prefix %+ _warp_affine_8x8_8bpc_avx512icl).main2 lea tmpq, [tmpq+tsq*4] .start: paddd m16, m16 vpermb m16, m15, m16 mova [tmpq+tsq*0], xm16 vextracti128 [tmpq+tsq*2], ym16, 1 sub r6d, 0x1800 jg .loop RET cglobal warp_affine_8x8_8bpc, 4, 7, 22, dst, ds, src, ss, abcd, filter vpbroadcastd m9, [pd_262144] mova xm15, [warp_8x8_end] call .main jmp .start .loop: call .main2 lea dstq, [dstq+dsq*2] .start: psrad m16, 19 packuswb m16, m16 vpermb m16, m15, m16 movq [dstq+dsq*0], xm16 movhps [dstq+dsq*1], xm16 sub r6d, 0x1800 jg .loop RET ALIGN function_align .main: vpbroadcastd m1, [pd_512] %if WIN64 mov abcdq, r5mp vpaddd ym18, ym1, r6m {1to8} ; mx %else add r5d, 512 vpbroadcastd ym18, r5d %endif vpaddd ym20, ym1, r7m {1to8} ; my mova ym16, [pd_0to7] vpbroadcastd ym19, [abcdq+4*0] vpbroadcastd ym21, [abcdq+4*1] lea r4, [ssq*3+3] mova m10, [warp_8x8_permA] mov r6d, 0x5555 mova m11, [warp_8x8_permB] lea filterq, [mc_warp_filter+64*8] vpbroadcastq m12, [warp_8x8_hpack] sub srcq, r4 ; src -= src_stride*3 + 3 vbroadcasti32x4 m13, [warp_8x8_permC] kxnorb k2, k2, k2 vbroadcasti32x4 m14, [warp_8x8_permD] vpdpwssd ym18, ym19, ym16 ; alpha vpdpwssd ym20, ym21, ym16 ; gamma vbroadcasti32x4 m0, [srcq] psrad ym19, 16 ; beta psrad ym21, 16 ; delta kmovw k1, r6d psrad ym16, ym18, 10 kmovb k3, k2 paddd ym18, ym19 vpgatherdq m2{k2}, [filterq+ym16*8] ; filter_x0 psrld m1, 8 ; pd_2 pshufb m0, m11 paddd m8, m1, m1 ; pd_4 vpdpbusd m1, m0, m2 call .h psllq m2, m1, 45 pslld m1, 13 paddd m1, m2 vpshrdq m1, m0, 48 ; 01 12 call .h vpshrdq m2, m1, m0, 48 ; 23 34 call .h vpshrdq m3, m2, m0, 48 ; 45 56 .main2: call .h psrad ym17, ym20, 10 kmovb k2, k3 paddd ym20, ym21 vpgatherdq m7{k3}, [filterq+ym17*8] ; filter_y0 psrad ym16, ym20, 10 kmovb k3, k2 paddd ym20, ym21 vpgatherdq m17{k2}, [filterq+ym16*8] ; filter_y1 shufps m5, m7, m17, q2020 ; a0 a1 a2 a3 b0 b1 b2 b3 A0 A1 A2 A3 B0 B1 B2 B3 mova m16, m9 pshufb m4, m5, m13 ; a0 a1 A0 A1 b0 b1 B0 B1 vpdpwssd m16, m1, m4 pshufb m5, m14 ; a2 a3 A2 A3 b2 b3 B2 B3 mova m1, m2 vpdpwssd m16, m2, m5 shufps m5, m7, m17, q3131 ; a4 a5 a6 a7 b4 b5 b6 b7 A4 A5 A6 A7 B4 B5 B6 B7 mova m2, m3 pshufb m4, m5, m13 ; a4 a5 A4 A5 b4 b5 B4 B5 vpdpwssd m16, m3, m4 vpshrdq m3, m0, 48 ; 67 78 pshufb m5, m14 ; a6 a7 A6 A7 b6 b7 B6 B7 vpdpwssd m16, m3, m5 ret ALIGN function_align .h: movu xm5, [srcq+ssq*1] psrad ym16, ym18, 10 lea srcq, [srcq+ssq*2] vinserti32x4 ym5, [srcq+ssq*0], 1 kmovb k2, k3 paddd ym18, ym19 vpgatherdq m6{k3}, [filterq+ym16*8] ; filter_x1 psrad ym17, ym18, 10 kmovb k3, k2 paddd ym18, ym19 vpgatherdq m16{k2}, [filterq+ym17*8] ; filter_x2 mova m0, m8 vpermb m4, m10, m5 ; a4 b0 a5 b1 a6 b2 a7 b3 a8 b4 a9 b5 aa b6 ab b7 vpshldq m17, m16, m6, 32 ; a4 a5 a6 a7 b0 b1 b2 b3 vpdpbusd m0, m4, m17 vpermb m5, m11, m5 ; a0 b4 a1 b5 a2 b6 a3 b7 a4 b8 a5 b9 a6 ba a7 bb vmovdqa32 m16{k1}, m6 ; a0 a1 a2 a3 b4 b5 b6 b7 vpdpbusd m0, m5, m16 vpmultishiftqb m0, m12, m0 ; 1 1 2 2 (>> 3) ret %macro BIDIR_FN 1 ; op lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM %1 0 vextracti32x4 xm1, ym0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_ret lea dstq, [dstq+strideq*4] pextrd [dstq ], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_ret: RET .w4_h16: vpbroadcastd m7, strided pmulld m7, [bidir_sctr_w4] %1 0 kxnorw k1, k1, k1 vpscatterdd [dstq+m7]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM %1 0 vextracti32x4 xm1, ym0, 1 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w8_h8: %1 0 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq ], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: %1_INC_PTR 2 lea dstq, [dstq+strideq*4] .w16: %1 0 vpermq m0, m0, q3120 mova [dstq ], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m7, [pb_02461357] .w32_loop: %1 0 %1_INC_PTR 2 vpermq m0, m7, m0 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m7, [pb_02461357] .w64_loop: %1 0 %1_INC_PTR 2 vpermq m0, m7, m0 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m7, [pb_02461357] .w128_loop: %1 0 vpermq m6, m7, m0 %1 2 mova [dstq+64*0], m6 %1_INC_PTR 4 vpermq m6, m7, m0 mova [dstq+64*1], m6 add dstq, strideq dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset mova m0, [tmp1q+(%1+0)*mmsize] paddw m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] pmulhrsw m0, m4 pmulhrsw m1, m4 packuswb m0, m1 %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal avg_8bpc, 4, 7, 5, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-avg_avx512icl_table lea r6, [avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r6+wq*4] vpbroadcastd m4, [base+pw_1024] add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m0, [tmp1q+(%1+0)*mmsize] psubw m2, m0, [tmp2q+(%1+0)*mmsize] mova m1, [tmp1q+(%1+1)*mmsize] psubw m3, m1, [tmp2q+(%1+1)*mmsize] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 %define base r6-w_avg_avx512icl_table lea r6, [w_avg_avx512icl_table] tzcnt wd, wm movifnidn hd, hm vpbroadcastw m4, r6m ; weight movsxd wq, dword [r6+wq*4] vpbroadcastd m5, [base+pw_2048] psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q pxor m0, m0 mov tmp1q, tmp2q psubw m4, m0, m4 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 %if mmsize == 64 vpermq m3, m8, [maskq+%1*32] %else vpermq m3, [maskq+%1*16], q3120 %endif mova m0, [tmp2q+(%1+0)*mmsize] psubw m1, m0, [tmp1q+(%1+0)*mmsize] psubb m3, m4, m3 paddw m1, m1 ; (b - a) << 1 paddb m3, m3 punpcklbw m2, m4, m3 ; -m << 9 pmulhw m1, m2 paddw m0, m1 mova m1, [tmp2q+(%1+1)*mmsize] psubw m2, m1, [tmp1q+(%1+1)*mmsize] paddw m2, m2 punpckhbw m3, m4, m3 pmulhw m2, m3 paddw m1, m2 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*32 add tmp2q, %1*64 add tmp1q, %1*64 %endmacro cglobal mask_8bpc, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-mask_avx512icl_table lea r7, [mask_avx512icl_table] tzcnt wd, wm movifnidn hd, hm mov maskq, maskmp movsxd wq, dword [r7+wq*4] pxor m4, m4 mova m8, [base+bilin_v_perm64] vpbroadcastd m5, [base+pw_2048] add wq, r7 BIDIR_FN MASK %macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4 mova m%1, [tmp1q+mmsize*%3] mova m1, [tmp2q+mmsize*%3] psubw m1, m%1 pabsw m%2, m1 psubusw m%2, m6, m%2 psrlw m%2, 8 ; 64 - m psllw m2, m%2, 10 pmulhw m1, m2 paddw m%1, m1 mova m1, [tmp1q+mmsize*%4] mova m2, [tmp2q+mmsize*%4] psubw m2, m1 pabsw m3, m2 psubusw m3, m6, m3 vpshldw m%2, m3, 8 psllw m3, m%2, 10 %if %5 psubb m%2, m5, m%2 %endif pmulhw m2, m3 paddw m1, m2 pmulhrsw m%1, m7 pmulhrsw m1, m7 packuswb m%1, m1 %endmacro cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_420_avx512icl_table lea r7, [w_mask_420_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] vpbroadcastd m9, [base+pb_m64] ; -1 << 6 mova ym10, [base+wm_420_mask+32] vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6 add wq, r7 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: mova m5, [wm_420_perm4] cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1 vinserti128 ym5, [wm_420_perm4+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: vpermb ym8, ym10, ym8 movq [maskq], xm8 RET .w4_h16: vpbroadcastd m11, strided pmulld m11, [bidir_sctr_w4] W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 vpdpbusd m8, m4, m9 kxnorw k1, k1, k1 vpermb m8, m10, m8 mova [maskq], xm8 vpscatterdd [dstq+m11]{k1}, m0 RET .w8: mova m5, [wm_420_perm8] cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1 vinserti128 ym5, [wm_420_perm8+32], 1 vpermb ym4, ym5, ym4 vpdpbusd ym8, ym4, ym9 vpermb m8, m10, m8 mova [maskq], xm8 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 16 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 mova m1, m8 vpdpbusd m1, m4, m9 vpermb m1, m10, m1 mova [maskq], xm1 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16: mova m5, [wm_420_perm16] .w16_loop: W_MASK 0, 4, 0, 1 vpermb m4, m5, m4 mova m1, m8 vpdpbusd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m0, q3120 mova [maskq], xm1 add maskq, 16 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 lea dstq, [dstq+strideq*4] sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m5, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpbusd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 mova [maskq], xm1 add maskq, 16 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14 psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15 .w64_loop: W_MASK 0, 4, 0, 2 W_MASK 11, 5, 1, 3 mova m2, m8 vpdpbusd m2, m4, m9 mova m3, m8 vpdpbusd m3, m5, m9 add tmp1q, 256 add tmp2q, 256 vpermt2b m2, m10, m3 mova m1, m0 vpermt2q m0, m12, m11 vpermt2q m1, m13, m11 mova [maskq], ym2 add maskq, 32 mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w64_loop RET .w128: pmovzxbq m14, [wm_420_perm64] mova m10, [wm_420_mask] psrlq m15, m14, 4 .w128_loop: W_MASK 0, 12, 0, 4 W_MASK 11, 13, 1, 5 mova m4, m8 vpdpbusd m4, m12, m9 mova m5, m8 vpdpbusd m5, m13, m9 mova m1, m0 vpermt2q m0, m14, m11 vpermt2q m1, m15, m11 mova [dstq+strideq*0+64*0], m0 mova [dstq+strideq*1+64*0], m1 W_MASK 0, 12, 2, 6 W_MASK 11, 13, 3, 7 vprold m4, 16 vprold m5, 16 vpdpbusd m4, m12, m9 vpdpbusd m5, m13, m9 add tmp1q, 512 add tmp2q, 512 vpermt2b m4, m10, m5 mova m1, m0 vpermt2q m0, m14, m11 vpermt2q m1, m15, m11 mova [maskq], m4 add maskq, 64 mova [dstq+strideq*0+64*1], m0 mova [dstq+strideq*1+64*1], m1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w128_loop RET cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_422_avx512icl_table lea r7, [w_mask_422_avx512icl_table] tzcnt wd, wm mov r6d, r7m ; sign movifnidn hd, hm movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m7, [base+pw_2048] vpbroadcastd m9, [base+pw_m128] mova m10, [base+wm_422_mask] vpbroadcastd m11, [base+pb_127] add wq, r7 vpbroadcastd m8, [base+wm_sign+4+r6*4] mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: pand xm8, xm11 mova [maskq], xm8 RET .w4_h16: vpbroadcastd m5, strided pmulld m5, [bidir_sctr_w4] W_MASK 0, 4, 0, 1 vpdpwssd m8, m4, m9 kxnorw k1, k1, k1 vpermb m8, m10, m8 pand ym8, ym11 mova [maskq], ym8 vpscatterdd [dstq+m5]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1 movhps xm10, [wm_422_mask+16] vpdpwssd ym8, ym4, ym9 vpermb ym8, ym10, ym8 pand xm8, xm11 mova [maskq], xm8 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 32 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 vpermb m1, m10, m1 pand ym1, ym11 mova [maskq], ym1 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 32 lea dstq, [dstq+strideq*4] .w16: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 vpermb m1, m10, m1 vpermq m0, m0, q3120 pand ym1, ym11 mova [maskq], ym1 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m5, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 pand ym1, ym11 mova [maskq], ym1 add maskq, 32 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m5, [pb_02461357] .w64_loop: W_MASK 0, 4, 0, 1 mova m1, m8 vpdpwssd m1, m4, m9 add tmp1q, 128 add tmp2q, 128 vpermb m1, m10, m1 vpermq m0, m5, m0 pand ym1, ym11 mova [maskq], ym1 add maskq, 32 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m13, [pb_02461357] .w128_loop: W_MASK 0, 4, 0, 1 W_MASK 12, 5, 2, 3 mova m2, m8 vpdpwssd m2, m4, m9 mova m3, m8 vpdpwssd m3, m5, m9 add tmp1q, 256 add tmp2q, 256 vpermt2b m2, m10, m3 vpermq m0, m13, m0 vpermq m1, m13, m12 pand m2, m11 mova [maskq], m2 add maskq, 64 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, strideq dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 %define base r7-w_mask_444_avx512icl_table lea r7, [w_mask_444_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r7+wq*4] vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 vpbroadcastd m5, [base+pb_64] vpbroadcastd m7, [base+pw_2048] mova m8, [base+wm_444_mask] add wq, r7 mov maskq, maskmp lea stride3q, [strideq*3] jmp wq .w4: cmp hd, 8 jg .w4_h16 WRAP_YMM W_MASK 0, 4, 0, 1, 1 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 vextracti32x4 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 movd [dstq+strideq*2], xm1 pextrd [dstq+stride3q ], xm1, 1 jl .w4_end lea dstq, [dstq+strideq*4] pextrd [dstq+strideq*0], xm0, 2 pextrd [dstq+strideq*1], xm0, 3 pextrd [dstq+strideq*2], xm1, 2 pextrd [dstq+stride3q ], xm1, 3 .w4_end: RET .w4_h16: vpbroadcastd m9, strided pmulld m9, [bidir_sctr_w4] W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 kxnorw k1, k1, k1 mova [maskq], m4 vpscatterdd [dstq+m9]{k1}, m0 RET .w8: cmp hd, 4 jne .w8_h8 WRAP_YMM W_MASK 0, 4, 0, 1, 1 vinserti128 ym8, [wm_444_mask+32], 1 vpermb ym4, ym8, ym4 mova [maskq], ym4 vextracti32x4 xm1, ym0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm0 movhps [dstq+stride3q ], xm1 RET .w8_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 64 lea dstq, [dstq+strideq*4] .w8_h8: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 mova [maskq], m4 vextracti32x4 xm1, ym0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 movq [dstq+strideq*2], xm2 movq [dstq+stride3q ], xm3 lea dstq, [dstq+strideq*4] movhps [dstq+strideq*0], xm0 movhps [dstq+strideq*1], xm1 movhps [dstq+strideq*2], xm2 movhps [dstq+stride3q ], xm3 sub hd, 8 jg .w8_loop RET .w16_loop: add tmp1q, 128 add tmp2q, 128 add maskq, 64 lea dstq, [dstq+strideq*4] .w16: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 vpermq m0, m0, q3120 mova [maskq], m4 mova [dstq+strideq*0], xm0 vextracti32x4 [dstq+strideq*1], m0, 2 vextracti32x4 [dstq+strideq*2], ym0, 1 vextracti32x4 [dstq+stride3q ], m0, 3 sub hd, 4 jg .w16_loop RET .w32: pmovzxbq m9, [pb_02461357] .w32_loop: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 add tmp1q, 128 add tmp2q, 128 vpermq m0, m9, m0 mova [maskq], m4 add maskq, 64 mova [dstq+strideq*0], ym0 vextracti32x8 [dstq+strideq*1], m0, 1 lea dstq, [dstq+strideq*2] sub hd, 2 jg .w32_loop RET .w64: pmovzxbq m9, [pb_02461357] .w64_loop: W_MASK 0, 4, 0, 1, 1 vpermb m4, m8, m4 add tmp1q, 128 add tmp2q, 128 vpermq m0, m9, m0 mova [maskq], m4 add maskq, 64 mova [dstq], m0 add dstq, strideq dec hd jg .w64_loop RET .w128: pmovzxbq m11, [pb_02461357] .w128_loop: W_MASK 0, 4, 0, 1, 1 W_MASK 10, 9, 2, 3, 1 vpermb m4, m8, m4 vpermb m9, m8, m9 add tmp1q, 256 add tmp2q, 256 vpermq m0, m11, m0 vpermq m10, m11, m10 mova [maskq+64*0], m4 mova [maskq+64*1], m9 add maskq, 128 mova [dstq+64*0], m0 mova [dstq+64*1], m10 add dstq, strideq dec hd jg .w128_loop RET cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask %define base r6-blend_avx512icl_table lea r6, [blend_avx512icl_table] tzcnt wd, wm movifnidn maskq, maskmp movifnidn hd, hm movsxd wq, [r6+wq*4] vpbroadcastd m6, [base+pb_64] vpbroadcastd m7, [base+pw_512] sub tmpq, maskq add wq, r6 lea r6, [dsq*3] jmp wq .w4: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 vpbroadcastd xmm1, [dstq+dsq*2] pinsrd xmm1, [dstq+r6 ], 3 mova xmm4, [maskq] mova xmm5, [maskq+tmpq] add maskq, 4*4 psubb xmm3, xm6, xmm4 punpcklbw xmm0, xmm5 punpcklbw xmm2, xmm3, xmm4 punpckhbw xmm1, xmm5 punpckhbw xmm3, xmm4 pmaddubsw xmm0, xmm2 pmaddubsw xmm1, xmm3 pmulhrsw xmm0, xm7 pmulhrsw xmm1, xm7 packuswb xmm0, xmm1 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 pextrd [dstq+dsq*2], xmm0, 2 pextrd [dstq+r6 ], xmm0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w4 RET .w8: movq xmm0, [dstq+dsq*0] vpbroadcastq xmm1, [dstq+dsq*1] vpbroadcastq ymm2, [dstq+dsq*2] vpbroadcastq ymm3, [dstq+r6 ] mova ymm4, [maskq] mova ymm5, [maskq+tmpq] add maskq, 8*4 vpblendd ymm0, ymm2, 0x30 vpblendd ymm1, ymm3, 0xc0 psubb ymm3, ym6, ymm4 punpcklbw ymm0, ymm5 punpcklbw ymm2, ymm3, ymm4 punpckhbw ymm1, ymm5 punpckhbw ymm3, ymm4 pmaddubsw ymm0, ymm2 pmaddubsw ymm1, ymm3 pmulhrsw ymm0, ym7 pmulhrsw ymm1, ym7 packuswb ymm0, ymm1 vextracti128 xmm1, ymm0, 1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 movq [dstq+dsq*2], xmm1 movhps [dstq+r6 ], xmm1 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w8 vzeroupper RET .w16: mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 vinserti32x4 m1, [dstq+dsq*2], 2 mova m4, [maskq] vinserti32x4 m1, [dstq+r6 ], 3 mova m5, [maskq+tmpq] add maskq, 16*4 psubb m3, m6, m4 punpcklbw m0, m1, m5 punpcklbw m2, m3, m4 punpckhbw m1, m5 punpckhbw m3, m4 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m7 pmulhrsw m1, m7 packuswb m0, m1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], ym0, 1 vextracti32x4 [dstq+dsq*2], m0, 2 vextracti32x4 [dstq+r6 ], m0, 3 lea dstq, [dstq+dsq*4] sub hd, 4 jg .w16 RET .w32: mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 mova m4, [maskq] mova m5, [maskq+tmpq] add maskq, 32*2 psubb m3, m6, m4 punpcklbw m0, m1, m5 punpcklbw m2, m3, m4 punpckhbw m1, m5 punpckhbw m3, m4 pmaddubsw m0, m2 pmaddubsw m1, m3 pmulhrsw m0, m7 pmulhrsw m1, m7 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32 RET cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_avx512icl_table lea r5, [blend_v_avx512icl_table] tzcnt wd, wm movifnidn hd, hm movsxd wq, [r5+wq*4] vpbroadcastd m5, [base+pw_512] add wq, r5 add maskq, obmc_masks-blend_v_avx512icl_table jmp wq .w2: vpbroadcastd xmm2, [maskq+2*2] .w2_s0_loop: movd xmm0, [dstq+dsq*0] pinsrw xmm0, [dstq+dsq*1], 1 movd xmm1, [tmpq] add tmpq, 2*2 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w2_s0_loop RET .w4: vpbroadcastq xmm2, [maskq+4*2] .w4_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movq xmm1, [tmpq] add tmpq, 4*2 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: mova xmm3, [maskq+8*2] .w8_loop: movq xmm0, [dstq+dsq*0] vpbroadcastq xmm1, [dstq+dsq*1] mova xmm2, [tmpq] add tmpq, 8*2 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 pmaddubsw xmm0, xmm3 pmaddubsw xmm1, xmm3 pmulhrsw xmm0, xm5 pmulhrsw xmm1, xm5 packuswb xmm0, xmm1 movq [dstq+dsq*0], xmm0 movhps [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: vbroadcasti32x4 ym3, [maskq+16*2] vbroadcasti32x4 ym4, [maskq+16*3] .w16_loop: mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 mova ym2, [tmpq] add tmpq, 16*2 punpcklbw ym0, ym1, ym2 punpckhbw ym1, ym2 pmaddubsw ym0, ym3 pmaddubsw ym1, ym4 pmulhrsw ym0, ym5 pmulhrsw ym1, ym5 packuswb ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w16_loop RET .w32: mova m4, [maskq+32*2] vshufi32x4 m3, m4, m4, q2020 vshufi32x4 m4, m4, q3131 .w32_loop: mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 mova m2, [tmpq] add tmpq, 32*2 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m4 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w32_loop RET cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask %define base r6-blend_h_avx512icl_table lea r6, [blend_h_avx512icl_table] tzcnt wd, wm mov hd, hm movsxd wq, [r6+wq*4] lea maskq, [base+obmc_masks+hq*2] vpbroadcastd m5, [base+pw_512] lea hd, [hq*3] add wq, r6 shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd xmm0, [dstq+dsq*0] pinsrw xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movd xmm1, [tmpq] add tmpq, 2*2 punpcklwd xmm2, xmm2 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 pextrw [dstq+dsq*0], xmm0, 0 pextrw [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w2 RET .w4: mova xmm3, [blend_shuf] .w4_loop: movd xmm0, [dstq+dsq*0] pinsrd xmm0, [dstq+dsq*1], 1 movd xmm2, [maskq+hq*2] movq xmm1, [tmpq] add tmpq, 4*2 pshufb xmm2, xmm3 punpcklbw xmm0, xmm1 pmaddubsw xmm0, xmm2 pmulhrsw xmm0, xm5 packuswb xmm0, xmm0 movd [dstq+dsq*0], xmm0 pextrd [dstq+dsq*1], xmm0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w4_loop RET .w8: vbroadcasti128 ymm4, [blend_shuf] shufpd ymm4, ymm4, 0x03 .w8_loop: vpbroadcastq ymm1, [dstq+dsq*0] movq xmm0, [dstq+dsq*1] vpblendd ymm0, ymm1, 0x30 vpbroadcastd ymm3, [maskq+hq*2] movq xmm1, [tmpq+8*1] vinserti128 ymm1, [tmpq+8*0], 1 add tmpq, 8*2 pshufb ymm3, ymm4 punpcklbw ymm0, ymm1 pmaddubsw ymm0, ymm3 pmulhrsw ymm0, ym5 vextracti128 xmm1, ymm0, 1 packuswb xmm0, xmm1 movhps [dstq+dsq*0], xmm0 movq [dstq+dsq*1], xmm0 lea dstq, [dstq+dsq*2] add hq, 2 jl .w8_loop vzeroupper RET .w16: vbroadcasti32x4 ym4, [blend_shuf] shufpd ym4, ym4, 0x0c .w16_loop: mova xm1, [dstq+dsq*0] vinserti32x4 ym1, [dstq+dsq*1], 1 vpbroadcastd ym3, [maskq+hq*2] mova ym2, [tmpq] add tmpq, 16*2 pshufb ym3, ym4 punpcklbw ym0, ym1, ym2 punpckhbw ym1, ym2 pmaddubsw ym0, ym3 pmaddubsw ym1, ym3 pmulhrsw ym0, ym5 pmulhrsw ym1, ym5 packuswb ym0, ym1 mova [dstq+dsq*0], xm0 vextracti32x4 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w16_loop RET .w32: vbroadcasti32x4 m4, [blend_shuf] shufpd m4, m4, 0xf0 .w32_loop: mova ym1, [dstq+dsq*0] vinserti32x8 m1, [dstq+dsq*1], 1 vpbroadcastd m3, [maskq+hq*2] mova m2, [tmpq] add tmpq, 32*2 pshufb m3, m4 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq+dsq*0], ym0 vextracti32x8 [dstq+dsq*1], m0, 1 lea dstq, [dstq+dsq*2] add hq, 2 jl .w32_loop RET .w64: vpbroadcastw m3, [maskq+hq*2] mova m1, [dstq] mova m2, [tmpq] add tmpq, 32*2 punpcklbw m0, m1, m2 punpckhbw m1, m2 pmaddubsw m0, m3 pmaddubsw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 mova [dstq], m0 add dstq, dsq inc hq jl .w64 RET .w128: vpbroadcastw m6, [maskq+hq*2] mova m2, [dstq+64*0] mova m1, [tmpq+64*0] mova m3, [dstq+64*1] mova m4, [tmpq+64*1] add tmpq, 64*2 punpcklbw m0, m2, m1 punpckhbw m2, m1 pmaddubsw m0, m6 pmaddubsw m2, m6 punpcklbw m1, m3, m4 punpckhbw m3, m4 pmaddubsw m1, m6 pmaddubsw m3, m6 REPX {pmulhrsw x, m5}, m0, m2, m1, m3 packuswb m0, m2 packuswb m1, m3 mova [dstq+64*0], m0 mova [dstq+64*1], m1 add dstq, dsq inc hq jl .w128 RET cglobal resize_8bpc, 6, 12, 19, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 sub dword mx0m, 4<<14 sub dword src_wm, 8 mov r6, ~0 vpbroadcastd m5, dxm vpbroadcastd m8, mx0m vpbroadcastd m6, src_wm kmovq k3, r6 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ vpbroadcastd m3, [base+pw_m256] vpbroadcastd m7, [base+pd_63] vbroadcasti32x4 m15, [base+pb_8x0_8x8] vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15] pslld m5, 4 ; dx*16 pslld m6, 14 pxor m2, m2 mova m16, [base+resize_permA] mova m17, [base+resize_permB] mova xm18, [base+resize_permC] .loop_y: xor xd, xd mova m4, m8 ; per-line working version of mx .loop_x: pmaxsd m0, m4, m2 psrad m9, m4, 8 ; filter offset (unmasked) pminsd m0, m6 ; iclip(mx, 0, src_w-8) psubd m1, m4, m0 ; pshufb offset psrad m0, 14 ; clipped src_x offset psrad m1, 14 ; pshufb edge_emu offset vptestmd k4, m1, m1 pand m9, m7 ; filter offset (masked) ktestw k4, k4 jz .load vextracti32x8 ym12, m0, 1 vextracti32x8 ym13, m1, 1 kmovq k1, k3 kmovq k2, k3 vpgatherdq m10{k1}, [srcq+ym0] vpgatherdq m11{k2}, [srcq+ym12] kmovq k1, k3 kmovq k2, k3 vpgatherdq m14{k1}, [base+resize_shuf+4+ym1] vpgatherdq m0{k2}, [base+resize_shuf+4+ym13] mova m12, m16 mova m13, m17 paddb m14, m15 paddb m0, m15 pshufb m10, m14 pshufb m11, m0 vpermi2d m12, m10, m11 vpermi2d m13, m10, m11 jmp .filter .load: kmovq k1, k3 kmovq k2, k3 vpgatherdd m12{k1}, [srcq+m0+0] vpgatherdd m13{k2}, [srcq+m0+4] .filter: kmovq k1, k3 kmovq k2, k3 vpgatherdd m10{k1}, [base+resize_filter+m9*8+0] vpgatherdd m11{k2}, [base+resize_filter+m9*8+4] mova m14, m2 vpdpbusd m14, m12, m10 vpdpbusd m14, m13, m11 packssdw m14, m14 pmulhrsw m14, m3 packuswb m14, m14 vpermd m14, m18, m14 mova [dstq+xq], xm14 paddd m4, m5 add xd, 16 cmp xd, dst_wd jl .loop_x add dstq, dst_strideq add srcq, src_strideq dec hd jg .loop_y RET %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/mc_sse.asm000064400000000000000000011361061046102023000140460ustar 00000000000000; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; Copyright © 2018, VideoLabs ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 ; dav1d_obmc_masks[] with 64-x interleaved obmc_masks: db 0, 0, 0, 0 ; 2 @4 db 45, 19, 64, 0 ; 4 @8 db 39, 25, 50, 14, 59, 5, 64, 0 ; 8 @16 db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0 ; 16 @32 db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10 db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0 ; 32 @64 db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14 subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12 bilin_h_shuf8: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 rescale_mul: dd 0, 1, 2, 3 resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7 wm_420_sign: times 4 dw 258 times 4 dw 257 wm_422_sign: times 8 db 128 times 8 db 127 pb_8x0_8x8: times 8 db 0 times 8 db 8 bdct_lb_dw: times 4 db 0 times 4 db 4 times 4 db 8 times 4 db 12 pb_64: times 16 db 64 pw_m256: times 8 dw -256 pw_1: times 8 dw 1 pw_2: times 8 dw 2 pw_8: times 8 dw 8 pw_15: times 8 dw 15 pw_26: times 8 dw 26 pw_34: times 8 dw 34 pw_512: times 8 dw 512 pw_1024: times 8 dw 1024 pw_2048: times 8 dw 2048 pw_6903: times 8 dw 6903 pw_8192: times 8 dw 8192 pd_32: times 4 dd 32 pd_63: times 4 dd 63 pd_512: times 4 dd 512 pd_16384: times 4 dd 16484 pd_32768: times 4 dd 32768 pd_262144:times 4 dd 262144 pd_0x3ff: times 4 dd 0x3ff pd_0x4000:times 4 dd 0x4000 pq_0x40000000: times 2 dq 0x40000000 const mc_warp_filter2 ; dav1d_mc_warp_filter[] reordered for pmaddubsw usage ; [-1, 0) db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0 db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0 db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0 db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0 db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0 db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0 db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0 db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0 db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0 db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0 db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0 db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0 db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0 db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0 db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0 db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0 db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0 db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0 db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0 db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0 db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0 db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0 db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0 db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0 db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0 db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0 db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0 db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0 db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0 db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0 db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0 db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0 ; [0, 1) db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0 db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0 db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1 db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1 db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1 db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1 db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1 db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1 db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2 db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2 db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2 db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2 db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2 db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2 db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2 db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2 db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2 db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2 db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2 db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2 db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2 db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2 db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2 db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2 db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2 db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1 db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2 db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1 db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1 db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1 db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0 db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0 ; [1, 2) db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0 db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1 db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1 db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1 db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1 db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2 db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2 db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2 db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3 db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3 db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3 db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4 db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4 db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4 db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4 db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4 db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4 db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4 db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4 db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4 db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4 db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4 db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4 db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3 db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3 db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3 db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2 db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2 db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2 db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1 db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1 db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0 db 0, 0, 2, -1, 0, 0, 127, 0 pw_258: times 2 dw 258 cextern mc_subpel_filters %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) %macro BIDIR_JMP_TABLE 2-* ;evaluated at definition time (in loop below) %xdefine %1_%2_table (%%table - 2*%3) %xdefine %%base %1_%2_table %xdefine %%prefix mangle(private_prefix %+ _%1_8bpc_%2) ; dynamically generated label %%table: %rep %0 - 2 ; repeat for num args dd %%prefix %+ .w%3 - %%base %rotate 1 %endrep %endmacro BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 16, 16, 16 %macro BASE_JMP_TABLE 3-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base %1_%2 %%table: %rep %0 - 2 dw %%base %+ _w%3 - %%base %rotate 1 %endrep %endmacro %xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_8bpc_sse2.prep) %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_8bpc_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_8bpc_ssse3.prep) BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %macro HV_JMP_TABLE 5-* %xdefine %%prefix mangle(private_prefix %+ _%1_%2_8bpc_%3) %xdefine %%base %1_%3 %assign %%types %4 %if %%types & 1 %xdefine %1_%2_h_%3_table (%%h - %5) %%h: %rep %0 - 4 dw %%prefix %+ .h_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 2 %xdefine %1_%2_v_%3_table (%%v - %5) %%v: %rep %0 - 4 dw %%prefix %+ .v_w%5 - %%base %rotate 1 %endrep %rotate 4 %endif %if %%types & 4 %xdefine %1_%2_hv_%3_table (%%hv - %5) %%hv: %rep %0 - 4 dw %%prefix %+ .hv_w%5 - %%base %rotate 1 %endrep %endif %endmacro HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128 %macro SCALED_JMP_TABLE 2-* %xdefine %1_%2_table (%%table - %3) %xdefine %%base mangle(private_prefix %+ _%1_8bpc_%2) %%table: %rep %0 - 2 dw %%base %+ .w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_1024: %xdefine %1_%2_dy1_table (%%dy_1024 - %3) %rep %0 - 2 dw %%base %+ .dy1_w%3 - %%base %rotate 1 %endrep %rotate 2 %%dy_2048: %xdefine %1_%2_dy2_table (%%dy_2048 - %3) %rep %0 - 2 dw %%base %+ .dy2_w%3 - %%base %rotate 1 %endrep %endmacro SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX SECTION .text INIT_XMM ssse3 %if ARCH_X86_32 DECLARE_REG_TMP 1 %define base t0-put_ssse3 %else DECLARE_REG_TMP 7 %define base 0 %endif %macro RESTORE_DSQ_32 1 %if ARCH_X86_32 mov %1, dsm ; restore dsq %endif %endmacro cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, h, mxy movifnidn mxyd, r6m ; mx LEA t0, put_ssse3 movifnidn srcq, srcmp movifnidn ssq, ssmp tzcnt wd, wm mov hd, hm test mxyd, mxyd jnz .h mov mxyd, r7m ; my test mxyd, mxyd jnz .v .put: movzx wd, word [t0+wq*2+table_offset(put,)] add wq, t0 RESTORE_DSQ_32 t0 jmp wq .put_w2: movzx r4d, word [srcq+ssq*0] movzx r6d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4w mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w2 RET .put_w4: mov r4d, [srcq+ssq*0] mov r6d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mov [dstq+dsq*0], r4d mov [dstq+dsq*1], r6d lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w4 RET .put_w8: movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movq [dstq+dsq*0], m0 movq [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w8 RET .put_w16: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0], m0 mova [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w16 RET .put_w32: movu m0, [srcq+ssq*0+16*0] movu m1, [srcq+ssq*0+16*1] movu m2, [srcq+ssq*1+16*0] movu m3, [srcq+ssq*1+16*1] lea srcq, [srcq+ssq*2] mova [dstq+dsq*0+16*0], m0 mova [dstq+dsq*0+16*1], m1 mova [dstq+dsq*1+16*0], m2 mova [dstq+dsq*1+16*1], m3 lea dstq, [dstq+dsq*2] sub hd, 2 jg .put_w32 RET .put_w64: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] add srcq, ssq mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 add dstq, dsq dec hd jg .put_w64 RET .put_w128: movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] movu m3, [srcq+16*3] mova [dstq+16*0], m0 mova [dstq+16*1], m1 mova [dstq+16*2], m2 mova [dstq+16*3], m3 movu m0, [srcq+16*4] movu m1, [srcq+16*5] movu m2, [srcq+16*6] movu m3, [srcq+16*7] mova [dstq+16*4], m0 mova [dstq+16*5], m1 mova [dstq+16*6], m2 mova [dstq+16*7], m3 add srcq, ssq add dstq, dsq dec hd jg .put_w128 RET .h: ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] mova m0, [base+bilin_h_shuf4] add mxyd, 0x00100010 movd m5, mxyd mov mxyd, r7m ; my pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)] mova m3, [base+pw_2048] add wq, t0 movifnidn dsq, dsmp jmp wq .h_w2: pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} .h_w2_loop: movd m0, [srcq+ssq*0] movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpckldq m0, m1 pshufb m0, m4 pmaddubsw m0, m5 pmulhrsw m0, m3 packuswb m0, m0 movd r6d, m0 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: movq m4, [srcq+ssq*0] movhps m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m4, m0 pmaddubsw m4, m5 pmulhrsw m4, m3 packuswb m4, m4 movd [dstq+dsq*0], m4 psrlq m4, 32 movd [dstq+dsq*1], m4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4 RET .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w8 RET .h_w16: movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq], m0 add dstq, dsq dec hd jg .h_w16 RET .h_w32: movu m0, [srcq+mmsize*0+8*0] movu m1, [srcq+mmsize*0+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 movu m1, [srcq+mmsize*1+8*0] movu m2, [srcq+mmsize*1+8*1] add srcq, ssq pshufb m1, m4 pshufb m2, m4 pmaddubsw m1, m5 pmaddubsw m2, m5 pmulhrsw m1, m3 pmulhrsw m2, m3 packuswb m1, m2 mova [dstq+16*0], m0 mova [dstq+16*1], m1 add dstq, dsq dec hd jg .h_w32 RET .h_w64: mov r6, -16*3 .h_w64_loop: movu m0, [srcq+r6+16*3+8*0] movu m1, [srcq+r6+16*3+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+16*3], m0 add r6, 16 jle .h_w64_loop add srcq, ssq add dstq, dsq dec hd jg .h_w64 RET .h_w128: mov r6, -16*7 .h_w128_loop: movu m0, [srcq+r6+16*7+8*0] movu m1, [srcq+r6+16*7+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 pmulhrsw m0, m3 pmulhrsw m1, m3 packuswb m0, m1 mova [dstq+r6+16*7], m0 add r6, 16 jle .h_w128_loop add srcq, ssq add dstq, dsq dec hd jg .h_w128 RET .v: movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] imul mxyd, 0x00ff00ff mova m5, [base+pw_2048] add mxyd, 0x00100010 add wq, t0 movd m4, mxyd pshufd m4, m4, q0000 movifnidn dsq, dsmp jmp wq .v_w2: movd m0, [srcq+ssq*0] .v_w2_loop: pinsrw m0, [srcq+ssq*1], 1 ; 0 1 lea srcq, [srcq+ssq*2] pshuflw m1, m0, q2301 pinsrw m0, [srcq+ssq*0], 0 ; 2 1 punpcklbw m1, m0 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 movd r6d, m1 mov [dstq+dsq*1], r6w shr r6d, 16 mov [dstq+dsq*0], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: movd m0, [srcq+ssq*0] .v_w4_loop: movd m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 movd m0, [srcq+ssq*0] punpckldq m1, m2 ; 0 1 punpckldq m2, m0 ; 1 2 punpcklbw m1, m2 pmaddubsw m1, m4 pmulhrsw m1, m5 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 ; lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop RET .v_w8: movq m0, [srcq+ssq*0] .v_w8_loop: movq m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 movq m0, [srcq+ssq*0] punpcklbw m1, m2 punpcklbw m2, m0 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop RET %macro PUT_BILIN_V_W16 0 movu m0, [srcq+ssq*0] %%loop: movu m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m1, m0 mova m2, m0 movu m0, [srcq+ssq*0] punpcklbw m1, m3 punpckhbw m2, m3 pmaddubsw m1, m4 pmaddubsw m2, m4 pmulhrsw m1, m5 pmulhrsw m2, m5 packuswb m1, m2 punpcklbw m2, m3, m0 punpckhbw m3, m0 pmaddubsw m2, m4 pmaddubsw m3, m4 pmulhrsw m2, m5 pmulhrsw m3, m5 packuswb m2, m3 mova [dstq+dsq*0], m1 mova [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] sub hd, 2 jg %%loop %endmacro .v_w16: PUT_BILIN_V_W16 RET .v_w128: lea r6d, [hq+(7<<16)] jmp .v_w16gt .v_w64: lea r6d, [hq+(3<<16)] jmp .v_w16gt .v_w32: lea r6d, [hq+(1<<16)] .v_w16gt: mov r4, srcq %if ARCH_X86_64 mov r7, dstq %endif .v_w16gt_loop: PUT_BILIN_V_W16 %if ARCH_X86_64 add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 %else mov dstq, dstmp add r4, 16 movzx hd, r6w add dstq, 16 mov srcq, r4 mov dstmp, dstq %endif sub r6d, 1<<16 jg .v_w16gt RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] WIN64_SPILL_XMM 8 shl mxyd, 11 ; can't shift by 12 due to signed overflow mova m7, [base+pw_15] movd m6, mxyd add wq, t0 pshuflw m6, m6, q0000 paddb m5, m5 punpcklqdq m6, m6 jmp wq .hv_w2: RESTORE_DSQ_32 t0 movd m0, [srcq+ssq*0] punpckldq m0, m0 pshufb m0, m4 pmaddubsw m0, m5 .hv_w2_loop: movd m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] punpckldq m1, m2 pshufb m1, m4 pmaddubsw m1, m5 ; 1 _ 2 _ shufps m2, m0, m1, q1032 ; 0 _ 1 _ mova m0, m1 psubw m1, m2 ; 2 * (src[x + src_stride] - src[x]) pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x]) >> 4 pavgw m2, m7 ; src[x] + 8 paddw m1, m2 ; src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8 psrlw m1, 4 packuswb m1, m1 %if ARCH_X86_64 movq r6, m1 %else pshuflw m1, m1, q2020 movd r6d, m1 %endif mov [dstq+dsq*0], r6w shr r6, gprsize*4 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET .hv_w4: mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+ssq*0] movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w4_loop: movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movhps m1, [srcq+ssq*0] pshufb m1, m4 pmaddubsw m1, m5 ; 1 2 shufps m2, m0, m1, q1032 ; 0 1 mova m0, m1 psubw m1, m2 pmulhw m1, m6 pavgw m2, m7 paddw m1, m2 psrlw m1, 4 packuswb m1, m1 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+ssq*0] movifnidn dsq, dsmp pshufb m0, m4 pmaddubsw m0, m5 .hv_w8_loop: movu m2, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m2, m4 pmaddubsw m2, m5 psubw m1, m2, m0 pmulhw m1, m6 pavgw m0, m7 paddw m1, m0 movu m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 psubw m3, m0, m2 pmulhw m3, m6 pavgw m2, m7 paddw m3, m2 psrlw m1, 4 psrlw m3, 4 packuswb m1, m3 movq [dstq+dsq*0], m1 movhps [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w8_loop RET .hv_w128: lea r6d, [hq+(7<<16)] jmp .hv_w16_start .hv_w64: lea r6d, [hq+(3<<16)] jmp .hv_w16_start .hv_w32: lea r6d, [hq+(1<<16)] .hv_w16_start: mov r4, srcq %if ARCH_X86_32 %define m8 [dstq] %else mov r7, dstq %endif .hv_w16: movifnidn dsq, dsmp %if WIN64 movaps r4m, m8 %endif .hv_w16_loop0: movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 pshufb m1, m4 pmaddubsw m0, m5 pmaddubsw m1, m5 .hv_w16_loop: add srcq, ssq movu m2, [srcq+8*0] movu m3, [srcq+8*1] pshufb m2, m4 pshufb m3, m4 pmaddubsw m2, m5 pmaddubsw m3, m5 mova m8, m2 psubw m2, m0 pmulhw m2, m6 pavgw m0, m7 paddw m2, m0 mova m0, m3 psubw m3, m1 pmulhw m3, m6 pavgw m1, m7 paddw m3, m1 mova m1, m0 mova m0, m8 psrlw m2, 4 psrlw m3, 4 packuswb m2, m3 mova [dstq], m2 add dstq, dsmp dec hd jg .hv_w16_loop %if ARCH_X86_32 mov dstq, dstm add r4, 16 movzx hd, r6w add dstq, 16 mov srcq, r4 mov dstm, dstq %else add r4, 16 add r7, 16 movzx hd, r6b mov srcq, r4 mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w16_loop0 %if WIN64 movaps m8, r4m %endif RET %macro PSHUFB_BILIN_H8 2 ; dst, src %if cpuflag(ssse3) pshufb %1, %2 %else psrldq %2, %1, 1 punpcklbw %1, %2 %endif %endmacro %macro PSHUFB_BILIN_H4 3 ; dst, src, tmp %if cpuflag(ssse3) pshufb %1, %2 %else psrldq %2, %1, 1 punpckhbw %3, %1, %2 punpcklbw %1, %2 punpcklqdq %1, %3 %endif %endmacro %macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero %if cpuflag(ssse3) pmaddubsw %1, %2 %else %if %5 == 1 pxor %3, %3 %endif punpckhbw %4, %1, %3 punpcklbw %1, %1, %3 pmaddwd %4, %2 pmaddwd %1, %2 packssdw %1, %4 %endif %endmacro %macro PMULHRSW 5 ; dst, src, tmp, rndval, shift %if cpuflag(ssse3) pmulhrsw %1, %2 %else punpckhwd %3, %1, %4 punpcklwd %1, %4 pmaddwd %3, %2 pmaddwd %1, %2 psrad %3, %5 psrad %1, %5 packssdw %1, %3 %endif %endmacro %macro PREP_BILIN 0 %if ARCH_X86_32 %define base r6-prep%+SUFFIX %else %define base 0 %endif cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx LEA r6, prep%+SUFFIX tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd jnz .h mov mxyd, r6m ; my test mxyd, mxyd jnz .v .prep: %if notcpuflag(ssse3) add r6, prep_ssse3 - prep_sse2 jmp prep_ssse3 %else movzx wd, word [r6+wq*2+table_offset(prep,)] pxor m4, m4 add wq, r6 lea stride3q, [strideq*3] jmp wq .prep_w4: movd m0, [srcq+strideq*0] movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m2, m3 punpcklbw m0, m4 punpcklbw m2, m4 psllw m0, 4 psllw m2, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 4 jg .prep_w4 RET .prep_w8: movq m0, [srcq+strideq*0] movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m0, m4 punpcklbw m1, m4 punpcklbw m2, m4 punpcklbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .prep_w8 RET .prep_w16: movu m1, [srcq+strideq*0] movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .prep_w16 RET .prep_w128: mov r3, -128 jmp .prep_w32_start .prep_w64: mov r3, -64 jmp .prep_w32_start .prep_w32: mov r3, -32 .prep_w32_start: sub srcq, r3 .prep_w32_vloop: mov r6, r3 .prep_w32_hloop: movu m1, [srcq+r6+16*0] movu m3, [srcq+r6+16*1] punpcklbw m0, m1, m4 punpckhbw m1, m4 punpcklbw m2, m3, m4 punpckhbw m3, m4 psllw m0, 4 psllw m1, 4 psllw m2, 4 psllw m3, 4 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add r6, 32 jl .prep_w32_hloop add srcq, strideq dec hd jg .prep_w32_vloop RET %endif .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] %if cpuflag(ssse3) imul mxyd, 0x00ff00ff mova m4, [base+bilin_h_shuf8] add mxyd, 0x00100010 %else imul mxyd, 0xffff add mxyd, 16 %endif movd m5, mxyd mov mxyd, r6m ; my pshufd m5, m5, q0000 test mxyd, mxyd jnz .hv movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)] %if notcpuflag(ssse3) WIN64_SPILL_XMM 8 pxor m6, m6 %endif add wq, r6 jmp wq .h_w4: %if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] %endif lea stride3q, [strideq*3] .h_w4_loop: movq m0, [srcq+strideq*0] movhps m0, [srcq+strideq*1] movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] PSHUFB_BILIN_H4 m0, m4, m2 PMADDUBSW m0, m5, m6, m2, 0 PSHUFB_BILIN_H4 m1, m4, m2 PMADDUBSW m1, m5, m6, m2, 0 mova [tmpq+0 ], m0 mova [tmpq+16], m1 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: lea stride3q, [strideq*3] .h_w8_loop: movu m0, [srcq+strideq*0] movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .h_w8_loop RET .h_w16: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] movu m2, [srcq+strideq*1+8*0] movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 2 jg .h_w16 RET .h_w128: mov r3, -128 jmp .h_w32_start .h_w64: mov r3, -64 jmp .h_w32_start .h_w32: mov r3, -32 .h_w32_start: sub srcq, r3 .h_w32_vloop: mov r6, r3 .h_w32_hloop: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] movu m2, [srcq+r6+8*2] movu m3, [srcq+r6+8*3] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add r6, 32 jl .h_w32_hloop add srcq, strideq dec hd jg .h_w32_vloop RET .v: %if notcpuflag(ssse3) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 %endif movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] %if cpuflag(ssse3) imul mxyd, 0x00ff00ff add mxyd, 0x00100010 %else imul mxyd, 0xffff pxor m6, m6 add mxyd, 16 %endif add wq, r6 lea stride3q, [strideq*3] movd m5, mxyd pshufd m5, m5, q0000 jmp wq .v_w4: movd m0, [srcq+strideq*0] .v_w4_loop: movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] movd m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpckldq m0, m1 punpckldq m1, m2 punpcklbw m0, m1 ; 01 12 PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m0 movd m0, [srcq+strideq*0] punpckldq m2, m3 punpckldq m3, m0 punpcklbw m2, m3 ; 23 34 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 4 jg .v_w4_loop RET .v_w8: movq m0, [srcq+strideq*0] .v_w8_loop: movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*2] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m0, m1 ; 01 punpcklbw m1, m2 ; 12 PMADDUBSW m0, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*0], m0 movq m0, [srcq+strideq*0] punpcklbw m2, m3 ; 23 punpcklbw m3, m0 ; 34 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*1], m1 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 sub hd, 4 jg .v_w8_loop RET .v_w16: movu m0, [srcq+strideq*0] .v_w16_loop: movu m1, [srcq+strideq*1] movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] punpcklbw m4, m0, m1 punpckhbw m0, m1 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m4 punpcklbw m4, m1, m2 punpckhbw m1, m2 PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0] PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*2], m4 punpcklbw m4, m2, m3 punpckhbw m2, m3 PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*3], m1 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*4], m4 punpcklbw m4, m3, m0 punpckhbw m3, m0 PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*5], m2 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*6], m4 mova [tmpq+16*7], m3 add tmpq, 16*8 sub hd, 4 jg .v_w16_loop RET .v_w128: lea r3d, [hq+(3<<8)] mov r6d, 256 jmp .v_w32_start .v_w64: lea r3d, [hq+(1<<8)] mov r6d, 128 jmp .v_w32_start .v_w32: xor r3d, r3d mov r6d, 64 .v_w32_start: %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq %endif mov r5, srcq .v_w32_hloop: movu m0, [srcq+strideq*0+16*0] movu m1, [srcq+strideq*0+16*1] .v_w32_vloop: movu m2, [srcq+strideq*1+16*0] movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m0, m2 punpckhbw m0, m2 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m0, m5, m6, m7, 0 mova [tmpq+16*0], m4 mova [tmpq+16*1], m0 movu m0, [srcq+strideq*0+16*0] punpcklbw m4, m1, m3 punpckhbw m1, m3 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*2], m4 mova [tmpq+16*3], m1 movu m1, [srcq+strideq*0+16*1] add tmpq, r6 punpcklbw m4, m2, m0 punpckhbw m2, m0 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq+16*0], m4 mova [tmpq+16*1], m2 punpcklbw m4, m3, m1 punpckhbw m3, m1 PMADDUBSW m4, m5, m6, m7, 0 PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*2], m4 mova [tmpq+16*3], m3 add tmpq, r6 sub hd, 2 jg .v_w32_vloop add r5, 32 movzx hd, r3b mov srcq, r5 %if ARCH_X86_64 add r7, 16*4 mov tmpq, r7 %else mov tmpq, tmpmp add tmpq, 16*4 mov tmpmp, tmpq %endif sub r3d, 1<<8 jg .v_w32_hloop %if WIN64 POP r7 %endif RET .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] %assign stack_offset stack_offset - stack_size_padded %if cpuflag(ssse3) imul mxyd, 0x08000800 WIN64_SPILL_XMM 8 %else or mxyd, 1<<16 WIN64_SPILL_XMM 9 %if ARCH_X86_64 mova m8, [base+pw_8] %else %define m8 [base+pw_8] %endif pxor m7, m7 %endif movd m6, mxyd add wq, r6 pshufd m6, m6, q0000 jmp wq .hv_w4: %if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+strideq*0] %else movhps m0, [srcq+strideq*0] %endif lea r3, [strideq*3] PSHUFB_BILIN_H4 m0, m4, m3 PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] movhps m1, [srcq+strideq*2] movq m2, [srcq+r3 ] lea srcq, [srcq+strideq*4] movhps m2, [srcq+strideq*0] PSHUFB_BILIN_H4 m1, m4, m3 PSHUFB_BILIN_H4 m2, m4, m3 PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 PMADDUBSW m2, m5, m7, m4, 0 ; 3 4 shufpd m0, m1, 0x01 ; 0 1 shufpd m3, m1, m2, 0x01 ; 2 3 psubw m1, m0 PMULHRSW m1, m6, m4, m8, 4 paddw m1, m0 mova m0, m2 psubw m2, m3 PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*0], m1 mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .hv_w4_loop RET .hv_w8: movu m0, [srcq+strideq*0] PSHUFB_BILIN_H8 m0, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 0 .hv_w8_loop: movu m1, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movu m2, [srcq+strideq*0] PSHUFB_BILIN_H8 m1, m4 PSHUFB_BILIN_H8 m2, m4 PMADDUBSW m1, m5, m7, m4, 0 ; 1 PMADDUBSW m2, m5, m7, m4, 0 ; 2 psubw m3, m1, m0 PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 mova m0, m2 psubw m2, m1 PMULHRSW m2, m6, m4, m8, 4 paddw m2, m1 mova [tmpq+16*0], m3 mova [tmpq+16*1], m2 add tmpq, 16*2 sub hd, 2 jg .hv_w8_loop RET .hv_w128: lea r3d, [hq+(7<<8)] mov r5d, 256 jmp .hv_w16_start .hv_w64: lea r3d, [hq+(3<<8)] mov r5d, 128 jmp .hv_w16_start .hv_w32: lea r3d, [hq+(1<<8)] mov r5d, 64 jmp .hv_w16_start .hv_w16: xor r3d, r3d mov r5d, 32 .hv_w16_start: %if ARCH_X86_64 || cpuflag(ssse3) mov r6, srcq %endif %if ARCH_X86_64 %if WIN64 PUSH r7 %endif mov r7, tmpq %endif .hv_w16_hloop: movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] PSHUFB_BILIN_H8 m0, m4 PSHUFB_BILIN_H8 m1, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 0a PMADDUBSW m1, m5, m7, m4, 0 ; 0b .hv_w16_vloop: movu m2, [srcq+strideq*1+8*0] PSHUFB_BILIN_H8 m2, m4 PMADDUBSW m2, m5, m7, m4, 0 ; 1a psubw m3, m2, m0 PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 mova [tmpq+16*0], m3 movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] PSHUFB_BILIN_H8 m3, m4 PMADDUBSW m3, m5, m7, m4, 0 ; 1b psubw m0, m3, m1 PMULHRSW m0, m6, m4, m8, 4 paddw m0, m1 mova [tmpq+16*1], m0 add tmpq, r5 movu m0, [srcq+strideq*0+8*0] PSHUFB_BILIN_H8 m0, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 2a psubw m1, m0, m2 PMULHRSW m1, m6, m4, m8, 4 paddw m1, m2 mova [tmpq+16*0], m1 movu m1, [srcq+strideq*0+8*1] PSHUFB_BILIN_H8 m1, m4 PMADDUBSW m1, m5, m7, m4, 0 ; 2b psubw m2, m1, m3 PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*1], m2 add tmpq, r5 sub hd, 2 jg .hv_w16_vloop movzx hd, r3b %if ARCH_X86_64 add r6, 16 add r7, 2*16 mov srcq, r6 mov tmpq, r7 %elif cpuflag(ssse3) mov tmpq, tmpm add r6, 16 add tmpq, 2*16 mov srcq, r6 mov tmpm, tmpq %else mov srcq, srcm mov tmpq, tmpm add srcq, 16 add tmpq, 2*16 mov srcm, srcq mov tmpm, tmpq %endif sub r3d, 1<<8 jg .hv_w16_hloop %if WIN64 POP r7 %endif RET %endmacro ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 %macro FN 4 ; prefix, type, type_h, type_v cglobal %1_%2_8bpc mov t0d, FILTER_%3 %ifidn %3, %4 mov t1d, t0d %else mov t1d, FILTER_%4 %endif %ifnidn %2, regular ; skip the jump in the last filter jmp mangle(private_prefix %+ _%1_8bpc %+ SUFFIX) %endif %endmacro %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif FN put_8tap, sharp, SHARP, SHARP FN put_8tap, sharp_smooth, SHARP, SMOOTH FN put_8tap, smooth_sharp, SMOOTH, SHARP FN put_8tap, smooth, SMOOTH, SMOOTH FN put_8tap, sharp_regular, SHARP, REGULAR FN put_8tap, regular_sharp, REGULAR, SHARP FN put_8tap, smooth_regular, SMOOTH, REGULAR FN put_8tap, regular_smooth, REGULAR, SMOOTH FN put_8tap, regular, REGULAR, REGULAR %if ARCH_X86_32 %define base_reg r1 %define base base_reg-put_ssse3 %else %define base_reg r8 %define base 0 %endif cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 %assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h %if ARCH_X86_64 imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v %else imul ssd, mym, 0x010101 add ssd, t1d ; 8tap_v, my, 4tap_v mov srcq, srcm %endif mov wd, wm movifnidn hd, hm LEA base_reg, put_ssse3 test mxd, 0xf00 jnz .h %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .v tzcnt wd, wd movzx wd, word [base_reg+wq*2+table_offset(put,)] add wq, base_reg ; put_bilin mangling jump %assign stack_offset org_stack_offset movifnidn dsq, dsmp movifnidn ssq, ssmp %if WIN64 pop r8 %endif lea r6, [ssq*3] jmp wq .h: %if ARCH_X86_32 test ssd, 0xf00 %else test myd, 0xf00 %endif jnz .hv movifnidn ssq, ssmp WIN64_SPILL_XMM 12 cmp wd, 4 jl .h_w2 je .h_w4 tzcnt wd, wd %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %endif shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(put, _8tap_h)] movq m6, [base_reg+mxq*8+subpel_filters-put_ssse3] mova m7, [base+pw_34] ; 2 + (8 << 2) pshufd m5, m6, q0000 pshufd m6, m6, q1111 add wq, base_reg jmp wq .h_w2: %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq mova m4, [base+subpel_h_shuf4] movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] mova m5, [base+pw_34] ; 2 + (8 << 2) pshufd m3, m3, q0000 movifnidn dsq, dsmp .h_w2_loop: movq m0, [srcq+ssq*0] movhps m0, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m0, m4 pmaddubsw m0, m3 phaddw m0, m0 paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 movd r6d, m0 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w2_loop RET .h_w4: %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m3, [base_reg+mxq*8+subpel_filters-put_ssse3+2] mova m6, [base+subpel_h_shufA] mova m5, [base+pw_34] ; 2 + (8 << 2) pshufd m3, m3, q0000 movifnidn dsq, dsmp .h_w4_loop: movq m0, [srcq+ssq*0] ; 1 movq m1, [srcq+ssq*1] ; 2 lea srcq, [srcq+ssq*2] pshufb m0, m6 ; subpel_h_shufA pshufb m1, m6 ; subpel_h_shufA pmaddubsw m0, m3 ; subpel_filters pmaddubsw m1, m3 ; subpel_filters phaddw m0, m1 paddw m0, m5 ; pw34 psraw m0, 6 packuswb m0, m0 movd [dstq+dsq*0], m0 psrlq m0, 32 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] sub hd, 2 jg .h_w4_loop RET %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] %if ARCH_X86_32 pshufb %2, %1, [base+subpel_h_shufB] pshufb %3, %1, [base+subpel_h_shufC] pshufb %1, [base+subpel_h_shufA] %else pshufb %2, %1, m11; subpel_h_shufB pshufb %3, %1, m9 ; subpel_h_shufC pshufb %1, m10 ; subpel_h_shufA %endif pmaddubsw %4, %2, m5 ; subpel +0 B0 pmaddubsw %2, m6 ; subpel +4 B4 pmaddubsw %3, m6 ; C4 pmaddubsw %1, m5 ; A0 paddw %3, %4 ; C4+B0 paddw %1, %2 ; A0+B4 phaddw %1, %3 paddw %1, m7 ; pw34 psraw %1, 6 %endmacro .h_w8: movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 %if ARCH_X86_32 movq [dstq], m0 add dstq, dsm movhps [dstq], m0 add dstq, dsm %else movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] %endif sub hd, 2 jg .h_w8 RET .h_w128: mov r4, -16*7 jmp .h_w16_start .h_w64: mov r4, -16*3 jmp .h_w16_start .h_w32: mov r4, -16*1 jmp .h_w16_start .h_w16: xor r4d, r4d .h_w16_start: sub srcq, r4 sub dstq, r4 .h_w16_loop_v: mov r6, r4 .h_w16_loop_h: movu m0, [srcq+r6+8*0] movu m1, [srcq+r6+8*1] PUT_8TAP_H m0, m2, m3, m4 PUT_8TAP_H m1, m2, m3, m4 packuswb m0, m1 mova [dstq+r6], m0 add r6, 16 jle .h_w16_loop_h add srcq, ssq add dstq, dsmp dec hd jg .h_w16_loop_v RET .v: %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] %else %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] %endif tzcnt r6d, wd movzx r6d, word [base_reg+r6*2+table_offset(put, _8tap_v)] punpcklwd m0, m0 mova m7, [base+pw_512] add r6, base_reg %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 2 ; use r1 (ds) as tmp for stack alignment if needed ALLOC_STACK -16*4 %assign regs_used 7 pshufd m1, m0, q0000 mova subpel0, m1 pshufd m1, m0, q1111 mova subpel1, m1 pshufd m1, m0, q2222 mova subpel2, m1 pshufd m1, m0, q3333 mova subpel3, m1 mov ssq, [rstk+stack_offset+gprsize*4] lea ssq, [ssq*3] sub srcq, ssq mov ssq, [rstk+stack_offset+gprsize*4] mov dsq, [rstk+stack_offset+gprsize*2] %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 lea ss3q, [ssq*3] pshufd m8, m0, q0000 sub srcq, ss3q pshufd m9, m0, q1111 pshufd m10, m0, q2222 pshufd m11, m0, q3333 %endif jmp r6 .v_w2: movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] movd m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m3, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m2, [srcq+ssq*2] add srcq, ss3q movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m4, [srcq+ssq*2] add srcq, ss3q %endif punpcklwd m1, m0 ; 0 1 punpcklwd m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+ssq*0] punpcklwd m2, m5 ; 2 3 punpcklwd m5, m3 ; 3 4 punpcklwd m3, m4 ; 4 5 punpcklwd m4, m0 ; 5 6 punpcklbw m2, m5 ; 23 34 punpcklbw m3, m4 ; 45 56 .v_w2_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 punpcklwd m3, m0, m4 ; 6 7 movd m0, [srcq+ssq*0] punpcklwd m4, m0 ; 7 8 punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 movd r6d, m5 mov [dstq+dsq*0], r6w shr r6d, 16 mov [dstq+dsq*1], r6w lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w2_loop RET .v_w4: %if ARCH_X86_32 .v_w8: .v_w16: .v_w32: .v_w64: .v_w128: shl wd, 14 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*4+gprsize] mov dstm, dstq %endif lea r6d, [hq+wq-(1<<16)] mov r4, srcq .v_w4_loop0: %endif movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movd m2, [srcq+ssq*0] movd m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] movd m3, [srcq+ssq*0] movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] %else movd m2, [srcq+ssq*2] add srcq, ss3q movd m5, [srcq+ssq*0] movd m3, [srcq+ssq*1] movd m4, [srcq+ssq*2] add srcq, ss3q %endif punpckldq m1, m0 ; 0 1 punpckldq m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+ssq*0] punpckldq m2, m5 ; 2 3 punpckldq m5, m3 ; 3 4 punpckldq m3, m4 ; 4 5 punpckldq m4, m0 ; 5 6 punpcklbw m2, m5 ; 23 34 punpcklbw m3, m4 ; 45 56 .v_w4_loop: movd m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m5, m1, subpel0 ; a0 b0 mova m1, m2 pmaddubsw m2, subpel1 ; a1 b1 paddw m5, m2 mova m2, m3 pmaddubsw m3, subpel2 ; a2 b2 paddw m5, m3 punpckldq m3, m0, m4 ; 6 7 _ _ movd m0, [srcq+ssq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 pmaddubsw m4, m3, subpel3 ; a3 b3 paddw m5, m4 pmulhrsw m5, m7 packuswb m5, m5 movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov dstq, dstm add r4, 4 movzx hd, r6w add dstq, 4 mov srcq, r4 mov dstm, dstq sub r6d, 1<<16 jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: .v_w16: .v_w32: .v_w64: .v_w128: lea r6d, [wq*8-64] mov r4, srcq mov r7, dstq lea r6d, [hq+r6*4] .v_w8_loop0: movq m1, [srcq+ssq*0] movq m2, [srcq+ssq*1] movq m3, [srcq+ssq*2] add srcq, ss3q movq m4, [srcq+ssq*0] movq m5, [srcq+ssq*1] movq m6, [srcq+ssq*2] add srcq, ss3q movq m0, [srcq+ssq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m5 ; 34 punpcklbw m5, m6 ; 45 punpcklbw m6, m0 ; 56 .v_w8_loop: movq m13, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddubsw m14, m1, subpel0 ; a0 mova m1, m3 pmaddubsw m15, m2, subpel0 ; b0 mova m2, m4 pmaddubsw m3, subpel1 ; a1 mova m12, m0 pmaddubsw m4, subpel1 ; b1 movq m0, [srcq+ssq*0] paddw m14, m3 paddw m15, m4 mova m3, m5 pmaddubsw m5, subpel2 ; a2 mova m4, m6 pmaddubsw m6, subpel2 ; b2 punpcklbw m12, m13 ; 67 punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 pmaddubsw m12, subpel3 ; a3 paddw m15, m6 mova m6, m13 pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 packuswb m14, m15 movq [dstq+dsq*0], m14 movhps [dstq+dsq*1], m14 lea dstq, [dstq+dsq*2] sub hd, 2 jg .v_w8_loop add r4, 8 add r7, 8 movzx hd, r6b mov srcq, r4 mov dstq, r7 sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 %undef subpel0 %undef subpel1 %undef subpel2 %undef subpel3 .hv: %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m1, [base_reg+mxq*8+subpel_filters-put_ssse3+2] %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m0, [base_reg+ssq*8+subpel_filters-put_ssse3] mov ssq, ssmp lea r6, [ssq*3] sub srcq, r6 %define base_reg r6 mov r6, r1; use as new base %assign regs_used 2 ALLOC_STACK -mmsize*14 %assign regs_used 7 mov dsq, [rstk+stack_offset+gprsize*2] %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 psraw m0, 8 ; sign-extend pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 mova subpelv1, m6 pshufd m6, m0, q2222 mova subpelv2, m6 pshufd m6, m0, q3333 mova subpelv3, m6 %else movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-put_ssse3] ALLOC_STACK mmsize*14, 14 lea ss3q, [ssq*3] sub srcq, ss3q %define subpelv0 m10 %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 psraw m0, 8 ; sign-extend mova m8, [base+pw_8192] mova m9, [base+pd_512] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 cmp wd, 4 je .hv_w4 .hv_w2: mova m6, [base+subpel_h_shuf4] movq m2, [srcq+ssq*0] ; 0 movhps m2, [srcq+ssq*1] ; 0 _ 1 %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] lea srcq, [srcq+ssq*2] movq m0, [srcq+ssq*0] ; 2 movhps m0, [srcq+ssq*1] ; 2 _ 3 lea srcq, [srcq+ssq*2] %else %define w8192reg m8 %define d512reg m9 movq m0, [srcq+ssq*2] ; 2 add srcq, ss3q movhps m0, [srcq+ssq*0] ; 2 _ 3 %endif pshufb m2, m6 ; 0 ~ 1 ~ pshufb m0, m6 ; 2 ~ 3 ~ pmaddubsw m2, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m2, m0 ; 0 1 2 3 pmulhrsw m2, w8192reg %if ARCH_X86_32 movq m3, [srcq+ssq*0] ; 4 movhps m3, [srcq+ssq*1] ; 4 _ 5 lea srcq, [srcq+ssq*2] %else movq m3, [srcq+ssq*1] ; 4 movhps m3, [srcq+ssq*2] ; 4 _ 5 add srcq, ss3q %endif movq m0, [srcq+ssq*0] ; 6 pshufb m3, m6 ; 4 ~ 5 ~ pshufb m0, m6 ; 6 ~ pmaddubsw m3, m7 ; subpel_filters pmaddubsw m0, m7 ; subpel_filters phaddw m3, m0 ; 4 5 6 _ pmulhrsw m3, w8192reg palignr m4, m3, m2, 4; V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 0 1 1 2 punpckhwd m2, m4 ; V 23 34 2 3 3 4 pshufd m0, m3, q2121; V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 4 5 5 6 .hv_w2_loop: movq m4, [srcq+ssq*1] ; V 7 lea srcq, [srcq+ssq*2] ; V movhps m4, [srcq+ssq*0] ; V 7 8 pshufb m4, m6 pmaddubsw m4, m7 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 ; V pmaddwd m2, subpelv1 ; V a1 b1 paddd m5, m2 ; V mova m2, m3 ; V pmaddwd m3, subpelv2 ; a2 b2 phaddw m4, m4 pmulhrsw m4, w8192reg paddd m5, m3 ; V palignr m3, m4, m0, 12 mova m0, m4 punpcklwd m3, m0 ; V 67 78 pmaddwd m4, m3, subpelv3 ; V a3 b3 paddd m5, d512reg paddd m5, m4 psrad m5, 10 packssdw m5, m5 packuswb m5, m5 movd r4d, m5 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .hv_w2_loop RET %undef w8192reg %undef d512reg .hv_w4: %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 %define hv4_line_0_3 7 %define hv4_line_0_4 8 %define hv4_line_0_5 9 %define hv4_line_1_0 10 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 %macro SAVELINE_W4 3 mova [rsp+mmsize*hv4_line_%3_%2], %1 %endmacro %macro RESTORELINE_W4 3 mova %1, [rsp+mmsize*hv4_line_%3_%2] %endmacro %if ARCH_X86_32 %define w8192reg [base+pw_8192] %define d512reg [base+pd_512] %else %define w8192reg m8 %define d512reg m9 %endif ; lower shuffle 0 1 2 3 4 mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 0 _ _ _ movhps m5, [srcq+ssq*1] ; 0 _ 1 _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movq m4, [srcq+ssq*0] ; 2 _ _ _ movhps m4, [srcq+ssq*1] ; 2 _ 3 _ lea srcq, [srcq+ssq*2] %else movq m4, [srcq+ssq*2] ; 2 _ _ _ movhps m4, [srcq+ss3q ] ; 2 _ 3 _ lea srcq, [srcq+ssq*4] %endif pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 ;H 0 1 2 3 pmulhrsw m2, w8192reg ;H pw_8192 SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 mova m6, [base+subpel_h_shuf4+16] pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ pmaddubsw m2, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m2, m0 ;H 0 1 2 3 pmulhrsw m2, w8192reg ;H pw_8192 ; ; lower shuffle mova m6, [base+subpel_h_shuf4] movq m5, [srcq+ssq*0] ; 4 _ _ _ movhps m5, [srcq+ssq*1] ; 4 _ 5 _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] movq m4, [srcq+ssq*0] ; 6 _ _ _ add srcq, ssq %else movq m4, [srcq+ssq*2] ; 6 _ _ _ add srcq, ss3q %endif pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 SAVELINE_W4 m3, 3, 0 ; upper shuffle mova m6, [base+subpel_h_shuf4+16] pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ pmaddubsw m3, m7 ;H subpel_filters pmaddubsw m0, m7 ;H subpel_filters phaddw m3, m0 ;H 4 5 6 7 pmulhrsw m3, w8192reg ;H pw_8192 ;process high palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 palignr m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 .hv_w4_loop: ;process low pmaddwd m5, m1, subpelv0 ; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 pmulhrsw m4, w8192reg ;H pw_8192 palignr m3, m4, m0, 12 ; 6 7 8 7 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m5, 10 SAVELINE_W4 m0, 0, 0 SAVELINE_W4 m1, 1, 0 SAVELINE_W4 m2, 2, 0 SAVELINE_W4 m3, 3, 0 SAVELINE_W4 m5, 5, 0 ;process high RESTORELINE_W4 m0, 0, 1 RESTORELINE_W4 m1, 1, 1 RESTORELINE_W4 m2, 2, 1 RESTORELINE_W4 m3, 3, 1 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 mova m6, [base+subpel_h_shuf4+16] movq m4, [srcq+ssq*0] ; 7 movhps m4, [srcq+ssq*1] ; 7 _ 8 _ lea srcq, [srcq+ssq*2] pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ pmaddubsw m4, m7 ;H subpel_filters phaddw m4, m4 ;H 7 8 7 8 pmulhrsw m4, w8192reg ;H pw_8192 palignr m3, m4, m0, 12 ; 6 7 8 7 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 paddd m5, d512reg ; pd_512 paddd m5, m4 psrad m4, m5, 10 RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 ; d -> w packuswb m5, m5 ; w -> b pshuflw m5, m5, q3120 movd [dstq+dsq*0], m5 psrlq m5, 32 movd [dstq+dsq*1], m5 lea dstq, [dstq+dsq*2] sub hd, 2 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 RESTORELINE_W4 m0, 0, 0 RESTORELINE_W4 m1, 1, 0 RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 jg .hv_w4_loop RET %undef subpelv0 %undef subpelv1 %undef subpelv2 %undef subpelv3 .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 %define hv8_line_4 3 %define hv8_line_6 4 %macro SAVELINE_W8 2 mova [rsp+hv8_line_%1*mmsize], %2 %endmacro %macro RESTORELINE_W8 2 mova %2, [rsp+hv8_line_%1*mmsize] %endmacro shr mxd, 16 sub srcq, 3 %if ARCH_X86_32 %define base_reg r1 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] %define subpelv1 [rsp+mmsize*8] %define subpelv2 [rsp+mmsize*9] %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] movq m1, [base_reg+mxq*8+subpel_filters-put_ssse3] movzx mxd, ssb shr ssd, 16 cmp hd, 6 cmovs ssd, mxd movq m5, [base_reg+ssq*8+subpel_filters-put_ssse3] mov ssq, ssmp ALLOC_STACK -mmsize*13 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*13+gprsize*1] %define dsm [rsp+mmsize*13+gprsize*2] mov r6, [rstk+stack_offset+gprsize*2] mov dsm, r6 %endif pshufd m0, m1, q0000 pshufd m1, m1, q1111 punpcklbw m5, m5 psraw m5, 8 ; sign-extend pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 pshufd m5, m5, q3333 mova subpelh0, m0 mova subpelh1, m1 mova subpelv0, m2 mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 lea r6, [ssq*3] mov dstm, dstq sub srcq, r6 %else ALLOC_STACK 16*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 %define subpelv1 m13 %define subpelv2 m14 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 movq m0, [base_reg+mxq*8+subpel_filters-put_ssse3] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-put_ssse3] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 punpcklbw m1, m1 psraw m1, 8 ; sign-extend pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea ss3q, [ssq*3] mov r7, dstq sub srcq, ss3q %endif shl wd, 14 lea r6d, [hq+wq-(1<<16)] mov r4, srcq .hv_w8_loop0: movu m4, [srcq+ssq*0] ; 0 = _ _ movu m5, [srcq+ssq*1] ; 1 = _ _ %if ARCH_X86_32 lea srcq, [srcq+ssq*2] %endif %macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3] %if ARCH_X86_32 pshufb %3, %1, [base+subpel_h_shufB] pshufb %4, %1, [base+subpel_h_shufC] pshufb %1, [base+subpel_h_shufA] %else pshufb %3, %1, %6 ; subpel_h_shufB pshufb %4, %1, %7 ; subpel_h_shufC pshufb %1, %5 ; subpel_h_shufA %endif pmaddubsw %2, %3, subpelh0 ; subpel +0 C0 pmaddubsw %4, subpelh1; subpel +4 B4 pmaddubsw %3, subpelh1; C4 pmaddubsw %1, subpelh0; A0 paddw %2, %4 ; C0+B4 paddw %1, %3 ; A0+C4 phaddw %1, %2 %endmacro %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %endif HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ %if ARCH_X86_32 movu m6, [srcq+ssq*0] ; 2 = _ _ movu m0, [srcq+ssq*1] ; 3 = _ _ lea srcq, [srcq+ssq*2] %else movu m6, [srcq+ssq*2] ; 2 = _ _ add srcq, ss3q movu m0, [srcq+ssq*0] ; 3 = _ _ %endif HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ mova m7, [base+pw_8192] pmulhrsw m4, m7 ; H pw_8192 pmulhrsw m5, m7 ; H pw_8192 pmulhrsw m6, m7 ; H pw_8192 pmulhrsw m0, m7 ; H pw_8192 punpcklwd m1, m4, m5 ; 0 1 ~ punpcklwd m2, m5, m6 ; 1 2 ~ punpcklwd m3, m6, m0 ; 2 3 ~ SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 mova m7, [base+subpel_h_shufA] %if ARCH_X86_32 movu m4, [srcq+ssq*0] ; 4 = _ _ movu m5, [srcq+ssq*1] ; 5 = _ _ lea srcq, [srcq+ssq*2] %else movu m4, [srcq+ssq*1] ; 4 = _ _ movu m5, [srcq+ssq*2] ; 5 = _ _ add srcq, ss3q %endif movu m6, [srcq+ssq*0] ; 6 = _ _ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ mova m7, [base+pw_8192] pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ punpcklwd m4, m0, m1 ; 3 4 ~ punpcklwd m5, m1, m2 ; 4 5 ~ punpcklwd m6, m2, m3 ; 5 6 ~ SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: ; m8 accu for V a ; m9 accu for V b SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 SAVELINE_W8 4, m6 %if ARCH_X86_32 pmaddwd m0, m1, subpelv0 ; a0 pmaddwd m7, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m0, m3 paddd m7, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_512] paddd m0, m5 ; pd_512 paddd m7, m5 ; pd_512 mova accuv0, m0 mova accuv1, m7 %else pmaddwd m8, m1, subpelv0 ; a0 pmaddwd m9, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m8, m3 paddd m9, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m8, m5 paddd m9, m6 mova m7, [base+pd_512] paddd m8, m7 ; pd_512 paddd m9, m7 ; pd_512 mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] %endif movu m0, [srcq+ssq*1] ; 7 movu m4, [srcq+ssq*2] ; 8 lea srcq, [srcq+ssq*2] HV_H_W8 m0, m1, m2, m3, m5, m7, m6 HV_H_W8 m4, m1, m2, m3, m5, m7, m6 mova m5, [base+pw_8192] pmulhrsw m0, m5 ; H pw_8192 pmulhrsw m4, m5 ; H pw_8192 RESTORELINE_W8 6, m6 punpcklwd m5, m6, m0 ; 6 7 ~ punpcklwd m6, m0, m4 ; 7 8 ~ pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 paddd m1, m1, accuv1 ; H + V psrad m2, 10 psrad m1, 10 packssdw m2, m1 ; d -> w packuswb m2, m1 ; w -> b movd [dstq+dsq*0], m2 psrlq m2, 32 %if ARCH_X86_32 add dstq, dsm movd [dstq+dsq*0], m2 add dstq, dsm %else movd [dstq+dsq*1], m2 lea dstq, [dstq+dsq*2] %endif sub hd, 2 jle .hv_w8_outer SAVELINE_W8 6, m4 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: %if ARCH_X86_32 mov dstq, dstm add r4, 4 movzx hd, r6w add dstq, 4 mov srcq, r4 mov dstm, dstq %else add r4, 4 add r7, 4 movzx hd, r6b mov srcq, r4 mov dstq, r7 %endif sub r6d, 1<<16 jg .hv_w8_loop0 RET %macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask %if cpuflag(ssse3) pshufb %1, %2 %else %if %5 == 1 pcmpeqd %2, %2 psrlq %2, 32 %endif psrldq %3, %1, 1 pshufd %3, %3, q2301 pand %1, %2 pandn %4, %2, %3 por %1, %4 %endif %endmacro %macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask %ifnidn %1, %2 mova %1, %2 %endif PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 %endmacro %macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask %if notcpuflag(ssse3) psrlq %1, %2, 16 %elifnidn %1, %2 mova %1, %2 %endif PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 %endmacro %macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] %if cpuflag(ssse3) palignr %1, %2, %3, %4 %else %if %0 == 4 %assign %%i regnumof%+%1 + 1 %define %%tmp m %+ %%i %else %define %%tmp %5 %endif psrldq %1, %3, %4 pslldq %%tmp, %2, 16-%4 por %1, %%tmp %endif %endmacro %macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 %if cpuflag(ssse3) phaddw %1, %2 %elifnidn %1, %2 %if %4 == 1 mova %3, [base+pw_1] %endif pmaddwd %1, %3 pmaddwd %2, %3 packssdw %1, %2 %else %if %4 == 1 pmaddwd %1, [base+pw_1] %else pmaddwd %1, %3 %endif packssdw %1, %1 %endif %endmacro %macro PMULHRSW_POW2 4 ; dst, src1, src2, shift %if cpuflag(ssse3) pmulhrsw %1, %2, %3 %else paddw %1, %2, %3 psraw %1, %4 %endif %endmacro %macro PMULHRSW_8192 3 ; dst, src1, src2 PMULHRSW_POW2 %1, %2, %3, 2 %endmacro %macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] movd %1, [%2+0] movd %3, [%2+1] movd %4, [%2+2] movd %5, [%2+3] punpckldq %1, %3 punpckldq %4, %5 punpcklqdq %1, %4 %endmacro %macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc %if cpuflag(ssse3) movu m%1, [%2] pshufb m2, m%1, m11 ; subpel_h_shufB pshufb m3, m%1, m9 ; subpel_h_shufC pshufb m%1, m10 ; subpel_h_shufA %else %if ARCH_X86_64 SWAP m12, m5 SWAP m13, m6 SWAP m14, m7 %define %%mx0 m%+%%i %define %%mx1 m%+%%j %assign %%i 0 %rep 12 movd %%mx0, [%2+%%i] %assign %%i %%i+1 %endrep %assign %%i 0 %rep 6 %assign %%j %%i+1 punpckldq %%mx0, %%mx1 %assign %%i %%i+2 %endrep %assign %%i 0 %rep 3 %assign %%j %%i+2 punpcklqdq %%mx0, %%mx1 %assign %%i %%i+4 %endrep SWAP m%1, m0 SWAP m2, m4 SWAP m3, m8 SWAP m5, m12 SWAP m6, m13 SWAP m7, m14 %else PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 SWAP m%1, m0 %endif %endif %endmacro %macro PREP_8TAP_H 2 ; dst, src_memloc PREP_8TAP_H_LOAD %1, %2 %if ARCH_X86_64 && notcpuflag(ssse3) SWAP m8, m1 SWAP m9, m7 %endif %xdefine mX m%+%1 %assign %%i regnumof%+mX %define mX m%+%%i mova m4, m2 PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 %undef mX %if ARCH_X86_64 && notcpuflag(ssse3) SWAP m1, m8 SWAP m7, m9 %endif paddw m3, m4 paddw m%1, m2 PHADDW m%1, m3, m15, ARCH_X86_32 %if ARCH_X86_64 || cpuflag(ssse3) PMULHRSW_8192 m%1, m%1, m7 %else PMULHRSW_8192 m%1, m%1, [base+pw_2] %endif %endmacro %macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] %if cpuflag(ssse3) movu %1, [%2] pshufb m2, %1, shufB pshufb m3, %1, shufC pshufb %1, shufA %else PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 %endif mova m1, m2 PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 paddw m1, m3 ; C0+B4 paddw %1, m2 ; A0+C4 PHADDW %1, m1, %3, 1 %endmacro %macro PREP_8TAP 0 %if ARCH_X86_32 DECLARE_REG_TMP 1, 2 %elif WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif FN prep_8tap, sharp, SHARP, SHARP FN prep_8tap, sharp_smooth, SHARP, SMOOTH FN prep_8tap, smooth_sharp, SMOOTH, SHARP FN prep_8tap, smooth, SMOOTH, SMOOTH FN prep_8tap, sharp_regular, SHARP, REGULAR FN prep_8tap, regular_sharp, REGULAR, SHARP FN prep_8tap, smooth_regular, SMOOTH, REGULAR FN prep_8tap, regular_smooth, REGULAR, SMOOTH FN prep_8tap, regular, REGULAR, REGULAR %if ARCH_X86_32 %define base_reg r2 %define base base_reg-prep%+SUFFIX %else %define base_reg r7 %define base 0 %endif cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %assign org_stack_offset stack_offset imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v mov wd, wm movifnidn srcd, srcm movifnidn hd, hm test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v LEA base_reg, prep_ssse3 tzcnt wd, wd movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] pxor m4, m4 add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] %assign stack_offset org_stack_offset %if WIN64 pop r8 pop r7 %endif jmp wq .h: LEA base_reg, prep%+SUFFIX test myd, 0xf00 jnz .hv %if cpuflag(ssse3) WIN64_SPILL_XMM 12 %else WIN64_SPILL_XMM 16 %endif %if ARCH_X86_32 %define strideq r6 mov strideq, stridem %endif cmp wd, 4 je .h_w4 tzcnt wd, wd %if cpuflag(ssse3) %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %else %define m10 [base+subpel_h_shufA] %define m11 [base+subpel_h_shufB] %define m9 [base+subpel_h_shufC] %endif %endif shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] movq m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m7, [base+pw_8192] pshufd m5, m6, q0000 pshufd m6, m6, q1111 %else punpcklbw m6, m6 psraw m6, 8 %if ARCH_X86_64 mova m7, [pw_2] mova m15, [pw_1] %else %define m15 m4 %endif pshufd m5, m6, q1010 punpckhqdq m6, m6 %endif add wq, base_reg jmp wq .h_w4: %if ARCH_X86_32 and mxd, 0x7f %else movzx mxd, mxb %endif dec srcq movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] %if cpuflag(ssse3) mova m6, [base+pw_8192] mova m5, [base+subpel_h_shufA] pshufd m4, m4, q0000 %else mova m6, [base+pw_2] %if ARCH_X86_64 mova m14, [pw_1] %else %define m14 m7 %endif punpcklbw m4, m4 psraw m4, 8 punpcklqdq m4, m4 %endif %if ARCH_X86_64 lea stride3q, [strideq*3] %endif .h_w4_loop: %if cpuflag(ssse3) movq m0, [srcq+strideq*0] ; 0 movq m1, [srcq+strideq*1] ; 1 %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m2, [srcq+strideq*0] ; 2 movq m3, [srcq+strideq*1] ; 3 lea srcq, [srcq+strideq*2] %else movq m2, [srcq+strideq*2] ; 2 movq m3, [srcq+stride3q ] ; 3 lea srcq, [srcq+strideq*4] %endif pshufb m0, m5 pshufb m1, m5 pshufb m2, m5 pshufb m3, m5 %elif ARCH_X86_64 movd m0, [srcq+strideq*0+0] movd m12, [srcq+strideq*0+1] movd m1, [srcq+strideq*1+0] movd m5, [srcq+strideq*1+1] movd m2, [srcq+strideq*2+0] movd m13, [srcq+strideq*2+1] movd m3, [srcq+stride3q +0] movd m7, [srcq+stride3q +1] punpckldq m0, m12 punpckldq m1, m5 punpckldq m2, m13 punpckldq m3, m7 movd m12, [srcq+strideq*0+2] movd m8, [srcq+strideq*0+3] movd m5, [srcq+strideq*1+2] movd m9, [srcq+strideq*1+3] movd m13, [srcq+strideq*2+2] movd m10, [srcq+strideq*2+3] movd m7, [srcq+stride3q +2] movd m11, [srcq+stride3q +3] lea srcq, [srcq+strideq*4] punpckldq m12, m8 punpckldq m5, m9 punpckldq m13, m10 punpckldq m7, m11 punpcklqdq m0, m12 ; 0 punpcklqdq m1, m5 ; 1 punpcklqdq m2, m13 ; 2 punpcklqdq m3, m7 ; 3 %else movd m0, [srcq+strideq*0+0] movd m1, [srcq+strideq*0+1] movd m2, [srcq+strideq*0+2] movd m3, [srcq+strideq*0+3] punpckldq m0, m1 punpckldq m2, m3 punpcklqdq m0, m2 ; 0 movd m1, [srcq+strideq*1+0] movd m2, [srcq+strideq*1+1] movd m3, [srcq+strideq*1+2] movd m7, [srcq+strideq*1+3] lea srcq, [srcq+strideq*2] punpckldq m1, m2 punpckldq m3, m7 punpcklqdq m1, m3 ; 1 movd m2, [srcq+strideq*0+0] movd m3, [srcq+strideq*0+1] movd m7, [srcq+strideq*0+2] movd m5, [srcq+strideq*0+3] punpckldq m2, m3 punpckldq m7, m5 punpcklqdq m2, m7 ; 2 movd m3, [srcq+strideq*1+0] movd m7, [srcq+strideq*1+1] punpckldq m3, m7 movd m7, [srcq+strideq*1+2] movd m5, [srcq+strideq*1+3] lea srcq, [srcq+strideq*2] punpckldq m7, m5 punpcklqdq m3, m7 ; 3 %endif PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 PMADDUBSW m1, m4, m5, m7, 0 PMADDUBSW m2, m4, m5, m7, 0 PMADDUBSW m3, m4, m5, m7, 0 PHADDW m0, m1, m14, ARCH_X86_32 PHADDW m2, m3, m14, 0 PMULHRSW_8192 m0, m0, m6 PMULHRSW_8192 m2, m2, m6 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 32 sub hd, 4 jg .h_w4_loop RET .h_w8: %if cpuflag(ssse3) PREP_8TAP_H 0, srcq+strideq*0 PREP_8TAP_H 1, srcq+strideq*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 lea srcq, [srcq+strideq*2] add tmpq, 32 sub hd, 2 %else PREP_8TAP_H 0, srcq mova [tmpq], m0 add srcq, strideq add tmpq, 16 dec hd %endif jg .h_w8 RET .h_w16: mov r3, -16*1 jmp .h_start .h_w32: mov r3, -16*2 jmp .h_start .h_w64: mov r3, -16*4 jmp .h_start .h_w128: mov r3, -16*8 .h_start: sub srcq, r3 mov r5, r3 .h_loop: %if cpuflag(ssse3) PREP_8TAP_H 0, srcq+r3+8*0 PREP_8TAP_H 1, srcq+r3+8*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 add r3, 16 %else PREP_8TAP_H 0, srcq+r3 mova [tmpq], m0 add tmpq, 16 add r3, 8 %endif jl .h_loop add srcq, strideq mov r3, r5 dec hd jg .h_loop RET .v: LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f %else %assign stack_offset org_stack_offset WIN64_SPILL_XMM 16 movzx mxd, myb %endif shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) mova m2, [base+pw_512] mova m7, [base+pw_8192] punpcklwd m0, m0 %else punpcklbw m0, m0 psraw m0, 8 %endif %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 6 ; use r5 (mx) as tmp for stack alignment if needed %if cpuflag(ssse3) ALLOC_STACK -mmsize*4 %else ALLOC_STACK -mmsize*5 %endif %assign regs_used 7 mov strideq, [rstk+stack_offset+gprsize*3] pshufd m1, m0, q0000 mova subpel0, m1 pshufd m1, m0, q1111 mova subpel1, m1 lea r5, [strideq*3] pshufd m1, m0, q2222 mova subpel2, m1 pshufd m1, m0, q3333 mova subpel3, m1 sub srcq, r5 %else %define subpel0 m8 %define subpel1 m9 %define subpel2 m10 %define subpel3 m11 pshufd m8, m0, q0000 pshufd m9, m0, q1111 lea stride3q, [strideq*3] pshufd m10, m0, q2222 pshufd m11, m0, q3333 sub srcq, stride3q cmp wd, 8 jns .v_w8 %endif .v_w4: %if notcpuflag(ssse3) pxor m6, m6 %if ARCH_X86_64 mova m7, [base+pw_2] %endif %endif %if ARCH_X86_32 %if STACK_ALIGNMENT < mmsize %define srcm [esp+stack_size+gprsize*1] %define tmpm [esp+stack_size+gprsize*2] %endif mov tmpm, tmpq mov srcm, srcq lea r5d, [wq - 4] ; horizontal loop shl r5d, (16 - 2) ; (wq / 4) << 16 mov r5w, hw .v_w4_loop0: %endif movd m1, [srcq+strideq*0] movd m0, [srcq+strideq*1] %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movd m2, [srcq+strideq*0] movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] movd m3, [srcq+strideq*0] movd m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] %else movd m2, [srcq+strideq*2] add srcq, stride3q movd m4, [srcq+strideq*0] movd m3, [srcq+strideq*1] movd m5, [srcq+strideq*2] add srcq, stride3q %endif punpckldq m1, m0 ; 0 1 punpckldq m0, m2 ; 1 2 punpcklbw m1, m0 ; 01 12 movd m0, [srcq+strideq*0] punpckldq m2, m4 ; 2 3 punpckldq m4, m3 ; 3 4 punpckldq m3, m5 ; 4 5 punpckldq m5, m0 ; 5 6 punpcklbw m2, m4 ; 23 34 punpcklbw m3, m5 ; 45 56 .v_w4_loop: %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel0 %define subpel0 m7 %endif mova m5, m1 PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel1 %define subpel1 m7 %endif mova m1, m2 PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 paddw m5, m2 %if ARCH_X86_32 && notcpuflag(ssse3) mova m7, subpel2 %define subpel2 m7 %endif mova m2, m3 PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 movd m4, [srcq+strideq*1] lea srcq, [srcq+strideq*2] paddw m5, m3 punpckldq m3, m0, m4 ; 6 7 _ _ movd m0, [srcq+strideq*0] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m12, m0 %else mova [esp+mmsize*4], m0 mova m7, subpel3 %define subpel3 m7 %endif %endif mova m4, m3 PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 paddw m5, m4 %if ARCH_X86_64 || cpuflag(ssse3) %if notcpuflag(ssse3) SWAP m0, m12 %endif PMULHRSW_8192 m5, m5, m7 %else mova m0, [esp+mmsize*4] PMULHRSW_8192 m5, m5, [base+pw_2] %endif movq [tmpq+wq*0], m5 movhps [tmpq+wq*2], m5 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w4_loop %if ARCH_X86_32 mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq sub r5d, 1<<16 ; horizontal-- jg .v_w4_loop0 %endif RET %if ARCH_X86_64 .v_w8: lea r6d, [wq*8-64] mov r5, srcq mov r8, tmpq lea r6d, [hq+r6*4] .v_w8_loop0: movq m1, [srcq+strideq*0] movq m2, [srcq+strideq*1] movq m3, [srcq+strideq*2] add srcq, stride3q movq m4, [srcq+strideq*0] movq m5, [srcq+strideq*1] movq m6, [srcq+strideq*2] add srcq, stride3q movq m0, [srcq+strideq*0] punpcklbw m1, m2 ; 01 punpcklbw m2, m3 ; 12 punpcklbw m3, m4 ; 23 punpcklbw m4, m5 ; 34 punpcklbw m5, m6 ; 45 punpcklbw m6, m0 ; 56 .v_w8_loop: movq m13, [srcq+strideq*1] lea srcq, [srcq+strideq*2] %if cpuflag(ssse3) pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 mova m2, m4 pmaddubsw m3, subpel1 ; a1 pmaddubsw m4, subpel1 ; b1 paddw m14, m3 paddw m15, m4 mova m3, m5 mova m4, m6 pmaddubsw m5, subpel2 ; a2 pmaddubsw m6, subpel2 ; b2 punpcklbw m12, m0, m13 ; 67 movq m0, [srcq+strideq*0] punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 pmaddubsw m12, subpel3 ; a3 paddw m15, m6 mova m6, m13 pmaddubsw m13, subpel3 ; b3 paddw m14, m12 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 %else mova m14, m1 PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 mova m15, m2 PMADDUBSW m15, subpel0, m7, m12, 0 ; b0 mova m1, m3 PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 mova m2, m4 PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 paddw m14, m3 mova m3, m5 PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 paddw m15, m4 mova m4, m6 PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 paddw m15, m6 punpcklbw m12, m0, m13 ; 67 movq m0, [srcq+strideq*0] punpcklbw m13, m0 ; 78 paddw m14, m5 mova m5, m12 PMADDUBSW m12, subpel3, m7, m6, 0 ; a3 paddw m14, m12 mova m6, m13 PMADDUBSW m13, subpel3, m7, m12, 0 ; b3 paddw m15, m13 PMULHRSW_8192 m14, m14, [base+pw_2] PMULHRSW_8192 m15, m15, [base+pw_2] %endif movu [tmpq+wq*0], m14 movu [tmpq+wq*2], m15 lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w8_loop add r5, 8 add r8, 16 movzx hd, r6b mov srcq, r5 mov tmpq, r8 sub r6d, 1<<8 jg .v_w8_loop0 RET %endif ;ARCH_X86_64 %undef subpel0 %undef subpel1 %undef subpel2 %undef subpel3 .hv: %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 and mxd, 0x7f movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] %if ARCH_X86_32 mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 lea r5, [strideq*3+1] sub srcq, r5 %define subpelv0 [rsp+mmsize*0] %define subpelv1 [rsp+mmsize*1] %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 psraw m0, 8 pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 mova subpelv1, m6 pshufd m6, m0, q2222 mova subpelv2, m6 pshufd m6, m0, q3333 mova subpelv3, m6 %else movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) ALLOC_STACK mmsize*14, 14 %else ALLOC_STACK mmsize*14, 16 %endif lea stride3q, [strideq*3] sub srcq, stride3q dec srcq %define subpelv0 m10 %define subpelv1 m11 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 psraw m0, 8 %if cpuflag(ssse3) mova m8, [base+pw_8192] %else mova m8, [base+pw_2] %endif mova m9, [base+pd_32] pshufd m10, m0, q0000 pshufd m11, m0, q1111 pshufd m12, m0, q2222 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 %if notcpuflag(ssse3) punpcklbw m7, m7 psraw m7, 8 %endif %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 %define hv4_line_0_3 7 %define hv4_line_0_4 8 %define hv4_line_0_5 9 %define hv4_line_1_0 10 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 %if ARCH_X86_32 %if cpuflag(ssse3) %define w8192reg [base+pw_8192] %else %define w8192reg [base+pw_2] %endif %define d32reg [base+pd_32] %else %define w8192reg m8 %define d32reg m9 %endif ; lower shuffle 0 1 2 3 4 %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %else %if ARCH_X86_64 mova m15, [pw_1] %else %define m15 m1 %endif %endif movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m4, [srcq+strideq*0] ; 2 _ _ _ movhps m4, [srcq+strideq*1] ; 2 _ 3 _ lea srcq, [srcq+strideq*2] %else movq m4, [srcq+strideq*2] ; 2 _ _ _ movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 PMULHRSW_8192 m2, m2, w8192reg SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 PMULHRSW_8192 m2, m2, w8192reg %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m14, m2 %else mova [esp+mmsize*4], m2 %endif %endif ; lower shuffle %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %endif movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m4, [srcq+strideq*0] ; 6 _ _ _ add srcq, strideq %else movq m4, [srcq+strideq*2] ; 6 _ _ _ add srcq, stride3q %endif PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 PMULHRSW_8192 m3, m3, w8192reg SAVELINE_W4 m3, 3, 0 ; upper shuffle %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 PMULHRSW_8192 m3, m3, w8192reg %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m2, m14 %else mova m2, [esp+mmsize*4] %endif %endif ;process high PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 punpcklwd m3, m0 ; V 45 56 .hv_w4_loop: ;process low pmaddwd m5, m1, subpelv0 ; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m14, m5 %else mova [esp+mmsize*4], m5 %define m15 m3 %endif %endif %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] %endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 PMULHRSW_8192 m4, m4, w8192reg PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m5, m14 %else mova m5, [esp+mmsize*4] %endif %endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m5, 6 SAVELINE_W4 m0, 0, 0 SAVELINE_W4 m1, 1, 0 SAVELINE_W4 m2, 2, 0 SAVELINE_W4 m3, 3, 0 SAVELINE_W4 m5, 5, 0 ;process high RESTORELINE_W4 m0, 0, 1 RESTORELINE_W4 m1, 1, 1 RESTORELINE_W4 m2, 2, 1 RESTORELINE_W4 m3, 3, 1 pmaddwd m5, m1, subpelv0; V a0 b0 mova m1, m2 pmaddwd m2, subpelv1; V a1 b1 paddd m5, m2 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m14, m5 %else mova [esp+0xA0], m5 %endif %endif %if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] %endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 PMULHRSW_8192 m4, m4, w8192reg PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 %if notcpuflag(ssse3) %if ARCH_X86_64 SWAP m5, m14 %else mova m5, [esp+0xA0] %endif %endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 RESTORELINE_W4 m5, 5, 0 packssdw m5, m4 pshufd m5, m5, q3120 movu [tmpq], m5 lea srcq, [srcq+strideq*2] add tmpq, 16 sub hd, 2 SAVELINE_W4 m0, 0, 1 SAVELINE_W4 m1, 1, 1 SAVELINE_W4 m2, 2, 1 SAVELINE_W4 m3, 3, 1 RESTORELINE_W4 m0, 0, 0 RESTORELINE_W4 m1, 1, 0 RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 jg .hv_w4_loop RET %undef subpelv0 %undef subpelv1 %undef subpelv2 %undef subpelv3 .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 %define hv8_line_2 1 %define hv8_line_3 2 %define hv8_line_4 3 %define hv8_line_6 4 shr mxd, 16 %if ARCH_X86_32 %define subpelh0 [rsp+mmsize*5] %define subpelh1 [rsp+mmsize*6] %define subpelv0 [rsp+mmsize*7] %define subpelv1 [rsp+mmsize*8] %define subpelv2 [rsp+mmsize*9] %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] mov strideq, stridem %assign regs_used 6 ALLOC_STACK -mmsize*14 %assign regs_used 7 %if STACK_ALIGNMENT < mmsize %define tmpm [rsp+mmsize*13+gprsize*1] %define srcm [rsp+mmsize*13+gprsize*2] %define stridem [rsp+mmsize*13+gprsize*3] mov tmpm, tmpq mov stridem, strideq %endif %if cpuflag(ssse3) pshufd m0, m1, q0000 pshufd m1, m1, q1111 %else punpcklbw m1, m1 psraw m1, 8 pshufd m0, m1, q1010 punpckhqdq m1, m1 %endif punpcklbw m5, m5 psraw m5, 8 pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 pshufd m5, m5, q3333 mova subpelh0, m0 mova subpelh1, m1 mova subpelv0, m2 mova subpelv1, m3 mova subpelv2, m4 mova subpelv3, m5 lea r5, [strideq*3+3] sub srcq, r5 mov srcm, srcq %else ALLOC_STACK mmsize*5, 16 %define subpelh0 m10 %define subpelh1 m11 %define subpelv0 m12 %define subpelv1 m13 %define subpelv2 m14 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] %if cpuflag(ssse3) pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 %else punpcklbw m0, m0 psraw m0, 8 pshufd subpelh0, m0, q1010 pshufd subpelh1, m0, q3232 mova m7, [base+pw_2] %endif punpcklbw m1, m1 psraw m1, 8 pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q mov r6, srcq mov r8, tmpq %endif lea r5d, [wq-4] shl r5d, 14 add r5d, hd .hv_w8_loop0: %if cpuflag(ssse3) %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] %define shufA m7 %define shufB m8 %define shufC m9 %else %define shufA [base+subpel_h_shufA] %define shufB [base+subpel_h_shufB] %define shufC [base+subpel_h_shufC] %endif %endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 %if ARCH_X86_64 PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 add srcq, stride3q PREP_8TAP_HV m0, srcq+strideq*0, m7, m9 %else lea srcq, [srcq+strideq*2] %if notcpuflag(ssse3) mova [esp], m4 %endif PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 lea srcq, [srcq+strideq*2] %endif %if cpuflag(ssse3) mova m7, [base+pw_8192] %else mova m7, [base+pw_2] %if ARCH_X86_32 mova m4, [esp] %endif %endif PMULHRSW_8192 m4, m4, m7 PMULHRSW_8192 m5, m5, m7 PMULHRSW_8192 m6, m6, m7 PMULHRSW_8192 m0, m0, m7 punpcklwd m1, m4, m5 ; 01 punpcklwd m2, m5, m6 ; 12 punpcklwd m3, m6, m0 ; 23 SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 %if cpuflag(ssse3) mova m7, [base+subpel_h_shufA] %endif %if ARCH_X86_64 PREP_8TAP_HV m4, srcq+strideq*1, m8, m9 PREP_8TAP_HV m5, srcq+strideq*2, m8, m9 add srcq, stride3q PREP_8TAP_HV m6, srcq+strideq*0, m8, m9 %else %if notcpuflag(ssse3) mova [esp+0x30], m0 %endif PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 lea srcq, [srcq+strideq*2] PREP_8TAP_HV m6, srcq+strideq*0, m7, m0 %endif %if cpuflag(ssse3) mova m7, [base+pw_8192] %elif ARCH_X86_32 mova m0, [esp+0x30] mova m7, [base+pw_2] %endif PMULHRSW_8192 m1, m4, m7 PMULHRSW_8192 m2, m5, m7 PMULHRSW_8192 m3, m6, m7 punpcklwd m4, m0, m1 ; 34 punpcklwd m5, m1, m2 ; 45 punpcklwd m6, m2, m3 ; 56 SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 SAVELINE_W8 4, m6 %if ARCH_X86_32 pmaddwd m0, m1, subpelv0 ; a0 pmaddwd m7, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd m0, m3 paddd m7, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_32] paddd m0, m5 paddd m7, m5 mova accuv0, m0 mova accuv1, m7 %else pmaddwd accuv0, m1, subpelv0 ; a0 pmaddwd accuv1, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 paddd accuv0, m3 paddd accuv1, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 paddd accuv0, m5 paddd accuv1, m6 mova m7, [base+pd_32] paddd accuv0, m7 paddd accuv1, m7 %if cpuflag(ssse3) mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] %define shufA m5 %define shufB m7 %define shufC m6 %endif %endif PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 lea srcq, [srcq+strideq*2] PREP_8TAP_HV m4, srcq+strideq*0, m5, m6 %if cpuflag(ssse3) mova m5, [base+pw_8192] %else mova m5, [base+pw_2] %endif PMULHRSW_8192 m0, m0, m5 PMULHRSW_8192 m4, m4, m5 RESTORELINE_W8 6, m6 punpcklwd m5, m6, m0 ; 67 punpcklwd m6, m0, m4 ; 78 pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 paddd m1, m1, accuv1 psrad m2, 6 psrad m1, 6 packssdw m2, m1 movq [tmpq+wq*0], m2 movhps [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] sub hd, 2 jle .hv_w8_outer SAVELINE_W8 6, m4 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 RESTORELINE_W8 4, m4 jmp .hv_w8_loop .hv_w8_outer: %if ARCH_X86_32 mov srcq, srcm mov tmpq, tmpm movzx hd, r5w add srcq, 4 add tmpq, 8 mov srcm, srcq mov tmpm, tmpq %else add r6, 4 add r8, 8 movzx hd, r5b mov srcq, r6 mov tmpq, r8 %endif sub r5d, 1<<16 jg .hv_w8_loop0 RET %endmacro %macro movifprep 2 %if isprep mov %1, %2 %endif %endmacro %macro SAVE_REG 1 %xdefine r%1_save r%1 %xdefine r%1q_save r%1q %xdefine r%1d_save r%1d %if ARCH_X86_32 %define r%1m_save [rstk+stack_offset+(%1+1)*4] %endif %endmacro %macro LOAD_REG 1 %xdefine r%1 r%1_save %xdefine r%1q r%1q_save %xdefine r%1d r%1d_save %if ARCH_X86_32 %define r%1m r%1m_save %endif %undef r%1d_save %undef r%1q_save %undef r%1_save %endmacro %macro REMAP_REG 2-3 %xdefine r%1 r%2 %xdefine r%1q r%2q %xdefine r%1d r%2d %if ARCH_X86_32 %if %3 == 0 %xdefine r%1m r%2m %else %define r%1m [rstk+stack_offset+(%1+1)*4] %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 %if isprep %if ARCH_X86_64 SAVE_REG 14 %assign %%i 14 %rep 14 %assign %%j %%i-1 REMAP_REG %%i, %%j %assign %%i %%i-1 %endrep %else SAVE_REG 5 %assign %%i 5 %rep 5 %assign %%j %%i-1 REMAP_REG %%i, %%j, 0 %assign %%i %%i-1 %endrep %endif %endif %endmacro %macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 %if isprep %assign %%i 1 %if ARCH_X86_64 %rep 13 %assign %%j %%i+1 REMAP_REG %%i, %%j %assign %%i %%i+1 %endrep LOAD_REG 14 %else %rep 4 %assign %%j %%i+1 REMAP_REG %%i, %%j, 1 %assign %%i %%i+1 %endrep LOAD_REG 5 %endif %endif %endmacro %macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT RET %if %1 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %endif %endmacro %if ARCH_X86_64 %macro MC_8TAP_SCALED_H 12 ; dst[0-1], tmp[0-5], weights[0-3] SWAP m%2, m%5 movq m%1, [srcq+ r4] movq m%2, [srcq+ r6] movhps m%1, [srcq+ r7] movhps m%2, [srcq+ r9] movq m%3, [srcq+r10] movq m%4, [srcq+r11] movhps m%3, [srcq+r13] movhps m%4, [srcq+ rX] add srcq, ssq movq m%5, [srcq+ r4] movq m%6, [srcq+ r6] movhps m%5, [srcq+ r7] movhps m%6, [srcq+ r9] movq m%7, [srcq+r10] movq m%8, [srcq+r11] movhps m%7, [srcq+r13] movhps m%8, [srcq+ rX] add srcq, ssq pmaddubsw m%1, m%9 pmaddubsw m%5, m%9 pmaddubsw m%2, m%10 pmaddubsw m%6, m%10 pmaddubsw m%3, m%11 pmaddubsw m%7, m%11 pmaddubsw m%4, m%12 pmaddubsw m%8, m%12 phaddw m%1, m%2 phaddw m%5, m%6 phaddw m%3, m%4 phaddw m%7, m%8 phaddw m%1, m%3 phaddw m%5, m%7 pmulhrsw m%1, m12 pmulhrsw m%5, m12 SWAP m%2, m%5 %endmacro %else %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem_start, load_fh_offsets %if %3 == 1 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] %endif movq m0, [srcq+r0] movq m1, [srcq+rX] movhps m0, [srcq+r4] movhps m1, [srcq+r5] add srcq, ssq movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] sub srcq, ssq movq m2, [srcq+r0] movq m3, [srcq+rX] movhps m2, [srcq+r4] movhps m3, [srcq+r5] add srcq, ssq movq m6, [srcq+r0] movq m7, [srcq+rX] movhps m6, [srcq+r4] movhps m7, [srcq+r5] add srcq, ssq pmaddubsw m0, [esp+%1+ 0] pmaddubsw m4, [esp+%1+ 0] pmaddubsw m1, [esp+%1+16] pmaddubsw m5, [esp+%1+16] pmaddubsw m2, [esp+%1+32] pmaddubsw m6, [esp+%1+32] pmaddubsw m3, [esp+%1+48] pmaddubsw m7, [esp+%1+48] phaddw m0, m1 phaddw m4, m5 phaddw m2, m3 phaddw m6, m7 phaddw m0, m2 phaddw m4, m6 pmulhrsw m0, m12 pmulhrsw m4, m12 %if %2 != 0 mova [esp+%2+ 0], m0 mova [esp+%2+16], m4 %endif %endmacro %endif %macro MC_8TAP_SCALED 1 %ifidn %1, put %assign isprep 0 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_8bpc, 2, 15, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy %else cglobal put_8tap_scaled_8bpc, 2, 14, 16, 0x180, dst, ds, src, ss, w, h, mx, my, dx, dy %endif %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal put_8tap_scaled_8bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy %else cglobal put_8tap_scaled_8bpc, 0, 7, 8, -0x200-0x20, dst, ds, src, ss, w, h, mx, my, dx, dy %endif %endif %xdefine base_reg r12 %define rndshift 10 %else ; prep %assign isprep 1 %if ARCH_X86_64 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_8bpc, 2, 15, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy %xdefine tmp_stridem r14q %else cglobal prep_8tap_scaled_8bpc, 2, 14, 16, 0x180, tmp, src, ss, w, h, mx, my, dx, dy %define tmp_stridem qword [rsp+0x138] %endif %xdefine base_reg r11 %else ; ARCH_X86_32 %if required_stack_alignment <= STACK_ALIGNMENT cglobal prep_8tap_scaled_8bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy %else cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy %endif %define tmp_stridem dword [esp+0x138] %endif %define rndshift 6 %endif %if ARCH_X86_32 mov [esp+0x1f0], t0d mov [esp+0x1f4], t1d %if !isprep && required_stack_alignment > STACK_ALIGNMENT mov dstd, dstm mov dsd, dsm mov srcd, srcm mov ssd, ssm mov hd, hm mov r4, mxm %define r0m [esp+0x200] %define dsm [esp+0x204] %define dsmp dsm %define r1m dsm %define r2m [esp+0x208] %define ssm [esp+0x20c] %define r3m ssm %define hm [esp+0x210] %define mxm [esp+0x214] mov r0m, dstd mov dsm, dsd mov r2m, srcd mov ssm, ssd mov hm, hd mov r0, mym mov r1, dxm mov r2, dym %define mym [esp+0x218] %define dxm [esp+0x09c] %define dym [esp+0x21c] mov mxm, r4 mov mym, r0 mov dxm, r1 mov dym, r2 tzcnt wd, wm %endif %if isprep && required_stack_alignment > STACK_ALIGNMENT %xdefine base_reg r5 %else %xdefine base_reg r6 %endif mov ssd, ssm %endif LEA base_reg, %1_8tap_scaled_8bpc_ssse3 %xdefine base base_reg-%1_8tap_scaled_8bpc_ssse3 %if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT tzcnt wd, wm %endif %if ARCH_X86_32 %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 %endif movd m8, dxm movd m14, mxm pshufd m8, m8, q0000 pshufd m14, m14, q0000 %if isprep && UNIX64 mov r5d, t0d DECLARE_REG_TMP 5, 7 %endif %if ARCH_X86_64 mov dyd, dym %endif %ifidn %1, put %if WIN64 mov r8d, hm DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 %define hm r5m %define dxm r8m %elif ARCH_X86_64 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 %define hm r6m %endif %if ARCH_X86_64 %if required_stack_alignment > STACK_ALIGNMENT %define dsm [rsp+0x138] %define rX r1 %define rXd r1d %else %define dsm dsq %define rX r14 %define rXd r14d %endif %else %define rX r1 %endif %else ; prep %if WIN64 mov r7d, hm DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 %define hm r4m %define dxm r7m %elif ARCH_X86_64 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 %define hm [rsp+0x94] %endif MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %if ARCH_X86_64 %define rX r14 %define rXd r14d %else %define rX r3 %endif %endif %if ARCH_X86_64 mova m10, [base+pd_0x3ff] mova m12, [base+pw_8192] %ifidn %1, put mova m13, [base+pd_512] %else mova m13, [base+pd_32] %endif %else %define m10 [base+pd_0x3ff] %define m12 [base+pw_8192] %ifidn %1, put %define m13 [base+pd_512] %else %define m13 [base+pd_32] %endif %endif pxor m9, m9 %if ARCH_X86_64 lea ss3q, [ssq*3] movzx r7d, t1b shr t1d, 16 cmp hd, 6 cmovs t1d, r7d sub srcq, ss3q %else MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT mov r1, [esp+0x1f4] lea r0, [ssq*3] movzx r2, r1b shr r1, 16 cmp dword hm, 6 cmovs r1, r2 mov [esp+0x1f4], r1 mov r1, r1m mov r2, r2m sub srcq, r0 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV %define ss3q r0 %define myd r4 %define dyd dword dym %define hd dword hm %endif cmp dyd, 1024 je .dy1 cmp dyd, 2048 je .dy2 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [rsp+0x180], m14 SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 pshufb m14, m5 paddb m14, m6 %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] movhps m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 %else pand m7, m8, m11 pandn m8, m15 %define m8 m6 %define m15 m5 por m15, m7 mova [rsp+0x190], m15 %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 1 2 3 pmulhrsw m1, m12 ; 4 5 6 7 palignr m2, m1, m0, 4 ; 1 2 3 4 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 pshufd m5, m1, q0321 ; 5 6 7 _ punpcklwd m2, m1, m5 ; 45 56 punpckhwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mov myd, mym mov r0, r0m mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif .w2_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m5, m3, m8 pmaddwd m6, m0, m9 pmaddwd m7, m2, m10 pmaddwd m8, m4, m11 paddd m5, m6 paddd m7, m8 %else mov mym, myd mov r1, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r1, [r1+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r1*8+0] cmovnz r3, [base+subpel_filters+r1*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m5, m7, q0000 pshufd m6, m7, q1111 pmaddwd m3, m5 pmaddwd m0, m6 pshufd m5, m7, q2222 pshufd m7, m7, q3333 pmaddwd m2, m5 pmaddwd m4, m7 paddd m3, m0 paddd m2, m4 SWAP m5, m3 SWAP m7, m2 %endif paddd m5, m13 paddd m5, m7 psrad m5, 10 packssdw m5, m5 packuswb m5, m5 %if ARCH_X86_64 pextrw r6d, m5, 0 mov [dstq], r6w add dstq, dsq dec hd jz .ret add myd, dyd %else pextrw r3d, m5, 0 mov [dstq], r3w add dstq, dsm dec hd jz .ret mov myd, mym add myd, dym %endif test myd, ~0x3ff %if ARCH_X86_32 SWAP m3, m5 SWAP m2, m7 mova m3, [rsp+0x1a0] mova m0, [rsp+0x1b0] mova m2, [rsp+0x1c0] mova m4, [rsp+0x1d0] %define m14 [esp+0x180] %define m15 [esp+0x190] %endif jz .w2_loop %if ARCH_X86_32 mov r3, r3m %endif movq m5, [srcq] test myd, 0x400 jz .w2_skip_line add srcq, ssq shufps m3, m0, q1032 ; 01 12 shufps m0, m2, q1032 ; 23 34 shufps m2, m4, q1032 ; 45 56 pshufb m5, m14 pmaddubsw m5, m15 phaddw m5, m5 pmulhrsw m5, m12 palignr m4, m5, m1, 12 punpcklqdq m1, m4, m4 ; 6 7 6 7 punpcklwd m4, m1, m5 ; 67 __ %if ARCH_X86_32 mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif jmp .w2_loop .w2_skip_line: movhps m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m3, m0 ; 01 12 mova m0, m2 ; 23 34 pshufb m5, m14 pmaddubsw m5, m15 phaddw m5, m5 pmulhrsw m5, m12 ; 6 7 6 7 palignr m4, m5, m1, 8 ; 4 5 6 7 pshufd m5, m4, q0321 ; 5 6 7 _ mova m1, m4 punpcklwd m2, m4, m5 ; 45 56 punpckhwd m4, m5 ; 67 __ %if ARCH_X86_32 mova [rsp+0x1a0], m3 mova [rsp+0x1b0], m0 mova [rsp+0x1c0], m2 mova [rsp+0x1d0], m4 %endif jmp .w2_loop %endif INIT_XMM ssse3 .w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m0, m14, m10 psrld m0, 6 paddd m15, m0 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] %else movd r0, m15 movd rX, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 movd m1, [base+subpel_filters+r0*8+2] movd m2, [base+subpel_filters+rX*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m7, [base+subpel_filters+r5*8+2] movifprep r3, r3m SWAP m4, m7 %define m15 m1 %endif mova m5, [base+bdct_lb_dw] movq m6, [base+subpel_s_shuf2] psrld m14, 10 punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 punpcklqdq m6, m6 pshufb m14, m5 paddb m14, m6 %if ARCH_X86_64 pcmpeqd m0, m9 pand m11, m0 %else mova [esp+0x180], m14 SWAP m7, m4 pxor m3, m3 pcmpeqd m0, m3 pand m2, m11, m0 %define m11 m2 %endif pandn m0, m15 %if ARCH_X86_64 SWAP m15, m0 %else %define m15 m0 %endif por m15, m11 %if ARCH_X86_64 movu m7, [srcq+ssq*0] movu m9, [srcq+ssq*1] movu m8, [srcq+ssq*2] movu m10, [srcq+ss3q ] lea srcq, [srcq+ssq*4] movu m2, [srcq+ssq*0] movu m4, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m5, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m7, m14 pshufb m9, m14 pshufb m8, m14 pshufb m10, m14 pshufb m2, m14 pshufb m4, m14 pshufb m3, m14 pshufb m5, m14 pmaddubsw m7, m15 pmaddubsw m9, m15 pmaddubsw m8, m15 pmaddubsw m10, m15 pmaddubsw m2, m15 pmaddubsw m4, m15 pmaddubsw m3, m15 pmaddubsw m5, m15 phaddw m7, m9 phaddw m8, m10 phaddw m9, m2, m4 phaddw m3, m5 pmulhrsw m7, m12 ; 0 1 pmulhrsw m8, m12 ; 2 3 pmulhrsw m9, m12 ; 4 5 pmulhrsw m3, m12 ; 6 7 shufps m4, m7, m8, q1032 ; 1 2 shufps m5, m8, m9, q1032 ; 3 4 shufps m6, m9, m3, q1032 ; 5 6 psrldq m11, m3, 8 ; 7 _ punpcklwd m0, m7, m4 ; 01 punpckhwd m7, m4 ; 12 punpcklwd m1, m8, m5 ; 23 punpckhwd m8, m5 ; 34 punpcklwd m2, m9, m6 ; 45 punpckhwd m9, m6 ; 56 punpcklwd m3, m11 ; 67 mova [rsp+0x00], m7 mova [rsp+0x10], m8 mova [rsp+0x20], m9 %else mova [esp+0x190], m15 lea ss3q, [ssq*3] movu m2, [srcq+ssq*0] movu m3, [srcq+ssq*1] movu m7, [srcq+ssq*2] movu m6, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m2, m14 pshufb m3, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 phaddw m2, m3 phaddw m7, m6 movu m1, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m6, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m1, m14 pshufb m5, m14 pshufb m3, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m5, m15 pmaddubsw m3, m15 pmaddubsw m6, m15 phaddw m1, m5 phaddw m3, m6 pmulhrsw m2, m12 pmulhrsw m7, m12 pmulhrsw m1, m12 pmulhrsw m3, m12 shufps m4, m2, m7, q1032 ; 1 2 shufps m5, m7, m1, q1032 ; 3 4 shufps m6, m1, m3, q1032 ; 5 6 psrldq m0, m3, 8 ; 7 _ mova [esp+0x1a0], m0 %define m11 [esp+0x1a0] punpcklwd m0, m2, m4 ; 01 punpckhwd m2, m4 ; 12 punpcklwd m4, m7, m5 ; 23 punpckhwd m7, m5 ; 34 punpcklwd m5, m1, m6 ; 45 punpckhwd m1, m6 ; 56 punpcklwd m3, [esp+0x1a0] ; 67 mov myd, mym mov r0, r0m mova [esp+0x1b0], m0 ; 01 mova [esp+0x1c0], m4 ; 23 mova [esp+0x1d0], m5 ; 45 mova [esp+0x1e0], m3 ; 67 mova [rsp+0x00], m2 ; 12 mova [rsp+0x10], m7 ; 34 mova [rsp+0x20], m1 ; 56 SWAP m1, m4 SWAP m2, m5 %endif .w4_loop: and myd, 0x3ff %if ARCH_X86_64 mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m10, r6q punpcklbw m10, m10 psraw m10, 8 pshufd m7, m10, q0000 pshufd m8, m10, q1111 pshufd m9, m10, q2222 pshufd m10, m10, q3333 pmaddwd m4, m0, m7 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m10 paddd m4, m5 paddd m6, m7 paddd m4, m13 paddd m4, m6 %else mov mym, myd mov r5, [esp+0x1f4] xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pshufd m6, m7, q2222 pshufd m7, m7, q3333 pmaddwd m0, m4 pmaddwd m1, m5 pmaddwd m2, m6 pmaddwd m3, m7 paddd m0, m1 paddd m2, m3 paddd m0, m13 paddd m0, m2 SWAP m4, m0 %endif psrad m4, rndshift packssdw m4, m4 %ifidn %1, put packuswb m4, m4 movd [dstq], m4 add dstq, dsmp %else movq [tmpq], m4 add tmpq, 8 %endif dec hd jz .ret %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .w4_loop %else SWAP m0, m4 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff jnz .w4_next_line mova m0, [esp+0x1b0] mova m1, [esp+0x1c0] mova m2, [esp+0x1d0] mova m3, [esp+0x1e0] jmp .w4_loop .w4_next_line: %define m14 [esp+0x180] %define m15 [esp+0x190] %endif movu m4, [srcq] test myd, 0x400 jz .w4_skip_line %if ARCH_X86_64 mova m0, [rsp+0x00] mova [rsp+0x00], m1 mova m1, [rsp+0x10] mova [rsp+0x10], m2 mova m2, [rsp+0x20] mova [rsp+0x20], m3 %else mova m5, [esp+0x1c0] mova m0, [rsp+0x000] mova [rsp+0x00], m5 mova [esp+0x1b0], m0 mova m6, [esp+0x1d0] mova m1, [rsp+0x010] mova [rsp+0x10], m6 mova [esp+0x1c0], m1 mova m7, [esp+0x1e0] mova m2, [rsp+0x020] mova [rsp+0x20], m7 mova [esp+0x1d0], m2 %endif pshufb m4, m14 pmaddubsw m4, m15 phaddw m4, m4 pmulhrsw m4, m12 punpcklwd m3, m11, m4 %if ARCH_X86_32 mova [esp+0x1e0], m3 %endif mova m11, m4 add srcq, ssq jmp .w4_loop .w4_skip_line: %if ARCH_X86_32 mova m0, [esp+0x1c0] mova m1, [esp+0x1d0] mova m2, [esp+0x1e0] %endif movu m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] mova m6, [rsp+0x10] mova m7, [rsp+0x20] pshufb m4, m14 pshufb m5, m14 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m4, m5 pmulhrsw m4, m12 punpcklwd m5, m11, m4 mova [rsp+0x00], m6 mova [rsp+0x10], m7 mova [rsp+0x20], m5 %if ARCH_X86_64 psrldq m11, m4, 8 mova m0, m1 mova m1, m2 mova m2, m3 punpcklwd m3, m4, m11 %else psrldq m6, m4, 8 punpcklwd m3, m4, m6 mova [esp+0x1a0], m6 mova [esp+0x1b0], m0 mova [esp+0x1c0], m1 mova [esp+0x1d0], m2 mova [esp+0x1e0], m3 %endif jmp .w4_loop INIT_XMM ssse3 .w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .w_start: %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 movd m15, t0d %else %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq ssm %endif mov r4, [esp+0x1f0] shr r4, 16 movd m15, r4 mov r0, r0m mov myd, mym %endif sub srcq, 3 pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 mov r5, hm mov [esp+0x094], myd mov [esp+0x134], r5 %endif jmp .hloop .hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [esp+0x130], 8*(isprep+1) mov myd, [esp+0x094] mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %endif mova m15, [rsp+0x120] pxor m9, m9 mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov mym, myd mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .hloop: %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m14, m5, q1100 pshufd m5, m5, q3322 pand m7, m11, m4 pand m8, m11, m6 pand m15, m11, m14 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m14, m2 pandn m5, m3 por m7, m4 por m8, m6 por m15, m14 por m11, m5 mova [rsp+0x10], m7 mova [rsp+0x20], m8 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10, 7, 8, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10, 7, 8, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 9, 10, 7, 8, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m9, [rsp+0x80] mov myd, mym mov dyd, dym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m9 ; 23a punpckhwd m3, m9 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 SWAP m14, m8 .vloop: and myd, 0x3ff mov r6d, 64 << 24 mov r4d, myd shr r4d, 6 lea r4d, [t1+r4] cmovnz r6q, [base+subpel_filters+r4*8] movq m11, r6q punpcklbw m11, m11 psraw m11, 8 pshufd m5, m11, q0000 pshufd m7, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 pmaddwd m4, m5, m0 pmaddwd m5, m5, m1 pmaddwd m6, m7, m2 pmaddwd m7, m7, m3 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 pmaddwd m8, [rsp+0x70], m11 pmaddwd m9, [rsp+0x80], m11 paddd m4, m6 paddd m5, m7 paddd m4, m8 paddd m5, m9 %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x140, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x160 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x180 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 6-7 mova m5, [esp+0x180] mova m6, [esp+0x190] mova m7, [esp+0x1a0] mova m0, [esp+0x1b0] mov myd, mym punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x180], m4 mova [esp+0x190], m5 mova [esp+0x1a0], m6 mova [esp+0x1b0], m7 mova m1, [esp+0x140] mova m2, [esp+0x150] mova m3, [esp+0x160] mova m4, [esp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x140], m0 mova [esp+0x150], m1 mova [esp+0x160], m2 mova [esp+0x170], m3 .vloop: mov r0, r0m mov r5, [esp+0x1f4] and myd, 0x3ff mov mym, myd xor r3, r3 shr r4, 6 lea r5, [r5+r4] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] movd m7, r4 movd m6, r3 punpckldq m7, m6 punpcklbw m7, m7 psraw m7, 8 pshufd m4, m7, q0000 pshufd m5, m7, q1111 pmaddwd m0, m4 pmaddwd m1, m4 pmaddwd m2, m5 pmaddwd m3, m5 pshufd m6, m7, q2222 pshufd m7, m7, q3333 paddd m0, m2 paddd m1, m3 pmaddwd m2, [esp+0x180], m6 pmaddwd m3, [esp+0x190], m6 pmaddwd m4, [esp+0x1a0], m7 pmaddwd m5, [esp+0x1b0], m7 paddd m0, m2 paddd m1, m3 paddd m0, m13 paddd m1, m13 paddd m4, m0 paddd m5, m1 %endif psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif dec hd jz .hloop_prep %if ARCH_X86_64 add myd, dyd test myd, ~0x3ff jz .vloop test myd, 0x400 mov [rsp+0x140], myd mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] jz .skip_line mova m14, [base+unpckw] movq m6, [srcq+r10] movq m7, [srcq+r11] movhps m6, [srcq+r13] movhps m7, [srcq+ rX] movq m4, [srcq+ r4] movq m5, [srcq+ r6] movhps m4, [srcq+ r7] movhps m5, [srcq+ r9] add srcq, ssq mov myd, [rsp+0x140] mov dyd, dym pshufd m9, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m9 ; 3a 2a pshufb m3, m9 ; 3b 2b pmaddubsw m6, [rsp+0x30] pmaddubsw m7, [rsp+0x40] pmaddubsw m4, [rsp+0x10] pmaddubsw m5, [rsp+0x20] phaddw m6, m7 phaddw m4, m5 phaddw m4, m6 pmulhrsw m4, m12 pshufb m5, [rsp+0x50], m14 ; 4a 5a pshufb m6, [rsp+0x60], m14 ; 4b 5b pshufb m7, [rsp+0x70], m9 ; 7a 6a pshufb m8, [rsp+0x80], m9 ; 7b 6b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m5 ; 34a punpcklwd m3, m6 ; 34b punpckhwd m5, m7 ; 56a punpckhwd m6, m8 ; 56b punpcklwd m7, m4 ; 78a punpckhqdq m4, m4 punpcklwd m8, m4 ; 78b mova [rsp+0x50], m5 mova [rsp+0x60], m6 mova [rsp+0x70], m7 mova [rsp+0x80], m8 jmp .vloop .skip_line: mova m0, [rsp+0x10] mova m1, [rsp+0x20] mova m14, [rsp+0x30] mova m15, [rsp+0x40] MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11, 0, 1, 14, 15 mov myd, [rsp+0x140] mov dyd, dym mova m0, m2 ; 01a mova m1, m3 ; 01b mova m2, [rsp+0x50] ; 23a mova m3, [rsp+0x60] ; 23b mova m5, [rsp+0x70] ; 45a mova m6, [rsp+0x80] ; 45b punpcklwd m7, m4, m8 ; 67a punpckhwd m4, m8 ; 67b mova [rsp+0x50], m5 mova [rsp+0x60], m6 mova [rsp+0x70], m7 mova [rsp+0x80], m4 %else mov r0m, r0 mov myd, mym mov r3, r3m add myd, dym test myd, ~0x3ff mov mym, myd jnz .next_line mova m0, [esp+0x140] mova m1, [esp+0x150] mova m2, [esp+0x160] mova m3, [esp+0x170] jmp .vloop .next_line: test myd, 0x400 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] jz .skip_line mova m6, [base+unpckw] mova m0, [esp+0x140] mova m1, [esp+0x150] mova m7, [esp+0x180] movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] pshufb m0, m6 ; 0a 1a pshufb m1, m6 ; 0b 1b pshufb m7, m6 ; 4a 5a mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] movq m3, [srcq+r0] movq m2, [srcq+rX] movhps m3, [srcq+r4] movhps m2, [srcq+r5] add srcq, ssq pmaddubsw m4, [esp+0x20] pmaddubsw m5, [esp+0x30] pmaddubsw m3, [esp+0x40] pmaddubsw m2, [esp+0x50] phaddw m4, m5 phaddw m3, m2 mova m5, [esp+0x190] mova m2, [esp+0x160] phaddw m4, m3 mova m3, [esp+0x170] pmulhrsw m4, m12 ; 8a 8b mov myd, mym pshufb m5, m6 ; 4b 5b pshufd m6, m6, q1032 pshufb m2, m6 ; 3a 2a pshufb m3, m6 ; 3b 2b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b mova [esp+0x140], m0 mova [esp+0x150], m1 mova m0, [esp+0x1a0] mova m1, [esp+0x1b0] punpcklwd m2, m7 ; 34a punpcklwd m3, m5 ; 34b mova [esp+0x160], m2 mova [esp+0x170], m3 pshufb m0, m6 ; 7a 6a pshufb m1, m6 ; 7b 6b punpckhwd m7, m0 ; 56a punpckhwd m5, m1 ; 56b punpcklwd m0, m4 punpckhqdq m4, m4 punpcklwd m1, m4 mova [esp+0x180], m7 mova [esp+0x190], m5 mova [esp+0x1a0], m0 mova [esp+0x1b0], m1 mova m0, [esp+0x140] mova m1, [esp+0x150] jmp .vloop .skip_line: MC_8TAP_SCALED_H 0x20, 0x1c0, 0 mov myd, mym mova m0, [esp+0x160] mova m1, [esp+0x170] mova m2, [esp+0x180] mova m3, [esp+0x190] mova [esp+0x140], m0 mova [esp+0x150], m1 mova m4, [esp+0x1a0] mova m5, [esp+0x1b0] mova [esp+0x160], m2 mova [esp+0x170], m3 mova m6, [esp+0x1c0] mova m7, [esp+0x1d0] mova [esp+0x180], m4 mova [esp+0x190], m5 punpcklwd m4, m6, m7 punpckhwd m6, m7 mova [esp+0x1a0], m4 mova [esp+0x1b0], m6 %endif jmp .vloop INIT_XMM ssse3 .dy1: movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 movzx r5, byte [esp+0x1f0] dec srcd movd m15, r5 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %else %define m11 [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [esp+0x00], m14 %define m14 [esp+0x00] SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m2, [srcq+ssq*2] movhps m0, [srcq+ssq*1] movhps m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movq m10, r4 %else mov myd, mym mov r5, [esp+0x1f4] xor r3, r3 shr myd, 6 lea r5, [r5+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r5*8+0] cmovnz r3, [base+subpel_filters+r5*8+4] %define m10 m4 movd m10, r4 movd m3, r3 mov r3, r3m punpckldq m10, m3 %endif movq m1, [srcq+ssq*0] movq m3, [srcq+ssq*2] movhps m1, [srcq+ssq*1] add srcq, ss3q punpcklbw m10, m10 psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 %else pand m7, m11, m8 %define m11 m7 %endif pandn m8, m15 SWAP m15, m8 por m15, m11 %if ARCH_X86_64 pshufd m8, m10, q0000 pshufd m9, m10, q1111 pshufd m11, m10, q3333 pshufd m10, m10, q2222 %else mova [esp+0x10], m15 %define m15 [esp+0x10] mov r0, r0m pshufd m5, m4, q0000 pshufd m6, m4, q1111 pshufd m7, m4, q2222 pshufd m4, m4, q3333 %define m8 [esp+0x20] %define m9 [esp+0x30] %define m10 [esp+0x40] %define m11 [esp+0x50] mova m8, m5 mova m9, m6 mova m10, m7 mova m11, m4 %endif pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 pmulhrsw m1, m12 palignr m2, m1, m0, 4 pshufd m4, m1, q2121 punpcklwd m3, m0, m2 ; 01 12 punpckhwd m0, m2 ; 23 34 punpcklwd m2, m1, m4 ; 45 56 .dy1_w2_loop: movq m1, [srcq+ssq*0] movhps m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pmaddwd m5, m3, m8 pmaddwd m6, m0, m9 pmaddwd m7, m2, m10 mova m3, m0 mova m0, m2 paddd m5, m13 paddd m6, m7 pshufb m1, m14 pmaddubsw m1, m15 phaddw m1, m1 pmulhrsw m1, m12 palignr m7, m1, m4, 12 punpcklwd m2, m7, m1 ; 67 78 pmaddwd m7, m2, m11 mova m4, m1 paddd m5, m6 paddd m5, m7 psrad m5, rndshift packssdw m5, m5 packuswb m5, m5 movd r4d, m5 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy1_w2_loop RET %endif INIT_XMM ssse3 .dy1_w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %if isprep %define ssq r3 %endif movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else movd r1, m15 movd r3, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 %define m15 m5 SWAP m4, m7 movd m15, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m4, [base+subpel_filters+r5*8+2] mov myd, mym mov rX, [esp+0x1f4] xor r5, r5 shr myd, 6 lea rX, [rX+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+rX*8+0] cmovnz r5, [base+subpel_filters+rX*8+4] mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %endif punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 movq m6, [base+subpel_s_shuf2] %if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] movu m2, [srcq+ssq*2] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpcklqdq m6, m6 movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] movu m7, [srcq+ssq*2] add srcq, ss3q pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 paddb m14, m6 movq m10, r4q punpcklbw m10, m10 psraw m10, 8 pshufb m0, m14 pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m4, m14 pshufb m5, m14 pshufb m7, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 pmaddubsw m7, m15 phaddw m0, m1 phaddw m2, m3 phaddw m4, m5 phaddw m6, m7, m7 pmulhrsw m0, m12 ; 0 1 pmulhrsw m2, m12 ; 2 3 pmulhrsw m4, m12 ; 4 5 pmulhrsw m6, m12 ; 6 _ shufps m1, m0, m2, q1032 ; 1 2 shufps m3, m2, m4, q1032 ; 3 4 shufps m5, m4, m6, q1032 ; 5 6 punpcklwd m7, m0, m1 ; 01 punpckhwd m0, m1 ; 12 punpcklwd m8, m2, m3 ; 23 punpckhwd m2, m3 ; 34 punpcklwd m9, m4, m5 ; 45 punpckhwd m4, m5 ; 56 %else pxor m3, m3 pcmpeqd m8, m3 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*1] movu m3, [srcq+ssq*2] add srcq, ss3q punpcklqdq m6, m6 SWAP m4, m7 pand m7, m11, m8 pandn m8, m15 SWAP m5, m0 por m15, m7 paddb m14, m6 movu m0, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m0, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 mova [esp+0x00], m14 mova [esp+0x10], m15 pmaddubsw m0, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 phaddw m1, m2 movu m2, [srcq+ss3q ] lea srcq, [srcq+ssq*4] mov r0, r0m phaddw m3, m0 pshufb m2, m14 pmaddubsw m2, m15 %define m14 [esp+0x00] %define m15 [esp+0x10] phaddw m7, m6 phaddw m2, m2 movd m6, r4 movd m0, r5 punpckldq m6, m0 punpcklbw m6, m6 psraw m6, 8 mova [esp+0x20], m6 pmulhrsw m1, m12 ; 0 1 pmulhrsw m3, m12 ; 2 3 pmulhrsw m7, m12 ; 4 5 pmulhrsw m2, m12 ; 6 _ shufps m0, m1, m3, q1032 ; 1 2 shufps m4, m3, m7, q1032 ; 3 4 shufps m5, m7, m2, q1032 ; 5 6 punpcklwd m6, m1, m0 ; 01 punpckhwd m1, m0 ; 12 mova [esp+0x30], m1 punpcklwd m1, m3, m4 ; 23 punpckhwd m3, m4 ; 34 mova [esp+0x40], m3 punpcklwd m3, m7, m5 ; 45 punpckhwd m7, m5 ; 56 mova [esp+0x50], m7 mova [esp+0x60], m2 mova m0, [esp+0x20] %xdefine m8 m1 %xdefine m9 m3 %xdefine m10 m0 SWAP m7, m6 SWAP m1, m4 SWAP m3, m2 %endif pshufd m1, m10, q0000 pshufd m3, m10, q1111 pshufd m5, m10, q2222 pshufd m10, m10, q3333 %if ARCH_X86_64 mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 %else mova [esp+0x70], m8 mova [esp+0x80], m9 mova [esp+0x90], m1 mova [esp+0xa0], m3 mova [esp+0xb0], m5 mova [esp+0xc0], m10 %ifidn %1, put mov dsd, dsm %endif %define m11 m6 %endif .dy1_w4_loop: %if ARCH_X86_64 movu m11, [srcq+ssq*0] pmaddwd m7, m1 pmaddwd m8, m3 pmaddwd m0, m1 pmaddwd m2, m3 pmaddwd m9, m5 pmaddwd m4, m5 paddd m7, m8 paddd m0, m2 movu m8, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m11, m14 pmaddubsw m11, m15 paddd m7, m13 paddd m0, m13 paddd m7, m9 paddd m0, m4 pshufb m8, m14 pmaddubsw m8, m15 phaddw m11, m8 mova m8, [rsp+0x20] pmulhrsw m11, m12 punpcklwd m9, m6, m11 ; 67 psrldq m6, m11, 8 punpcklwd m4, m11, m6 ; 78 pmaddwd m2, m9, m10 pmaddwd m11, m4, m10 paddd m7, m2 mova m2, [rsp+0x30] paddd m0, m11 %else SWAP m7, m6 SWAP m1, m4 SWAP m3, m2 movu m5, [srcq+ssq*0] mova m0, [esp+0x30] mova m2, [esp+0x40] mova m4, [esp+0x50] pmaddwd m6, [esp+0x90] pmaddwd m1, [esp+0xa0] pmaddwd m0, [esp+0x90] pmaddwd m2, [esp+0xa0] pmaddwd m3, [esp+0xb0] pmaddwd m4, [esp+0xb0] paddd m6, m1 paddd m0, m2 movu m7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pshufb m5, m14 pmaddubsw m5, m15 paddd m6, m13 paddd m0, m13 paddd m6, m3 paddd m0, m4 pshufb m7, m14 pmaddubsw m7, m15 phaddw m5, m7 mova m7, [rsp+0x80] pmulhrsw m5, m12 punpcklwd m3, [esp+0x60], m5 ; 67 psrldq m1, m5, 8 punpcklwd m4, m5, m1 ; 78 pmaddwd m2, m3, [esp+0xc0] pmaddwd m5, m4, [esp+0xc0] mova [esp+0x60], m1 paddd m6, m2 mova m2, [esp+0x50] paddd m0, m5 SWAP m7, m6 %endif psrad m7, rndshift psrad m0, rndshift packssdw m7, m0 %if ARCH_X86_64 mova m0, [rsp+0x10] %else mova m0, [esp+0x40] %define m11 m5 %endif %ifidn %1, put packuswb m7, m7 psrldq m11, m7, 4 movd [dstq+dsq*0], m7 movd [dstq+dsq*1], m11 lea dstq, [dstq+dsq*2] %else mova [tmpq], m7 add tmpq, 16 %endif sub hd, 2 jz .ret %if ARCH_X86_64 mova m7, [rsp+0x00] mova [rsp+0x00], m8 mova [rsp+0x10], m2 mova [rsp+0x20], m9 mova [rsp+0x30], m4 %else mova m7, [esp+0x70] ; 01 mova m1, [esp+0x80] ; 23 mova m2, [esp+0x50] ; 34 mova [esp+0x30], m0 mova [esp+0x70], m1 mova [esp+0x40], m2 mova [esp+0x80], m3 mova [esp+0x50], m4 %endif jmp .dy1_w4_loop INIT_XMM ssse3 .dy1_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy1_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define m8 m0 %define m9 m1 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define ssq ssm %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 sub srcq, 3 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 %else movd m5, r4 movd m6, r5 punpckldq m5, m6 punpcklbw m5, m5 psraw m5, 8 SWAP m3, m5 %endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [rsp+0x140], m0 mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 SWAP m5, m3 mov r5, hm mov [esp+0x134], r5 %endif jmp .dy1_hloop .dy1_hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [rsp+0x130], 8*(isprep+1) mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %else %define m10 [base+pd_0x3ff] %endif mova m15, [rsp+0x120] mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy1_hloop: pxor m9, m9 %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m8, m11, m4 pand m9, m11, m6 pand m15, m11, m7 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m7, m2 pandn m5, m3 por m8, m4 por m9, m6 por m15, m7 por m11, m5 mova [rsp+0x10], m8 mova [rsp+0x20], m9 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m15, [rsp+0x80] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b SWAP m14, m8 mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m15; 23a punpckhwd m3, m15 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 mova m14, [base+unpckw] %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 mova m5, [esp+0x1a0] mova m6, [esp+0x1b0] mova m7, [esp+0x1c0] mova m0, [esp+0x1d0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 mova [esp+0x1d0], m7 mova m1, [esp+0x060] mova m2, [esp+0x070] mova m3, [esp+0x180] mova m4, [esp+0x190] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x060], m0 mova [esp+0x070], m1 mova [esp+0x180], m2 mova [esp+0x190], m3 %define m8 [esp+0x140] %define m9 [esp+0x150] %define m10 [esp+0x160] %define m11 [esp+0x170] %endif .dy1_vloop: %if ARCH_X86_32 mov r0, r0m %endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m9 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 %else pmaddwd m6, [rsp+0x1a0], m10 pmaddwd m7, [rsp+0x1b0], m10 %endif paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x70], m11 pmaddwd m7, [rsp+0x80], m11 %else pmaddwd m6, [rsp+0x1c0], m11 pmaddwd m7, [rsp+0x1d0], m11 %endif paddd m4, m6 paddd m5, m7 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif %if ARCH_X86_32 mov r0m, r0 %endif dec hd jz .dy1_hloop_prep %if ARCH_X86_64 movq m4, [srcq+ r4] movq m5, [srcq+ r6] movhps m4, [srcq+ r7] movhps m5, [srcq+ r9] movq m6, [srcq+r10] movq m7, [srcq+r11] movhps m6, [srcq+r13] movhps m7, [srcq+ rX] add srcq, ssq pshufd m15, m14, q1032 pshufb m0, m14 ; 0a 1a pshufb m1, m14 ; 0b 1b pshufb m2, m15 ; 3a 2a pshufb m3, m15 ; 3b 2b pmaddubsw m4, [rsp+0x10] pmaddubsw m5, [rsp+0x20] pmaddubsw m6, [rsp+0x30] pmaddubsw m7, [rsp+0x40] phaddw m4, m5 phaddw m6, m7 phaddw m4, m6 pmulhrsw m4, m12 pshufb m5, [rsp+0x70], m15 ; 7a 6a pshufb m7, [rsp+0x80], m15 ; 7b 6b pshufb m6, [rsp+0x50], m14 ; 4a 5a pshufb m15, [rsp+0x60], m14 ; 4b 5b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b punpcklwd m2, m6 ; 34a punpcklwd m3, m15 ; 34b punpckhwd m6, m5 ; 56a punpckhwd m15, m7 ; 56b punpcklwd m5, m4 ; 78a psrldq m4, 8 punpcklwd m7, m4 ; 78b mova [rsp+0x50], m6 mova [rsp+0x60], m15 mova [rsp+0x70], m5 mova [rsp+0x80], m7 %else mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova m6, [base+unpckw] mova m0, [esp+0x060] mova m1, [esp+0x070] mova m7, [esp+0x1a0] movq m4, [srcq+r0] movq m5, [srcq+rX] movhps m4, [srcq+r4] movhps m5, [srcq+r5] pshufb m0, m6 ; 0a 1a pshufb m1, m6 ; 0b 1b pshufb m7, m6 ; 4a 5a mov r0, [esp+16] mov rX, [esp+24] mov r4, [esp+20] mov r5, [esp+28] movq m3, [srcq+r0] movq m2, [srcq+rX] movhps m3, [srcq+r4] movhps m2, [srcq+r5] add srcq, ssq pmaddubsw m4, [esp+0x20] pmaddubsw m5, [esp+0x30] pmaddubsw m3, [esp+0x40] pmaddubsw m2, [esp+0x50] phaddw m4, m5 phaddw m3, m2 mova m5, [esp+0x1b0] mova m2, [esp+0x180] phaddw m4, m3 mova m3, [esp+0x190] pmulhrsw m4, m12 ; 8a 8b pshufb m5, m6 ; 4b 5b pshufd m6, m6, q1032 pshufb m2, m6 ; 3a 2a pshufb m3, m6 ; 3b 2b punpckhwd m0, m2 ; 12a punpckhwd m1, m3 ; 12b mova [esp+0x60], m0 mova [esp+0x70], m1 mova m0, [esp+0x1c0] mova m1, [esp+0x1d0] punpcklwd m2, m7 ; 34a punpcklwd m3, m5 ; 34b mova [esp+0x180], m2 mova [esp+0x190], m3 pshufb m0, m6 ; 7a 6a pshufb m1, m6 ; 7b 6b punpckhwd m7, m0 ; 56a punpckhwd m5, m1 ; 56b punpcklwd m0, m4 punpckhqdq m4, m4 punpcklwd m1, m4 mova [esp+0x1a0], m7 mova [esp+0x1b0], m5 mova [esp+0x1c0], m0 mova [esp+0x1d0], m1 mova m0, [esp+0x60] mova m1, [esp+0x70] %endif jmp .dy1_vloop INIT_XMM ssse3 .dy2: movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %define m9 m1 %define m14 m4 %define m15 m3 movzx r5, byte [esp+0x1f0] dec srcd movd m15, r5 %endif punpckldq m9, m8 SWAP m8, m9 paddd m14, m8 ; mx+dx*[0-1] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 pand m8, m14, m10 psrld m8, 6 paddd m15, m8 movd r4d, m15 psrldq m15, 4 %if ARCH_X86_64 movd r6d, m15 %else movd r3d, m15 %endif mova m5, [base+bdct_lb_dw] mova m6, [base+subpel_s_shuf2] movd m15, [base+subpel_filters+r4*8+2] %if ARCH_X86_64 movd m7, [base+subpel_filters+r6*8+2] %else movd m7, [base+subpel_filters+r3*8+2] %endif pxor m9, m9 pcmpeqd m8, m9 psrld m14, 10 %if ARCH_X86_32 mov r3, r3m pshufb m14, m5 paddb m14, m6 mova [esp+0x00], m14 %define m14 [esp+0x00] SWAP m5, m0 SWAP m6, m3 %define m8 m5 %define m15 m6 %endif movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] movhps m0, [srcq+ssq*2] movhps m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] %if ARCH_X86_64 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] pshufb m14, m5 paddb m14, m6 movq m10, r4q %else mov myd, mym mov r3, [esp+0x1f4] xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r3, r3m %define m10 m4 movd m10, r4 movd m3, r5 punpckldq m10, m3 %endif movq m3, [srcq+ssq*0] movhps m3, [srcq+ssq*1] lea srcq, [srcq+ssq*2] punpcklbw m10, m10 psraw m10, 8 punpckldq m15, m7 punpcklqdq m15, m15 %if ARCH_X86_64 pand m11, m8 %else pand m7, m11, m8 %define m11 m7 %endif pandn m8, m15 SWAP m15, m8 por m15, m11 %if ARCH_X86_64 pshufd m8, m10, q0000 pshufd m9, m10, q1111 pshufd m11, m10, q3333 pshufd m10, m10, q2222 %else mova [esp+0x10], m15 %define m15 [esp+0x10] mov r5, r0m %define dstq r5 mov dsd, dsm pshufd m5, m4, q0000 pshufd m6, m4, q1111 pshufd m7, m4, q2222 pshufd m4, m4, q3333 %define m8 [esp+0x20] %define m9 [esp+0x30] %define m10 [esp+0x40] %define m11 [esp+0x50] mova m8, m5 mova m9, m6 mova m10, m7 mova m11, m4 %endif pshufb m0, m14 pshufb m1, m14 pshufb m3, m14 pmaddubsw m0, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 pslldq m2, m3, 8 phaddw m0, m2 phaddw m1, m3 pmulhrsw m0, m12 ; 0 2 _ 4 pmulhrsw m1, m12 ; 1 3 _ 5 pshufd m2, m0, q3110 ; 0 2 2 4 pshufd m1, m1, q3110 ; 1 3 3 5 punpcklwd m3, m2, m1 ; 01 23 punpckhwd m2, m1 ; 23 45 .dy2_w2_loop: movq m6, [srcq+ssq*0] movq m7, [srcq+ssq*1] movhps m6, [srcq+ssq*2] movhps m7, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pmaddwd m4, m3, m8 pmaddwd m5, m2, m9 pshufb m6, m14 pshufb m7, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 phaddw m6, m7 pmulhrsw m6, m12 psrldq m7, m6, 8 palignr m6, m0, 8 palignr m7, m1, 8 mova m0, m6 mova m1, m7 pshufd m6, m6, q3221 pshufd m7, m7, q3221 punpcklwd m3, m6, m7 ; 45 67 punpckhwd m2, m6, m7 ; 67 89 pmaddwd m6, m3, m10 pmaddwd m7, m2, m11 paddd m4, m5 paddd m4, m13 paddd m6, m7 paddd m4, m6 psrad m4, rndshift packssdw m4, m4 packuswb m4, m4 movd r4d, m4 mov [dstq+dsq*0], r4w shr r4d, 16 mov [dstq+dsq*1], r4w lea dstq, [dstq+dsq*2] sub hd, 2 jg .dy2_w2_loop RET %endif INIT_XMM ssse3 .dy2_w4: %if ARCH_X86_64 mov myd, mym movzx t0d, t0b dec srcq movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %xdefine m14 m4 %define m15 m3 %define dstq r0 %if isprep %define ssq r3 %endif movzx r4, byte [esp+0x1f0] dec srcq movd m15, r4 %endif pmaddwd m8, [base+rescale_mul] %if ARCH_X86_64 mova m11, [base+pd_0x4000] %endif pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] pand m8, m14, m10 psrld m8, 6 paddd m15, m8 psrldq m7, m15, 8 %if ARCH_X86_64 movd r4d, m15 movd r11d, m7 psrldq m15, 4 psrldq m7, 4 movd r6d, m15 movd r13d, m7 movd m15, [base+subpel_filters+ r4*8+2] movd m2, [base+subpel_filters+r11*8+2] movd m3, [base+subpel_filters+ r6*8+2] movd m4, [base+subpel_filters+r13*8+2] movq m6, [base+subpel_s_shuf2] shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] %else movd r1, m15 movd r3, m7 psrldq m15, 4 psrldq m7, 4 movd r4, m15 movd r5, m7 %define m15 m5 SWAP m4, m7 movd m15, [base+subpel_filters+r1*8+2] movd m2, [base+subpel_filters+r3*8+2] movd m3, [base+subpel_filters+r4*8+2] movd m4, [base+subpel_filters+r5*8+2] movq m6, [base+subpel_s_shuf2] mov myd, mym mov r3, [esp+0x1f4] xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r3, r3m %if isprep lea ss3q, [ssq*3] %endif %endif punpckldq m15, m3 punpckldq m2, m4 punpcklqdq m15, m2 %if ARCH_X86_64 pcmpeqd m8, m9 psrld m14, 10 movu m0, [srcq+ssq*0] movu m2, [srcq+ssq*2] movu m1, [srcq+ssq*1] movu m3, [srcq+ss3q ] lea srcq, [srcq+ssq*4] punpcklqdq m6, m6 pshufb m14, [base+bdct_lb_dw] movu m4, [srcq+ssq*0] movu m5, [srcq+ssq*1] lea srcq, [srcq+ssq*2] pand m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m11 paddb m14, m6 movq m11, r4q punpcklbw m11, m11 psraw m11, 8 pshufb m0, m14 pshufb m2, m14 pshufb m1, m14 pshufb m3, m14 pshufb m4, m14 pshufb m5, m14 pmaddubsw m0, m15 pmaddubsw m2, m15 pmaddubsw m1, m15 pmaddubsw m3, m15 pmaddubsw m4, m15 pmaddubsw m5, m15 phaddw m0, m2 phaddw m1, m3 phaddw m4, m5 pmulhrsw m0, m12 ; 0 2 pmulhrsw m1, m12 ; 1 3 pmulhrsw m4, m12 ; 4 5 pshufd m8, m11, q0000 pshufd m9, m11, q1111 pshufd m10, m11, q2222 pshufd m11, m11, q3333 %else pxor m3, m3 pcmpeqd m8, m3 psrld m14, 10 pshufb m14, [base+bdct_lb_dw] movu m1, [srcq+ssq*0] movu m2, [srcq+ssq*2] movu m3, [srcq+ssq*1] add srcq, ss3q punpcklqdq m6, m6 SWAP m4, m7 pand m7, m11, m8 pandn m8, m15 SWAP m15, m8 por m15, m7 paddb m14, m6 movu m0, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m6, [srcq+ssq*2] add srcq, ss3q pshufb m1, m14 pshufb m2, m14 pshufb m3, m14 pshufb m0, m14 pshufb m7, m14 pshufb m6, m14 pmaddubsw m1, m15 pmaddubsw m2, m15 pmaddubsw m3, m15 mova [esp+0x00], m14 mova [esp+0x10], m15 pmaddubsw m0, m15 pmaddubsw m7, m15 pmaddubsw m6, m15 %define m14 [esp+0x00] %define m15 [esp+0x10] phaddw m1, m2 phaddw m3, m0 phaddw m7, m6 %ifidn %1, put mov dsd, dsm %define dstq r5 %else %define tmpq r5 %endif movd m6, r4 movd m0, r5 punpckldq m6, m0 punpcklbw m6, m6 psraw m6, 8 mov r5, r0m pmulhrsw m1, m12 ; 0 2 pmulhrsw m3, m12 ; 1 3 pmulhrsw m7, m12 ; 4 5 SWAP m0, m1, m3 SWAP m4, m7 pshufd m2, m6, q0000 pshufd m3, m6, q1111 pshufd m7, m6, q2222 pshufd m6, m6, q3333 mova [esp+0x30], m2 mova [esp+0x40], m3 mova [esp+0x50], m7 mova [esp+0x60], m6 %define m8 [esp+0x30] %define m9 [esp+0x40] %define m10 [esp+0x50] %define m11 [esp+0x60] %endif psrldq m5, m4, 8 ; 5 _ punpckhwd m2, m0, m1 ; 23 punpcklwd m0, m1 ; 01 punpcklwd m4, m5 ; 45 .dy2_w4_loop: pmaddwd m0, m8 ; a0 pmaddwd m5, m2, m8 ; b0 pmaddwd m2, m9 ; a1 pmaddwd m7, m4, m9 ; b1 pmaddwd m3, m4, m10 ; a2 paddd m0, m13 paddd m5, m13 paddd m0, m2 paddd m5, m7 paddd m0, m3 movu m6, [srcq+ssq*0] movu m7, [srcq+ssq*1] movu m3, [srcq+ssq*2] movu m1, [srcq+ss3q ] lea srcq, [srcq+ssq*4] pshufb m6, m14 pshufb m7, m14 pshufb m3, m14 pshufb m1, m14 pmaddubsw m6, m15 pmaddubsw m7, m15 pmaddubsw m3, m15 pmaddubsw m1, m15 phaddw m6, m7 phaddw m3, m1 pmulhrsw m6, m12 ; 6 7 pmulhrsw m3, m12 ; 8 9 psrldq m7, m6, 8 psrldq m1, m3, 8 punpcklwd m6, m7 ; 67 punpcklwd m3, m1 ; 89 mova m2, m6 pmaddwd m1, m6, m10 ; b2 pmaddwd m6, m11 ; a3 pmaddwd m7, m3, m11 ; b3 paddd m5, m1 paddd m0, m6 paddd m5, m7 psrad m0, rndshift psrad m5, rndshift packssdw m0, m5 %ifidn %1, put packuswb m0, m0 psrldq m1, m0, 4 movd [dstq+dsq*0], m0 movd [dstq+dsq*1], m1 lea dstq, [dstq+dsq*2] %else mova [tmpq], m0 add tmpq, 16 %endif mova m0, m4 mova m4, m3 sub hd, 2 jg .dy2_w4_loop MC_8TAP_SCALED_RET INIT_XMM ssse3 .dy2_w8: mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy2_w_start: mov myd, mym %ifidn %1, put movifnidn dsm, dsq %endif %if ARCH_X86_64 shr t0d, 16 sub srcq, 3 shr myd, 6 mov r4d, 64 << 24 lea myd, [t1+myq] cmovnz r4q, [base+subpel_filters+myq*8] movd m15, t0d %else %define m10 [base+pd_0x3ff] %define m11 [base+pd_0x4000] %define m8 m0 %define m9 m1 %xdefine m14 m4 %xdefine m15 m3 %if isprep %define tmpq r0 %define ssq ssm %else %define dstq r0 %endif mov r5, [esp+0x1f0] mov r3, [esp+0x1f4] shr r5, 16 sub srcq, 3 movd m15, r5 xor r5, r5 shr myd, 6 lea r3, [r3+myd] mov r4, 64 << 24 cmovnz r4, [base+subpel_filters+r3*8+0] cmovnz r5, [base+subpel_filters+r3*8+4] mov r0, r0m mov r3, r3m %endif pslld m7, m8, 2 ; dx*4 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] pshufd m15, m15, q0000 paddd m14, m8 ; mx+dx*[0-3] %if ARCH_X86_64 movq m3, r4q punpcklbw m3, m3 psraw m3, 8 %else movd m5, r4 movd m6, r5 punpckldq m5, m6 punpcklbw m5, m5 psraw m5, 8 SWAP m3, m5 %endif mova [rsp+0x100], m7 mova [rsp+0x120], m15 mov [rsp+0x098], srcq mov [rsp+0x130], r0q ; dstq / tmpq pshufd m0, m3, q0000 pshufd m1, m3, q1111 pshufd m2, m3, q2222 pshufd m3, m3, q3333 mova [rsp+0x140], m0 mova [rsp+0x150], m1 mova [rsp+0x160], m2 mova [rsp+0x170], m3 %if ARCH_X86_64 && UNIX64 mov hm, hd %elif ARCH_X86_32 SWAP m5, m3 mov r5, hm mov [esp+0x134], r5 %endif jmp .dy2_hloop .dy2_hloop_prep: dec dword [rsp+0x090] jz .ret %if ARCH_X86_64 add qword [rsp+0x130], 8*(isprep+1) mov hd, hm %else add dword [rsp+0x130], 8*(isprep+1) mov r5, [esp+0x134] mov r0, [esp+0x130] %endif mova m7, [rsp+0x100] mova m14, [rsp+0x110] %if ARCH_X86_64 mova m10, [base+pd_0x3ff] %else %define m10 [base+pd_0x3ff] %endif mova m15, [rsp+0x120] mov srcq, [rsp+0x098] %if ARCH_X86_64 mov r0q, [rsp+0x130] ; dstq / tmpq %else mov hm, r5 mov r0m, r0 mov r3, r3m %endif paddd m14, m7 .dy2_hloop: pxor m9, m9 %if ARCH_X86_64 mova m11, [base+pq_0x40000000] %else %define m11 [base+pq_0x40000000] %endif psrld m2, m14, 10 mova [rsp], m2 pand m6, m14, m10 psrld m6, 6 paddd m5, m15, m6 pcmpeqd m6, m9 psrldq m2, m5, 8 %if ARCH_X86_64 movd r4d, m5 movd r6d, m2 psrldq m5, 4 psrldq m2, 4 movd r7d, m5 movd r9d, m2 movq m0, [base+subpel_filters+r4*8] movq m1, [base+subpel_filters+r6*8] movhps m0, [base+subpel_filters+r7*8] movhps m1, [base+subpel_filters+r9*8] %else movd r0, m5 movd rX, m2 psrldq m5, 4 psrldq m2, 4 movd r4, m5 movd r5, m2 movq m0, [base+subpel_filters+r0*8] movq m1, [base+subpel_filters+rX*8] movhps m0, [base+subpel_filters+r4*8] movhps m1, [base+subpel_filters+r5*8] pxor m2, m2 %define m9 m2 %endif paddd m14, m7 ; mx+dx*[4-7] pand m5, m14, m10 psrld m5, 6 paddd m15, m5 pcmpeqd m5, m9 mova [rsp+0x110], m14 psrldq m4, m15, 8 %if ARCH_X86_64 movd r10d, m15 movd r11d, m4 psrldq m15, 4 psrldq m4, 4 movd r13d, m15 movd rXd, m4 movq m2, [base+subpel_filters+r10*8] movq m3, [base+subpel_filters+r11*8] movhps m2, [base+subpel_filters+r13*8] movhps m3, [base+subpel_filters+ rX*8] psrld m14, 10 psrldq m4, m14, 8 movd r10d, m14 movd r11d, m4 psrldq m14, 4 psrldq m4, 4 movd r13d, m14 movd rXd, m4 mov r4d, [rsp+ 0] mov r6d, [rsp+ 8] mov r7d, [rsp+ 4] mov r9d, [rsp+12] pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m8, m11, m4 pand m9, m11, m6 pand m15, m11, m7 pand m11, m11, m5 pandn m4, m0 pandn m6, m1 pandn m7, m2 pandn m5, m3 por m8, m4 por m9, m6 por m15, m7 por m11, m5 mova [rsp+0x10], m8 mova [rsp+0x20], m9 mova [rsp+0x30], m15 mova [rsp+0x40], m11 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 10, 8, 9, 15, 11 ; 0-1 mova [rsp+0x50], m1 mova [rsp+0x60], m2 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 7, 10, 8, 9, 15, 11 ; 2-3 mova [rsp+0x70], m3 mova [rsp+0x80], m4 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 4-5 MC_8TAP_SCALED_H 0,14, 1, 2, 3, 4, 7, 10, 8, 9, 15, 11 ; 6-7 SWAP m7, m0 SWAP m8, m14 mova m1, [rsp+0x50] mova m2, [rsp+0x60] mova m3, [rsp+0x70] mova m15, [rsp+0x80] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m8 ; 67a punpckhwd m7, m8 ; 67b SWAP m14, m8 mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m15; 23a punpckhwd m3, m15 ; 23b mova [rsp+0x50], m4 mova [rsp+0x60], m5 mova [rsp+0x70], m6 mova [rsp+0x80], m7 %else movd r0, m15 movd rX, m4 psrldq m15, 4 psrldq m4, 4 movd r4, m15 movd r5, m4 mova m14, [esp+0x110] movq m2, [base+subpel_filters+r0*8] movq m3, [base+subpel_filters+rX*8] movhps m2, [base+subpel_filters+r4*8] movhps m3, [base+subpel_filters+r5*8] psrld m14, 10 mova [esp+16], m14 mov r0, [esp+ 0] mov rX, [esp+ 8] mov r4, [esp+ 4] mov r5, [esp+12] mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 pshufd m4, m6, q1100 pshufd m6, m6, q3322 pshufd m7, m5, q1100 pshufd m5, m5, q3322 pand m0, m11, m4 pand m1, m11, m6 pand m2, m11, m7 pand m3, m11, m5 pandn m4, [esp+0x20] pandn m6, [esp+0x30] pandn m7, [esp+0x40] pandn m5, [esp+0x50] por m0, m4 por m1, m6 por m2, m7 por m3, m5 mova [esp+0x20], m0 mova [esp+0x30], m1 mova [esp+0x40], m2 mova [esp+0x50], m3 MC_8TAP_SCALED_H 0x20, 0x60, 0 ; 0-1 MC_8TAP_SCALED_H 0x20, 0x180 ; 2-3 MC_8TAP_SCALED_H 0x20, 0x1a0 ; 4-5 MC_8TAP_SCALED_H 0x20, 0x1c0 ; 6-7 mova m5, [esp+0x1a0] mova m6, [esp+0x1b0] mova m7, [esp+0x1c0] mova m0, [esp+0x1d0] punpcklwd m4, m5, m6 ; 45a punpckhwd m5, m6 ; 45b punpcklwd m6, m7, m0 ; 67a punpckhwd m7, m0 ; 67b mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 mova [esp+0x1d0], m7 mova m1, [esp+0x060] mova m2, [esp+0x070] mova m3, [esp+0x180] mova m4, [esp+0x190] punpcklwd m0, m1, m2 ; 01a punpckhwd m1, m2 ; 01b punpcklwd m2, m3, m4 ; 23a punpckhwd m3, m4 ; 23b mova [esp+0x180], m2 mova [esp+0x190], m3 %define m8 [esp+0x140] %define m9 [esp+0x150] %define m10 [esp+0x160] %define m11 [esp+0x170] %endif .dy2_vloop: %if ARCH_X86_32 mov r0, r0m %endif pmaddwd m4, m0, m8 pmaddwd m5, m1, m8 pmaddwd m6, m2, m9 pmaddwd m7, m3, m9 paddd m4, m13 paddd m5, m13 paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x50], m10 pmaddwd m7, [rsp+0x60], m10 %else pmaddwd m6, [esp+0x1a0], m10 pmaddwd m7, [esp+0x1b0], m10 %endif paddd m4, m6 paddd m5, m7 %if ARCH_X86_64 pmaddwd m6, [rsp+0x70], m11 pmaddwd m7, [rsp+0x80], m11 %else pmaddwd m6, [esp+0x1c0], m11 pmaddwd m7, [esp+0x1d0], m11 %endif paddd m4, m6 paddd m5, m7 psrad m4, rndshift psrad m5, rndshift packssdw m4, m5 %ifidn %1, put packuswb m4, m4 movq [dstq], m4 add dstq, dsm %else mova [tmpq], m4 add tmpq, tmp_stridem %endif %if ARCH_X86_32 mov r0m, r0 %endif dec hd jz .dy2_hloop_prep %if ARCH_X86_64 mova m8, [rsp+0x10] mova m9, [rsp+0x20] mova m10, [rsp+0x30] mova m11, [rsp+0x40] mova m0, m2 ; 01a mova m1, m3 ; 01b MC_8TAP_SCALED_H 2, 6, 3, 4, 5, 7, 14, 15, 8, 9, 10, 11 mova m3, [rsp+0x50] ; 23a mova m4, [rsp+0x60] ; 23b mova m5, [rsp+0x70] ; 45a mova m7, [rsp+0x80] ; 45b mova m8, [rsp+0x140] mova m9, [rsp+0x150] mova m10, [rsp+0x160] mova m11, [rsp+0x170] punpcklwd m14, m2, m6 ; 67a punpckhwd m2, m6 ; 67b mova [rsp+0x50], m5 mova [rsp+0x60], m7 mova [rsp+0x70], m14 mova [rsp+0x80], m2 mova m2, m3 mova m3, m4 %else MC_8TAP_SCALED_H 0x20, 0 punpcklwd m6, m0, m4 punpckhwd m7, m0, m4 mova m0, [esp+0x180] ; 01a mova m1, [esp+0x190] ; 01b mova m2, [rsp+0x1a0] ; 23a mova m3, [esp+0x1b0] ; 23b mova m4, [esp+0x1c0] ; 45a mova m5, [esp+0x1d0] ; 45b mova [esp+0x180], m2 mova [esp+0x190], m3 mova [esp+0x1a0], m4 mova [esp+0x1b0], m5 mova [esp+0x1c0], m6 ; 67a mova [esp+0x1d0], m7 ; 67b %endif jmp .dy2_vloop .ret: MC_8TAP_SCALED_RET 0 %if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT %define r0m [rstk+stack_offset+ 4] %define r1m [rstk+stack_offset+ 8] %define r2m [rstk+stack_offset+12] %define r3m [rstk+stack_offset+16] %endif %undef isprep %endmacro %macro BILIN_SCALED_FN 1 cglobal %1_bilin_scaled_8bpc mov t0d, (5*15 << 16) | 5*15 mov t1d, (5*15 << 16) | 5*15 jmp mangle(private_prefix %+ _%1_8tap_scaled_8bpc %+ SUFFIX) %endmacro %if WIN64 DECLARE_REG_TMP 6, 5 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 8 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN put FN put_8tap_scaled, sharp, SHARP, SHARP FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN put_8tap_scaled, smooth, SMOOTH, SMOOTH FN put_8tap_scaled, sharp_regular, SHARP, REGULAR FN put_8tap_scaled, regular_sharp, REGULAR, SHARP FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN put_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED put %if WIN64 DECLARE_REG_TMP 5, 4 %elif ARCH_X86_64 DECLARE_REG_TMP 6, 7 %else DECLARE_REG_TMP 1, 2 %endif BILIN_SCALED_FN prep FN prep_8tap_scaled, sharp, SHARP, SHARP FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH FN prep_8tap_scaled, regular, REGULAR, REGULAR MC_8TAP_SCALED prep %if ARCH_X86_32 %macro SAVE_ALPHA_BETA 0 mov alpham, alphad mov betam, betad %endmacro %macro SAVE_DELTA_GAMMA 0 mov deltam, deltad mov gammam, gammad %endmacro %macro LOAD_ALPHA_BETA_MX 0 mov mym, myd mov alphad, alpham mov betad, betam mov mxd, mxm %endmacro %macro LOAD_DELTA_GAMMA_MY 0 mov mxm, mxd mov deltad, deltam mov gammad, gammam mov myd, mym %endmacro %define PIC_reg r2 %define PIC_base_offset $$ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset) %else %define SAVE_ALPHA_BETA %define SAVE_DELTA_GAMMA %define PIC_sym(sym) sym %endif %if ARCH_X86_32 %if STACK_ALIGNMENT < required_stack_alignment %assign copy_args 8*4 %else %assign copy_args 0 %endif %endif %macro RELOC_ARGS 0 %if copy_args mov r0, r0m mov r1, r1m mov r2, r2m mov r3, r3m mov r5, r5m mov dstm, r0 mov dsm, r1 mov srcm, r2 mov ssm, r3 mov mxm, r5 mov r0, r6m mov mym, r0 %endif %endmacro %macro BLENDHWDW 2 ; blend high words from dwords, src1, src2 %if cpuflag(sse4) pblendw %1, %2, 0xAA %else pand %2, m10 por %1, %2 %endif %endmacro %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 %if ARCH_X86_32 %define m8 m4 %define m9 m5 %define m14 m6 %define m15 m7 %define m11 m7 %endif %if notcpuflag(ssse3) || ARCH_X86_32 pxor m11, m11 %endif lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq m2, [filterq+myq *8] ; a movq m8, [filterq+tmp1q*8] ; e lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+deltaq*1] shr tmp2d, 10 shr tmp1d, 10 movq m3, [filterq+tmp2q*8] ; b movq m0, [filterq+tmp1q*8] ; f punpcklwd m2, m3 punpcklwd m8, m0 lea tmp1d, [myq+deltaq*4] lea tmp2d, [myq+deltaq*1] shr myd, 10 shr tmp1d, 10 movq m0, [filterq+myq *8] ; c movq m9, [filterq+tmp1q*8] ; g lea tmp1d, [tmp2q+deltaq*4] lea myd, [tmp2q+gammaq] ; my += gamma shr tmp2d, 10 shr tmp1d, 10 movq m3, [filterq+tmp2q*8] ; d movq m1, [filterq+tmp1q*8] ; h punpcklwd m0, m3 punpcklwd m9, m1 punpckldq m1, m2, m0 punpckhdq m2, m0 punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8 punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8 punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8 punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8 pmaddwd m0, %3 pmaddwd m3, %5 pmaddwd m1, %7 pmaddwd m14, %9 paddd m0, m3 paddd m1, m14 paddd m0, m1 mova %1, m0 %if ARCH_X86_64 SWAP m3, m14 %endif punpckldq m0, m8, m9 punpckhdq m8, m9 punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8 punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8 punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8 punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8 pmaddwd m1, %4 pmaddwd m14, %6 pmaddwd m2, %8 pmaddwd m15, %10 paddd m1, m14 paddd m2, m15 paddd m1, m2 mova %2, m1 %if ARCH_X86_64 SWAP m14, m3 %endif %endmacro %if ARCH_X86_64 %define counterd r4d %else %if copy_args == 0 %define counterd dword r4m %else %define counterd dword [esp+stack_size-4*7] %endif %endif %macro WARP_AFFINE_8X8T 0 %if ARCH_X86_64 cglobal warp_affine_8x8t_8bpc, 6, 14, 16, 0x90, tmp, ts %else cglobal warp_affine_8x8t_8bpc, 0, 7, 16, -0x130-copy_args, tmp, ts %if copy_args %define tmpm [esp+stack_size-4*1] %define tsm [esp+stack_size-4*2] %endif %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main .loop: %if ARCH_X86_32 %define m12 m4 %define m13 m5 %define m14 m6 %define m15 m7 mova m12, [esp+0xC0] mova m13, [esp+0xD0] mova m14, [esp+0xE0] mova m15, [esp+0xF0] %endif %if cpuflag(ssse3) psrad m12, 13 psrad m13, 13 psrad m14, 13 psrad m15, 13 packssdw m12, m13 packssdw m14, m15 mova m13, [PIC_sym(pw_8192)] pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7 pmulhrsw m14, m13 %else %if ARCH_X86_32 %define m10 m0 %endif mova m10, [PIC_sym(pd_16384)] paddd m12, m10 paddd m13, m10 paddd m14, m10 paddd m15, m10 psrad m12, 15 psrad m13, 15 psrad m14, 15 psrad m15, 15 packssdw m12, m13 packssdw m14, m15 %endif mova [tmpq+tsq*0], m12 mova [tmpq+tsq*2], m14 dec counterd jz mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).end %if ARCH_X86_32 mov tmpm, tmpd mov r0, [esp+0x100] mov r1, [esp+0x104] %endif call mangle(private_prefix %+ _warp_affine_8x8_8bpc_%+cpuname).main2 lea tmpq, [tmpq+tsq*4] jmp .loop %endmacro %macro WARP_AFFINE_8X8 0 %if ARCH_X86_64 cglobal warp_affine_8x8_8bpc, 6, 14, 16, 0x90, \ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ filter, tmp1, delta, my, gamma %else cglobal warp_affine_8x8_8bpc, 0, 7, 16, -0x130-copy_args, \ dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \ filter, tmp1, delta, my, gamma %define alphaq r0 %define alphad r0 %define alpham [esp+gprsize+0x100] %define betaq r1 %define betad r1 %define betam [esp+gprsize+0x104] %define deltaq r0 %define deltad r0 %define deltam [esp+gprsize+0x108] %define gammaq r1 %define gammad r1 %define gammam [esp+gprsize+0x10C] %define filterq r3 %define tmp1q r4 %define tmp1d r4 %define tmp1m [esp+gprsize+0x110] %define myq r5 %define myd r5 %define mym r6m %if copy_args %define dstm [esp+stack_size-4*1] %define dsm [esp+stack_size-4*2] %define srcm [esp+stack_size-4*3] %define ssm [esp+stack_size-4*4] %define mxm [esp+stack_size-4*5] %define mym [esp+stack_size-4*6] %endif %endif call .main jmp .start .loop: %if ARCH_X86_32 mov dstm, dstd mov alphad, [esp+0x100] mov betad, [esp+0x104] %endif call .main2 lea dstq, [dstq+dsq*2] .start: %if notcpuflag(sse4) %if cpuflag(ssse3) %define roundval pw_8192 %else %define roundval pd_262144 %endif %if ARCH_X86_64 mova m10, [PIC_sym(roundval)] %else %define m10 [PIC_sym(roundval)] %endif %endif %if ARCH_X86_32 %define m12 m5 %define m13 m6 mova m12, [esp+0xC0] mova m13, [esp+0xD0] %endif %if cpuflag(sse4) %if ARCH_X86_32 %define m11 m4 pxor m11, m11 %endif psrad m12, 18 psrad m13, 18 packusdw m12, m13 pavgw m12, m11 ; (x + (1 << 10)) >> 11 %else %if cpuflag(ssse3) psrad m12, 17 psrad m13, 17 packssdw m12, m13 pmulhrsw m12, m10 %else paddd m12, m10 paddd m13, m10 psrad m12, 19 psrad m13, 19 packssdw m12, m13 %endif %endif %if ARCH_X86_32 %define m14 m6 %define m15 m7 mova m14, [esp+0xE0] mova m15, [esp+0xF0] %endif %if cpuflag(sse4) psrad m14, 18 psrad m15, 18 packusdw m14, m15 pavgw m14, m11 ; (x + (1 << 10)) >> 11 %else %if cpuflag(ssse3) psrad m14, 17 psrad m15, 17 packssdw m14, m15 pmulhrsw m14, m10 %else paddd m14, m10 paddd m15, m10 psrad m14, 19 psrad m15, 19 packssdw m14, m15 %endif %endif packuswb m12, m14 movq [dstq+dsq*0], m12 movhps [dstq+dsq*1], m12 dec counterd jg .loop .end: RET ALIGN function_align .main: %assign stack_offset stack_offset+gprsize %if ARCH_X86_32 %assign stack_size stack_size+4 %if copy_args %assign stack_offset stack_offset-4 %endif RELOC_ARGS LEA PIC_reg, $$ %define PIC_mem [esp+gprsize+0x114] mov abcdd, abcdm %if copy_args == 0 mov ssd, ssm mov mxd, mxm %endif mov PIC_mem, PIC_reg mov srcd, srcm %endif movsx deltad, word [abcdq+2*2] movsx gammad, word [abcdq+2*3] lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 SAVE_DELTA_GAMMA %if ARCH_X86_32 mov abcdd, abcdm %endif movsx alphad, word [abcdq+2*0] movsx betad, word [abcdq+2*1] lea tmp1q, [ssq*3+3] add mxd, 512+(64<<10) lea tmp2d, [alphaq*3] sub srcq, tmp1q ; src -= src_stride*3 + 3 %if ARCH_X86_32 mov srcm, srcd mov PIC_reg, PIC_mem %endif sub betad, tmp2d ; beta -= alpha*3 lea filterq, [PIC_sym(mc_warp_filter2)] %if ARCH_X86_64 mov myd, r6m %if cpuflag(ssse3) pxor m11, m11 %endif %endif call .h psrld m2, m0, 16 psrld m3, m1, 16 %if ARCH_X86_32 %if notcpuflag(ssse3) mova [esp+gprsize+0x00], m2 %endif mova [esp+gprsize+0x10], m3 %endif call .h psrld m4, m0, 16 psrld m5, m1, 16 %if ARCH_X86_32 mova [esp+gprsize+0x20], m4 mova [esp+gprsize+0x30], m5 %endif call .h %if ARCH_X86_64 %define blendmask [rsp+gprsize+0x80] %else %if notcpuflag(ssse3) mova m2, [esp+gprsize+0x00] %endif mova m3, [esp+gprsize+0x10] %define blendmask [esp+gprsize+0x120] %define m10 m7 %endif pcmpeqd m10, m10 pslld m10, 16 mova blendmask, m10 BLENDHWDW m2, m0 ; 0 BLENDHWDW m3, m1 ; 2 mova [rsp+gprsize+0x00], m2 mova [rsp+gprsize+0x10], m3 call .h %if ARCH_X86_32 mova m4, [esp+gprsize+0x20] mova m5, [esp+gprsize+0x30] %endif mova m10, blendmask BLENDHWDW m4, m0 ; 1 BLENDHWDW m5, m1 ; 3 mova [rsp+gprsize+0x20], m4 mova [rsp+gprsize+0x30], m5 call .h %if ARCH_X86_32 %if notcpuflag(ssse3) mova m2, [esp+gprsize+0x00] %endif mova m3, [esp+gprsize+0x10] %define m10 m5 %endif psrld m6, m2, 16 psrld m7, m3, 16 mova m10, blendmask BLENDHWDW m6, m0 ; 2 BLENDHWDW m7, m1 ; 4 mova [rsp+gprsize+0x40], m6 mova [rsp+gprsize+0x50], m7 call .h %if ARCH_X86_32 mova m4, [esp+gprsize+0x20] mova m5, [esp+gprsize+0x30] %endif psrld m2, m4, 16 psrld m3, m5, 16 mova m10, blendmask BLENDHWDW m2, m0 ; 3 BLENDHWDW m3, m1 ; 5 mova [rsp+gprsize+0x60], m2 mova [rsp+gprsize+0x70], m3 call .h %if ARCH_X86_32 mova m6, [esp+gprsize+0x40] mova m7, [esp+gprsize+0x50] %define m10 m7 %endif psrld m4, m6, 16 psrld m5, m7, 16 mova m10, blendmask BLENDHWDW m4, m0 ; 4 BLENDHWDW m5, m1 ; 6 %if ARCH_X86_64 add myd, 512+(64<<10) mova m6, m2 mova m7, m3 %else mova [esp+gprsize+0x80], m4 mova [esp+gprsize+0x90], m5 add dword mym, 512+(64<<10) %endif mov counterd, 4 SAVE_ALPHA_BETA .main2: call .h %if ARCH_X86_32 mova m6, [esp+gprsize+0x60] mova m7, [esp+gprsize+0x70] %define m10 m5 %endif psrld m6, 16 psrld m7, 16 mova m10, blendmask BLENDHWDW m6, m0 ; 5 BLENDHWDW m7, m1 ; 7 %if ARCH_X86_64 WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ m4, m5, \ [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ m6, m7 %else mova [esp+gprsize+0xA0], m6 mova [esp+gprsize+0xB0], m7 LOAD_DELTA_GAMMA_MY WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \ [esp+gprsize+0x00], [esp+gprsize+0x10], \ [esp+gprsize+0x80], [esp+gprsize+0x90], \ [esp+gprsize+0x20], [esp+gprsize+0x30], \ [esp+gprsize+0xA0], [esp+gprsize+0xB0] LOAD_ALPHA_BETA_MX %endif call .h mova m2, [rsp+gprsize+0x40] mova m3, [rsp+gprsize+0x50] %if ARCH_X86_32 mova m4, [rsp+gprsize+0x80] mova m5, [rsp+gprsize+0x90] %define m10 m7 %endif mova [rsp+gprsize+0x00], m2 mova [rsp+gprsize+0x10], m3 mova [rsp+gprsize+0x40], m4 mova [rsp+gprsize+0x50], m5 psrld m4, 16 psrld m5, 16 mova m10, blendmask BLENDHWDW m4, m0 ; 6 BLENDHWDW m5, m1 ; 8 %if ARCH_X86_64 WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \ m6, m7, \ [rsp+gprsize+0x00], [rsp+gprsize+0x10], \ m4, m5 %else mova [esp+gprsize+0x80], m4 mova [esp+gprsize+0x90], m5 LOAD_DELTA_GAMMA_MY WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \ [esp+gprsize+0x20], [esp+gprsize+0x30], \ [esp+gprsize+0xA0], [esp+gprsize+0xB0], \ [esp+gprsize+0x00], [esp+gprsize+0x10], \ [esp+gprsize+0x80], [esp+gprsize+0x90] mov mym, myd mov dstd, dstm mov dsd, dsm mov mxd, mxm %endif mova m2, [rsp+gprsize+0x60] mova m3, [rsp+gprsize+0x70] %if ARCH_X86_32 mova m6, [esp+gprsize+0xA0] mova m7, [esp+gprsize+0xB0] %endif mova [rsp+gprsize+0x20], m2 mova [rsp+gprsize+0x30], m3 mova [rsp+gprsize+0x60], m6 mova [rsp+gprsize+0x70], m7 ret ALIGN function_align .h: %if ARCH_X86_32 %define m8 m3 %define m9 m4 %define m10 m5 %define m14 m6 %define m15 m7 %endif lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] %if ARCH_X86_32 %assign stack_offset stack_offset+4 %assign stack_size stack_size+4 %define PIC_mem [esp+gprsize*2+0x114] mov PIC_mem, PIC_reg mov srcd, srcm %endif movu m10, [srcq] %if ARCH_X86_32 add srcd, ssm mov srcm, srcd mov PIC_reg, PIC_mem %else add srcq, ssq %endif shr mxd, 10 shr tmp1d, 10 movq m1, [filterq+mxq *8] ; 0 X movq m8, [filterq+tmp1q*8] ; 4 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+alphaq*1] shr tmp2d, 10 shr tmp1d, 10 movhps m1, [filterq+tmp2q*8] ; 0 1 movhps m8, [filterq+tmp1q*8] ; 4 5 lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] shr mxd, 10 shr tmp1d, 10 %if cpuflag(ssse3) movq m14, [filterq+mxq *8] ; 2 X movq m9, [filterq+tmp1q*8] ; 6 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 movhps m14, [filterq+tmp2q*8] ; 2 3 movhps m9, [filterq+tmp1q*8] ; 6 7 pshufb m0, m10, [PIC_sym(warp_8x8_shufA)] pmaddubsw m0, m1 pshufb m1, m10, [PIC_sym(warp_8x8_shufB)] pmaddubsw m1, m8 pshufb m15, m10, [PIC_sym(warp_8x8_shufC)] pmaddubsw m15, m14 pshufb m10, m10, [PIC_sym(warp_8x8_shufD)] pmaddubsw m10, m9 phaddw m0, m15 phaddw m1, m10 %else %if ARCH_X86_32 %define m11 m2 %endif pcmpeqw m0, m0 psrlw m14, m0, 8 psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15 pand m14, m10 ; 00 02 04 06 08 10 12 14 packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 psrldq m9, m0, 4 pshufd m0, m14, q0220 pand m0, m9 psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ pslldq m15, m14, 12 por m0, m15 ; shufA psrlw m15, m0, 8 psraw m11, m1, 8 psllw m0, 8 psllw m1, 8 psrlw m0, 8 psraw m1, 8 pmullw m15, m11 pmullw m0, m1 paddw m0, m15 ; pmaddubsw m0, m1 pshufd m15, m14, q0220 pand m15, m9 psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ pslldq m1, m14, 12 por m15, m1 ; shufC pshufd m1, m14, q0220 pand m1, m9 psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ pslldq m11, m14, 12 por m1, m11 ; shufB pshufd m10, m14, q0220 pand m10, m9 psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __ pslldq m14, m14, 12 por m10, m14 ; shufD psrlw m9, m1, 8 psraw m11, m8, 8 psllw m1, 8 psllw m8, 8 psrlw m1, 8 psraw m8, 8 pmullw m9, m11 pmullw m1, m8 paddw m1, m9 ; pmaddubsw m1, m8 movq m14, [filterq+mxq *8] ; 2 X movq m9, [filterq+tmp1q*8] ; 6 X lea tmp1d, [tmp2q+alphaq*4] lea mxd, [tmp2q+betaq] ; mx += beta shr tmp2d, 10 shr tmp1d, 10 movhps m14, [filterq+tmp2q*8] ; 2 3 movhps m9, [filterq+tmp1q*8] ; 6 7 psrlw m8, m15, 8 psraw m11, m14, 8 psllw m15, 8 psllw m14, 8 psrlw m15, 8 psraw m14, 8 pmullw m8, m11 pmullw m15, m14 paddw m15, m8 ; pmaddubsw m15, m14 psrlw m8, m10, 8 psraw m11, m9, 8 psllw m10, 8 psllw m9, 8 psrlw m10, 8 psraw m9, 8 pmullw m8, m11 pmullw m10, m9 paddw m10, m8 ; pmaddubsw m10, m9 pslld m8, m0, 16 pslld m9, m1, 16 pslld m14, m15, 16 pslld m11, m10, 16 paddw m0, m8 paddw m1, m9 paddw m15, m14 paddw m10, m11 psrad m0, 16 psrad m1, 16 psrad m15, 16 psrad m10, 16 packssdw m0, m15 ; phaddw m0, m15 packssdw m1, m10 ; phaddw m1, m10 %endif mova m14, [PIC_sym(pw_8192)] mova m9, [PIC_sym(pd_32768)] pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13 pmaddwd m1, m14 paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword paddd m1, m9 ret %endmacro %if WIN64 DECLARE_REG_TMP 6, 4 %else DECLARE_REG_TMP 6, 7 %endif %macro BIDIR_FN 1 ; op %1 0 lea stride3q, [strideq*3] jmp wq .w4_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*4] .w4: ; tile 4x movd [dstq ], m0 ; copy dw[0] pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] movd [dstq+strideq*1], m1 ; copy dw[1] punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0] movd [dstq+strideq*2], m0 ; dw[2] psrlq m0, 32 ; shift right in dw[3] movd [dstq+stride3q ], m0 ; copy sub hd, 4 jg .w4_loop RET .w8_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq*2] .w8: movq [dstq ], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: %1_INC_PTR 2 %1 0 lea dstq, [dstq+strideq] .w16: mova [dstq ], m0 dec hd jg .w16_loop RET .w32_loop: %1_INC_PTR 4 %1 0 lea dstq, [dstq+strideq] .w32: mova [dstq ], m0 %1 2 mova [dstq + 16 ], m0 dec hd jg .w32_loop RET .w64_loop: %1_INC_PTR 8 %1 0 add dstq, strideq .w64: %assign i 0 %rep 4 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 4 %1 2*i %endif %endrep dec hd jg .w64_loop RET .w128_loop: %1_INC_PTR 16 %1 0 add dstq, strideq .w128: %assign i 0 %rep 8 mova [dstq + i*16 ], m0 %assign i i+1 %if i < 8 %1 2*i %endif %endrep dec hd jg .w128_loop RET %endmacro %macro AVG 1 ; src_offset ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1 paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2 mova m1, [tmp1q+(%1+1)*mmsize] paddw m1, [tmp2q+(%1+1)*mmsize] pmulhrsw m0, m2 pmulhrsw m1, m2 packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit %endmacro %macro AVG_INC_PTR 1 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro cglobal avg_8bpc, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3 LEA r6, avg_ssse3_table tzcnt wd, wm ; leading zeros movifnidn hd, hm ; move h(stack) to h(register) if not already that register movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align add wq, r6 BIDIR_FN AVG %macro W_AVG 1 ; src_offset ; (a * weight + b * (16 - weight) + 128) >> 8 ; = ((a - b) * weight + (b << 4) + 128) >> 8 ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4 ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4 mova m2, [tmp1q+(%1+0)*mmsize] mova m0, m2 psubw m2, [tmp2q+(%1+0)*mmsize] mova m3, [tmp1q+(%1+1)*mmsize] mova m1, m3 psubw m3, [tmp2q+(%1+1)*mmsize] pmulhw m2, m4 pmulhw m3, m4 paddw m0, m2 paddw m1, m3 pmulhrsw m0, m5 pmulhrsw m1, m5 packuswb m0, m1 %endmacro %define W_AVG_INC_PTR AVG_INC_PTR cglobal w_avg_8bpc, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3 LEA r6, w_avg_ssse3_table tzcnt wd, wm movd m4, r6m movifnidn hd, hm pxor m0, m0 movsxd wq, dword [r6+wq*4] mova m5, [pw_2048+r6-w_avg_ssse3_table] pshufb m4, m0 psllw m4, 12 ; (weight-16) << 12 when interpreted as signed add wq, r6 cmp dword r6m, 7 jg .weight_gt7 mov r6, tmp1q psubw m0, m4 mov tmp1q, tmp2q mova m4, m0 ; -weight mov tmp2q, r6 .weight_gt7: BIDIR_FN W_AVG %macro MASK 1 ; src_offset ; (a * m + b * (64 - m) + 512) >> 10 ; = ((a - b) * m + (b << 6) + 512) >> 10 ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4 mova m3, [maskq+(%1+0)*(mmsize/2)] mova m0, [tmp2q+(%1+0)*mmsize] ; b psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a mova m6, m3 ; m psubb m3, m4, m6 ; -m paddw m1, m1 ; (b - a) << 1 paddb m3, m3 ; -m << 1 punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16) pmulhw m1, m2 ; (-m * (b - a)) << 10 paddw m0, m1 ; + b mova m1, [tmp2q+(%1+1)*mmsize] ; b psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a paddw m2, m2 ; (b - a) << 1 mova m6, m3 ; (-m << 1) punpckhbw m3, m4, m6 ; (-m << 9) pmulhw m2, m3 ; (-m << 9) paddw m1, m2 ; (-m * (b - a)) << 10 pmulhrsw m0, m5 ; round pmulhrsw m1, m5 ; round packuswb m0, m1 ; interleave 16 -> 8 %endmacro %macro MASK_INC_PTR 1 add maskq, %1*mmsize/2 add tmp1q, %1*mmsize add tmp2q, %1*mmsize %endmacro %if ARCH_X86_64 cglobal mask_8bpc, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3 movifnidn hd, hm %else cglobal mask_8bpc, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 %define hd dword r5m %endif %define base r6-mask_ssse3_table LEA r6, mask_ssse3_table tzcnt wd, wm movsxd wq, dword [r6+wq*4] pxor m4, m4 mova m5, [base+pw_2048] add wq, r6 mov maskq, r6m BIDIR_FN MASK %undef hd %macro W_MASK_420_END 1-* %rep %0 call .main paddw m2, [maskq+16*%1] mova [maskq+16*%1], m2 mova [dstq+strideq*1+16*(2*%1+0)], m0 call .main psubw m3, m7, m2 psubw m1, m7, [maskq+16*%1] psubw m3, [dstq+strideq*1+16*(2*%1+1)] psrlw m1, 2 psrlw m3, 2 packuswb m1, m3 mova [maskq+16*%1], m1 mova [dstq+strideq*1+16*(2*%1+1)], m0 %rotate 1 %endrep %endmacro %if UNIX64 DECLARE_REG_TMP 7 %else DECLARE_REG_TMP 5 %endif cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_420_ssse3_table LEA t0, w_mask_420_ssse3_table tzcnt wd, wm mov r6d, r7m ; sign sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_2048] movddup m7, [base+wm_420_sign+r6*8] ; 258 - sign add wq, t0 %if ARCH_X86_64 mova m8, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8 movifnidn hd, hm %else %define m8 [base+pw_6903] %define hd dword hm %endif mov maskq, maskmp call .main jmp wq .w4_loop: call .main add maskq, 4 lea dstq, [dstq+strideq*2] .w4: pshufd m3, m2, q2020 pshufd m2, m2, q3131 psubw m1, m7, m3 psubw m1, m2 psrlw m1, 2 packuswb m1, m1 movd [maskq], m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call .main add maskq, 4 lea dstq, [dstq+strideq*2] .w8: movhlps m3, m2 psubw m1, m7, m2 psubw m1, m3 psrlw m1, 2 packuswb m1, m1 movd [maskq], m1 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: call .main add maskq, 8 lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*1], m2 mova [dstq+strideq*0], m0 call .main psubw m1, m7, [dstq+strideq*1] psubw m1, m2 psrlw m1, 2 packuswb m1, m1 movq [maskq], m1 mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add maskq, 16 lea dstq, [dstq+strideq*2] .w32: mova [maskq], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 W_MASK_420_END 0 sub hd, 2 jg .w32_loop RET .w64_loop: call .main add maskq, 16*2 lea dstq, [dstq+strideq*2] .w64: mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 call .main mova [maskq+16*1], m2 mova [dstq+strideq*0+16*2], m0 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m0 W_MASK_420_END 0, 1 sub hd, 2 jg .w64_loop RET .w128_loop: call .main add maskq, 16*4 lea dstq, [dstq+strideq*2] .w128: mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main mova [dstq+strideq*1+16*1], m2 mova [dstq+strideq*0+16*1], m0 call .main mova [maskq+16*1], m2 mova [dstq+strideq*0+16*2], m0 call .main mova [dstq+strideq*1+16*3], m2 mova [dstq+strideq*0+16*3], m0 call .main mova [maskq+16*2], m2 mova [dstq+strideq*0+16*4], m0 call .main mova [dstq+strideq*1+16*5], m2 mova [dstq+strideq*0+16*5], m0 call .main mova [maskq+16*3], m2 mova [dstq+strideq*0+16*6], m0 call .main mova [dstq+strideq*1+16*7], m2 mova [dstq+strideq*0+16*7], m0 W_MASK_420_END 0, 1, 2, 3 sub hd, 2 jg .w128_loop RET ALIGN function_align .main: mova m0, [tmp1q +16*0] mova m3, [tmp1q+tmp2q+16*0] mova m1, [tmp1q +16*1] mova m4, [tmp1q+tmp2q+16*1] add tmp1q, 16*2 psubw m3, m0 psubw m4, m1 pabsw m5, m3 psubusw m2, m8, m5 psrlw m2, 8 ; 64 - m psllw m5, m2, 10 pmulhw m3, m5 pabsw m5, m4 paddw m0, m3 psubusw m3, m8, m5 psrlw m3, 8 phaddw m2, m3 psllw m3, 10 pmulhw m4, m3 paddw m1, m4 pmulhrsw m0, m6 pmulhrsw m1, m6 packuswb m0, m1 ret %macro W_MASK_422_BACKUP 1 ; mask_offset %if ARCH_X86_64 mova m10, m2 %else mova [maskq+16*%1], m2 %endif %endmacro %macro W_MASK_422_END 1 ; mask_offset %if ARCH_X86_64 packuswb m10, m2 psubb m1, m7, m10 pavgb m1, m9 %else mova m3, [maskq+16*%1] packuswb m3, m2 pxor m2, m2 psubb m1, m7, m3 pavgb m1, m2 %endif mova [maskq+16*%1], m1 %endmacro cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_422_ssse3_table LEA t0, w_mask_422_ssse3_table tzcnt wd, wm mov r6d, r7m ; sign sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_2048] movddup m7, [base+wm_422_sign+r6*8] ; 128 - sign add wq, t0 %if ARCH_X86_64 mova m8, [base+pw_6903] pxor m9, m9 movifnidn hd, hm %else add t0, w_mask_420_ssse3_table-w_mask_422_ssse3_table %define hd dword hm %endif mov maskq, maskmp call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main jmp wq .w4_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 8 lea dstq, [dstq+strideq*2] .w4: packuswb m2, m2 psubb m1, m7, m2 %if ARCH_X86_64 pavgb m1, m9 %else pxor m2, m2 pavgb m1, m2 %endif movq [maskq], m1 movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 lea dstq, [dstq+strideq*2] .w8: W_MASK_422_BACKUP 0 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main lea dstq, [dstq+strideq*2] W_MASK_422_END 0 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 4 jg .w8_loop RET .w16_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 lea dstq, [dstq+strideq*2] .w16: W_MASK_422_BACKUP 0 mova [dstq+strideq*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16 add dstq, strideq .w32: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 dec hd jg .w32_loop RET .w64_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16*2 add dstq, strideq .w64: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 1 mova [dstq+16*2], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 1 mova [dstq+16*3], m0 dec hd jg .w64_loop RET .w128_loop: call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main add maskq, 16*4 add dstq, strideq .w128: W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 0 mova [dstq+16*1], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 1 mova [dstq+16*2], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 1 mova [dstq+16*3], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 2 mova [dstq+16*4], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 2 mova [dstq+16*5], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_BACKUP 3 mova [dstq+16*6], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main W_MASK_422_END 3 mova [dstq+16*7], m0 dec hd jg .w128_loop RET cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask %define base t0-w_mask_444_ssse3_table LEA t0, w_mask_444_ssse3_table tzcnt wd, wm mov maskq, maskmp sub tmp2q, tmp1q movsxd wq, [t0+wq*4] mova m6, [base+pw_6903] mova m7, [base+pw_2048] add wq, t0 %if ARCH_X86_64 mova m8, [base+pb_64] movifnidn hd, hm %else %define m8 [base+pb_64] %define hd dword hm %endif call .main jmp wq .w4_loop: call .main lea dstq, [dstq+strideq*2] .w4: movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 punpckhqdq m0, m0 lea dstq, [dstq+strideq*2] movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 sub hd, 4 jg .w4_loop RET .w8_loop: call .main lea dstq, [dstq+strideq*2] .w8: movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 jg .w8_loop RET .w16_loop: call .main lea dstq, [dstq+strideq*2] .w16: mova [dstq+strideq*0], m0 call .main mova [dstq+strideq*1], m0 sub hd, 2 jg .w16_loop RET .w32_loop: call .main add dstq, strideq .w32: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 dec hd jg .w32_loop RET .w64_loop: call .main add dstq, strideq .w64: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 call .main mova [dstq+16*2], m0 call .main mova [dstq+16*3], m0 dec hd jg .w64_loop RET .w128_loop: call .main add dstq, strideq .w128: mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 call .main mova [dstq+16*2], m0 call .main mova [dstq+16*3], m0 call .main mova [dstq+16*4], m0 call .main mova [dstq+16*5], m0 call .main mova [dstq+16*6], m0 call .main mova [dstq+16*7], m0 dec hd jg .w128_loop RET ALIGN function_align .main: mova m0, [tmp1q +16*0] mova m3, [tmp1q+tmp2q+16*0] mova m1, [tmp1q +16*1] mova m4, [tmp1q+tmp2q+16*1] add tmp1q, 16*2 psubw m3, m0 psubw m4, m1 pabsw m5, m3 psubusw m2, m6, m5 psrlw m2, 8 ; 64 - m psllw m5, m2, 10 pmulhw m3, m5 pabsw m5, m4 paddw m0, m3 psubusw m3, m6, m5 psrlw m3, 8 packuswb m2, m3 psllw m3, 10 pmulhw m4, m3 psubb m3, m8, m2 paddw m1, m4 pmulhrsw m0, m7 pmulhrsw m1, m7 mova [maskq], m3 add maskq, 16 packuswb m0, m1 ret %macro BLEND_64M 4; a, b, mask1, mask2 punpcklbw m0, %1, %2; {b;a}[7..0] punpckhbw %1, %2 ; {b;a}[15..8] pmaddubsw m0, %3 ; {b*m[0] + (64-m[0])*a}[7..0] u16 pmaddubsw %1, %4 ; {b*m[1] + (64-m[1])*a}[15..8] u16 pmulhrsw m0, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 pmulhrsw %1, m5 ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16 packuswb m0, %1 ; {blendpx}[15..0] u8 %endmacro %macro BLEND 2; a, b psubb m3, m4, m0 ; m3 = (64 - m) punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] punpckhbw m3, m0 ; {m;(64-m)}[15..8] BLEND_64M %1, %2, m2, m3 %endmacro cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask %define base r6-blend_ssse3_table LEA r6, blend_ssse3_table tzcnt wd, wm movifnidn hd, hm movifnidn maskq, maskmp movsxd wq, dword [r6+wq*4] mova m4, [base+pb_64] mova m5, [base+pw_512] add wq, r6 lea r6, [dsq*3] jmp wq .w4: movq m0, [maskq]; m movd m1, [dstq+dsq*0] ; a movd m6, [dstq+dsq*1] punpckldq m1, m6 movq m6, [tmpq] ; b psubb m3, m4, m0 ; m3 = (64 - m) punpcklbw m2, m3, m0 ; {m;(64-m)}[7..0] punpcklbw m1, m6 ; {b;a}[7..0] pmaddubsw m1, m2 ; {b*m[0] + (64-m[0])*a}[7..0] u16 pmulhrsw m1, m5 ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16 packuswb m1, m0 ; {blendpx}[15..0] u8 movd [dstq+dsq*0], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 add maskq, 8 add tmpq, 8 lea dstq, [dstq+dsq*2] ; dst_stride * 2 sub hd, 2 jg .w4 RET .w8: mova m0, [maskq]; m movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] mova m6, [tmpq] ; b BLEND m1, m6 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 add maskq, 16 add tmpq, 16 lea dstq, [dstq+dsq*2] ; dst_stride * 2 sub hd, 2 jg .w8 RET .w16: mova m0, [maskq]; m mova m1, [dstq] ; a mova m6, [tmpq] ; b BLEND m1, m6 mova [dstq], m0 add maskq, 16 add tmpq, 16 add dstq, dsq ; dst_stride dec hd jg .w16 RET .w32: %assign i 0 %rep 2 mova m0, [maskq+16*i]; m mova m1, [dstq+16*i] ; a mova m6, [tmpq+16*i] ; b BLEND m1, m6 mova [dstq+i*16], m0 %assign i i+1 %endrep add maskq, 32 add tmpq, 32 add dstq, dsq ; dst_stride dec hd jg .w32 RET cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mask %define base r5-blend_v_ssse3_table LEA r5, blend_v_ssse3_table tzcnt wd, wm movifnidn hd, hm movsxd wq, dword [r5+wq*4] mova m5, [base+pw_512] add wq, r5 add maskq, obmc_masks-blend_v_ssse3_table jmp wq .w2: movd m3, [maskq+4] punpckldq m3, m3 ; 2 mask blend is provided for 4 pixels / 2 lines .w2_loop: movd m1, [dstq+dsq*0] ; a {..;a;a} pinsrw m1, [dstq+dsq*1], 1 movd m2, [tmpq] ; b punpcklbw m0, m1, m2; {b;a}[7..0] pmaddubsw m0, m3 ; {b*m + (64-m)*a}[7..0] u16 pmulhrsw m0, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 packuswb m0, m1 ; {blendpx}[8..0] u8 movd r3d, m0 mov [dstq+dsq*0], r3w shr r3d, 16 mov [dstq+dsq*1], r3w add tmpq, 2*2 lea dstq, [dstq + dsq * 2] sub hd, 2 jg .w2_loop RET .w4: movddup m3, [maskq+8] ; 4 mask blend is provided for 8 pixels / 2 lines .w4_loop: movd m1, [dstq+dsq*0] ; a movd m2, [dstq+dsq*1] ; punpckldq m1, m2 movq m2, [tmpq] ; b punpcklbw m1, m2 ; {b;a}[7..0] pmaddubsw m1, m3 ; {b*m + (64-m)*a}[7..0] u16 pmulhrsw m1, m5 ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16 packuswb m1, m1 ; {blendpx}[8..0] u8 movd [dstq], m1 psrlq m1, 32 movd [dstq+dsq*1], m1 add tmpq, 2*4 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w4_loop RET .w8: mova m3, [maskq+16] ; 8 mask blend is provided for 16 pixels .w8_loop: movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] mova m2, [tmpq]; b BLEND_64M m1, m2, m3, m3 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 add tmpq, 16 lea dstq, [dstq+dsq*2] sub hd, 2 jg .w8_loop RET .w16: ; 16 mask blend is provided for 32 pixels mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) .w16_loop: mova m1, [dstq] ; a mova m2, [tmpq] ; b BLEND_64M m1, m2, m3, m4 mova [dstq], m0 add tmpq, 16 add dstq, dsq dec hd jg .w16_loop RET .w32: %if WIN64 mova [rsp+8], xmm6 %endif mova m3, [maskq+64] ; obmc_masks_32[0] (64-m[0]) mova m4, [maskq+80] ; obmc_masks_32[1] (64-m[1]) mova m6, [maskq+96] ; obmc_masks_32[2] (64-m[2]) ; 16 mask blend is provided for 64 pixels .w32_loop: mova m1, [dstq+16*0] ; a mova m2, [tmpq+16*0] ; b BLEND_64M m1, m2, m3, m4 movq m1, [dstq+16*1] ; a punpcklbw m1, [tmpq+16*1] ; b pmaddubsw m1, m6 pmulhrsw m1, m5 packuswb m1, m1 mova [dstq+16*0], m0 movq [dstq+16*1], m1 add tmpq, 32 add dstq, dsq dec hd jg .w32_loop %if WIN64 mova xmm6, [rsp+8] %endif RET cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mask %define base t0-blend_h_ssse3_table %if ARCH_X86_32 ; We need to keep the PIC pointer for w4, reload wd from stack instead DECLARE_REG_TMP 6 %else DECLARE_REG_TMP 5 mov r6d, wd %endif LEA t0, blend_h_ssse3_table tzcnt wd, wm mov hd, hm movsxd wq, dword [t0+wq*4] mova m5, [base+pw_512] add wq, t0 lea maskq, [base+obmc_masks+hq*2] lea hd, [hq*3] shr hd, 2 ; h * 3/4 lea maskq, [maskq+hq*2] neg hq jmp wq .w2: movd m0, [dstq+dsq*0] pinsrw m0, [dstq+dsq*1], 1 movd m2, [maskq+hq*2] movd m1, [tmpq] punpcklwd m2, m2 punpcklbw m0, m1 pmaddubsw m0, m2 pmulhrsw m0, m5 packuswb m0, m0 movd r3d, m0 mov [dstq+dsq*0], r3w shr r3d, 16 mov [dstq+dsq*1], r3w lea dstq, [dstq+dsq*2] add tmpq, 2*2 add hq, 2 jl .w2 RET .w4: %if ARCH_X86_32 mova m3, [base+blend_shuf] %else mova m3, [blend_shuf] %endif .w4_loop: movd m0, [dstq+dsq*0] movd m2, [dstq+dsq*1] punpckldq m0, m2 ; a movq m1, [tmpq] ; b movq m2, [maskq+hq*2] ; m pshufb m2, m3 punpcklbw m0, m1 pmaddubsw m0, m2 pmulhrsw m0, m5 packuswb m0, m0 movd [dstq+dsq*0], m0 psrlq m0, 32 movd [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add tmpq, 4*2 add hq, 2 jl .w4_loop RET .w8: movd m4, [maskq+hq*2] punpcklwd m4, m4 pshufd m3, m4, q0000 pshufd m4, m4, q1111 movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] mova m2, [tmpq] BLEND_64M m1, m2, m3, m4 movq [dstq+dsq*0], m0 movhps [dstq+dsq*1], m0 lea dstq, [dstq+dsq*2] add tmpq, 8*2 add hq, 2 jl .w8 RET ; w16/w32/w64/w128 .w16: %if ARCH_X86_32 mov r6d, wm %endif sub dsq, r6 .w16_loop0: movd m3, [maskq+hq*2] pshuflw m3, m3, q0000 punpcklqdq m3, m3 mov wd, r6d .w16_loop: mova m1, [dstq] ; a mova m2, [tmpq] ; b BLEND_64M m1, m2, m3, m3 mova [dstq], m0 add dstq, 16 add tmpq, 16 sub wd, 16 jg .w16_loop add dstq, dsq inc hq jl .w16_loop0 RET ; emu_edge args: ; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, ; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, ; const pixel *ref, const ptrdiff_t ref_stride ; ; bw, bh total filled size ; iw, ih, copied block -> fill bottom, right ; x, y, offset in bw/bh -> fill top, left cglobal emu_edge_8bpc, 10, 13, 2, bw, bh, iw, ih, x, \ y, dst, dstride, src, sstride, \ bottomext, rightext, blk ; we assume that the buffer (stride) is larger than width, so we can ; safely overwrite by a few bytes pxor m1, m1 %if ARCH_X86_64 %define reg_zero r12q %define reg_tmp r10 %define reg_src srcq %define reg_bottomext bottomextq %define reg_rightext rightextq %define reg_blkm r9m %else %define reg_zero r6 %define reg_tmp r0 %define reg_src r1 %define reg_bottomext r0 %define reg_rightext r1 %define reg_blkm r2m %endif ; ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) xor reg_zero, reg_zero lea reg_tmp, [ihq-1] cmp yq, ihq cmovs reg_tmp, yq test yq, yq cmovs reg_tmp, reg_zero %if ARCH_X86_64 imul reg_tmp, sstrideq add srcq, reg_tmp %else imul reg_tmp, sstridem mov reg_src, srcm add reg_src, reg_tmp %endif ; ; ref += iclip(x, 0, iw - 1) lea reg_tmp, [iwq-1] cmp xq, iwq cmovs reg_tmp, xq test xq, xq cmovs reg_tmp, reg_zero add reg_src, reg_tmp %if ARCH_X86_32 mov srcm, reg_src %endif ; ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) %if ARCH_X86_32 mov r1, r1m ; restore bh %endif lea reg_bottomext, [yq+bhq] sub reg_bottomext, ihq lea r3, [bhq-1] cmovs reg_bottomext, reg_zero ; DEFINE_ARGS bw, bh, iw, ih, x, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; top_ext = iclip(-y, 0, bh - 1) neg topextq cmovs topextq, reg_zero cmp reg_bottomext, bhq cmovns reg_bottomext, r3 cmp topextq, bhq cmovg topextq, r3 %if ARCH_X86_32 mov r4m, reg_bottomext ; ; right_ext = iclip(x + bw - iw, 0, bw - 1) mov r0, r0m ; restore bw %endif lea reg_rightext, [xq+bwq] sub reg_rightext, iwq lea r2, [bwq-1] cmovs reg_rightext, reg_zero DEFINE_ARGS bw, bh, iw, ih, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; left_ext = iclip(-x, 0, bw - 1) neg leftextq cmovs leftextq, reg_zero cmp reg_rightext, bwq cmovns reg_rightext, r2 %if ARCH_X86_32 mov r3m, r1 %endif cmp leftextq, bwq cmovns leftextq, r2 %undef reg_zero %undef reg_tmp %undef reg_src %undef reg_bottomext %undef reg_rightext DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ topext, dst, dstride, src, sstride, \ bottomext, rightext, blk ; center_h = bh - top_ext - bottom_ext %if ARCH_X86_64 lea r3, [bottomextq+topextq] sub centerhq, r3 %else mov r1, centerhm ; restore r1 sub centerhq, topextq sub centerhq, r4m mov r1m, centerhq %endif ; ; blk += top_ext * PXSTRIDE(dst_stride) mov r2, topextq %if ARCH_X86_64 imul r2, dstrideq %else mov r6, r6m ; restore dstq imul r2, dstridem %endif add dstq, r2 mov reg_blkm, dstq ; save pointer for ext ; ; center_w = bw - left_ext - right_ext mov centerwq, bwq %if ARCH_X86_64 lea r3, [rightextq+leftextq] sub centerwq, r3 %else sub centerwq, r3m sub centerwq, leftextq %endif ; vloop Macro %macro v_loop 3 ; need_left_ext, need_right_ext, suffix %if ARCH_X86_64 %define reg_tmp r12 %else %define reg_tmp r0 %endif .v_loop_%3: %if ARCH_X86_32 mov r0, r0m mov r1, r1m %endif %if %1 ; left extension %if ARCH_X86_64 movd m0, [srcq] %else mov r3, srcm movd m0, [r3] %endif pshufb m0, m1 xor r3, r3 .left_loop_%3: mova [dstq+r3], m0 add r3, mmsize cmp r3, leftextq jl .left_loop_%3 ; body lea reg_tmp, [dstq+leftextq] %endif xor r3, r3 .body_loop_%3: %if ARCH_X86_64 movu m0, [srcq+r3] %else mov r1, srcm movu m0, [r1+r3] %endif %if %1 movu [reg_tmp+r3], m0 %else movu [dstq+r3], m0 %endif add r3, mmsize cmp r3, centerwq jl .body_loop_%3 %if %2 ; right extension %if %1 add reg_tmp, centerwq %else lea reg_tmp, [dstq+centerwq] %endif %if ARCH_X86_64 movd m0, [srcq+centerwq-1] %else mov r3, srcm movd m0, [r3+centerwq-1] %endif pshufb m0, m1 xor r3, r3 .right_loop_%3: movu [reg_tmp+r3], m0 add r3, mmsize %if ARCH_X86_64 cmp r3, rightextq %else cmp r3, r3m %endif jl .right_loop_%3 %endif %if ARCH_X86_64 add dstq, dstrideq add srcq, sstrideq dec centerhq jg .v_loop_%3 %else add dstq, dstridem mov r0, sstridem add srcm, r0 sub dword centerhm, 1 jg .v_loop_%3 mov r0, r0m ; restore r0 %endif %endmacro ; vloop MACRO test leftextq, leftextq jnz .need_left_ext %if ARCH_X86_64 test rightextq, rightextq jnz .need_right_ext %else cmp leftextq, r3m ; leftextq == 0 jne .need_right_ext %endif v_loop 0, 0, 0 jmp .body_done ;left right extensions .need_left_ext: %if ARCH_X86_64 test rightextq, rightextq %else mov r3, r3m test r3, r3 %endif jnz .need_left_right_ext v_loop 1, 0, 1 jmp .body_done .need_left_right_ext: v_loop 1, 1, 2 jmp .body_done .need_right_ext: v_loop 0, 1, 3 .body_done: ; r0 ; bw ; r1 ;; x loop ; r4 ;; y loop ; r5 ; topextq ; r6 ;dstq ; r7 ;dstrideq ; r8 ; srcq %if ARCH_X86_64 %define reg_dstride dstrideq %else %define reg_dstride r2 %endif ; ; bottom edge extension %if ARCH_X86_64 test bottomextq, bottomextq jz .top %else xor r1, r1 cmp r1, r4m je .top %endif ; %if ARCH_X86_64 mov srcq, dstq sub srcq, dstrideq xor r1, r1 %else mov r3, dstq mov reg_dstride, dstridem sub r3, reg_dstride mov srcm, r3 %endif ; .bottom_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1] lea r3, [dstq+r1] mov r4, bottomextq %else mov r3, srcm mova m0, [r3+r1] lea r3, [dstq+r1] mov r4, r4m %endif ; .bottom_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .bottom_y_loop add r1, mmsize cmp r1, bwq jl .bottom_x_loop .top: ; top edge extension test topextq, topextq jz .end %if ARCH_X86_64 mov srcq, reg_blkm %else mov r3, reg_blkm mov reg_dstride, dstridem %endif mov dstq, dstm xor r1, r1 ; .top_x_loop: %if ARCH_X86_64 mova m0, [srcq+r1] %else mov r3, reg_blkm mova m0, [r3+r1] %endif lea r3, [dstq+r1] mov r4, topextq ; .top_y_loop: mova [r3], m0 add r3, reg_dstride dec r4 jg .top_y_loop add r1, mmsize cmp r1, bwq jl .top_x_loop .end: RET %undef reg_dstride %undef reg_blkm %undef reg_tmp cextern resize_filter %macro SCRATCH 3 %if ARCH_X86_32 mova [rsp+%3*mmsize], m%1 %define m%2 [rsp+%3*mmsize] %else SWAP %1, %2 %endif %endmacro %if ARCH_X86_64 cglobal resize_8bpc, 0, 12, 14, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 %elif STACK_ALIGNMENT >= 16 cglobal resize_8bpc, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 %else cglobal resize_8bpc, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ dst_w, h, src_w, dx, mx0 %endif movifnidn dstq, dstmp movifnidn srcq, srcmp %if STACK_ALIGNMENT >= 16 movifnidn dst_wd, dst_wm %endif %if ARCH_X86_64 movifnidn hd, hm %endif sub dword mx0m, 4<<14 sub dword src_wm, 8 movd m7, dxm movd m6, mx0m movd m5, src_wm pshufd m7, m7, q0000 pshufd m6, m6, q0000 pshufd m5, m5, q0000 %if ARCH_X86_64 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x LEA r7, $$ %define base r7-$$ %else DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x %define hd dword r5m %if STACK_ALIGNMENT >= 16 LEA r6, $$ %define base r6-$$ %else LEA r4, $$ %define base r4-$$ %endif %endif %if ARCH_X86_64 mova m10, [base+pw_m256] mova m9, [base+pd_63] mova m8, [base+pb_8x0_8x8] %else %define m10 [base+pw_m256] %define m9 [base+pd_63] %define m8 [base+pb_8x0_8x8] %endif pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] pslld m7, 2 ; dx*4 pslld m5, 14 paddd m6, m4 ; mx+[0..3]*dx SCRATCH 7, 13, 0 SCRATCH 6, 12, 1 SCRATCH 5, 11, 2 ; m10 = pmulhrsw constant for x=(x+64)>>7 ; m12 = mx+[0..3]*dx, m13 = dx*4, m11 = src_w, m9 = 0x3f, m8=0,8 .loop_y: xor xd, xd mova m0, m12 ; per-line working version of mx .loop_x: pxor m1, m1 pcmpgtd m1, m0 pandn m1, m0 psrad m2, m0, 8 ; filter offset (unmasked) pcmpgtd m3, m11, m1 pand m1, m3 pandn m3, m11 por m1, m3 psubd m3, m0, m1 ; pshufb offset psrad m1, 14 ; clipped src_x offset psrad m3, 14 ; pshufb edge_emu offset pand m2, m9 ; filter offset (masked) ; load source pixels %if ARCH_X86_64 movd r8d, m1 pshuflw m1, m1, q3232 movd r9d, m1 punpckhqdq m1, m1 movd r10d, m1 psrlq m1, 32 movd r11d, m1 movq m4, [srcq+r8] movq m5, [srcq+r10] movhps m4, [srcq+r9] movhps m5, [srcq+r11] %else movd r3d, m1 pshufd m1, m1, q3312 movd r1d, m1 pshuflw m1, m1, q3232 movq m4, [srcq+r3] movq m5, [srcq+r1] movd r3d, m1 punpckhqdq m1, m1 movd r1d, m1 movhps m4, [srcq+r3] movhps m5, [srcq+r1] %endif ; if no emulation is required, we don't need to shuffle or emulate edges ; this also saves 2 quasi-vpgatherdqs pxor m6, m6 pcmpeqb m6, m3 %if ARCH_X86_64 pmovmskb r8d, m6 cmp r8d, 0xffff %else pmovmskb r3d, m6 cmp r3d, 0xffff %endif je .filter %if ARCH_X86_64 movd r8d, m3 pshuflw m3, m3, q3232 movd r9d, m3 punpckhqdq m3, m3 movd r10d, m3 psrlq m3, 32 movd r11d, m3 movsxd r8, r8d movsxd r9, r9d movsxd r10, r10d movsxd r11, r11d movq m6, [base+resize_shuf+4+r8] movq m7, [base+resize_shuf+4+r10] movhps m6, [base+resize_shuf+4+r9] movhps m7, [base+resize_shuf+4+r11] %else movd r3d, m3 pshufd m3, m3, q3312 movd r1d, m3 pshuflw m3, m3, q3232 movq m6, [base+resize_shuf+4+r3] movq m7, [base+resize_shuf+4+r1] movd r3d, m3 punpckhqdq m3, m3 movd r1d, m3 movhps m6, [base+resize_shuf+4+r3] movhps m7, [base+resize_shuf+4+r1] %endif paddb m6, m8 paddb m7, m8 pshufb m4, m6 pshufb m5, m7 .filter: %if ARCH_X86_64 movd r8d, m2 pshuflw m2, m2, q3232 movd r9d, m2 punpckhqdq m2, m2 movd r10d, m2 psrlq m2, 32 movd r11d, m2 movq m6, [base+resize_filter+r8*8] movq m7, [base+resize_filter+r10*8] movhps m6, [base+resize_filter+r9*8] movhps m7, [base+resize_filter+r11*8] %else movd r3d, m2 pshufd m2, m2, q3312 movd r1d, m2 pshuflw m2, m2, q3232 movq m6, [base+resize_filter+r3*8] movq m7, [base+resize_filter+r1*8] movd r3d, m2 punpckhqdq m2, m2 movd r1d, m2 movhps m6, [base+resize_filter+r3*8] movhps m7, [base+resize_filter+r1*8] %endif pmaddubsw m4, m6 pmaddubsw m5, m7 phaddw m4, m5 phaddsw m4, m4 pmulhrsw m4, m10 ; x=(x+64)>>7 packuswb m4, m4 movd [dstq+xq], m4 paddd m0, m13 add xd, 4 %if STACK_ALIGNMENT >= 16 cmp xd, dst_wd %else cmp xd, dst_wm %endif jl .loop_x add dstq, dst_stridemp add srcq, src_stridemp dec hd jg .loop_y RET INIT_XMM ssse3 PREP_BILIN PREP_8TAP WARP_AFFINE_8X8 WARP_AFFINE_8X8T INIT_XMM sse4 WARP_AFFINE_8X8 WARP_AFFINE_8X8T INIT_XMM sse2 PREP_BILIN PREP_8TAP WARP_AFFINE_8X8 WARP_AFFINE_8X8T rav1e-0.7.1/src/x86/me.asm000064400000000000000000000064211046102023000131710ustar 00000000000000; Copyright (c) 2018, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION .text %macro W_ABS_DIFF 8 psubw %1, %5 psubw %2, %6 psubw %3, %7 psubw %4, %8 pabsw %1, %1 pabsw %2, %2 pabsw %3, %3 pabsw %4, %4 %endmacro INIT_XMM ssse3 cglobal sad_4x4_hbd, 4, 6, 8, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] movq m0, [srcq] movq m1, [srcq+src_strideq*1] movq m2, [srcq+src_strideq*2] movq m3, [srcq+src_stride3q] movq m4, [dstq] movq m5, [dstq+dst_strideq*1] movq m6, [dstq+dst_strideq*2] movq m7, [dstq+dst_stride3q] W_ABS_DIFF m0, m1, m2, m3, m4, m5, m6, m7 ; Don't convert to 32 bit integers: 4*4 abs diffs of 12-bits fits in 16 bits. ; Accumulate onto m0 %define sum m0 paddw sum, m1 paddw m2, m3 paddw sum, m2 ; Horizontal reduction pshuflw m1, sum, q2323 paddw sum, m1 pshuflw m1, sum, q1111 paddw sum, m1 movd eax, sum ; Convert to 16-bits since the upper half of eax is dirty movzx eax, ax RET %if ARCH_X86_64 INIT_XMM ssse3 cglobal sad_16x16_hbd, 4, 5, 10, src, src_stride, dst, dst_stride, \ cnt mov cntd, 8 %define sum m0 pxor sum, sum pxor m9, m9 .loop: movu m1, [srcq] movu m2, [srcq+16] movu m3, [srcq+src_strideq] movu m4, [srcq+src_strideq+16] lea srcq, [srcq+src_strideq*2] movu m5, [dstq] movu m6, [dstq+16] movu m7, [dstq+dst_strideq] movu m8, [dstq+dst_strideq+16] lea dstq, [dstq+dst_strideq*2] W_ABS_DIFF m1, m2, m3, m4, m5, m6, m7, m8 paddw m1, m2 paddw m3, m4 ; Convert to 32-bits punpcklwd m2, m1, m9 punpcklwd m4, m3, m9 punpckhwd m1, m9 punpckhwd m3, m9 paddd m1, m2 paddd m3, m4 paddd sum, m1 paddd sum, m3 dec cntd jg .loop ; Horizontal reduction movhlps m1, sum paddd sum, m1 pshufd m1, sum, q1111 paddd sum, m1 movd eax, sum RET %endif rav1e-0.7.1/src/x86/msac.asm000064400000000000000000000440461046102023000135200ustar 00000000000000; Copyright © 2019, VideoLAN and dav1d authors ; Copyright © 2019, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 64 ; avoids cacheline splits min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 pw_0xff00: times 8 dw 0xff00 pw_32: times 8 dw 32 %if ARCH_X86_64 %define resp resq %define movp movq %define c_shuf q3333 %macro DECODE_SYMBOL_ADAPT_INIT 0-1 %endmacro %else %define resp resd %define movp movd %define c_shuf q1111 %macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok mov t0, r0m mov t1, r1m %if %1 == 0 mov t2, r2m %endif %if STACK_ALIGNMENT >= 16 sub esp, 40-%1*4 %else mov eax, esp and esp, ~15 sub esp, 40-%1*4 mov [esp], eax %endif %endmacro %endif struc msac .buf: resp 1 .end: resp 1 .dif: resp 1 .rng: resd 1 .cnt: resd 1 .update_cdf: resd 1 endstruc %define m(x, y) mangle(private_prefix %+ _ %+ x %+ y) SECTION .text %if WIN64 DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8 %define buf rsp+stack_offset+8 ; shadow space %elif UNIX64 DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8 %define buf rsp-40 ; red zone %else DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3 %define buf esp+8 %endif INIT_XMM sse2 cglobal msac_decode_symbol_adapt4, 0, 6, 6 DECODE_SYMBOL_ADAPT_INIT LEA rax, pw_0xff00 movd m2, [t0+msac.rng] movq m1, [t1] movp m3, [t0+msac.dif] mov t3d, [t0+msac.update_cdf] mov t4d, t2d not t2 ; -(n_symbols + 1) pshuflw m2, m2, q0000 movd [buf+12], m2 pand m2, [rax] mova m0, m1 psrlw m1, 6 psllw m1, 7 pmulhuw m1, m2 movq m2, [rax+t2*2] pshuflw m3, m3, c_shuf paddw m1, m2 mova [buf+16], m1 psubusw m1, m3 pxor m2, m2 pcmpeqw m1, m2 ; c >= v pmovmskb eax, m1 test t3d, t3d jz .renorm ; !allow_update_cdf ; update_cdf: movzx t3d, word [t1+t4*2] ; count pcmpeqw m2, m2 mov t2d, t3d shr t3d, 4 cmp t4d, 3 sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4 cmp t2d, 32 adc t2d, 0 ; count + (count < 32) movd m3, t3d pavgw m2, m1 ; i >= val ? -1 : 32768 psubw m2, m0 ; for (i = 0; i < val; i++) psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate; psraw m2, m3 ; for (; i < n_symbols; i++) paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1; movq [t1], m0 mov [t1+t4*2], t2w .renorm: tzcnt eax, eax mov t4, [t0+msac.dif] movzx t1d, word [buf+rax+16] ; v movzx t2d, word [buf+rax+14] ; u shr eax, 1 .renorm2: %if ARCH_X86_64 == 0 %if STACK_ALIGNMENT >= 16 add esp, 40 %else mov esp, [esp] %endif %endif not t4 sub t2d, t1d ; rng shl t1, gprsize*8-16 add t4, t1 ; ~dif .renorm3: mov t1d, [t0+msac.cnt] movifnidn t7, t0 .renorm4: bsr ecx, t2d xor ecx, 15 ; d .renorm5: shl t2d, cl shl t4, cl mov [t7+msac.rng], t2d not t4 sub t1d, ecx jae .end ; no refill required ; refill: mov t2, [t7+msac.buf] mov rcx, [t7+msac.end] %if ARCH_X86_64 == 0 push t5 %endif lea t5, [t2+gprsize] cmp t5, rcx ja .refill_eob mov t2, [t2] lea ecx, [t1+23] add t1d, 16 shr ecx, 3 ; shift_bytes bswap t2 sub t5, rcx shl ecx, 3 ; shift_bits shr t2, cl sub ecx, t1d ; shift_bits - 16 - cnt mov t1d, gprsize*8-16 shl t2, cl mov [t7+msac.buf], t5 sub t1d, ecx ; cnt + gprsize*8 - shift_bits xor t4, t2 %if ARCH_X86_64 == 0 pop t5 %endif .end: mov [t7+msac.cnt], t1d mov [t7+msac.dif], t4 RET .refill_eob: ; avoid overreading the input buffer mov t5, rcx mov ecx, gprsize*8-24 sub ecx, t1d ; c .refill_eob_loop: cmp t2, t5 jae .refill_eob_end ; eob reached movzx t1d, byte [t2] inc t2 shl t1, cl xor t4, t1 sub ecx, 8 jge .refill_eob_loop .refill_eob_end: mov t1d, gprsize*8-24 %if ARCH_X86_64 == 0 pop t5 %endif sub t1d, ecx mov [t7+msac.buf], t2 mov [t7+msac.dif], t4 mov [t7+msac.cnt], t1d RET cglobal msac_decode_symbol_adapt8, 0, 6, 6 DECODE_SYMBOL_ADAPT_INIT LEA rax, pw_0xff00 movd m2, [t0+msac.rng] mova m1, [t1] movp m3, [t0+msac.dif] mov t3d, [t0+msac.update_cdf] mov t4d, t2d not t2 pshuflw m2, m2, q0000 movd [buf+12], m2 punpcklqdq m2, m2 mova m0, m1 psrlw m1, 6 pand m2, [rax] psllw m1, 7 pmulhuw m1, m2 movu m2, [rax+t2*2] pshuflw m3, m3, c_shuf paddw m1, m2 punpcklqdq m3, m3 mova [buf+16], m1 psubusw m1, m3 pxor m2, m2 pcmpeqw m1, m2 pmovmskb eax, m1 test t3d, t3d jz m(msac_decode_symbol_adapt4, SUFFIX).renorm movzx t3d, word [t1+t4*2] pcmpeqw m2, m2 mov t2d, t3d shr t3d, 4 cmp t4d, 3 ; may be called with n_symbols <= 2 sbb t3d, -5 cmp t2d, 32 adc t2d, 0 movd m3, t3d pavgw m2, m1 psubw m2, m0 psubw m0, m1 psraw m2, m3 paddw m0, m2 mova [t1], m0 mov [t1+t4*2], t2w jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm cglobal msac_decode_symbol_adapt16, 0, 6, 6 DECODE_SYMBOL_ADAPT_INIT LEA rax, pw_0xff00 movd m4, [t0+msac.rng] mova m2, [t1] mova m3, [t1+16] movp m5, [t0+msac.dif] mov t3d, [t0+msac.update_cdf] mov t4d, t2d not t2 %if WIN64 sub rsp, 48 ; need 36 bytes, shadow space is only 32 %endif pshuflw m4, m4, q0000 movd [buf-4], m4 punpcklqdq m4, m4 mova m0, m2 psrlw m2, 6 mova m1, m3 psrlw m3, 6 pand m4, [rax] psllw m2, 7 psllw m3, 7 pmulhuw m2, m4 pmulhuw m3, m4 movu m4, [rax+t2*2] pshuflw m5, m5, c_shuf paddw m2, m4 psubw m4, [rax-pw_0xff00+pw_32] punpcklqdq m5, m5 paddw m3, m4 mova [buf], m2 psubusw m2, m5 mova [buf+16], m3 psubusw m3, m5 pxor m4, m4 pcmpeqw m2, m4 pcmpeqw m3, m4 packsswb m5, m2, m3 pmovmskb eax, m5 test t3d, t3d jz .renorm movzx t3d, word [t1+t4*2] pcmpeqw m4, m4 mova m5, m4 lea t2d, [t3+80] ; only support n_symbols > 2 shr t2d, 4 cmp t3d, 32 adc t3d, 0 pavgw m4, m2 pavgw m5, m3 psubw m4, m0 psubw m0, m2 movd m2, t2d psubw m5, m1 psubw m1, m3 psraw m4, m2 psraw m5, m2 paddw m0, m4 paddw m1, m5 mova [t1], m0 mova [t1+16], m1 mov [t1+t4*2], t3w .renorm: tzcnt eax, eax mov t4, [t0+msac.dif] movzx t1d, word [buf+rax*2] movzx t2d, word [buf+rax*2-2] %if WIN64 add rsp, 48 %endif jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2 cglobal msac_decode_bool_adapt, 0, 6, 0 movifnidn t1, r1mp movifnidn t0, r0mp movzx eax, word [t1] movzx t3d, byte [t0+msac.rng+1] mov t4, [t0+msac.dif] mov t2d, [t0+msac.rng] %if ARCH_X86_64 mov t5d, eax %endif and eax, ~63 imul eax, t3d %if UNIX64 mov t6, t4 %endif shr eax, 7 add eax, 4 ; v mov t3d, eax shl rax, gprsize*8-16 ; vw sub t2d, t3d ; r - v sub t4, rax ; dif - vw setb al cmovb t2d, t3d mov t3d, [t0+msac.update_cdf] %if UNIX64 cmovb t4, t6 %else cmovb t4, [t0+msac.dif] %endif %if ARCH_X86_64 == 0 movzx eax, al %endif not t4 test t3d, t3d jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3 %if UNIX64 == 0 push t6 %endif movzx t6d, word [t1+2] %if ARCH_X86_64 == 0 push t5 movzx t5d, word [t1] %endif movifnidn t7, t0 lea ecx, [t6+64] cmp t6d, 32 adc t6d, 0 mov [t1+2], t6w imul t6d, eax, -32769 shr ecx, 4 ; rate add t6d, t5d ; if (bit) sub t5d, eax ; cdf[0] -= ((cdf[0] - 32769) >> rate) + 1; sar t6d, cl ; else sub t5d, t6d ; cdf[0] -= cdf[0] >> rate; mov [t1], t5w %if WIN64 mov t1d, [t7+msac.cnt] pop t6 jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4 %else %if ARCH_X86_64 == 0 pop t5 pop t6 %endif jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 %endif cglobal msac_decode_bool_equi, 0, 6, 0 movifnidn t0, r0mp mov t1d, [t0+msac.rng] mov t4, [t0+msac.dif] mov t2d, t1d mov t1b, 8 mov t3, t4 mov eax, t1d shr t1d, 1 ; v shl rax, gprsize*8-17 ; vw sub t2d, t1d ; r - v sub t4, rax ; dif - vw cmovb t2d, t1d mov t1d, [t0+msac.cnt] cmovb t4, t3 movifnidn t7, t0 mov ecx, 0xbfff setb al ; the upper 32 bits contains garbage but that's OK sub ecx, t2d not t4 ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14) ; i.e. (0 <= d <= 2) and v < (3 << 14) shr ecx, 14 ; d %if ARCH_X86_64 == 0 movzx eax, al %endif jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5 cglobal msac_decode_bool, 0, 6, 0 movifnidn t0, r0mp movifnidn t1d, r1m movzx eax, byte [t0+msac.rng+1] ; r >> 8 mov t4, [t0+msac.dif] mov t2d, [t0+msac.rng] and t1d, ~63 imul eax, t1d mov t3, t4 shr eax, 7 add eax, 4 ; v mov t1d, eax shl rax, gprsize*8-16 ; vw sub t2d, t1d ; r - v sub t4, rax ; dif - vw cmovb t2d, t1d cmovb t4, t3 setb al not t4 %if ARCH_X86_64 == 0 movzx eax, al %endif jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3 %macro HI_TOK 1 ; update_cdf %if ARCH_X86_64 == 0 mov eax, -24 %endif %%loop: %if %1 movzx t2d, word [t1+3*2] %endif mova m1, m0 pshuflw m2, m2, q0000 psrlw m1, 6 movd [buf+12], m2 pand m2, m4 psllw m1, 7 pmulhuw m1, m2 %if ARCH_X86_64 == 0 add eax, 5 mov [buf+8], eax %endif pshuflw m3, m3, c_shuf paddw m1, m5 movq [buf+16], m1 psubusw m1, m3 pxor m2, m2 pcmpeqw m1, m2 pmovmskb eax, m1 %if %1 lea ecx, [t2+80] pcmpeqw m2, m2 shr ecx, 4 cmp t2d, 32 adc t2d, 0 movd m3, ecx pavgw m2, m1 psubw m2, m0 psubw m0, m1 psraw m2, m3 paddw m0, m2 movq [t1], m0 mov [t1+3*2], t2w %endif tzcnt eax, eax movzx ecx, word [buf+rax+16] movzx t2d, word [buf+rax+14] not t4 %if ARCH_X86_64 add t6d, 5 %endif sub eax, 5 ; setup for merging the tok_br and tok branches sub t2d, ecx shl rcx, gprsize*8-16 add t4, rcx bsr ecx, t2d xor ecx, 15 shl t2d, cl shl t4, cl movd m2, t2d mov [t7+msac.rng], t2d not t4 sub t5d, ecx jae %%end mov t2, [t7+msac.buf] mov rcx, [t7+msac.end] %if UNIX64 == 0 push t8 %endif lea t8, [t2+gprsize] cmp t8, rcx ja %%refill_eob mov t2, [t2] lea ecx, [t5+23] add t5d, 16 shr ecx, 3 bswap t2 sub t8, rcx shl ecx, 3 shr t2, cl sub ecx, t5d mov t5d, gprsize*8-16 shl t2, cl mov [t7+msac.buf], t8 %if UNIX64 == 0 pop t8 %endif sub t5d, ecx xor t4, t2 %%end: movp m3, t4 %if ARCH_X86_64 add t6d, eax ; CF = tok_br < 3 || tok == 15 jnc %%loop lea eax, [t6+30] %else add eax, [buf+8] jnc %%loop add eax, 30 %if STACK_ALIGNMENT >= 16 add esp, 36 %else mov esp, [esp] %endif %endif mov [t7+msac.dif], t4 shr eax, 1 mov [t7+msac.cnt], t5d RET %%refill_eob: mov t8, rcx mov ecx, gprsize*8-24 sub ecx, t5d %%refill_eob_loop: cmp t2, t8 jae %%refill_eob_end movzx t5d, byte [t2] inc t2 shl t5, cl xor t4, t5 sub ecx, 8 jge %%refill_eob_loop %%refill_eob_end: %if UNIX64 == 0 pop t8 %endif mov t5d, gprsize*8-24 mov [t7+msac.buf], t2 sub t5d, ecx jmp %%end %endmacro cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6 DECODE_SYMBOL_ADAPT_INIT 1 %if ARCH_X86_64 == 0 && PIC LEA t2, min_prob+12*2 %define base t2-(min_prob+12*2) %else %define base 0 %endif movq m0, [t1] movd m2, [t0+msac.rng] mov eax, [t0+msac.update_cdf] movq m4, [base+pw_0xff00] movp m3, [t0+msac.dif] movq m5, [base+min_prob+12*2] mov t4, [t0+msac.dif] mov t5d, [t0+msac.cnt] %if ARCH_X86_64 mov t6d, -24 %endif movifnidn t7, t0 test eax, eax jz .no_update_cdf HI_TOK 1 .no_update_cdf: HI_TOK 0 %if ARCH_X86_64 INIT_YMM avx2 cglobal msac_decode_symbol_adapt16, 3, 6, 6 lea rax, [pw_0xff00] vpbroadcastw m2, [t0+msac.rng] mova m0, [t1] vpbroadcastw m3, [t0+msac.dif+6] vbroadcasti128 m4, [rax] mov t3d, [t0+msac.update_cdf] mov t4d, t2d not t2 mov r5, rsp %if WIN64 and rsp, ~31 sub rsp, 40 %else and r5, ~31 %define buf r5-32 %endif psrlw m1, m0, 6 movd [buf-4], xm2 pand m2, m4 psllw m1, 7 pmulhuw m1, m2 paddw m1, [rax+t2*2] mova [buf], m1 pmaxuw m1, m3 pcmpeqw m1, m3 pmovmskb eax, m1 test t3d, t3d jz .renorm movzx t3d, word [t1+t4*2] pcmpeqw m2, m2 lea t2d, [t3+80] shr t2d, 4 cmp t3d, 32 adc t3d, 0 movd xm3, t2d pavgw m2, m1 psubw m2, m0 psubw m0, m1 psraw m2, xm3 paddw m0, m2 mova [t1], m0 mov [t1+t4*2], t3w .renorm: tzcnt eax, eax mov t4, [t0+msac.dif] movzx t1d, word [buf+rax-0] movzx t2d, word [buf+rax-2] shr eax, 1 %if WIN64 mov rsp, r5 %endif vzeroupper jmp m(msac_decode_symbol_adapt4, _sse2).renorm2 %endif rav1e-0.7.1/src/x86/sad_avx.asm000064400000000000000000000140551046102023000142170ustar 00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "config.asm" %include "ext/x86/x86inc.asm" SECTION .text %macro SAD_FN 4 %if %4 == 0 %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %else ; avg %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 %if ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 %endif ; avg/sad movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 lea src_stride3q, [src_strideq*3] lea ref_stride3q, [ref_strideq*3] %endif ; %3 == 7 %endmacro ; unsigned int aom_sad128x128_avx2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD128XN 1-2 0 SAD_FN 128, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+32] movu m3, [refq+64] movu m4, [refq+96] %if %2 == 1 vpavgb m1, [second_predq+mmsize*0] vpavgb m2, [second_predq+mmsize*1] vpavgb m3, [second_predq+mmsize*2] vpavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif vpsadbw m1, [srcq] vpsadbw m2, [srcq+32] vpsadbw m3, [srcq+64] vpsadbw m4, [srcq+96] add refq, ref_strideq add srcq, src_strideq vpaddd m1, m2 vpaddd m3, m4 vpaddd m0, m1 vpaddd m0, m3 dec n_rowsd jg .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET %endmacro INIT_YMM avx2 SAD128XN 128 ; sad128x128_avx2 SAD128XN 128, 1 ; sad128x128_avg_avx2 SAD128XN 64 ; sad128x64_avx2 SAD128XN 64, 1 ; sad128x64_avg_avx2 ; unsigned int aom_sad64x64_avx2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 mov n_rowsd, %1/2 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+32] movu m3, [refq+ref_strideq] movu m4, [refq+ref_strideq+32] %if %2 == 1 vpavgb m1, [second_predq+mmsize*0] vpavgb m2, [second_predq+mmsize*1] vpavgb m3, [second_predq+mmsize*2] vpavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif vpsadbw m1, [srcq] vpsadbw m2, [srcq+32] vpsadbw m3, [srcq+src_strideq] vpsadbw m4, [srcq+src_strideq+32] vpaddd m1, m2 vpaddd m3, m4 lea refq, [refq+ref_strideq*2] vpaddd m0, m1 lea srcq, [srcq+src_strideq*2] vpaddd m0, m3 dec n_rowsd jg .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET %endmacro INIT_YMM avx2 SAD64XN 128 ; sad64x128_avx2 SAD64XN 128, 1 ; sad64x128_avg_avx2 SAD64XN 64 ; sad64x64_avx2 SAD64XN 32 ; sad64x32_avx2 SAD64XN 64, 1 ; sad64x64_avg_avx2 SAD64XN 32, 1 ; sad64x32_avg_avx2 SAD64XN 16 ; sad64x16_avx2 SAD64XN 16, 1 ; sad64x16_avg_avx2 ; unsigned int aom_sad32x32_avx2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+ref_strideq] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_stride3q] %if %2 == 1 vpavgb m1, [second_predq+mmsize*0] vpavgb m2, [second_predq+mmsize*1] vpavgb m3, [second_predq+mmsize*2] vpavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+src_strideq] psadbw m3, [srcq+src_strideq*2] psadbw m4, [srcq+src_stride3q] vpaddd m1, m2 vpaddd m3, m4 lea refq, [refq+ref_strideq*4] vpaddd m0, m1 lea srcq, [srcq+src_strideq*4] vpaddd m0, m3 dec n_rowsd jg .loop vextracti128 xm1, m0, 1 paddd xm0, xm1 movhlps xm1, xm0 paddd xm0, xm1 movd eax, xm0 RET %endmacro INIT_YMM avx2 SAD32XN 64 ; sad32x64_avx2 SAD32XN 32 ; sad32x32_avx2 SAD32XN 16 ; sad32x16_avx2 SAD32XN 64, 1 ; sad32x64_avg_avx2 SAD32XN 32, 1 ; sad32x32_avg_avx2 SAD32XN 16, 1 ; sad32x16_avg_avx2 SAD32XN 8 ; sad_32x8_avx2 SAD32XN 8, 1 ; sad_32x8_avg_avx2 rav1e-0.7.1/src/x86/sad_plane.asm000064400000000000000000000163141046102023000145200ustar 00000000000000; Copyright (c) 2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA align 32 mask_lut: db \ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, 0, \ -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0, %macro JMP_TABLE 3-* %xdefine %%func mangle(private_prefix %+ _%1_%2) %xdefine %%table %1_%2_table %%table: %rep %0 - 2 dd (%%func %+ .%3) - (%%table) %rotate 1 %endrep %endmacro JMP_TABLE sad_plane_8bpc, avx2, vec0, vec1, vec2, vec3 JMP_TABLE sad_plane_8bpc, sse2, vec0, vec1, vec2, vec3 %use ifunc SECTION .text %macro SAD_PLANE_FN 0 cglobal sad_plane_8bpc, 5, 9, 9, p1, p2, stride, width, rows, \ resid_simd, resid, width_unrll, tmp0 mov resid_simdq, widthq mov residd, widthd and residd, mmsize - 1 and resid_simdq, -(mmsize) and widthq, -(4*mmsize) ; LUT row size is always 32 regardless of mmsize (because the ; start of the rows would be the same, so we reuse the same LUT) shl residd, ilog2(32) pxor xm0, xm0 pxor xm1, xm1 pxor xm2, xm2 pxor xm3, xm3 ; load mask from lookup table into m8 lea tmp0q, [mask_lut] mova m8, [tmp0q + residq] DEFINE_ARGS p1, p2, stride, width, rows, \ resid_simd, resid, width_unrll, skip_ptr sub resid_simdq, widthq ; need to divide by mmsize to load skip pointer shr resid_simdq, ilog2(mmsize) %if mmsize == 32 %define jmp_table sad_plane_8bpc_avx2_table %elif mmsize == 16 %define jmp_table sad_plane_8bpc_sse2_table %endif lea r6, [jmp_table] movsxd skip_ptrq, [r6 + 4*resid_simdq] add skip_ptrq, r6 ; shift back (for residual to load correct number of bytes) shl resid_simdq, ilog2(mmsize) ; set pointer to point after end of width of first row add p1q, widthq add p2q, widthq mov width_unrllq, widthq neg widthq .loop_row: test widthq, widthq jz .skip .loop: mova m4, [p1q + widthq + 0*mmsize] mova m5, [p1q + widthq + 1*mmsize] mova m6, [p1q + widthq + 2*mmsize] mova m7, [p1q + widthq + 3*mmsize] psadbw m4, m4, [p2q + widthq + 0*mmsize] psadbw m5, m5, [p2q + widthq + 1*mmsize] psadbw m6, m6, [p2q + widthq + 2*mmsize] psadbw m7, m7, [p2q + widthq + 3*mmsize] paddq m0, m4 paddq m1, m5 paddq m2, m6 paddq m3, m7 add widthq, 4*mmsize jnz .loop .skip: jmp skip_ptrq .vec3: mova m6, [p1q + 2*mmsize] psadbw m6, m6, [p2q + 2*mmsize] paddq m2, m6 .vec2: mova m5, [p1q + 1*mmsize] psadbw m5, m5, [p2q + 1*mmsize] paddq m1, m5 .vec1: mova m4, [p1q + 0*mmsize] psadbw m4, m4, [p2q + 0*mmsize] paddq m0, m4 .vec0: ; skip residual element add if necessary test residd, residd jz .next_row ; load residual elements and mask out elements past the width pand m4, m8, [p1q + resid_simdq] pand m5, m8, [p2q + resid_simdq] psadbw m4, m4, m5 paddq m2, m4 .next_row: ; width is 0 after the unrolled loop, so subtracting is basically a mov + neg sub widthq, width_unrllq ; since we started with p1+width, adding stride will get the ; pointer at the end of the next row add p1q, strideq add p2q, strideq dec rowsd jnz .loop_row ; final horizontal reduction paddq m2, m3 paddq m0, m1 paddq m0, m2 %if mmsize == 32 vextracti128 xm1, ym0, 1 paddq xm0, xm1 %endif pshufd xm1, xm0, q0032 paddq xm0, xm1 movq rax, xm0 RET %endmacro INIT_XMM sse2 SAD_PLANE_FN INIT_YMM avx2 SAD_PLANE_FN %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/sad_sse2.asm000064400000000000000000000252631046102023000143000ustar 00000000000000; ; Copyright (c) 2016, Alliance for Open Media. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; ; %include "config.asm" %include "ext/x86/x86inc.asm" SECTION .text %macro SAD_FN 4 %if %4 == 0 %if %3 == 5 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows %else ; %3 == 7 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ src_stride3, ref_stride3, n_rows %endif ; %3 == 5/7 %else ; avg %if %3 == 5 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ second_pred, n_rows %else ; %3 == 7 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ ref, ref_stride, \ second_pred, \ src_stride3, ref_stride3 %if ARCH_X86_64 %define n_rowsd r7d %else ; x86-32 %define n_rowsd dword r0m %endif ; x86-32/64 %endif ; %3 == 5/7 %endif ; avg/sad movsxdifnidn src_strideq, src_strided movsxdifnidn ref_strideq, ref_strided %if %3 == 7 lea src_stride3q, [src_strideq*3] lea ref_stride3q, [ref_strideq*3] %endif ; %3 == 7 %endmacro ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD128XN 1-2 0 SAD_FN 128, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] psadbw m4, [srcq+48] paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 movu m1, [refq+64] movu m2, [refq+80] movu m3, [refq+96] movu m4, [refq+112] %if %2 == 1 pavgb m1, [second_predq+mmsize*4] pavgb m2, [second_predq+mmsize*5] pavgb m3, [second_predq+mmsize*6] pavgb m4, [second_predq+mmsize*7] lea second_predq, [second_predq+mmsize*8] %endif psadbw m1, [srcq+64] psadbw m2, [srcq+80] psadbw m3, [srcq+96] psadbw m4, [srcq+112] add refq, ref_strideq add srcq, src_strideq paddd m1, m2 paddd m3, m4 paddd m0, m1 paddd m0, m3 sub n_rowsd, 1 jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD128XN 128 ; sad128x128_sse2 SAD128XN 128, 1 ; sad128x128_avg_sse2 SAD128XN 64 ; sad128x64_sse2 SAD128XN 64, 1 ; sad128x64_avg_sse2 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD64XN 1-2 0 SAD_FN 64, %1, 5, %2 mov n_rowsd, %1 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+32] movu m4, [refq+48] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+32] psadbw m4, [srcq+48] paddd m1, m2 paddd m3, m4 add refq, ref_strideq paddd m0, m1 add srcq, src_strideq paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD64XN 128 ; sad64x128_sse2 SAD64XN 128, 1 ; sad64x128_avg_sse2 SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 SAD64XN 16 ; sad64x16_sse2 SAD64XN 16, 1 ; sad64x16_avg_sse2 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD32XN 1-2 0 SAD_FN 32, %1, 5, %2 mov n_rowsd, %1/2 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+16] movu m3, [refq+ref_strideq] movu m4, [refq+ref_strideq+16] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+16] psadbw m3, [srcq+src_strideq] psadbw m4, [srcq+src_strideq+16] paddd m1, m2 paddd m3, m4 lea refq, [refq+ref_strideq*2] paddd m0, m1 lea srcq, [srcq+src_strideq*2] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD32XN 64 ; sad32x64_sse2 SAD32XN 32 ; sad32x32_sse2 SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 SAD32XN 8 ; sad_32x8_sse2 SAD32XN 8, 1 ; sad_32x8_avg_sse2 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD16XN 1-2 0 SAD_FN 16, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movu m1, [refq] movu m2, [refq+ref_strideq] movu m3, [refq+ref_strideq*2] movu m4, [refq+ref_stride3q] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] pavgb m3, [second_predq+mmsize*2] pavgb m4, [second_predq+mmsize*3] lea second_predq, [second_predq+mmsize*4] %endif psadbw m1, [srcq] psadbw m2, [srcq+src_strideq] psadbw m3, [srcq+src_strideq*2] psadbw m4, [srcq+src_stride3q] paddd m1, m2 paddd m3, m4 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] paddd m0, m3 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD16XN 32 ; sad16x32_sse2 SAD16XN 16 ; sad16x16_sse2 SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 SAD16XN 4 ; sad_16x4_sse2 SAD16XN 4, 1 ; sad_16x4_avg_sse2 SAD16XN 64 ; sad_16x64_sse2 SAD16XN 64, 1 ; sad_16x64_avg_sse2 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD8XN 1-2 0 SAD_FN 8, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movh m1, [refq] movhps m1, [refq+ref_strideq] movh m2, [refq+ref_strideq*2] movhps m2, [refq+ref_stride3q] %if %2 == 1 pavgb m1, [second_predq+mmsize*0] pavgb m2, [second_predq+mmsize*1] lea second_predq, [second_predq+mmsize*2] %endif movh m3, [srcq] movhps m3, [srcq+src_strideq] movh m4, [srcq+src_strideq*2] movhps m4, [srcq+src_stride3q] psadbw m1, m3 psadbw m2, m4 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] paddd m0, m2 dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD8XN 16 ; sad8x16_sse2 SAD8XN 8 ; sad8x8_sse2 SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 SAD8XN 32 ; sad_8x32_sse2 SAD8XN 32, 1 ; sad_8x32_avg_sse2 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD4XN 1-2 0 SAD_FN 4, %1, 7, %2 mov n_rowsd, %1/4 pxor m0, m0 .loop: movd m1, [refq] movd m2, [refq+ref_strideq] movd m3, [refq+ref_strideq*2] movd m4, [refq+ref_stride3q] punpckldq m1, m2 punpckldq m3, m4 movlhps m1, m3 %if %2 == 1 pavgb m1, [second_predq+mmsize*0] lea second_predq, [second_predq+mmsize*1] %endif movd m2, [srcq] movd m5, [srcq+src_strideq] movd m4, [srcq+src_strideq*2] movd m3, [srcq+src_stride3q] punpckldq m2, m5 punpckldq m4, m3 movlhps m2, m4 psadbw m1, m2 lea refq, [refq+ref_strideq*4] paddd m0, m1 lea srcq, [srcq+src_strideq*4] dec n_rowsd jg .loop movhlps m1, m0 paddd m0, m1 movd eax, m0 RET %endmacro INIT_XMM sse2 SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse SAD4XN 16 ; sad_4x16_sse2 SAD4XN 16, 1 ; sad_4x16_avg_sse2 rav1e-0.7.1/src/x86/satd.asm000064400000000000000000000775361046102023000135420ustar 00000000000000; Copyright (c) 2019, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "ext/x86/x86inc.asm" %if ARCH_X86_64 SECTION_RODATA 32 maddubsw_hsub: times 16 db 1, -1 SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; Perform 4x4 hadamard transform on input with 2 rows per register. ; Rows 0 and 2 are in m0 and rows 1 and 3 are in m1. ; A second set of packed input can also be taken in m2 and m3. ; Ends with sums in every other entry (i.e. already reduced horizontally). %macro HADAMARD_4x4_PACKED 1 %if %1 == 1 %define tmp m2 ; 2->0, 1->2, 0->2 %define ROTATE SWAP 2, 1, 0 %elif %1 == 2 %define tmp m4 ; 4->0, 3->2, 2->3, 1->2, 0->1 %define ROTATE SWAP 4, 3, 2, 1, 0 %endif ; m0 d2 c2 b2 a2 d0 c0 b0 a0 ; m1 d3 c3 b3 a3 d1 c1 b1 a1 ; Stage 1 ; m0 d2+d3 c2+c3 b2+b3 a2+a3 d0+d1 c0+c1 b0+b1 a0+a1 ; m1 d2-d3 c2-c3 b2-b3 a2-a3 d0-d1 c0-c1 b0-b1 a0-a1 paddw tmp, m0, m1 psubw m0, m1 %if %1 == 2 paddw m1, m2, m3 psubw m2, m3 %endif ROTATE ; Stage 2 ; m0 d0-d1 d0+d1 c0-c1 c0+c1 b0-b1 b0+b1 a0-a1 a0+a1 ; m1 d2-d3 d2+d3 c2-c3 c2+c3 b2-b3 b2+b3 a2-a3 a2+a3 punpcklwd tmp, m0, m1 punpckhwd m0, m1 %if %1 == 2 punpcklwd m1, m2, m3 punpckhwd m2, m3 %endif ROTATE ; m0 d0-d1+d2-d3 d0+d1+d2+d3 c0-c1+c2-c3 c0+c1+c2+c3 ; b0-b1+b2-b3 b0+b1+b2+b3 a0-a1+a2-a3 a0+a1+a2+a3 ; m1 d0-d2-d2+d3 d0+d1-d2-d3 c0-c1-c2+c3 c0+c1-c2-c3 ; b0-b1-b2+b3 b0+b1-b2-b3 a0-a1-a2-a3 a0+a1-a2-a3 paddw tmp, m0, m1 psubw m0, m1 %if %1 == 2 paddw m1, m2, m3 psubw m2, m3 %endif ROTATE ; m0 s2 s0 r2 r0 q2 q0 p2 p0 ; m1 s3 s1 r3 r1 q3 q1 p3 p1 ; Stage 1 ; m0 q3 q1 q2 q0 p3 p1 p2 p0 ; m1 s3 s1 s2 s0 r3 r1 r2 r0 punpckldq tmp, m0, m1 punpckhdq m0, m1 %if %1 == 2 punpckldq m1, m2, m3 punpckhdq m2, m3 %endif ROTATE ; m0 q3+s3 q1+s1 q2+s2 q0+s0 p3+r3 p1+r1 p2+r2 p0+r0 ; m1 q3-s3 q1-s1 q2-s2 q0-s0 p3-r3 p1-r1 p2-r2 p0-r0 paddw tmp, m0, m1 psubw m0, m1 %if %1 == 2 paddw m1, m2, m3 psubw m2, m3 %endif ROTATE ; Stage 2 ; m0 p3-r3 p1-r1 p2-r2 p0-r0 p3+r3 p1+r1 p2+r2 p0+r0 ; m1 q3-s3 q1-s1 q2-s2 q0-s0 q3+s3 q1+s1 q2+s2 q0+s0 punpcklqdq tmp, m0, m1 punpckhqdq m0, m1 %if %1 == 2 punpcklqdq m1, m2, m3 punpckhqdq m2, m3 %endif ROTATE ; Use the fact that ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) ; to merge the final butterfly with the abs and the first stage of ; accumulation. ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. ; The final sum must be offset to compensate for subtracting 0x7FFF. paddw tmp, m0, m1 pmaxsw m0, m1 ; m1 is free ; 0x7FFF pcmpeqb m1, m1 psrlw m1, 1 paddsw tmp, m1 psubw m0, tmp %if %1 == 2 paddw tmp, m2, m3 pmaxsw m2, m3 paddsw tmp, m1 psubw m2, tmp paddw m0, m2 %endif %endmacro ; Load diffs of 4 entries for 2 rows %macro LOAD_PACK_DIFF_Dx2 7 movd m%1, %2 movd m%6, %4 punpckldq m%1, m%6 pmovzxbw m%1, m%1 movd m%6, %3 movd m%7, %5 punpckldq m%6, m%7 pmovzxbw m%6, m%6 psubw m%1, m%6 %endmacro ; Can only use 128-bit vectors %macro SATD_4x4_FN 0 cglobal satd_4x4, 4, 6, 4, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] ; Load rows 0 and 2 to m0 and 1 and 3 to m1 LOAD_PACK_DIFF_Dx2 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 2, 3 LOAD_PACK_DIFF_Dx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ 2, 3 HADAMARD_4x4_PACKED 1 ; Reduce horizontally pshufd m1, m0, q3232 paddw m0, m1 pshuflw m1, m0, q3232 paddw m0, m1 pshuflw m1, m0, q1111 ; Perform normalization during the final stage of accumulation pavgw m0, m1 movd eax, m0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 4 RET %endmacro INIT_XMM sse4 SATD_4x4_FN INIT_XMM avx2 SATD_4x4_FN ; Load diffs of 8 entries for 2 row ; Each set of 4 columns share an 128-bit lane %macro LOAD_PACK_DIFF_Qx2 7 movq xm%1, %2 movq xm%6, %4 punpckldq xm%1, xm%6 pmovzxbw m%1, xm%1 movq xm%6, %3 movq xm%7, %5 punpckldq xm%6, xm%7 pmovzxbw m%6, xm%6 psubw m%1, m%6 %endmacro INIT_YMM avx2 cglobal satd_8x4, 4, 6, 4, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] ; Load rows 0 and 2 to m0 and 1 and 3 to m1 ; Each set of 4 columns share 128-bit lanes LOAD_PACK_DIFF_Qx2 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ 2, 3 LOAD_PACK_DIFF_Qx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ 2, 3 HADAMARD_4x4_PACKED 1 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation pavgw xm0, xm1 movd eax, xm0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 8 RET ; Load diffs of 4 entries for 4 rows ; Each set of two rows share 128-bit lanes %macro LOAD_PACK_DIFF_Dx4 12 movd xm%1, %2 movd xm%10, %4 punpckldq xm%1, xm%10 movd xm%10, %6 movd xm%11, %8 punpckldq xm%10, xm%11 punpcklqdq xm%1, xm%10 pmovzxbw m%1, xm%1 movd xm%10, %3 movd xm%11, %5 punpckldq xm%10, xm%11 movd xm%11, %7 movd xm%12, %9 punpckldq xm%11, xm%12 punpcklqdq xm%10, xm%11 pmovzxbw m%10, xm%10 psubw m%1, m%10 %endmacro INIT_YMM avx2 cglobal satd_4x8, 4, 8, 5, src, src_stride, dst, dst_stride, \ src4, dst4, src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] lea src4q, [srcq+src_strideq*4] lea dst4q, [dstq+dst_strideq*4] ; Load rows 0, 2, 4 and 6 to m0 and 1, 3, 5 and 7 to m1. ; Lanes split the low and high rows of m0 and m1. LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src4q], [dst4q], \ [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 2, 3, 4 LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ [src4q+src_stride3q], [dst4q+dst_stride3q], \ 2, 3, 4 HADAMARD_4x4_PACKED 1 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation. pavgw xm0, xm1 movd eax, xm0 movzx eax, ax sub ax, 8 RET ; Rudimentary fast hadamard transform ; Two Hadamard transforms share an 128-bit lane. %macro HADAMARD_4x4 0 ; 4->0, 3->2, 2->3, 1->2, 0->1 %define ROTATE SWAP 4, 3, 2, 1, 0 ; Stage 1 paddw m0, m1, m2 psubw m1, m2 paddw m2, m3, m4 psubw m3, m4 ROTATE ; Stage 2 paddw m0, m1, m3 psubw m1, m3 paddw m3, m2, m4 psubw m2, m4 SWAP 3, 2, 1 ROTATE ; Transpose ; Since two transforms share an 128-bit lane, unpacking results in a single ; transform's values on each register. This has to be resolved later. ; A and B indicate different 4x4 transforms. ; Start ; m1 B (a3 a2 a1 a0) A (a3 a2 a1 a0) ; m2 B (b3 b2 b1 b0) A (b3 b2 b1 b0) ; m3 B (c3 c2 c1 c0) A (c3 c2 c1 c0) ; m4 B (d3 d2 d1 d0) A (d3 d2 d1 d0) ; Stage 1 ; m1 A (b3 a3 b2 a2 b1 a1 b0 a0) ; m2 B (b3 a3 b2 a2 b1 a1 b0 a0) ; m3 A (d3 c3 d2 c2 d1 c1 d0 c0) ; m4 B (d3 c3 d2 c2 d1 c1 d0 c0) punpcklwd m0, m1, m2 punpckhwd m1, m2 punpcklwd m2, m3, m4 punpckhwd m3, m4 ROTATE ; m1 A (d3 c3 b3 a3 d2 c2 b2 a2) ; m2 A (d1 c1 b1 a1 d0 c0 b0 a0) ; m3 B (d3 c3 b3 a3 d2 c2 b2 a2) ; m4 B (d1 c1 b1 a1 d0 c0 b0 a0) punpckldq m0, m1, m3 punpckhdq m1, m3 punpckldq m3, m2, m4 punpckhdq m2, m4 SWAP 3, 2, 1 ROTATE ; Make the transforms share 128-bit lanes again. ; m1 B (d0 c0 b0 a0) A (d0 c0 b0 a0) ; m2 B (d1 c1 b1 a1) A (d1 c1 b1 a1) ; m3 B (d2 c2 b2 a2) A (d2 c2 b2 a2) ; m4 B (d3 c3 b3 a3) A (d3 c3 b3 a3) punpcklqdq m0, m1, m2 punpckhqdq m1, m2 punpcklqdq m2, m3, m4 punpckhqdq m3, m4 ROTATE ; Stage 1 paddw m0, m1, m2 psubw m1, m2 paddw m2, m3, m4 psubw m3, m4 ROTATE ; Use the fact that ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) ; to merge the final butterfly with the abs and the first stage of ; accumulation. ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. ; The final sum must be offset to compensate for subtracting 0x7FFF. paddw m0, m1, m3 pmaxsw m1, m3 ; m2 is free ; 0x7FFF pcmpeqb m3, m3 psrlw m3, 1 paddsw m0, m3 psubw m1, m0 paddw m0, m2, m4 pmaxsw m2, m4 paddsw m0, m3 psubw m2, m0 paddw m1, m2 SWAP 1, 0 %endmacro ; Load diffs of 16 entries for 1 row %macro LOAD_DIFF_DQ 4 movu xm%1, %2 movu xm%4, %3 vpmovzxbw m%1, xm%1 vpmovzxbw m%4, xm%4 psubw m%1, m%4 %endmacro INIT_YMM avx2 cglobal satd_16x4, 4, 6, 5, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_DQ 1, [srcq], [dstq], 0 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 HADAMARD_4x4 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation ; Avoids overflow in this case pavgw xm0, xm1 movd eax, xm0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 16 RET INIT_YMM avx2 cglobal satd_4x16, 4, 8, 7, src, src_stride, dst, dst_stride, \ src4, dst4, src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] lea src4q, [srcq+src_strideq*4] lea dst4q, [dstq+dst_strideq*4] LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src4q], [dst4q], \ [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 4, 5, 6 LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ [src4q+src_stride3q], [dst4q+dst_stride3q], \ 4, 5, 6 lea srcq, [srcq+src_strideq*8] lea dstq, [dstq+dst_strideq*8] lea src4q, [src4q+src_strideq*8] lea dst4q, [dst4q+dst_strideq*8] LOAD_PACK_DIFF_Dx4 2, [srcq], [dstq], \ [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src4q], [dst4q], \ [src4q+src_strideq*2], [dst4q+dst_strideq*2], \ 4, 5, 6 LOAD_PACK_DIFF_Dx4 3, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [srcq+src_stride3q], [dstq+dst_stride3q], \ [src4q+src_strideq*1], [dst4q+dst_strideq*1], \ [src4q+src_stride3q], [dst4q+dst_stride3q], \ 4, 5, 6 HADAMARD_4x4_PACKED 2 ; Reduce horizontally vextracti128 xm1, m0, 1 paddw xm0, xm1 pshufd xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q3232 paddw xm0, xm1 pshuflw xm1, xm0, q1111 ; Perform normalization during the final stage of accumulation pavgw xm0, xm1 movd eax, xm0 movzx eax, ax ; Add an offset for how the final butterfly stage and the first stage of ; accumulation was done. Since this offset is an even number, this can ; safely be done after normalization using pavgw. sub ax, 16 RET ; On x86-64 we can transpose in-place without spilling registers. ; By clever choices of the order to apply the butterflies and the order of ; their outputs, we can take the rows in order and output the columns in order ; without any extra operations and using just one temporary register. %macro TRANSPOSE8x8 9 punpckhwd m%9, m%5, m%6 punpcklwd m%5, m%6 ; m%6 is free punpckhwd m%6, m%1, m%2 punpcklwd m%1, m%2 ; m%2 is free punpckhwd m%2, m%7, m%8 punpcklwd m%7, m%8 ; m%8 is free punpckhwd m%8, m%3, m%4 punpcklwd m%3, m%4 ; m%4 is free punpckhdq m%4, m%1, m%3 punpckldq m%1, m%3 ; m%3 is free punpckldq m%3, m%5, m%7 punpckhdq m%5, m%7 ; m%7 is free punpckhdq m%7, m%6, m%8 punpckldq m%6, m%8 ; m%8 is free punpckldq m%8, m%9, m%2 punpckhdq m%9, m%2 ; m%2 is free punpckhqdq m%2, m%1, m%3 punpcklqdq m%1, m%3 ; m%3 is free punpcklqdq m%3, m%4, m%5 punpckhqdq m%4, m%5 ; m%5 is free punpcklqdq m%5, m%6, m%8 punpckhqdq m%6, m%8 ; m%8 is free punpckhqdq m%8, m%7, m%9 punpcklqdq m%7, m%9 %endmacro ; Load diff of 8 entries for 1 row %macro LOAD_DIFF_Q 4 movq %1, %2 movq %4, %3 punpcklbw %1, %4 pmaddubsw %1, hsub %endmacro %macro HADAMARD_8_STAGE_1 9 paddw m%9, m%1, m%2 psubw m%1, m%2 paddw m%2, m%3, m%4 psubw m%3, m%4 paddw m%4, m%5, m%6 psubw m%5, m%6 paddw m%6, m%7, m%8 psubw m%7, m%8 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 %endmacro %macro HADAMARD_8_STAGE_2 9 paddw m%9, m%1, m%3 ; 0 psubw m%1, m%3 ; 2 paddw m%3, m%2, m%4 ; 1 psubw m%2, m%4 ; 3 SWAP %3, %2, %1 paddw m%4, m%5, m%7 ; 4 psubw m%5, m%7 ; 6 paddw m%7, m%6, m%8 ; 5 psubw m%6, m%8 ; 7 SWAP %7, %6, %5 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 %endmacro %macro HADAMARD_8_STAGE_3 9 paddw m%9, m%1, m%5 ; 0 psubw m%1, m%5 ; 4 paddw m%5, m%2, m%6 ; 1 psubw m%2, m%6 ; 5 paddw m%6, m%3, m%7 ; 2 psubw m%3, m%7 ; 6 paddw m%7, m%4, m%8 ; 3 psubw m%4, m%8 ; 7 SWAP %5, %2, %6, %3, %7, %4, %1 ; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1 SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9 %endmacro ; Rudimentary fast hadamard transform %macro HADAMARD_8x8 0 HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_3 1, 2, 3, 4, 5, 6, 7, 8, 0 TRANSPOSE8x8 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0 HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0 ; Stage 3 ; Use the fact that ; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b)) ; to merge the final butterfly with the abs and the first stage of ; accumulation. ; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead. ; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF. ; The final sum must be offset to compensate for subtracting 0x7FFF. paddw m0, m1, m5 pmaxsw m1, m5 ; m1 is free ; 0x7FFF pcmpeqb m5, m5 psrlw m5, 1 paddsw m0, m5 psubw m1, m0 paddw m0, m2, m6 pmaxsw m2, m6 paddsw m0, m5 psubw m2, m0 paddw m0, m3, m7 pmaxsw m3, m7 paddsw m0, m5 psubw m3, m0 paddw m0, m4, m8 pmaxsw m4, m8 paddsw m0, m5 psubw m4, m0 paddw m1, m2 paddw m3, m4 paddw m1, m3 SWAP 1, 0 %endmacro ; Only works with 128 bit vectors %macro SATD_8x8_FN 0 cglobal satd_8x8, 4, 6, 10, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 %define hsub m0 mova hsub, [maddubsw_hsub] ; Load rows into m1-m8 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_Q m1, [srcq], [dstq], m2 LOAD_DIFF_Q m2, [srcq+src_strideq*1], [dstq+dst_strideq*1], m3 LOAD_DIFF_Q m3, [srcq+src_strideq*2], [dstq+dst_strideq*2], m4 LOAD_DIFF_Q m4, [srcq+src_stride3q], [dstq+dst_stride3q], m5 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] LOAD_DIFF_Q m5, [srcq], [dstq], m6 LOAD_DIFF_Q m6, [srcq+src_strideq*1], [dstq+dst_strideq*1], m7 LOAD_DIFF_Q m7, [srcq+src_strideq*2], [dstq+dst_strideq*2], m8 LOAD_DIFF_Q m8, [srcq+src_stride3q], [dstq+dst_stride3q], m9 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pxor m2, m2 punpcklwd m1, m0, m2 punpckhwd m0, m2 paddd m0, m1 pshufd m1, m0, q3232 paddd m0, m1 pshuflw m1, m0, q3232 paddd m0, m1 movd eax, m0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 32-2 shr eax, 2 RET %endmacro INIT_XMM ssse3 SATD_8x8_FN INIT_XMM avx2 SATD_8x8_FN INIT_YMM avx2 cglobal satd_16x8, 4, 6, 9, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3 ; Load rows into m1-m8 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_DQ 1, [srcq], [dstq], 0 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] LOAD_DIFF_DQ 5, [srcq], [dstq], 0 LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pxor m2, m2 punpcklwd m1, m0, m2 punpckhwd m0, m2 paddd m0, m1 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 64-2 shr eax, 2 RET %macro LOAD_DIFF_Qx2 7 movq xm%1, %2 movq xm%6, %3 punpcklbw xm%1, xm%6 movq xm%6, %4 movq xm%7, %5 punpcklbw xm%6, xm%7 vinserti128 m%1, xm%6, 1 pmaddubsw m%1, hsub %endmacro INIT_YMM avx2 cglobal satd_8x16, 4, 8, 11, src, src_stride, dst, dst_stride, \ src8, dst8, src_stride3, dst_stride3 %define hsub m0 mova hsub, [maddubsw_hsub] ; Load rows into m1-m8 lea src8q, [srcq+src_strideq*8] lea dst8q, [dstq+dst_strideq*8] lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] LOAD_DIFF_Qx2 1, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] lea src8q, [src8q+src_strideq*4] lea dst8q, [dst8q+dst_strideq*4] LOAD_DIFF_Qx2 5, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pxor m2, m2 punpcklwd m1, m0, m2 punpckhwd m0, m2 paddd m0, m1 vextracti128 xm1, m0, 1 paddd xm0, xm1 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 64-2 shr eax, 2 RET ; Less optimized, boilerplate implementations INIT_YMM avx2 cglobal satd_8x32, 4, 9, 13, src, src_stride, dst, dst_stride, \ src8, dst8, src_stride3, dst_stride3, cnt ; ones for converting to 32-bit with pmaddwd pcmpeqw m11, m11 pabsw m11, m11 ; sum pxor m12, m12 mov cntd, 1 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] lea src8q, [srcq+src_strideq*8] lea dst8q, [dstq+dst_strideq*8] .loop: %define hsub m0 mova hsub, [maddubsw_hsub] ; Load rows into m1-m8 LOAD_DIFF_Qx2 1, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] lea src8q, [src8q+src_strideq*4] lea dst8q, [dst8q+dst_strideq*4] LOAD_DIFF_Qx2 5, [srcq], [dstq], \ [src8q], [dst8q], \ 9, 10 LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \ [src8q+src_strideq*1], [dst8q+dst_strideq*1], \ 9, 10 LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \ [src8q+src_strideq*2], [dst8q+dst_strideq*2], \ 9, 10 LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \ [src8q+src_stride3q], [dst8q+dst_stride3q], \ 9, 10 HADAMARD_8x8 ; Reduce horizontally and convert to 32 bits pmaddwd m0, m11 paddd m12, m0 lea srcq, [srcq+src_stride3q*4] lea dstq, [dstq+dst_stride3q*4] lea src8q, [src8q+src_stride3q*4] lea dst8q, [dst8q+dst_stride3q*4] dec cntd jge .loop vextracti128 xm0, m12, 1 paddd xm0, xm12 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, 128-2 shr eax, 2 RET INIT_YMM avx2 cglobal satd_16x8_internal, 0, 0, 0, \ dummy1, src_stride, dummy2, dst_stride, \ src_stride3, dst_stride3, src, dst %define hadd m9 %define sum m10 ; Load rows into m1-m8 LOAD_DIFF_DQ 1, [srcq], [dstq], 0 LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0 lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] LOAD_DIFF_DQ 5, [srcq], [dstq], 0 LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0 LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0 LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0 HADAMARD_8x8 pmaddwd m0, hadd paddd sum, m0 ret %macro SATD_NXM 2 %if %1 > 16 %if %2 > 8 cglobal satd_%1x%2, 4, 10, 11, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3, call_src, call_dst, \ w, h %else cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3, call_src, call_dst, \ w %endif %else ; %2 > 8 cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \ src_stride3, dst_stride3, call_src, call_dst, \ h %endif ; ones for converting to 32-bit with pmaddwd pcmpeqw m9, m9 pabsw m9, m9 ; sum pxor m10, m10 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] %if %2 > 8 mov hd, %2/8 - 1 .looph: %endif %if %1 > 16 mov wd, %1/16 - 1 .loopv: %endif mov call_srcq, srcq mov call_dstq, dstq call m(satd_16x8_internal) %if %1 > 16 add srcq, 16 add dstq, 16 dec wd jge .loopv sub srcq, %1 sub dstq, %1 %endif %if %2 > 8 lea srcq, [srcq+src_strideq*8] lea dstq, [dstq+dst_strideq*8] dec hd jge .looph %endif ; Reduce horizontally vextracti128 xm0, m10, 1 paddd xm0, xm10 pshufd xm1, xm0, q3232 paddd xm0, xm1 pshuflw xm1, xm0, q3232 paddd xm0, xm1 movd eax, xm0 ; Normalize ; Add rounding offset and an offset for how the final butterfly stage and ; the first stage of accumulation was done. sub eax, %1*%2/2 - 2 shr eax, 2 RET %endmacro INIT_YMM avx2 SATD_NXM 16, 16 SATD_NXM 32, 32 SATD_NXM 64, 64 SATD_NXM 128, 128 SATD_NXM 16, 32 SATD_NXM 32, 16 SATD_NXM 32, 64 SATD_NXM 64, 32 SATD_NXM 64, 128 SATD_NXM 128, 64 SATD_NXM 32, 8 SATD_NXM 16, 64 SATD_NXM 64, 16 %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/satd16_avx2.asm000064400000000000000000000726311046102023000146400ustar 00000000000000; Copyright (c) 2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "ext/x86/x86inc.asm" %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %if ARCH_X86_64 SECTION_RODATA 32 align 32 pw_1x16: times 16 dw 1 SECTION .text %macro NORMALIZE4PT 0 add eax, 2 shr eax, 2 %endmacro %macro NORMALIZE8PT 0 add eax, 4 shr eax, 3 %endmacro ; Add and subtract registers ; ; Takes m0 and m1 as both input and output. ; Requires m2 as a free register. ; ; If we start with this permutation: ; ; m0 0 1 2 3 4 5 6 7 ; m1 8 9 10 11 12 13 14 15 ; ; Then the output will be as such: ; ; m0 [0+8][1+9][2+10][3+11] [4+12][5+13][6+14][7+15] ; m1 [0-8][1-9][2-10][3-11] [4-12][5-13][6-14][7-15] %macro BUTTERFLY 3 %define BIT_PRECISION %1 %define VEC_SIZE %2 ; use alternate registers 3,4,5 %define USE_ALT %3 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %if VEC_SIZE == 32 %define V ym %elif VEC_SIZE == 16 %define V xm %endif ; Use m2 as a temporary register, then swap ; so that m0 and m1 contain the output. %if BIT_PRECISION == 16 paddw V%+ 2, V%+ 0, V%+ 1 psubw V%+ 0, V%+ 1 %elif BIT_PRECISION == 32 paddd ym2, ym0, ym1 psubd ym0, ym1 %else %error Incorrect precision specified (16 or 32 expected) %endif SWAP 2, 1, 0 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %endmacro ; Interleave packed rows together (in m0 and m1). ; m2 should contain a free register. ; ; Macro argument takes size in bits of each element (where one ; element is the difference between two original source pixels). ; ; If we start with this permutation: ; ; m0 0 1 2 3 4 5 6 7 ; m1 8 9 10 11 12 13 14 15 ; ; Then, after INTERLEAVE, this will be the permutation: ; ; m0 0 8 1 9 2 10 3 11 ; m1 4 12 5 13 6 14 7 15 %macro INTERLEAVE 3 %define BIT_PRECISION %1 %define VEC_SIZE %2 %define USE_ALT %3 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %if VEC_SIZE == 16 %define V xm %elif VEC_SIZE == 32 %define V ym %else %error Invalid vector size (expected 16 or 32) %endif %if BIT_PRECISION == 16 punpcklwd V%+ 2, V%+ 0, V%+ 1 punpckhwd V%+ 0, V%+ 1 SWAP 2, 1, 0 %elif BIT_PRECISION == 32 punpckldq ym2, ym0, ym1 punpckhdq ym0, ym1 ; AVX2 shuffles operate over 128-bit halves of the full ymm register ; in parallel, so these shuffles are required to fix up the permutation. vperm2i128 ym1, ym2, ym0, 0x20 vperm2i128 ym0, ym2, ym0, 0x31 SWAP 0, 1 %else %error Incorrect precision specified (16 or 32 expected) %endif %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %endmacro ; Interleave pairs of 2 elements (in m0 and m1) ; m2 should contain a free register. %macro INTERLEAVE_PAIRS 3 %define BIT_PRECISION %1 %define VEC_SIZE %2 %define USE_ALT %3 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %if VEC_SIZE == 16 %define V xm %elif VEC_SIZE == 32 %define V ym %else %error Invalid vector size (expected 16 or 32) %endif %if BIT_PRECISION == 16 punpckldq V%+ 2, V%+ 0, V%+ 1 punpckhdq V%+ 0, V%+ 1 %elif BIT_PRECISION == 32 punpcklqdq ym2, ym0, ym1 punpckhqdq ym0, ym1 %else %error Incorrect precision specified (16 or 32 expected) %endif SWAP 2, 1, 0 %if USE_ALT == 1 SWAP 3, 0 SWAP 4, 1 SWAP 5, 2 %endif %endmacro %macro HADAMARD_4X4_PACKED 2 %define BIT_PRECISION %1 ; Register size to use (in bytes) %define VEC_SIZE %2 %if VEC_SIZE == 16 %define V xm %elif VEC_SIZE == 32 %define V ym %else %error Invalid vector size (expected 16 or 32) %endif ; Starting registers: ; m0 0 1 2 3 ; m1 4 5 6 7 ; m2 8 9 10 11 ; m3 12 13 14 15 ; Where each number represents an index of the ; original block of differences. ; Pack rows 0,2 and 1,3 into m0 and m1 %if BIT_PRECISION == 16 %if VEC_SIZE == 16 ; In this case, each row only has 64 bits, so we use ; punpcklqdq only. The high 64 bits are always 0. punpcklqdq xm0, xm2 punpcklqdq xm1, xm3 %elif VEC_SIZE == 32 ; The upper 128 bits of all input registers are zeroed punpcklqdq m4, m0, m2 punpcklqdq m5, m1, m3 punpckhqdq m0, m0, m2 punpckhqdq m1, m1, m3 vinserti128 m0, m4, xm0, 1 vinserti128 m1, m5, xm1, 1 %endif %elif BIT_PRECISION == 32 vinserti128 ym0, ym0, xm2, 1 vinserti128 ym1, ym1, xm3, 1 %else %error Invalid bit precision (expected 16 or 32) %endif ; Now that we've packed rows 0-2 and 1-3 together, ; this is our permutation: ; m0 0 1 2 3 8 9 10 11 ; m1 4 5 6 7 12 13 14 15 ; For a 8x4 transform (with 16-bit coefficients), this pattern is ; extended for each 128-bit half but for the second block, and thus ; all comments also apply to the upper 128-bits for the 8x4 transform. BUTTERFLY %1, %2, 0 ; m0 [0+4][1+5][2+6][3+7] [8+12][9+13][10+14][11+15] ; m1 [0-4][1-5][2-6][3-7] [8-12][9-13][10-14][11-15] INTERLEAVE %1, %2, 0 ; m0 [ 0+4][ 0-4][ 1+5][ 1-5] [2 + 6][2 - 6][3 + 7][3 - 7] ; m1 [8+12][8-12][9+13][9-13] [10+14][10-14][11+15][11-15] BUTTERFLY %1, %2, 0 ; m0 [0+4+8+12][0-4+8-12][1+5+9+13][1-5+9-13] [2+6+10+14][2-6+10-14][3+7+11+15][3-7+11-15] ; m1 [0+4-8-12][0-4-8+12][1+5-9-13][1-5-9+13] [2+6-10-14][2-6-10+14][3+7-11-15][3-7-11+15] ; for one row: ; [0+1+2+3][0-1+2-3][0+1-2-3][0-1-2+3] ; For the vertical transform, these are packed into a new column. INTERLEAVE_PAIRS %1, %2, 0 ; p0 p1 p2 p3 ; m0 [0+4+ 8+12][0-4+ 8-12][0+4- 8-12][0-4- 8+12] [1+5+ 9+13][1-5+ 9-13][1+5- 9-13][1-5- 9+13] ; m1 [2+6+10+14][2-6+10-14][2+6-10-14][2-6-10+14] [3+7+11+15][3-7+11-15][3+7-11-15][3-7-11+15] ; According to this grid: ; p0 q0 r0 s0 ; p1 q1 r1 s1 ; p2 q2 r2 s2 ; p3 q3 r3 s3 ; Horizontal transform; since the output is transposed from the original order, ; we can do the same steps as the vertical transform and the result will be the same. BUTTERFLY %1, %2, 0 INTERLEAVE %1, %2, 0 BUTTERFLY %1, %2, 0 ; Finished horizontal transform except for the last step (interleaving pairs), ; which we skip, because after this we add up the absolute value of the ; coefficients, which is a commutative operation (order does not matter). %endmacro ; Horizontal sum of mm register ; ; Inputs: ; %1 = Element size in bits (16 or 32) ; %2 = Size of input register in bytes (16 or 32) ; You can e.g. pass 16 for this argument if you ; only want to sum up the bottom 128-bits of a ; ymm register. ; %3 = Input register number ; %4 = Temporary register number ; %5 = Output register (e.g., eax) %macro HSUM 5 %define E_SIZE %1 %define REG_SIZE %2 %define INPUT %3 %define TMP %4 %define OUTPUT %5 %if REG_SIZE == 16 %define V xm %elif REG_SIZE == 32 %define V ym %else %error Invalid register size (expected 16 or 32) %endif %if E_SIZE == 16 ; Add adjacent pairs of 16-bit elements to produce 32-bit results, ; then proceed with 32-bit sum pmaddwd V%+INPUT, [pw_1x16] %endif %if mmsize == 32 && REG_SIZE == 32 ; Add upper half of ymm to xmm vextracti128 xm%+TMP, ym%+INPUT, 1 paddd xm%+INPUT, xm%+TMP %endif ; Reduce 32-bit results pshufd xm%+TMP, xm%+INPUT, q2323 paddd xm%+INPUT, xm%+TMP pshufd xm%+TMP, xm%+INPUT, q1111 paddd xm%+INPUT, xm%+TMP movd OUTPUT, xm%+INPUT %endmacro ; given m0-7, do butterfly as follows: ; (m0, m1) = butterfly(m0, m1) ; (m2, m3) = butterfly(m2, m3) ; (m4, m5) = butterfly(m4, m5) ; (m6, m7) = butterfly(m6, m7) %macro BUTTERFLY_8X8 0 ; m8 is free paddd m8, m0, m1 psubd m0, m1 SWAP 8, 1, 0 ; m8 is free paddd m8, m2, m3 psubd m2, m3 SWAP 8, 3, 2 paddd m8, m4, m5 psubd m4, m5 SWAP 8, 5, 4 paddd m8, m6, m7 psubd m6, m7 SWAP 8, 7, 6 %endmacro %macro HADAMARD_8X8_VERTICAL 0 BUTTERFLY_8X8 ; m0-7 contain a0-7 SWAP 2, 1 SWAP 6, 5 BUTTERFLY_8X8 SWAP 1, 4 SWAP 3, 6 BUTTERFLY_8X8 SWAP 2, 1 SWAP 2, 4 SWAP 3, 6 SWAP 5, 6 %endmacro ; Transpose rows m0-7. ; Output is also contained in m0-7. ; ; Uses m8, m10-15 as temporary registers (i.e. m9 is left unchanged.) %macro TRANSPOSE8X8D 0 SWAP 9, 0 SWAP 10, 1 SWAP 11, 2 SWAP 12, 3 SWAP 13, 4 SWAP 14, 5 SWAP 15, 6 SWAP 2, 7 punpckldq m6, m9, m10 punpckldq m1, m11, m12 punpckhdq m8, m9, m10 punpckldq m4, m13, m14 punpckldq m9, m15, m2 vshufps m3, m6, m1, 0x4e vpblendd m10, m6, m3, 0xcc vshufps m6, m4, m9, 0x4e punpckhdq m7, m11, m12 vpblendd m11, m4, m6, 0xcc vpblendd m12, m3, m1, 0xcc vperm2i128 m3, m10, m11, 0x20 punpckhdq m5, m13, m14 vpblendd m13, m6, m9, 0xcc punpckhdq m4, m15, m2 vperm2i128 m2, m12, m13, 0x20 vshufps m14, m8, m7, 0x4e vpblendd m15, m14, m7, 0xcc vshufps m7, m5, m4, 0x4e vpblendd m8, m8, m14, 0xcc vpblendd m5, m5, m7, 0xcc vperm2i128 m6, m8, m5, 0x20 vpblendd m4, m7, m4, 0xcc vperm2i128 m7, m15, m4, 0x20 vperm2i128 m1, m10, m11, 0x31 vperm2i128 m9, m12, m13, 0x31 vperm2i128 m5, m8, m5, 0x31 vperm2i128 m4, m15, m4, 0x31 SWAP 0, 9 ; Output order is as follows: ; 3 2 6 7 1 0 5 4 ; sort rows SWAP 3, 0 ; 0 2 6 7 1 3 5 4 SWAP 1, 2 ; 0 1 6 7 2 3 5 4 SWAP 6, 2 ; 0 1 2 7 6 3 5 4 SWAP 7, 3 ; 0 1 2 3 6 7 5 4 SWAP 6, 4 ; 0 1 2 3 4 7 5 6 SWAP 7, 5 ; 0 1 2 3 4 5 7 6 SWAP 6, 7 ; 0 1 2 3 4 5 6 7 %endmacro ; m0-7 as input; add coefficients to ymm9. INIT_YMM avx2 cglobal satd_8x8_hbd_internal, 0, 0, 0, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 HADAMARD_8X8_VERTICAL TRANSPOSE8X8D HADAMARD_8X8_VERTICAL REPX {pabsd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 ; Add m0-7 paddd m0, m4 paddd m1, m5 paddd m2, m6 paddd m3, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 paddd ymm9, m0 ret %macro LOAD_DIFF_8X8 0 movu xm0, [srcq + 0*src_strideq] movu xm1, [srcq + 1*src_strideq] movu xm2, [srcq + 2*src_strideq] movu xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] movu xm4, [srcq + 0*src_strideq] movu xm5, [srcq + 1*src_strideq] movu xm6, [srcq + 2*src_strideq] movu xm7, [srcq + src_stride3q ] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] pmovsxwd m0, xm0 pmovsxwd m1, xm1 pmovsxwd m2, xm2 pmovsxwd m3, xm3 pmovsxwd m4, xm4 pmovsxwd m5, xm5 pmovsxwd m6, xm6 pmovsxwd m7, xm7 %endmacro INIT_YMM avx2 cglobal satd_8x8_hbd, 5, 7, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] LOAD_DIFF_8X8 ; m0-7 contain rows of 8x8 block to transform ; with 32-bit coefficients HADAMARD_8X8_VERTICAL TRANSPOSE8X8D HADAMARD_8X8_VERTICAL REPX {pabsd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 ; Add m0-7 paddd m0, m4 paddd m1, m5 paddd m2, m6 paddd m3, m7 paddd m0, m2 paddd m1, m3 paddd m0, m1 HSUM 32, 32, 0, 1, eax NORMALIZE8PT RET INIT_YMM avx2 cglobal satd_4x4_hbd, 5, 7, 8, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; Load src rows movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] ; src -= dst psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] HADAMARD_4X4_PACKED 16, 16 ; Sum up absolute value of transform coefficients pabsw xm0, xm0 pabsw xm1, xm1 paddw xm0, xm1 HSUM 16, 16, 0, 1, eax NORMALIZE4PT RET .12bpc: ; this gives a nicer disassembly RESET_MM_PERMUTATION ; Load src rows pmovzxwd xm0, [srcq + 0*src_strideq] pmovzxwd xm1, [srcq + 1*src_strideq] pmovzxwd xm2, [srcq + 2*src_strideq] pmovzxwd xm3, [srcq + src_stride3q ] ; Load dst rows pmovzxwd xm4, [dstq + 0*dst_strideq] pmovzxwd xm5, [dstq + 1*dst_strideq] pmovzxwd xm6, [dstq + 2*dst_strideq] pmovzxwd xm7, [dstq + dst_stride3q ] ; src -= dst psubd xm0, xm4 psubd xm1, xm5 psubd xm2, xm6 psubd xm3, xm7 HADAMARD_4X4_PACKED 32, 32 pabsd m0, m0 pabsd m1, m1 paddd m0, m1 HSUM 32, 32, 0, 1, eax NORMALIZE4PT RET ; 32-bit input rows are in m0-3; result is in m0. ; Uses m0-5 as temporary registers. %macro HADAMARD_8X4_12BPC 0 vperm2i128 m4, m0, m2, 0x31 vperm2i128 m5, m1, m3, 0x31 vinserti128 m0, m0, xm2, 1 vinserti128 m1, m1, xm3, 1 ; Swap so m3,m4 are used as inputs. SWAP 3, 4, 5 ; instead of using HADAMARD_4X4_PACKED twice, we interleave ; 2 transforms operating over different registers for more ; opportunity for instruction level parallelism. BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 INTERLEAVE 32, 32, 0 INTERLEAVE 32, 32, 1 BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 INTERLEAVE_PAIRS 32, 32, 0 INTERLEAVE_PAIRS 32, 32, 1 BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 INTERLEAVE 32, 32, 0 INTERLEAVE 32, 32, 1 BUTTERFLY 32, 32, 0 BUTTERFLY 32, 32, 1 pabsd m0, m0 pabsd m1, m1 pabsd m3, m3 pabsd m4, m4 paddd m0, m1 paddd m3, m4 paddd m0, m3 %endmacro INIT_YMM avx2 cglobal satd_16x4_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; Load src rows movu m0, [srcq + 0*src_strideq] movu m1, [srcq + 1*src_strideq] movu m2, [srcq + 2*src_strideq] movu m3, [srcq + src_stride3q ] ; src -= dst psubw m0, [dstq + 0*dst_strideq] psubw m1, [dstq + 1*dst_strideq] psubw m2, [dstq + 2*dst_strideq] psubw m3, [dstq + dst_stride3q ] .10bpc_main: ; Original permutation ; m0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ; m1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 ; m2 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 ; m3 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 ; Two registers perform 2 4x4 transforms in parallel punpcklqdq m4, m0, m2 punpcklqdq m5, m1, m3 punpckhqdq m0, m0, m2 punpckhqdq m1, m1, m3 SWAP 4, 3 SWAP 5, 4 ; New permutation ; m0 0 1 2 3 32 33 34 35 8 9 10 11 40 41 42 43 ; m1 16 17 18 19 48 49 50 51 24 25 26 27 56 57 58 59 ; m3 4 5 6 7 36 37 38 39 12 13 14 15 44 45 46 47 ; m4 20 21 22 23 52 53 54 55 28 29 30 31 60 61 62 63 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 INTERLEAVE 16, 32, 0 INTERLEAVE 16, 32, 1 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 INTERLEAVE_PAIRS 16, 32, 0 INTERLEAVE_PAIRS 16, 32, 1 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 INTERLEAVE 16, 32, 0 INTERLEAVE 16, 32, 1 BUTTERFLY 16, 32, 0 BUTTERFLY 16, 32, 1 pabsw m0, m0 pabsw m1, m1 pabsw m3, m3 pabsw m4, m4 paddw m0, m1 paddw m3, m4 paddw m0, m3 HSUM 16, 32, 0, 1, eax NORMALIZE4PT RET .12bpc: RESET_MM_PERMUTATION mov bdmaxd, 2 pxor m6, m6 .12bpc_loop: movu xm0, [srcq + 0*src_strideq] movu xm1, [srcq + 1*src_strideq] movu xm2, [srcq + 2*src_strideq] movu xm3, [srcq + src_stride3q ] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] pmovsxwd m0, xm0 pmovsxwd m1, xm1 pmovsxwd m2, xm2 pmovsxwd m3, xm3 add srcq, 16 add dstq, 16 HADAMARD_8X4_12BPC paddd m6, m0 dec bdmaxd jnz .12bpc_loop HSUM 32, 32, 6, 1, eax NORMALIZE4PT RET INIT_YMM avx2 cglobal satd_4x16_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; BLOCK 1 movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] ; BLOCK 2 movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 ; BLOCK 3 movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] ; BLOCK 4 movq xm8, [srcq + 0*src_strideq] movq xm9, [srcq + 1*src_strideq] movq xm10, [srcq + 2*src_strideq] movq xm11, [srcq + src_stride3q ] psubw xm8, [dstq + 0*dst_strideq] psubw xm9, [dstq + 1*dst_strideq] psubw xm10, [dstq + 2*dst_strideq] psubw xm11, [dstq + dst_stride3q ] vinserti128 m4, m4, xm8, 1 vinserti128 m5, m5, xm9, 1 vinserti128 m6, m6, xm10, 1 vinserti128 m7, m7, xm11, 1 punpcklqdq m0, m0, m4 punpcklqdq m1, m1, m5 punpcklqdq m2, m2, m6 punpcklqdq m3, m3, m7 jmp m(satd_16x4_hbd).10bpc_main .12bpc: mov bdmaxd, 2 pxor m8, m8 .12bpc_loop: ; BLOCK 1 movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] pmovsxwd xm0, xm0 pmovsxwd xm1, xm1 pmovsxwd xm2, xm2 pmovsxwd xm3, xm3 ; BLOCK 2 movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] pmovsxwd xm4, xm4 pmovsxwd xm5, xm5 pmovsxwd xm6, xm6 pmovsxwd xm7, xm7 vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 HADAMARD_8X4_12BPC paddd m8, m0 dec bdmaxd jnz .12bpc_loop HSUM 32, 32, 8, 0, eax NORMALIZE4PT RET INIT_YMM avx2 cglobal satd_8x4_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc ; Load src rows movu xm0, [srcq + 0*src_strideq] movu xm1, [srcq + 1*src_strideq] movu xm2, [srcq + 2*src_strideq] movu xm3, [srcq + src_stride3q ] ; src -= dst psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] .10bpc_main: HADAMARD_4X4_PACKED 16, 32 pabsw m0, m0 pabsw m1, m1 paddw m0, m1 HSUM 16, 32, 0, 1, eax NORMALIZE4PT RET .12bpc: RESET_MM_PERMUTATION pmovzxwd m0, [srcq + 0*src_strideq] pmovzxwd m1, [srcq + 1*src_strideq] pmovzxwd m2, [srcq + 2*src_strideq] pmovzxwd m3, [srcq + src_stride3q ] pmovzxwd m4, [dstq + 0*dst_strideq] pmovzxwd m5, [dstq + 1*dst_strideq] pmovzxwd m6, [dstq + 2*dst_strideq] pmovzxwd m7, [dstq + dst_stride3q ] ; src -= dst psubd m0, m4 psubd m1, m5 psubd m2, m6 psubd m3, m7 .12bpc_main: HADAMARD_8X4_12BPC HSUM 32, 32, 0, 1, eax NORMALIZE4PT RET INIT_YMM avx2 cglobal satd_4x8_hbd, 5, 7, 12, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] cmp bdmaxd, (1 << 10) - 1 jne .12bpc movq xm0, [srcq + 0*src_strideq] movq xm1, [srcq + 1*src_strideq] movq xm2, [srcq + 2*src_strideq] movq xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] movq xm4, [srcq + 0*src_strideq] movq xm5, [srcq + 1*src_strideq] movq xm6, [srcq + 2*src_strideq] movq xm7, [srcq + src_stride3q ] ; This loads past the number of elements we are technically supposed ; to read, however, this should still be safe, since at least one ; valid element is in the memory address. psubw xm0, [dstq + 0*dst_strideq] psubw xm1, [dstq + 1*dst_strideq] psubw xm2, [dstq + 2*dst_strideq] psubw xm3, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] psubw xm4, [dstq + 0*dst_strideq] psubw xm5, [dstq + 1*dst_strideq] psubw xm6, [dstq + 2*dst_strideq] psubw xm7, [dstq + dst_stride3q ] punpcklqdq xm0, xm0, xm4 punpcklqdq xm1, xm1, xm5 punpcklqdq xm2, xm2, xm6 punpcklqdq xm3, xm3, xm7 ; Jump to HADAMARD_4X4_PACKED in 8x4 satd, this saves us some binary size ; by deduplicating the shared code. jmp m(satd_8x4_hbd).10bpc_main ; No return; we return in the other function. .12bpc: RESET_MM_PERMUTATION pmovzxwd xm0, [srcq + 0*src_strideq] pmovzxwd xm1, [srcq + 1*src_strideq] pmovzxwd xm2, [srcq + 2*src_strideq] pmovzxwd xm3, [srcq + src_stride3q ] lea srcq, [srcq + 4*src_strideq] pmovzxwd xm4, [dstq + 0*dst_strideq] pmovzxwd xm5, [dstq + 1*dst_strideq] pmovzxwd xm6, [dstq + 2*dst_strideq] pmovzxwd xm7, [dstq + dst_stride3q ] lea dstq, [dstq + 4*dst_strideq] ; src -= dst psubd xm0, xm4 psubd xm1, xm5 psubd xm2, xm6 psubd xm3, xm7 pmovzxwd xm4, [srcq + 0*src_strideq] pmovzxwd xm5, [srcq + 1*src_strideq] pmovzxwd xm6, [srcq + 2*src_strideq] pmovzxwd xm7, [srcq + src_stride3q ] pmovzxwd xm8, [dstq + 0*dst_strideq] pmovzxwd xm9, [dstq + 1*dst_strideq] pmovzxwd xm10, [dstq + 2*dst_strideq] pmovzxwd xm11, [dstq + dst_stride3q ] ; src -= dst (second block) psubd xm4, xm8 psubd xm5, xm9 psubd xm6, xm10 psubd xm7, xm11 vinserti128 m0, m0, xm4, 1 vinserti128 m1, m1, xm5, 1 vinserti128 m2, m2, xm6, 1 vinserti128 m3, m3, xm7, 1 ; Jump to HADAMARD_4X4_PACKED in 8x4 satd, this saves us some binary size ; by deduplicating the shared code. jmp m(satd_8x4_hbd).12bpc_main ; No return; we return in the other function. ; , %macro SATD_NXM 2 INIT_YMM avx2 cglobal satd_%1x%2_hbd, 5, 10, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3, nsrc_stride4, ndst_stride4, rows lea nsrc_stride4q, [4*src_strideq] lea ndst_stride4q, [4*dst_strideq] lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] neg nsrc_stride4q neg ndst_stride4q pxor m9, m9 ; Height contains the number of rows. mov rowsd, %2/8 .outer: mov bdmaxd, %1/8 ; Loop over blocks in same row. .loop: LOAD_DIFF_8X8 ; Fix up pointers and go to next block in same row. lea srcq, [srcq + nsrc_stride4q + 16] lea dstq, [dstq + ndst_stride4q + 16] call m(satd_8x8_hbd_internal) dec bdmaxd jnz .loop lea srcq, [srcq + 8*src_strideq - (%1*16)/8] lea dstq, [dstq + 8*dst_strideq - (%1*16)/8] dec rowsd jnz .outer HSUM 32, 32, 9, 0, eax NORMALIZE8PT RET %endmacro %macro SATD_NX8 1 INIT_YMM avx2 cglobal satd_%1x8_hbd, 5, 9, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3, nsrc_stride4, ndst_stride4 lea nsrc_stride4q, [4*src_strideq] lea ndst_stride4q, [4*dst_strideq] lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] neg nsrc_stride4q neg ndst_stride4q pxor m9, m9 mov bdmaxd, %1/8 .loop: LOAD_DIFF_8X8 lea srcq, [srcq + nsrc_stride4q + 16] lea dstq, [dstq + ndst_stride4q + 16] call m(satd_8x8_hbd_internal) dec bdmaxd jnz .loop HSUM 32, 32, 9, 0, eax NORMALIZE8PT RET %endmacro %macro SATD_8XM 1 INIT_YMM avx2 cglobal satd_8x%1_hbd, 5, 7, 16, src, src_stride, dst, dst_stride, bdmax, \ src_stride3, dst_stride3 lea src_stride3q, [3*src_strideq] lea dst_stride3q, [3*dst_strideq] pxor m9, m9 mov bdmaxd, %1/8 .loop: LOAD_DIFF_8X8 lea srcq, [srcq + 4*src_strideq] lea dstq, [dstq + 4*dst_strideq] call m(satd_8x8_hbd_internal) dec bdmaxd jnz .loop HSUM 32, 32, 9, 0, eax NORMALIZE8PT RET %endmacro SATD_NXM 16, 16 SATD_NXM 32, 32 SATD_NXM 64, 64 SATD_NXM 128, 128 SATD_NXM 16, 32 SATD_NXM 16, 64 SATD_NXM 32, 16 SATD_NXM 32, 64 SATD_NXM 64, 16 SATD_NXM 64, 32 SATD_NXM 64, 128 SATD_NXM 128, 64 SATD_NX8 16 SATD_NX8 32 SATD_8XM 16 SATD_8XM 32 %endif ; ARCH_X86_64 rav1e-0.7.1/src/x86/sse.asm000064400000000000000000000365001046102023000133630ustar 00000000000000; Copyright (c) 2020-2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. %include "config.asm" %include "ext/x86/x86inc.asm" ; Must match crate::dist::rust::GET_WEIGHTED_SSE_SHIFT %define get_weighted_sse_shift 8 %define get_weighted_sse_round (1 << get_weighted_sse_shift >> 1) SECTION_RODATA 32 addsub: times 16 db 1, -1 rounding: times 4 dq get_weighted_sse_round SECTION .text %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) ; Consolidate scaling and rounding to one place so that it is easier to change. %macro SSE_SCALE_4X4 0 ; Multiply and shift using scalar code mov scaled, [scaleq] imul rax, scaleq add rax, get_weighted_sse_round shr rax, get_weighted_sse_shift %endmacro ; 1 is the input and output register. ; 2-3 are tmp registers. %macro SSE_SCALE 2-3 ; Reduce 32-bit sums to 64-bits sums. pshufd m%2, m%1, q3311 paddd m%1, m%2 LOAD_SCALES %2, %3 ; Multiply and shift with rounding. pmuludq m%1, m%2 mova m%2, [rounding] paddq m%1, m%2 psrlq m%1, get_weighted_sse_shift %endmacro %macro LOAD_SCALES_4X8 2 ; Load 1 scale from each of the 2 rows. movd m%1, [scaleq] movd m%2, [scaleq+scale_strideq] ; 64-bit unpack since our loads have only one value each. punpcklqdq m%1, m%2 %endmacro ; 2 is unused %macro LOAD_SCALES_8X4 2 ; Convert to 64-bits. ; It doesn't matter that the upper halves are full of garbage. movq m%1, [scaleq] pshufd m%1, m%1, q1100 %endmacro ; 2 is unused %macro LOAD_SCALES_16X4 2 pmovzxdq m%1, [scaleq] %endmacro ; Separate from other scale macros, since it uses 2 inputs. ; 1-2 are inputs regs and 1 is the output reg. ; 3-4 are tmp registers %macro SSE_SCALE_32X4 4 pshufd m%3, m%1, q3311 paddd m%1, m%3 pshufd m%3, m%2, q3311 paddd m%2, m%3 ; Load scale for 4x4 blocks and convert to 64-bits. ; It doesn't matter if the upper halves are full of garbage. ; raw load: 0, 1, 2, 3 | 4, 5, 6, 7 ; unpack low: 0, 1 | 4, 5 ; unpack high: 2, 3, | 6, 7 mova m%4, [scaleq] punpckldq m%3, m%4, m%4 punpckhdq m%4, m%4 pmuludq m%1, m%3 pmuludq m%2, m%4 mova m%3, [rounding] paddq m%1, m%3 paddq m%2, m%3 psrlq m%1, get_weighted_sse_shift psrlq m%2, get_weighted_sse_shift paddq m%1, m%2 %endmacro INIT_XMM ssse3 ; Use scale_stride's register to store src_stride3 cglobal weighted_sse_4x4, 6, 7, 5, \ src, src_stride, dst, dst_stride, scale, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] movq m0, [addsub] movd m1, [srcq] movd m2, [dstq] punpcklbw m1, m2 movd m2, [srcq+src_strideq] movd m3, [dstq+dst_strideq] punpcklbw m2, m3 pmaddubsw m1, m0 pmaddubsw m2, m0 pmaddwd m1, m1 pmaddwd m2, m2 paddd m1, m2 movd m2, [srcq+src_strideq*2] movd m3, [dstq+dst_strideq*2] punpcklbw m2, m3 movd m3, [srcq+src_stride3q] movd m4, [dstq+dst_stride3q] punpcklbw m3, m4 pmaddubsw m2, m0 pmaddubsw m3, m0 pmaddwd m2, m2 pmaddwd m3, m3 paddd m2, m3 paddd m1, m2 pshuflw m0, m1, q3232 paddd m0, m1 movd eax, m0 ; Multiply and shift using scalar code. SSE_SCALE_4X4 RET %macro WEIGHTED_SSE_4X8_KERNEL 0 movd m1, [srcq] movd m2, [srcq+src_strideq*4] punpckldq m1, m2 movd m2, [dstq] movd m3, [dstq+dst_strideq*4] add srcq, src_strideq add dstq, dst_strideq punpckldq m2, m3 punpcklbw m1, m2 movd m2, [srcq] movd m3, [srcq+src_strideq*4] punpckldq m2, m3 movd m3, [dstq] movd m4, [dstq+dst_strideq*4] add srcq, src_strideq add dstq, dst_strideq punpckldq m3, m4 punpcklbw m2, m3 pmaddubsw m1, m0 pmaddubsw m2, m0 pmaddwd m1, m1 pmaddwd m2, m2 paddd m1, m2 movd m2, [srcq] movd m3, [srcq+src_strideq*4] punpckldq m2, m3 movd m3, [dstq] movd m4, [dstq+dst_strideq*4] add srcq, src_strideq add dstq, dst_strideq punpckldq m3, m4 punpcklbw m2, m3 movd m3, [srcq] movd m4, [srcq+src_strideq*4] punpckldq m3, m4 movd m4, [dstq] movd m5, [dstq+dst_strideq*4] punpckldq m4, m5 punpcklbw m3, m4 pmaddubsw m2, m0 pmaddubsw m3, m0 pmaddwd m2, m2 pmaddwd m3, m3 paddd m2, m3 paddd m1, m2 %define LOAD_SCALES LOAD_SCALES_4X8 SSE_SCALE 1, 2, 3 %endmacro INIT_XMM ssse3 cglobal weighted_sse_4x8, 6, 6, 6, \ src, src_stride, dst, dst_stride, scale, scale_stride mova m0, [addsub] WEIGHTED_SSE_4X8_KERNEL pshufd m0, m1, q3232 paddq m1, m0 movq rax, m1 RET INIT_XMM ssse3 cglobal weighted_sse_4x16, 6, 6, 7, \ src, src_stride, dst, dst_stride, scale, scale_stride mova m0, [addsub] WEIGHTED_SSE_4X8_KERNEL ; Swap so the use of this macro will use m6 as the result SWAP 1, 6 lea scaleq, [scaleq+scale_strideq*2] ; Already incremented by stride 3 times, but must go up 5 more to get to 8 add srcq, src_strideq add dstq, dst_strideq lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] WEIGHTED_SSE_4X8_KERNEL paddq m6, m1 pshufd m0, m6, q3232 paddq m6, m0 movq rax, m6 RET %macro WEIGHTED_SSE_8X4_KERNEL 0 movq m1, [srcq] movq m2, [dstq] punpcklbw m1, m2 movq m2, [srcq+src_strideq] movq m3, [dstq+dst_strideq] punpcklbw m2, m3 pmaddubsw m1, m0 pmaddubsw m2, m0 pmaddwd m1, m1 pmaddwd m2, m2 paddd m1, m2 movq m2, [srcq+src_strideq*2] movq m3, [dstq+dst_strideq*2] punpcklbw m2, m3 movq m3, [srcq+src_stride3q] movq m4, [dstq+dst_stride3q] punpcklbw m3, m4 pmaddubsw m2, m0 pmaddubsw m3, m0 pmaddwd m2, m2 pmaddwd m3, m3 paddd m2, m3 paddd m1, m2 %define LOAD_SCALES LOAD_SCALES_8X4 SSE_SCALE 1, 2 %endmacro %macro WEIGHTED_SSE_16X4_KERNEL 0 pmovzxbw m0, [srcq] pmovzxbw m1, [dstq] psubw m0, m1 pmaddwd m0, m0 pmovzxbw m1, [srcq+src_strideq] pmovzxbw m2, [dstq+dst_strideq] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 pmovzxbw m1, [srcq+src_strideq*2] pmovzxbw m2, [dstq+dst_strideq*2] psubw m1, m2 pmaddwd m1, m1 pmovzxbw m2, [srcq+src_stride3q] pmovzxbw m3, [dstq+dst_stride3q] psubw m2, m3 pmaddwd m2, m2 paddd m1, m2 paddd m1, m0 %define LOAD_SCALES LOAD_SCALES_16X4 SSE_SCALE 1, 2 %endmacro %macro WEIGHTED_SSE_32X4_KERNEL 0 ; Unpacking high and low results in sums that are 8 samples apart. To ; correctly apply weights, two separate registers are needed to accumulate. mova m2, [srcq] mova m3, [dstq] punpcklbw m1, m2, m3 punpckhbw m2, m3 mova m4, [srcq+src_strideq] mova m5, [dstq+dst_strideq] punpcklbw m3, m4, m5 punpckhbw m4, m5 pmaddubsw m1, m0 pmaddubsw m2, m0 pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 pmaddwd m4, m4 ; Accumulate paddd m1, m3 paddd m2, m4 mova m4, [srcq+src_strideq*2] mova m5, [dstq+dst_strideq*2] punpcklbw m3, m4, m5 punpckhbw m4, m5 mova m6, [srcq+src_stride3q] mova m7, [dstq+dst_stride3q] punpcklbw m5, m6, m7 punpckhbw m6, m7 pmaddubsw m3, m0 pmaddubsw m4, m0 pmaddubsw m5, m0 pmaddubsw m6, m0 pmaddwd m3, m3 pmaddwd m4, m4 pmaddwd m5, m5 pmaddwd m6, m6 paddd m3, m5 paddd m4, m6 paddd m1, m3 paddd m2, m4 SSE_SCALE_32X4 1, 2, 3, 4 %endmacro %macro WEIGHTED_SSE 2 ; w, h %if %1 == 8 %if %2 == 4 ; Use scale_stride's register to store src_stride3 cglobal weighted_sse_%1x%2, 6, 7, 5, \ src, src_stride, dst, dst_stride, scale, \ src_stride3, dst_stride3 %else cglobal weighted_sse_%1x%2, 6, 9, 6, \ src, src_stride, dst, dst_stride, scale, scale_stride, \ src_stride3, dst_stride3, h %endif %elif %1 == 16 %if %2 == 4 ; Use scale_stride's register to store src_stride3 cglobal weighted_sse_%1x%2, 6, 7, 4, \ src, src_stride, dst, dst_stride, scale, \ src_stride3, dst_stride3 %else cglobal weighted_sse_%1x%2, 6, 9, 5, \ src, src_stride, dst, dst_stride, scale, scale_stride, \ src_stride3, dst_stride3, h %endif %elif %1 == 32 cglobal weighted_sse_%1x%2, 6, 9, 9, \ src, src_stride, dst, dst_stride, scale, scale_stride, \ src_stride3, dst_stride3, h %else ; > 32 cglobal weighted_sse_%1x%2, 6, 10, 9, \ src, src_stride, dst, dst_stride, scale, scale_stride, \ src_stride3, dst_stride3, h, w %endif ; === Setup === ; kernel_width/kernel_height: number of elements that the kernel processes. ; m0: except for when w == 16, m0 is used to hold a constant 1, -1... vector ; register for diffing the two sources. ; sum: The kernel stores it's results on m1. The last vector register is used ; unless only one iteration is done. ; Default the kernel width to the width of this function. %define kernel_width %1 %define kernel_height 4 %if %1 == 8 mova m0, [addsub] %endif %if %1 >= 32 mova m0, [addsub] ; Iterate multiple times when w > 32. %define kernel_width 32 %endif %if %1 > kernel_width || %2 > kernel_height ; Add onto the last used vector register. %assign sum xmm_regs_used-1 %else ; Use the result from the kernel %define sum 1 %endif lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] %if %1 > kernel_width || %2 > kernel_height pxor m%[sum], m%[sum] %endif %if %2 > kernel_height mov hd, %2/kernel_height-1 .loop: %endif %if %1 > kernel_width mov wd, %1/kernel_width-1 .loop_horiz: %endif WEIGHTED_SSE_%[kernel_width]X%[kernel_height]_KERNEL %if %2 > kernel_height || %1 > kernel_width paddq m%[sum], m1 %endif %if %1 > kernel_width add scaleq, kernel_width*4/4 add srcq, kernel_width add dstq, kernel_width dec wq jge .loop_horiz %endif %if %2 > kernel_height ; Move down 4 rows. %if %1 > kernel_width ; src/dst is incremented by width when processing multi iteration rows. ; Reduce the offset by the width of the row. lea srcq, [srcq+src_strideq*4 - %1] lea dstq, [dstq+dst_strideq*4 - %1] ; The behaviour for scale is similar lea scaleq, [scaleq+scale_strideq - %1*4/4] %else lea srcq, [srcq+src_strideq*4] lea dstq, [dstq+dst_strideq*4] add scaleq, scale_strideq %endif dec hq jge .loop %endif %if mmsize == 16 pshufd m2, m%[sum], q3232 paddq m%[sum], m2 movq rax, m%[sum] %elif mmsize == 32 vextracti128 xm2, m%[sum], 1 paddq xm%[sum], xm2 pshufd xm2, xm%[sum], q3232 paddq xm%[sum], xm2 movq rax, xm%[sum] %endif RET %undef sum, kernel_width, res %endmacro INIT_XMM ssse3 WEIGHTED_SSE 8, 4 %if ARCH_X86_64 WEIGHTED_SSE 8, 8 WEIGHTED_SSE 8, 16 WEIGHTED_SSE 8, 32 %endif ; ARCH_X86_64 INIT_YMM avx2 WEIGHTED_SSE 16, 4 %if ARCH_X86_64 WEIGHTED_SSE 16, 8 WEIGHTED_SSE 16, 16 WEIGHTED_SSE 16, 32 WEIGHTED_SSE 16, 64 WEIGHTED_SSE 32, 8 WEIGHTED_SSE 32, 16 WEIGHTED_SSE 32, 32 WEIGHTED_SSE 32, 64 WEIGHTED_SSE 64, 16 WEIGHTED_SSE 64, 32 WEIGHTED_SSE 64, 64 WEIGHTED_SSE 64, 128 WEIGHTED_SSE 128, 64 WEIGHTED_SSE 128, 128 %endif ; ARCH_X86_64 INIT_XMM sse2 cglobal weighted_sse_4x4_hbd, 6, 8, 4, \ src, src_stride, dst, dst_stride, scale, scale_stride, \ src_stride3, dst_stride3 lea src_stride3q, [src_strideq*3] lea dst_stride3q, [dst_strideq*3] movq m0, [srcq] movq m1, [dstq] psubw m0, m1 pmaddwd m0, m0 movq m1, [srcq+src_strideq] movq m2, [dstq+dst_strideq] psubw m1, m2 pmaddwd m1, m1 paddd m0, m1 movq m1, [srcq+src_strideq*2] movq m2, [dstq+dst_strideq*2] psubw m1, m2 pmaddwd m1, m1 movq m2, [srcq+src_stride3q] movq m3, [dstq+dst_stride3q] psubw m2, m3 pmaddwd m2, m2 paddd m1, m2 paddd m0, m1 pshuflw m1, m0, q3232 paddd m0, m1 movd eax, m0 ; Multiply and shift using scalar code. SSE_SCALE_4X4 RET rav1e-0.7.1/src/x86/tables.asm000064400000000000000000001317001046102023000140410ustar 00000000000000; Copyright (c) 2019-2022, The rav1e contributors. All rights reserved ; ; This source code is subject to the terms of the BSD 2 Clause License and ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License ; was not distributed with this source code in the LICENSE file, you can ; obtain it at www.aomedia.org/license/software. If the Alliance for Open ; Media Patent License 1.0 was not distributed with this source code in the ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. ; Copyright © 2018, VideoLAN and dav1d authors ; Copyright © 2018, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "config.asm" %include "ext/x86/x86inc.asm" SECTION_RODATA 16 align 8, db 0 const mc_subpel_filters, db 0, 1, -3, 63, 4, -1, 0, 0, ; REGULAR db 0, 1, -5, 61, 9, -2, 0, 0, db 0, 1, -6, 58, 14, -4, 1, 0, db 0, 1, -7, 55, 19, -5, 1, 0, db 0, 1, -7, 51, 24, -6, 1, 0, db 0, 1, -8, 47, 29, -6, 1, 0, db 0, 1, -7, 42, 33, -6, 1, 0, db 0, 1, -7, 38, 38, -7, 1, 0, db 0, 1, -6, 33, 42, -7, 1, 0, db 0, 1, -6, 29, 47, -8, 1, 0, db 0, 1, -6, 24, 51, -7, 1, 0, db 0, 1, -5, 19, 55, -7, 1, 0, db 0, 1, -4, 14, 58, -6, 1, 0, db 0, 0, -2, 9, 61, -5, 1, 0, db 0, 0, -1, 4, 63, -3, 1, 0, db 0, 1, 14, 31, 17, 1, 0, 0, ; SMOOTH db 0, 0, 13, 31, 18, 2, 0, 0, db 0, 0, 11, 31, 20, 2, 0, 0, db 0, 0, 10, 30, 21, 3, 0, 0, db 0, 0, 9, 29, 22, 4, 0, 0, db 0, 0, 8, 28, 23, 5, 0, 0, db 0, -1, 8, 27, 24, 6, 0, 0, db 0, -1, 7, 26, 26, 7, -1, 0, db 0, 0, 6, 24, 27, 8, -1, 0, db 0, 0, 5, 23, 28, 8, 0, 0, db 0, 0, 4, 22, 29, 9, 0, 0, db 0, 0, 3, 21, 30, 10, 0, 0, db 0, 0, 2, 20, 31, 11, 0, 0, db 0, 0, 2, 18, 31, 13, 0, 0, db 0, 0, 1, 17, 31, 14, 1, 0, db -1, 1, -3, 63, 4, -1, 1, 0, ; SHARP db -1, 3, -6, 62, 8, -3, 2, -1, db -1, 4, -9, 60, 13, -5, 3, -1, db -2, 5, -11, 58, 19, -7, 3, -1, db -2, 5, -11, 54, 24, -9, 4, -1, db -2, 5, -12, 50, 30, -10, 4, -1, db -2, 5, -12, 45, 35, -11, 5, -1, db -2, 6, -12, 40, 40, -12, 6, -2, db -1, 5, -11, 35, 45, -12, 5, -2, db -1, 4, -10, 30, 50, -12, 5, -2, db -1, 4, -9, 24, 54, -11, 5, -2, db -1, 3, -7, 19, 58, -11, 5, -2, db -1, 3, -5, 13, 60, -9, 4, -1, db -1, 2, -3, 8, 62, -6, 3, -1, db 0, 1, -1, 4, 63, -3, 1, -1, db 0, 0, -2, 63, 4, -1, 0, 0, ; REGULAR 4 db 0, 0, -4, 61, 9, -2, 0, 0, db 0, 0, -5, 58, 14, -3, 0, 0, db 0, 0, -6, 55, 19, -4, 0, 0, db 0, 0, -6, 51, 24, -5, 0, 0, db 0, 0, -7, 47, 29, -5, 0, 0, db 0, 0, -6, 42, 33, -5, 0, 0, db 0, 0, -6, 38, 38, -6, 0, 0, db 0, 0, -5, 33, 42, -6, 0, 0, db 0, 0, -5, 29, 47, -7, 0, 0, db 0, 0, -5, 24, 51, -6, 0, 0, db 0, 0, -4, 19, 55, -6, 0, 0, db 0, 0, -3, 14, 58, -5, 0, 0, db 0, 0, -2, 9, 61, -4, 0, 0, db 0, 0, -1, 4, 63, -2, 0, 0, db 0, 0, 15, 31, 17, 1, 0, 0, ; SMOOTH 4 db 0, 0, 13, 31, 18, 2, 0, 0, db 0, 0, 11, 31, 20, 2, 0, 0, db 0, 0, 10, 30, 21, 3, 0, 0, db 0, 0, 9, 29, 22, 4, 0, 0, db 0, 0, 8, 28, 23, 5, 0, 0, db 0, 0, 7, 27, 24, 6, 0, 0, db 0, 0, 6, 26, 26, 6, 0, 0, db 0, 0, 6, 24, 27, 7, 0, 0, db 0, 0, 5, 23, 28, 8, 0, 0, db 0, 0, 4, 22, 29, 9, 0, 0, db 0, 0, 3, 21, 30, 10, 0, 0, db 0, 0, 2, 20, 31, 11, 0, 0, db 0, 0, 2, 18, 31, 13, 0, 0, db 0, 0, 1, 17, 31, 15, 0, 0, ; Bilin scaled being very rarely used, add a new table entry ; and use the put/prep_8tap_scaled code, thus acting as a ; scaled bilinear filter. db 0, 0, 0, 60, 4, 0, 0, 0, db 0, 0, 0, 56, 8, 0, 0, 0, db 0, 0, 0, 52, 12, 0, 0, 0, db 0, 0, 0, 48, 16, 0, 0, 0, db 0, 0, 0, 44, 20, 0, 0, 0, db 0, 0, 0, 40, 24, 0, 0, 0, db 0, 0, 0, 36, 28, 0, 0, 0, db 0, 0, 0, 32, 32, 0, 0, 0, db 0, 0, 0, 28, 36, 0, 0, 0, db 0, 0, 0, 24, 40, 0, 0, 0, db 0, 0, 0, 20, 44, 0, 0, 0, db 0, 0, 0, 16, 48, 0, 0, 0, db 0, 0, 0, 12, 52, 0, 0, 0, db 0, 0, 0, 8, 56, 0, 0, 0, db 0, 0, 0, 4, 60, 0, 0, 0 align 64, db 0 const filter_intra_taps, db -6, 10, -5, 2, -3, 1, -3, 1, ; 0 db -4, 6, -3, 2, -3, 2, -3, 1, db 0, 0, 10, 0, 1, 10, 1, 2, db 0, 0, 6, 0, 2, 6, 2, 2, db 0, 12, 0, 9, 0, 7, 10, 5, db 0, 2, 0, 2, 0, 2, 6, 3, db 0, 0, 0, 0, 0, 0, 0, 0, db 12, 0, 9, 0, 7, 0, 5, 0, db -10, 16, -6, 0, -4, 0, -2, 0, ; 1 db -10, 16, -6, 0, -4, 0, -2, 0, db 0, 0, 16, 0, 0, 16, 0, 0, db 0, 0, 16, 0, 0, 16, 0, 0, db 0, 10, 0, 6, 0, 4, 16, 2, db 0, 0, 0, 0, 0, 0, 16, 0, db 0, 0, 0, 0, 0, 0, 0, 0, db 10, 0, 6, 0, 4, 0, 2, 0, db -8, 8, -8, 0, -8, 0, -8, 0, ; 2 db -4, 4, -4, 0, -4, 0, -4, 0, db 0, 0, 8, 0, 0, 8, 0, 0, db 0, 0, 4, 0, 0, 4, 0, 0, db 0, 16, 0, 16, 0, 16, 8, 16, db 0, 0, 0, 0, 0, 0, 4, 0, db 0, 0, 0, 0, 0, 0, 0, 0, db 16, 0, 16, 0, 16, 0, 16, 0, db -2, 8, -1, 3, -1, 2, 0, 1, ; 3 db -1, 4, -1, 3, -1, 2, -1, 2, db 0, 0, 8, 0, 3, 8, 2, 3, db 0, 0, 4, 0, 3, 4, 2, 3, db 0, 10, 0, 6, 0, 4, 8, 2, db 0, 3, 0, 4, 0, 4, 4, 3, db 0, 0, 0, 0, 0, 0, 0, 0, db 10, 0, 6, 0, 4, 0, 3, 0, db -12, 14, -10, 0, -9, 0, -8, 0, ; 4 db -10, 12, -9, 1, -8, 0, -7, 0, db 0, 0, 14, 0, 0, 14, 0, 0, db 0, 0, 12, 0, 0, 12, 0, 1, db 0, 14, 0, 12, 0, 11, 14, 10, db 0, 0, 0, 0, 0, 1, 12, 1, db 0, 0, 0, 0, 0, 0, 0, 0, db 14, 0, 12, 0, 11, 0, 9, 0 align 64, db 0 const sgr_x_by_x, db 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, db 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, db 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, db 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, db 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, db 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, db 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, db 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, db 0 align 8, db 0 const mc_warp_filter, db 0, 127, 0, 0, 0, 1, 0, 0, 0, 127, 0, 0, -1, 2, 0, 0, ; [-1, 0) db 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, 0, db 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, 0, db 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, 0, db 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, 0, db 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, 0, db 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, 0, db 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, 0, db 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, 0, db 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, 0, db 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, 0, db 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, 0, db 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, 0, db 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, 0, db 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, 0, db 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, 0, db 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, 0, db 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, 0, db 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, 0, db 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, 0, db 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, 0, db 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, 0, db 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, 0, db 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, 0, db 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, 0, db 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, 0, db 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, 0, db 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, 0, db 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, 0, db 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, 0, db 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, 0, db 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, 0, db 0, 0, 1, 0, 0, 127, 0, 0, 0, -1, 2, 0, 0, 127, 0, 0, ; [0, 1) db 0, -3, 4, 1, 1, 127, -2, 0, 0, -5, 6, 1, 1, 127, -2, 0, db 0, -6, 8, 1, 2, 126, -3, 0, -1, -7, 11, 2, 2, 126, -4, -1, db -1, -8, 13, 2, 3, 125, -5, -1, -1, -10, 16, 3, 3, 124, -6, -1, db -1, -11, 18, 3, 4, 123, -7, -1, -1, -12, 20, 3, 4, 122, -7, -1, db -1, -13, 23, 3, 4, 121, -8, -1, -2, -14, 25, 4, 5, 120, -9, -1, db -1, -15, 27, 4, 5, 119, -10, -1, -1, -16, 30, 4, 5, 118, -11, -1, db -2, -17, 33, 5, 6, 116, -12, -1, -2, -17, 35, 5, 6, 114, -12, -1, db -2, -18, 38, 5, 6, 113, -13, -1, -2, -19, 41, 6, 7, 111, -14, -2, db -2, -19, 43, 6, 7, 110, -15, -2, -2, -20, 46, 6, 7, 108, -15, -2, db -2, -20, 49, 6, 7, 106, -16, -2, -2, -21, 51, 7, 7, 104, -16, -2, db -2, -21, 54, 7, 7, 102, -17, -2, -2, -21, 56, 7, 8, 100, -18, -2, db -2, -22, 59, 7, 8, 98, -18, -2, -2, -22, 62, 7, 8, 96, -19, -2, db -2, -22, 64, 7, 8, 94, -19, -2, -2, -22, 67, 8, 8, 91, -20, -2, db -2, -22, 69, 8, 8, 89, -20, -2, -2, -22, 72, 8, 8, 87, -21, -2, db -2, -21, 74, 8, 8, 84, -21, -2, -2, -22, 77, 8, 8, 82, -21, -2, db -2, -21, 79, 8, 8, 79, -21, -2, -2, -21, 82, 8, 8, 77, -22, -2, db -2, -21, 84, 8, 8, 74, -21, -2, -2, -21, 87, 8, 8, 72, -22, -2, db -2, -20, 89, 8, 8, 69, -22, -2, -2, -20, 91, 8, 8, 67, -22, -2, db -2, -19, 94, 8, 7, 64, -22, -2, -2, -19, 96, 8, 7, 62, -22, -2, db -2, -18, 98, 8, 7, 59, -22, -2, -2, -18, 100, 8, 7, 56, -21, -2, db -2, -17, 102, 7, 7, 54, -21, -2, -2, -16, 104, 7, 7, 51, -21, -2, db -2, -16, 106, 7, 6, 49, -20, -2, -2, -15, 108, 7, 6, 46, -20, -2, db -2, -15, 110, 7, 6, 43, -19, -2, -2, -14, 111, 7, 6, 41, -19, -2, db -1, -13, 113, 6, 5, 38, -18, -2, -1, -12, 114, 6, 5, 35, -17, -2, db -1, -12, 116, 6, 5, 33, -17, -2, -1, -11, 118, 5, 4, 30, -16, -1, db -1, -10, 119, 5, 4, 27, -15, -1, -1, -9, 120, 5, 4, 25, -14, -2, db -1, -8, 121, 4, 3, 23, -13, -1, -1, -7, 122, 4, 3, 20, -12, -1, db -1, -7, 123, 4, 3, 18, -11, -1, -1, -6, 124, 3, 3, 16, -10, -1, db -1, -5, 125, 3, 2, 13, -8, -1, -1, -4, 126, 2, 2, 11, -7, -1, db 0, -3, 126, 2, 1, 8, -6, 0, 0, -2, 127, 1, 1, 6, -5, 0, db 0, -2, 127, 1, 1, 4, -3, 0, 0, 0, 127, 0, 0, 2, -1, 0, db 0, 0, 127, 0, 0, 1, 0, 0, 0, 0, 127, 0, 0, -1, 2, 0, ; [1, 2) db 0, 1, 127, -1, 0, -3, 4, 0, 0, 1, 126, -2, 0, -4, 6, 1, db 0, 1, 126, -3, 0, -5, 8, 1, 0, 1, 125, -4, 0, -6, 11, 1, db 0, 1, 124, -4, 0, -7, 13, 1, 0, 2, 123, -5, 0, -8, 15, 1, db 0, 2, 122, -6, 0, -9, 18, 1, 0, 2, 121, -6, 0, -10, 20, 1, db 0, 2, 120, -7, 0, -11, 22, 2, 0, 2, 119, -8, 0, -12, 25, 2, db 0, 3, 117, -8, 0, -13, 27, 2, 0, 3, 116, -9, 0, -13, 29, 2, db 0, 3, 114, -10, 0, -14, 32, 3, 0, 3, 113, -10, 0, -15, 35, 2, db 0, 3, 111, -11, 0, -15, 37, 3, 0, 3, 109, -11, 0, -16, 40, 3, db 0, 3, 108, -12, 0, -16, 42, 3, 0, 4, 106, -13, 0, -17, 45, 3, db 0, 4, 104, -13, 0, -17, 47, 3, 0, 4, 102, -14, 0, -17, 50, 3, db 0, 4, 100, -14, 0, -17, 52, 3, 0, 4, 98, -15, 0, -18, 55, 4, db 0, 4, 96, -15, 0, -18, 58, 3, 0, 4, 94, -16, 0, -18, 60, 4, db 0, 4, 91, -16, 0, -18, 63, 4, 0, 4, 89, -16, 0, -18, 65, 4, db 0, 4, 87, -17, 0, -18, 68, 4, 0, 4, 85, -17, 0, -18, 70, 4, db 0, 4, 82, -17, 0, -18, 73, 4, 0, 4, 80, -17, 0, -18, 75, 4, db 0, 4, 78, -18, 0, -18, 78, 4, 0, 4, 75, -18, 0, -17, 80, 4, db 0, 4, 73, -18, 0, -17, 82, 4, 0, 4, 70, -18, 0, -17, 85, 4, db 0, 4, 68, -18, 0, -17, 87, 4, 0, 4, 65, -18, 0, -16, 89, 4, db 0, 4, 63, -18, 0, -16, 91, 4, 0, 4, 60, -18, 0, -16, 94, 4, db 0, 3, 58, -18, 0, -15, 96, 4, 0, 4, 55, -18, 0, -15, 98, 4, db 0, 3, 52, -17, 0, -14, 100, 4, 0, 3, 50, -17, 0, -14, 102, 4, db 0, 3, 47, -17, 0, -13, 104, 4, 0, 3, 45, -17, 0, -13, 106, 4, db 0, 3, 42, -16, 0, -12, 108, 3, 0, 3, 40, -16, 0, -11, 109, 3, db 0, 3, 37, -15, 0, -11, 111, 3, 0, 2, 35, -15, 0, -10, 113, 3, db 0, 3, 32, -14, 0, -10, 114, 3, 0, 2, 29, -13, 0, -9, 116, 3, db 0, 2, 27, -13, 0, -8, 117, 3, 0, 2, 25, -12, 0, -8, 119, 2, db 0, 2, 22, -11, 0, -7, 120, 2, 0, 1, 20, -10, 0, -6, 121, 2, db 0, 1, 18, -9, 0, -6, 122, 2, 0, 1, 15, -8, 0, -5, 123, 2, db 0, 1, 13, -7, 0, -4, 124, 1, 0, 1, 11, -6, 0, -4, 125, 1, db 0, 1, 8, -5, 0, -3, 126, 1, 0, 1, 6, -4, 0, -2, 126, 1, db 0, 0, 4, -3, 0, -1, 127, 1, 0, 0, 2, -1, 0, 0, 127, 0, db 0, 0, 2, -1, 0, 0, 127, 0 ; dummy (replicate row index 191) ; Values that are 0 will never be used align 2, db 0 const dr_intra_derivative, dw 0, ; Angles: dw 1023, 0, ; 3, 93, 183 dw 547, ; 6, 96, 186 dw 372, 0, 0, ; 9, 99, 189 dw 273, ; 14, 104, 194 dw 215, 0, ; 17, 107, 197 dw 178, ; 20, 110, 200 dw 151, 0, ; 23, 113, 203 (113 & 203 are base angles) dw 132, ; 26, 116, 206 dw 116, 0, ; 29, 119, 209 dw 102, 0, ; 32, 122, 212 dw 90, ; 36, 126, 216 dw 80, 0, ; 39, 129, 219 dw 71, ; 42, 132, 222 dw 64, 0, ; 45, 135, 225 (45 & 135 are base angles) dw 57, ; 48, 138, 228 dw 51, 0, ; 51, 141, 231 dw 45, 0, ; 54, 144, 234 dw 40, ; 58, 148, 238 dw 35, 0, ; 61, 151, 241 dw 31, ; 64, 154, 244 dw 27, 0, ; 67, 157, 247 (67 & 157 are base angles) dw 23, ; 70, 160, 250 dw 19, 0, ; 73, 163, 253 dw 15, 0, ; 76, 166, 256 dw 11, 0, ; 81, 171, 261 dw 7, ; 84, 174, 264 dw 3 ; 87, 177, 267 ; Taken from the spec. Range is [-2048, 2047], mean is 0 and stddev is 512 align 2, db 0 const gaussian_sequence, dw 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, dw 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, dw 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, dw -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, dw 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, dw 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, dw 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, dw 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, dw 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, dw 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, dw 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, dw -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, dw 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, dw 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, dw -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, dw -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, dw -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, dw -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, dw 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, dw 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, dw 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, dw -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, dw -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, dw -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, dw 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, dw 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, dw 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, dw -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, dw 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, dw -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, dw 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, dw -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, dw 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, dw -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, dw -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, dw -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, dw -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, dw -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, dw 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, dw 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, dw -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, dw -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, dw 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, dw 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, dw -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, dw 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, dw 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, dw -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, dw 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, dw -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, dw 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, dw -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, dw -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, dw 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, dw -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, dw -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, dw 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, dw 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, dw -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, dw 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, dw 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, dw 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, dw -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, dw -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, dw -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, dw 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, dw -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, dw -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, dw -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, dw -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, dw -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, dw 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, dw -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, dw -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, dw 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, dw -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, dw -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, dw -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, dw 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, dw -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, dw 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, dw 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, dw 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, dw -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, dw -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, dw 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, dw 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, dw -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, dw -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, dw -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, dw -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, dw 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, dw 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, dw 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, dw 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, dw 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, dw 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, dw 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, dw -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, dw 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, dw -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, dw -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, dw -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, dw 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, dw -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, dw -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, dw 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, dw 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, dw 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, dw 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, dw 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, dw 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, dw 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, dw -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, dw -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, dw -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, dw 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, dw -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, dw -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, dw 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, dw -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, dw 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, dw 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, dw 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, dw -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, dw 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, dw -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, dw 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, dw 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, dw 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, dw 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, dw -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, dw -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, dw 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, dw -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, dw 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, dw 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, dw 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, dw -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, dw -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, dw 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, dw 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, dw 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, dw -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, dw -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, dw 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, dw -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, dw -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, dw -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, dw 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, dw -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, dw 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, dw -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, dw 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, dw -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, dw 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, dw 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, dw 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, dw 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, dw -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, dw -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, dw -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, dw -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, dw 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, dw 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, dw 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, dw 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, dw -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, dw 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, dw -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, dw 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, dw 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, dw -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, dw -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, dw -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, dw -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, dw 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, dw -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, dw -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, dw -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, dw -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, dw 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, dw 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, dw -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, dw -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, dw 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, dw 428, -484 align 8, db 0 const resize_filter, db 0, 0, 0, -128, 0, 0, 0, 0, 0, 0, 1, -128, -2, 1, 0, 0, db 0, -1, 3, -127, -4, 2, -1, 0, 0, -1, 4, -127, -6, 3, -1, 0, db 0, -2, 6, -126, -8, 3, -1, 0, 0, -2, 7, -125, -11, 4, -1, 0, db 1, -2, 8, -125, -13, 5, -2, 0, 1, -3, 9, -124, -15, 6, -2, 0, db 1, -3, 10, -123, -18, 6, -2, 1, 1, -3, 11, -122, -20, 7, -3, 1, db 1, -4, 12, -121, -22, 8, -3, 1, 1, -4, 13, -120, -25, 9, -3, 1, db 1, -4, 14, -118, -28, 9, -3, 1, 1, -4, 15, -117, -30, 10, -4, 1, db 1, -5, 16, -116, -32, 11, -4, 1, 1, -5, 16, -114, -35, 12, -4, 1, db 1, -5, 17, -112, -38, 12, -4, 1, 1, -5, 18, -111, -40, 13, -5, 1, db 1, -5, 18, -109, -43, 14, -5, 1, 1, -6, 19, -107, -45, 14, -5, 1, db 1, -6, 19, -105, -48, 15, -5, 1, 1, -6, 19, -103, -51, 16, -5, 1, db 1, -6, 20, -101, -53, 16, -6, 1, 1, -6, 20, -99, -56, 17, -6, 1, db 1, -6, 20, -97, -58, 17, -6, 1, 1, -6, 20, -95, -61, 18, -6, 1, db 2, -7, 20, -93, -64, 18, -6, 2, 2, -7, 20, -91, -66, 19, -6, 1, db 2, -7, 20, -88, -69, 19, -6, 1, 2, -7, 20, -86, -71, 19, -6, 1, db 2, -7, 20, -84, -74, 20, -7, 2, 2, -7, 20, -81, -76, 20, -7, 1, db 2, -7, 20, -79, -79, 20, -7, 2, 1, -7, 20, -76, -81, 20, -7, 2, db 2, -7, 20, -74, -84, 20, -7, 2, 1, -6, 19, -71, -86, 20, -7, 2, db 1, -6, 19, -69, -88, 20, -7, 2, 1, -6, 19, -66, -91, 20, -7, 2, db 2, -6, 18, -64, -93, 20, -7, 2, 1, -6, 18, -61, -95, 20, -6, 1, db 1, -6, 17, -58, -97, 20, -6, 1, 1, -6, 17, -56, -99, 20, -6, 1, db 1, -6, 16, -53, -101, 20, -6, 1, 1, -5, 16, -51, -103, 19, -6, 1, db 1, -5, 15, -48, -105, 19, -6, 1, 1, -5, 14, -45, -107, 19, -6, 1, db 1, -5, 14, -43, -109, 18, -5, 1, 1, -5, 13, -40, -111, 18, -5, 1, db 1, -4, 12, -38, -112, 17, -5, 1, 1, -4, 12, -35, -114, 16, -5, 1, db 1, -4, 11, -32, -116, 16, -5, 1, 1, -4, 10, -30, -117, 15, -4, 1, db 1, -3, 9, -28, -118, 14, -4, 1, 1, -3, 9, -25, -120, 13, -4, 1, db 1, -3, 8, -22, -121, 12, -4, 1, 1, -3, 7, -20, -122, 11, -3, 1, db 1, -2, 6, -18, -123, 10, -3, 1, 0, -2, 6, -15, -124, 9, -3, 1, db 0, -2, 5, -13, -125, 8, -2, 1, 0, -1, 4, -11, -125, 7, -2, 0, db 0, -1, 3, -8, -126, 6, -2, 0, 0, -1, 3, -6, -127, 4, -1, 0, db 0, -1, 2, -4, -127, 3, -1, 0, 0, 0, 1, -2, -128, 1, 0, 0, align 16, db 0 ; Unused const obmc_masks, db 0, 0, ; 2 db 19, 0, ; 4 db 25, 14, 5, 0, ; 8 db 28, 22, 16, 11, 7, 3, 0, 0, ; 16 db 30, 27, 24, 21, 18, 15, 12, 10, 8, 6, 4, 3, 0, 0, 0, 0, ; 32 db 31, 29, 28, 26, 24, 23, 21, 20, 19, 17, 16, 14, 13, 12, 11, 9, db 8, 7, 6, 5, 4, 4, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0,